{ "best_metric": 0.04665221, "best_model_checkpoint": "/home/sushant/D1/MIUA/kvasir-format/training2/v0-20250204-233638/checkpoint-65200", "epoch": 5.0, "eval_steps": 200, "global_step": 65595, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 7.622532205198568e-05, "grad_norm": 4.528491020202637, "learning_rate": 9.999999994265464e-05, "loss": 1.9246152639389038, "memory(GiB)": 27.45, "step": 1, "token_acc": 0.6773049645390071, "train_speed(iter/s)": 0.144363 }, { "epoch": 0.00038112661025992833, "grad_norm": 1.751957893371582, "learning_rate": 9.999999856636602e-05, "loss": 1.202160120010376, "memory(GiB)": 55.68, "step": 5, "token_acc": 0.6860735910572893, "train_speed(iter/s)": 0.253992 }, { "epoch": 0.0007622532205198567, "grad_norm": 0.46129679679870605, "learning_rate": 9.999999426546414e-05, "loss": 0.6056160449981689, "memory(GiB)": 75.01, "step": 10, "token_acc": 0.8009572072072072, "train_speed(iter/s)": 0.261969 }, { "epoch": 0.001143379830779785, "grad_norm": 0.3930913209915161, "learning_rate": 9.999998709729462e-05, "loss": 0.626449728012085, "memory(GiB)": 75.01, "step": 15, "token_acc": 0.7716919739696312, "train_speed(iter/s)": 0.281731 }, { "epoch": 0.0015245064410397133, "grad_norm": 0.5390567183494568, "learning_rate": 9.999997706185787e-05, "loss": 0.510352373123169, "memory(GiB)": 75.01, "step": 20, "token_acc": 0.7957918467029871, "train_speed(iter/s)": 0.286165 }, { "epoch": 0.0019056330512996417, "grad_norm": 0.3768530488014221, "learning_rate": 9.999996415915447e-05, "loss": 0.47327170372009275, "memory(GiB)": 75.01, "step": 25, "token_acc": 0.8311387900355872, "train_speed(iter/s)": 0.286581 }, { "epoch": 0.00228675966155957, "grad_norm": 0.9930228590965271, "learning_rate": 9.999994838918515e-05, "loss": 0.5166455745697022, "memory(GiB)": 75.01, "step": 30, "token_acc": 0.81693028705331, "train_speed(iter/s)": 0.292458 }, { "epoch": 0.0026678862718194983, "grad_norm": 0.2230115830898285, "learning_rate": 9.99999297519508e-05, "loss": 0.4076542854309082, "memory(GiB)": 101.64, "step": 35, "token_acc": 0.8522645578720345, "train_speed(iter/s)": 0.282366 }, { "epoch": 0.0030490128820794267, "grad_norm": 0.5454061031341553, "learning_rate": 9.999990824745254e-05, "loss": 0.45731205940246583, "memory(GiB)": 101.64, "step": 40, "token_acc": 0.8287697647512534, "train_speed(iter/s)": 0.284384 }, { "epoch": 0.003430139492339355, "grad_norm": 0.6889848709106445, "learning_rate": 9.999988387569155e-05, "loss": 0.42831859588623045, "memory(GiB)": 101.64, "step": 45, "token_acc": 0.852589641434263, "train_speed(iter/s)": 0.290818 }, { "epoch": 0.0038112661025992835, "grad_norm": 0.5877910256385803, "learning_rate": 9.999985663666924e-05, "loss": 0.4950111389160156, "memory(GiB)": 101.64, "step": 50, "token_acc": 0.8203249442497611, "train_speed(iter/s)": 0.288583 }, { "epoch": 0.004192392712859211, "grad_norm": 0.40256166458129883, "learning_rate": 9.99998265303872e-05, "loss": 0.5511288166046142, "memory(GiB)": 101.64, "step": 55, "token_acc": 0.7799592517132803, "train_speed(iter/s)": 0.28885 }, { "epoch": 0.00457351932311914, "grad_norm": 0.490213006734848, "learning_rate": 9.999979355684712e-05, "loss": 0.4428769588470459, "memory(GiB)": 122.96, "step": 60, "token_acc": 0.8411040656471466, "train_speed(iter/s)": 0.287094 }, { "epoch": 0.004954645933379068, "grad_norm": 0.7190462350845337, "learning_rate": 9.999975771605092e-05, "loss": 0.4864951133728027, "memory(GiB)": 122.96, "step": 65, "token_acc": 0.8173539518900343, "train_speed(iter/s)": 0.285589 }, { "epoch": 0.005335772543638997, "grad_norm": 0.6336663365364075, "learning_rate": 9.999971900800063e-05, "loss": 0.3921785354614258, "memory(GiB)": 122.96, "step": 70, "token_acc": 0.8481855764813964, "train_speed(iter/s)": 0.286862 }, { "epoch": 0.005716899153898925, "grad_norm": 0.25349387526512146, "learning_rate": 9.99996774326985e-05, "loss": 0.4328573226928711, "memory(GiB)": 122.96, "step": 75, "token_acc": 0.8483591097698981, "train_speed(iter/s)": 0.286473 }, { "epoch": 0.006098025764158853, "grad_norm": 0.8416159152984619, "learning_rate": 9.999963299014687e-05, "loss": 0.4801319122314453, "memory(GiB)": 122.96, "step": 80, "token_acc": 0.8177020506634499, "train_speed(iter/s)": 0.286314 }, { "epoch": 0.006479152374418782, "grad_norm": 1.5706214904785156, "learning_rate": 9.999958568034832e-05, "loss": 0.5126121044158936, "memory(GiB)": 122.96, "step": 85, "token_acc": 0.7738552953512757, "train_speed(iter/s)": 0.288732 }, { "epoch": 0.00686027898467871, "grad_norm": 0.3812929391860962, "learning_rate": 9.999953550330556e-05, "loss": 0.4611966133117676, "memory(GiB)": 122.96, "step": 90, "token_acc": 0.8080450187578158, "train_speed(iter/s)": 0.289571 }, { "epoch": 0.0072414055949386385, "grad_norm": 0.5538367629051208, "learning_rate": 9.999948245902148e-05, "loss": 0.49420366287231443, "memory(GiB)": 122.96, "step": 95, "token_acc": 0.8193152214365463, "train_speed(iter/s)": 0.290225 }, { "epoch": 0.007622532205198567, "grad_norm": 0.6944217085838318, "learning_rate": 9.999942654749909e-05, "loss": 0.5629151344299317, "memory(GiB)": 122.96, "step": 100, "token_acc": 0.7596277278562259, "train_speed(iter/s)": 0.291891 }, { "epoch": 0.008003658815458496, "grad_norm": 0.43158140778541565, "learning_rate": 9.999936776874162e-05, "loss": 0.4801185607910156, "memory(GiB)": 122.96, "step": 105, "token_acc": 0.8235092529129541, "train_speed(iter/s)": 0.293535 }, { "epoch": 0.008384785425718423, "grad_norm": 0.5736222863197327, "learning_rate": 9.999930612275243e-05, "loss": 0.38431410789489745, "memory(GiB)": 122.96, "step": 110, "token_acc": 0.8450786255096098, "train_speed(iter/s)": 0.296656 }, { "epoch": 0.008765912035978351, "grad_norm": 0.6656768321990967, "learning_rate": 9.999924160953506e-05, "loss": 0.38381361961364746, "memory(GiB)": 122.96, "step": 115, "token_acc": 0.8520732162868883, "train_speed(iter/s)": 0.298319 }, { "epoch": 0.00914703864623828, "grad_norm": 1.1191989183425903, "learning_rate": 9.999917422909322e-05, "loss": 0.45654473304748533, "memory(GiB)": 122.96, "step": 120, "token_acc": 0.8286196707471507, "train_speed(iter/s)": 0.299776 }, { "epoch": 0.009528165256498208, "grad_norm": 1.0045593976974487, "learning_rate": 9.999910398143075e-05, "loss": 0.4776431083679199, "memory(GiB)": 122.96, "step": 125, "token_acc": 0.8123598305507101, "train_speed(iter/s)": 0.3002 }, { "epoch": 0.009909291866758136, "grad_norm": 0.7601901888847351, "learning_rate": 9.999903086655171e-05, "loss": 0.4598785400390625, "memory(GiB)": 122.96, "step": 130, "token_acc": 0.8288269331930563, "train_speed(iter/s)": 0.298578 }, { "epoch": 0.010290418477018065, "grad_norm": 0.42522627115249634, "learning_rate": 9.999895488446025e-05, "loss": 0.3817711591720581, "memory(GiB)": 122.96, "step": 135, "token_acc": 0.8470910752360645, "train_speed(iter/s)": 0.298282 }, { "epoch": 0.010671545087277993, "grad_norm": 0.44290637969970703, "learning_rate": 9.999887603516075e-05, "loss": 0.4092557430267334, "memory(GiB)": 122.96, "step": 140, "token_acc": 0.8293347873500545, "train_speed(iter/s)": 0.29882 }, { "epoch": 0.011052671697537922, "grad_norm": 0.48456859588623047, "learning_rate": 9.999879431865775e-05, "loss": 0.4724125385284424, "memory(GiB)": 122.96, "step": 145, "token_acc": 0.8090737240075614, "train_speed(iter/s)": 0.298429 }, { "epoch": 0.01143379830779785, "grad_norm": 0.5638869404792786, "learning_rate": 9.99987097349559e-05, "loss": 0.4015656471252441, "memory(GiB)": 122.96, "step": 150, "token_acc": 0.832496971794428, "train_speed(iter/s)": 0.298221 }, { "epoch": 0.011814924918057778, "grad_norm": 0.6350157260894775, "learning_rate": 9.99986222840601e-05, "loss": 0.3583400011062622, "memory(GiB)": 122.96, "step": 155, "token_acc": 0.8633340052408789, "train_speed(iter/s)": 0.298018 }, { "epoch": 0.012196051528317707, "grad_norm": 0.7299162745475769, "learning_rate": 9.999853196597531e-05, "loss": 0.41852712631225586, "memory(GiB)": 122.96, "step": 160, "token_acc": 0.7804347826086957, "train_speed(iter/s)": 0.298977 }, { "epoch": 0.012577178138577635, "grad_norm": 0.8121874928474426, "learning_rate": 9.999843878070673e-05, "loss": 0.4489565849304199, "memory(GiB)": 122.96, "step": 165, "token_acc": 0.8196943972835314, "train_speed(iter/s)": 0.29989 }, { "epoch": 0.012958304748837563, "grad_norm": 0.405847430229187, "learning_rate": 9.999834272825971e-05, "loss": 0.3791668176651001, "memory(GiB)": 122.96, "step": 170, "token_acc": 0.8517262023548194, "train_speed(iter/s)": 0.299447 }, { "epoch": 0.013339431359097492, "grad_norm": 0.8079923391342163, "learning_rate": 9.999824380863975e-05, "loss": 0.4233840465545654, "memory(GiB)": 122.96, "step": 175, "token_acc": 0.8390994657847876, "train_speed(iter/s)": 0.296935 }, { "epoch": 0.01372055796935742, "grad_norm": 1.0664503574371338, "learning_rate": 9.999814202185254e-05, "loss": 0.5039567470550537, "memory(GiB)": 122.96, "step": 180, "token_acc": 0.813451425408248, "train_speed(iter/s)": 0.297514 }, { "epoch": 0.014101684579617349, "grad_norm": 0.9102248549461365, "learning_rate": 9.999803736790391e-05, "loss": 0.45125503540039064, "memory(GiB)": 122.96, "step": 185, "token_acc": 0.8211943389713496, "train_speed(iter/s)": 0.298491 }, { "epoch": 0.014482811189877277, "grad_norm": 0.5232564210891724, "learning_rate": 9.999792984679986e-05, "loss": 0.3918847799301147, "memory(GiB)": 122.96, "step": 190, "token_acc": 0.836739843552864, "train_speed(iter/s)": 0.298917 }, { "epoch": 0.014863937800137205, "grad_norm": 0.7289205193519592, "learning_rate": 9.999781945854652e-05, "loss": 0.47884335517883303, "memory(GiB)": 122.96, "step": 195, "token_acc": 0.8149618320610686, "train_speed(iter/s)": 0.298475 }, { "epoch": 0.015245064410397134, "grad_norm": 0.34483739733695984, "learning_rate": 9.999770620315028e-05, "loss": 0.3874505996704102, "memory(GiB)": 122.96, "step": 200, "token_acc": 0.8417340191036002, "train_speed(iter/s)": 0.298135 }, { "epoch": 0.015245064410397134, "eval_loss": 0.35053905844688416, "eval_runtime": 194.0246, "eval_samples_per_second": 2.732, "eval_steps_per_second": 2.732, "eval_token_acc": 0.8361996265285224, "step": 200 }, { "epoch": 0.015626191020657064, "grad_norm": 0.869731068611145, "learning_rate": 9.99975900806176e-05, "loss": 0.48143720626831055, "memory(GiB)": 122.96, "step": 205, "token_acc": 0.8356118754525707, "train_speed(iter/s)": 0.232612 }, { "epoch": 0.016007317630916992, "grad_norm": 1.4615966081619263, "learning_rate": 9.999747109095514e-05, "loss": 0.4894747734069824, "memory(GiB)": 122.96, "step": 210, "token_acc": 0.7937556289402582, "train_speed(iter/s)": 0.234395 }, { "epoch": 0.016388444241176917, "grad_norm": 1.5600606203079224, "learning_rate": 9.999734923416974e-05, "loss": 0.4120072841644287, "memory(GiB)": 122.96, "step": 215, "token_acc": 0.8441496163682864, "train_speed(iter/s)": 0.235235 }, { "epoch": 0.016769570851436846, "grad_norm": 0.9481793642044067, "learning_rate": 9.999722451026837e-05, "loss": 0.44687700271606445, "memory(GiB)": 122.96, "step": 220, "token_acc": 0.8321864058915852, "train_speed(iter/s)": 0.235175 }, { "epoch": 0.017150697461696774, "grad_norm": 0.8998978137969971, "learning_rate": 9.999709691925818e-05, "loss": 0.44192066192626955, "memory(GiB)": 122.96, "step": 225, "token_acc": 0.8430707876370888, "train_speed(iter/s)": 0.236345 }, { "epoch": 0.017531824071956702, "grad_norm": 0.6256055235862732, "learning_rate": 9.999696646114651e-05, "loss": 0.42284693717956545, "memory(GiB)": 122.96, "step": 230, "token_acc": 0.8319887758681165, "train_speed(iter/s)": 0.237912 }, { "epoch": 0.01791295068221663, "grad_norm": 1.3327051401138306, "learning_rate": 9.999683313594083e-05, "loss": 0.3381479263305664, "memory(GiB)": 122.96, "step": 235, "token_acc": 0.8573446327683616, "train_speed(iter/s)": 0.238446 }, { "epoch": 0.01829407729247656, "grad_norm": 1.0581331253051758, "learning_rate": 9.999669694364878e-05, "loss": 0.4608944892883301, "memory(GiB)": 122.96, "step": 240, "token_acc": 0.8080599328338931, "train_speed(iter/s)": 0.239826 }, { "epoch": 0.018675203902736488, "grad_norm": 1.003026008605957, "learning_rate": 9.999655788427817e-05, "loss": 0.4396797180175781, "memory(GiB)": 122.96, "step": 245, "token_acc": 0.82760663507109, "train_speed(iter/s)": 0.241188 }, { "epoch": 0.019056330512996416, "grad_norm": 2.65301251411438, "learning_rate": 9.999641595783699e-05, "loss": 0.4889055252075195, "memory(GiB)": 122.96, "step": 250, "token_acc": 0.7832465688594415, "train_speed(iter/s)": 0.24279 }, { "epoch": 0.019437457123256344, "grad_norm": 0.6014110445976257, "learning_rate": 9.999627116433335e-05, "loss": 0.36986827850341797, "memory(GiB)": 122.96, "step": 255, "token_acc": 0.8325727324586423, "train_speed(iter/s)": 0.244074 }, { "epoch": 0.019818583733516273, "grad_norm": 0.6963219046592712, "learning_rate": 9.999612350377559e-05, "loss": 0.40722270011901857, "memory(GiB)": 122.96, "step": 260, "token_acc": 0.8420222947866232, "train_speed(iter/s)": 0.244368 }, { "epoch": 0.0201997103437762, "grad_norm": 0.495159387588501, "learning_rate": 9.999597297617213e-05, "loss": 0.3730503797531128, "memory(GiB)": 122.96, "step": 265, "token_acc": 0.8629600626468285, "train_speed(iter/s)": 0.245124 }, { "epoch": 0.02058083695403613, "grad_norm": 0.43080323934555054, "learning_rate": 9.999581958153165e-05, "loss": 0.4320246696472168, "memory(GiB)": 122.96, "step": 270, "token_acc": 0.8030902822117226, "train_speed(iter/s)": 0.246039 }, { "epoch": 0.020961963564296058, "grad_norm": 0.9531773924827576, "learning_rate": 9.999566331986293e-05, "loss": 0.4725799560546875, "memory(GiB)": 122.96, "step": 275, "token_acc": 0.784859448928472, "train_speed(iter/s)": 0.247121 }, { "epoch": 0.021343090174555986, "grad_norm": 1.5243268013000488, "learning_rate": 9.999550419117489e-05, "loss": 0.3928366661071777, "memory(GiB)": 122.96, "step": 280, "token_acc": 0.827170582226762, "train_speed(iter/s)": 0.248078 }, { "epoch": 0.021724216784815915, "grad_norm": 0.95951247215271, "learning_rate": 9.999534219547673e-05, "loss": 0.37721011638641355, "memory(GiB)": 122.96, "step": 285, "token_acc": 0.8557114228456913, "train_speed(iter/s)": 0.248213 }, { "epoch": 0.022105343395075843, "grad_norm": 0.7358139753341675, "learning_rate": 9.999517733277769e-05, "loss": 0.33346145153045653, "memory(GiB)": 122.96, "step": 290, "token_acc": 0.8577140953015662, "train_speed(iter/s)": 0.249156 }, { "epoch": 0.02248647000533577, "grad_norm": 0.4751031696796417, "learning_rate": 9.999500960308723e-05, "loss": 0.3752711772918701, "memory(GiB)": 122.96, "step": 295, "token_acc": 0.8514014839241549, "train_speed(iter/s)": 0.249662 }, { "epoch": 0.0228675966155957, "grad_norm": 0.6474721431732178, "learning_rate": 9.999483900641498e-05, "loss": 0.3588348388671875, "memory(GiB)": 122.96, "step": 300, "token_acc": 0.8601759175007583, "train_speed(iter/s)": 0.250854 }, { "epoch": 0.023248723225855628, "grad_norm": 0.7326136827468872, "learning_rate": 9.999466554277072e-05, "loss": 0.4338569164276123, "memory(GiB)": 122.96, "step": 305, "token_acc": 0.8326075122482308, "train_speed(iter/s)": 0.251247 }, { "epoch": 0.023629849836115557, "grad_norm": 0.4683168828487396, "learning_rate": 9.999448921216438e-05, "loss": 0.4193264484405518, "memory(GiB)": 122.96, "step": 310, "token_acc": 0.8413978494623656, "train_speed(iter/s)": 0.251192 }, { "epoch": 0.024010976446375485, "grad_norm": 0.46098604798316956, "learning_rate": 9.999431001460611e-05, "loss": 0.4343080997467041, "memory(GiB)": 122.96, "step": 315, "token_acc": 0.8151284627529783, "train_speed(iter/s)": 0.251629 }, { "epoch": 0.024392103056635413, "grad_norm": 0.5721769332885742, "learning_rate": 9.999412795010615e-05, "loss": 0.429913330078125, "memory(GiB)": 122.96, "step": 320, "token_acc": 0.8274948483956432, "train_speed(iter/s)": 0.252658 }, { "epoch": 0.024773229666895342, "grad_norm": 0.30305206775665283, "learning_rate": 9.999394301867495e-05, "loss": 0.47609596252441405, "memory(GiB)": 122.96, "step": 325, "token_acc": 0.8352841800410281, "train_speed(iter/s)": 0.252577 }, { "epoch": 0.02515435627715527, "grad_norm": 0.38029199838638306, "learning_rate": 9.999375522032313e-05, "loss": 0.35333943367004395, "memory(GiB)": 122.96, "step": 330, "token_acc": 0.8452774715178243, "train_speed(iter/s)": 0.253365 }, { "epoch": 0.0255354828874152, "grad_norm": 0.9743106365203857, "learning_rate": 9.999356455506143e-05, "loss": 0.4180303573608398, "memory(GiB)": 122.96, "step": 335, "token_acc": 0.8379776405806775, "train_speed(iter/s)": 0.253802 }, { "epoch": 0.025916609497675127, "grad_norm": 1.1414568424224854, "learning_rate": 9.999337102290083e-05, "loss": 0.4610589981079102, "memory(GiB)": 122.96, "step": 340, "token_acc": 0.8129402556744064, "train_speed(iter/s)": 0.254605 }, { "epoch": 0.026297736107935055, "grad_norm": 0.7228065729141235, "learning_rate": 9.999317462385238e-05, "loss": 0.4676199913024902, "memory(GiB)": 122.96, "step": 345, "token_acc": 0.7753973738769868, "train_speed(iter/s)": 0.255637 }, { "epoch": 0.026678862718194984, "grad_norm": 0.38195085525512695, "learning_rate": 9.999297535792736e-05, "loss": 0.3708998203277588, "memory(GiB)": 122.96, "step": 350, "token_acc": 0.8554350505299483, "train_speed(iter/s)": 0.25576 }, { "epoch": 0.027059989328454912, "grad_norm": 0.4837827682495117, "learning_rate": 9.99927732251372e-05, "loss": 0.42815542221069336, "memory(GiB)": 122.96, "step": 355, "token_acc": 0.8350820006111846, "train_speed(iter/s)": 0.255386 }, { "epoch": 0.02744111593871484, "grad_norm": 0.5391905903816223, "learning_rate": 9.999256822549349e-05, "loss": 0.41931886672973634, "memory(GiB)": 122.96, "step": 360, "token_acc": 0.8338378206149435, "train_speed(iter/s)": 0.255675 }, { "epoch": 0.02782224254897477, "grad_norm": 0.5956388115882874, "learning_rate": 9.999236035900799e-05, "loss": 0.4532865047454834, "memory(GiB)": 122.96, "step": 365, "token_acc": 0.8296431362333941, "train_speed(iter/s)": 0.256309 }, { "epoch": 0.028203369159234697, "grad_norm": 0.6002485156059265, "learning_rate": 9.999214962569261e-05, "loss": 0.38429875373840333, "memory(GiB)": 122.96, "step": 370, "token_acc": 0.8312441534144059, "train_speed(iter/s)": 0.256908 }, { "epoch": 0.028584495769494626, "grad_norm": 0.7980237603187561, "learning_rate": 9.999193602555946e-05, "loss": 0.38420839309692384, "memory(GiB)": 122.96, "step": 375, "token_acc": 0.8263841421736159, "train_speed(iter/s)": 0.257219 }, { "epoch": 0.028965622379754554, "grad_norm": 1.9750065803527832, "learning_rate": 9.999171955862075e-05, "loss": 0.39796831607818606, "memory(GiB)": 122.96, "step": 380, "token_acc": 0.8312723722746502, "train_speed(iter/s)": 0.257562 }, { "epoch": 0.029346748990014482, "grad_norm": 0.6654983162879944, "learning_rate": 9.999150022488891e-05, "loss": 0.3543954372406006, "memory(GiB)": 122.96, "step": 385, "token_acc": 0.8525121555915721, "train_speed(iter/s)": 0.258147 }, { "epoch": 0.02972787560027441, "grad_norm": 0.6314485669136047, "learning_rate": 9.999127802437654e-05, "loss": 0.420991039276123, "memory(GiB)": 122.96, "step": 390, "token_acc": 0.8441044579266684, "train_speed(iter/s)": 0.258796 }, { "epoch": 0.03010900221053434, "grad_norm": 0.7217302322387695, "learning_rate": 9.999105295709635e-05, "loss": 0.34807798862457273, "memory(GiB)": 122.96, "step": 395, "token_acc": 0.8563394683026585, "train_speed(iter/s)": 0.259362 }, { "epoch": 0.030490128820794268, "grad_norm": 0.8781742453575134, "learning_rate": 9.999082502306126e-05, "loss": 0.44669809341430666, "memory(GiB)": 122.96, "step": 400, "token_acc": 0.7998065764023211, "train_speed(iter/s)": 0.260001 }, { "epoch": 0.030490128820794268, "eval_loss": 0.31395217776298523, "eval_runtime": 184.9028, "eval_samples_per_second": 2.866, "eval_steps_per_second": 2.866, "eval_token_acc": 0.8472531775194265, "step": 400 }, { "epoch": 0.030871255431054196, "grad_norm": 1.139358401298523, "learning_rate": 9.999059422228434e-05, "loss": 0.48725070953369143, "memory(GiB)": 122.96, "step": 405, "token_acc": 0.8459585915635554, "train_speed(iter/s)": 0.232722 }, { "epoch": 0.03125238204131413, "grad_norm": 0.7483011484146118, "learning_rate": 9.999036055477883e-05, "loss": 0.33954520225524903, "memory(GiB)": 122.96, "step": 410, "token_acc": 0.8506464556397682, "train_speed(iter/s)": 0.233269 }, { "epoch": 0.031633508651574056, "grad_norm": 0.6765672564506531, "learning_rate": 9.999012402055812e-05, "loss": 0.3194799184799194, "memory(GiB)": 122.96, "step": 415, "token_acc": 0.8708342409061207, "train_speed(iter/s)": 0.234019 }, { "epoch": 0.032014635261833985, "grad_norm": 0.7512357831001282, "learning_rate": 9.998988461963578e-05, "loss": 0.4671616554260254, "memory(GiB)": 122.96, "step": 420, "token_acc": 0.8207862818904225, "train_speed(iter/s)": 0.234798 }, { "epoch": 0.03239576187209391, "grad_norm": 0.8870928287506104, "learning_rate": 9.998964235202554e-05, "loss": 0.32560741901397705, "memory(GiB)": 122.96, "step": 425, "token_acc": 0.86752281176073, "train_speed(iter/s)": 0.235599 }, { "epoch": 0.032776888482353835, "grad_norm": 0.7087309956550598, "learning_rate": 9.99893972177413e-05, "loss": 0.3868344783782959, "memory(GiB)": 122.96, "step": 430, "token_acc": 0.8306651262584585, "train_speed(iter/s)": 0.236148 }, { "epoch": 0.03315801509261376, "grad_norm": 0.4838944673538208, "learning_rate": 9.998914921679712e-05, "loss": 0.41447787284851073, "memory(GiB)": 122.96, "step": 435, "token_acc": 0.8408316291751875, "train_speed(iter/s)": 0.236785 }, { "epoch": 0.03353914170287369, "grad_norm": 1.3201626539230347, "learning_rate": 9.998889834920718e-05, "loss": 0.3463392496109009, "memory(GiB)": 122.96, "step": 440, "token_acc": 0.8546085615117625, "train_speed(iter/s)": 0.237625 }, { "epoch": 0.03392026831313362, "grad_norm": 1.3567754030227661, "learning_rate": 9.998864461498592e-05, "loss": 0.3136441707611084, "memory(GiB)": 122.96, "step": 445, "token_acc": 0.8744167962674961, "train_speed(iter/s)": 0.238504 }, { "epoch": 0.03430139492339355, "grad_norm": 0.6349946856498718, "learning_rate": 9.998838801414785e-05, "loss": 0.3825905084609985, "memory(GiB)": 122.96, "step": 450, "token_acc": 0.8521446007206789, "train_speed(iter/s)": 0.238717 }, { "epoch": 0.034682521533653476, "grad_norm": 0.7947877049446106, "learning_rate": 9.998812854670772e-05, "loss": 0.4890433311462402, "memory(GiB)": 122.96, "step": 455, "token_acc": 0.8064168819982773, "train_speed(iter/s)": 0.239305 }, { "epoch": 0.035063648143913405, "grad_norm": 1.0431760549545288, "learning_rate": 9.998786621268038e-05, "loss": 0.36441137790679934, "memory(GiB)": 122.96, "step": 460, "token_acc": 0.8524590163934426, "train_speed(iter/s)": 0.239876 }, { "epoch": 0.03544477475417333, "grad_norm": 1.1932413578033447, "learning_rate": 9.998760101208087e-05, "loss": 0.41066956520080566, "memory(GiB)": 122.96, "step": 465, "token_acc": 0.8544642857142857, "train_speed(iter/s)": 0.240276 }, { "epoch": 0.03582590136443326, "grad_norm": 1.577191710472107, "learning_rate": 9.998733294492444e-05, "loss": 0.3186405897140503, "memory(GiB)": 122.96, "step": 470, "token_acc": 0.8375808292126283, "train_speed(iter/s)": 0.241127 }, { "epoch": 0.03620702797469319, "grad_norm": 0.7229084372520447, "learning_rate": 9.998706201122641e-05, "loss": 0.42525033950805663, "memory(GiB)": 122.96, "step": 475, "token_acc": 0.8299128479055383, "train_speed(iter/s)": 0.241785 }, { "epoch": 0.03658815458495312, "grad_norm": 0.41239479184150696, "learning_rate": 9.998678821100235e-05, "loss": 0.3694032669067383, "memory(GiB)": 122.96, "step": 480, "token_acc": 0.8582842724978974, "train_speed(iter/s)": 0.242385 }, { "epoch": 0.03696928119521305, "grad_norm": 1.160628318786621, "learning_rate": 9.998651154426796e-05, "loss": 0.39645626544952395, "memory(GiB)": 122.96, "step": 485, "token_acc": 0.8437158469945355, "train_speed(iter/s)": 0.24281 }, { "epoch": 0.037350407805472975, "grad_norm": 0.5555111169815063, "learning_rate": 9.99862320110391e-05, "loss": 0.31522388458251954, "memory(GiB)": 122.96, "step": 490, "token_acc": 0.8616138763197587, "train_speed(iter/s)": 0.243348 }, { "epoch": 0.037731534415732904, "grad_norm": 0.32308223843574524, "learning_rate": 9.998594961133181e-05, "loss": 0.40636935234069826, "memory(GiB)": 122.96, "step": 495, "token_acc": 0.8518052811208909, "train_speed(iter/s)": 0.243587 }, { "epoch": 0.03811266102599283, "grad_norm": 0.8555343747138977, "learning_rate": 9.998566434516226e-05, "loss": 0.415405797958374, "memory(GiB)": 122.96, "step": 500, "token_acc": 0.8308977035490606, "train_speed(iter/s)": 0.243855 }, { "epoch": 0.03849378763625276, "grad_norm": 0.8992118835449219, "learning_rate": 9.99853762125468e-05, "loss": 0.40863685607910155, "memory(GiB)": 122.96, "step": 505, "token_acc": 0.8112258406142441, "train_speed(iter/s)": 0.24454 }, { "epoch": 0.03887491424651269, "grad_norm": 0.48962679505348206, "learning_rate": 9.998508521350201e-05, "loss": 0.3729743242263794, "memory(GiB)": 122.96, "step": 510, "token_acc": 0.8407970859224341, "train_speed(iter/s)": 0.24501 }, { "epoch": 0.03925604085677262, "grad_norm": 0.7814298868179321, "learning_rate": 9.998479134804453e-05, "loss": 0.31490697860717776, "memory(GiB)": 122.96, "step": 515, "token_acc": 0.8582721626199887, "train_speed(iter/s)": 0.245637 }, { "epoch": 0.039637167467032546, "grad_norm": 0.4278663992881775, "learning_rate": 9.998449461619121e-05, "loss": 0.3379648447036743, "memory(GiB)": 122.96, "step": 520, "token_acc": 0.8733570159857904, "train_speed(iter/s)": 0.245892 }, { "epoch": 0.040018294077292474, "grad_norm": 0.6275119185447693, "learning_rate": 9.99841950179591e-05, "loss": 0.4239678382873535, "memory(GiB)": 122.96, "step": 525, "token_acc": 0.8335500650195059, "train_speed(iter/s)": 0.246305 }, { "epoch": 0.0403994206875524, "grad_norm": 1.5933401584625244, "learning_rate": 9.998389255336535e-05, "loss": 0.3082572460174561, "memory(GiB)": 122.96, "step": 530, "token_acc": 0.8701431492842536, "train_speed(iter/s)": 0.24708 }, { "epoch": 0.04078054729781233, "grad_norm": 0.5892673134803772, "learning_rate": 9.998358722242731e-05, "loss": 0.4067962646484375, "memory(GiB)": 122.96, "step": 535, "token_acc": 0.8366409024441195, "train_speed(iter/s)": 0.247567 }, { "epoch": 0.04116167390807226, "grad_norm": 0.4958355724811554, "learning_rate": 9.998327902516251e-05, "loss": 0.42982120513916017, "memory(GiB)": 122.96, "step": 540, "token_acc": 0.8277149777957207, "train_speed(iter/s)": 0.247619 }, { "epoch": 0.04154280051833219, "grad_norm": 0.3523523211479187, "learning_rate": 9.998296796158859e-05, "loss": 0.33466572761535646, "memory(GiB)": 122.96, "step": 545, "token_acc": 0.8730695524743547, "train_speed(iter/s)": 0.247475 }, { "epoch": 0.041923927128592116, "grad_norm": 0.7282472848892212, "learning_rate": 9.998265403172343e-05, "loss": 0.3507689476013184, "memory(GiB)": 122.96, "step": 550, "token_acc": 0.8283169533169533, "train_speed(iter/s)": 0.248143 }, { "epoch": 0.042305053738852044, "grad_norm": 0.8247482180595398, "learning_rate": 9.998233723558499e-05, "loss": 0.4202248573303223, "memory(GiB)": 122.96, "step": 555, "token_acc": 0.8383141762452108, "train_speed(iter/s)": 0.24834 }, { "epoch": 0.04268618034911197, "grad_norm": 0.5519070029258728, "learning_rate": 9.998201757319146e-05, "loss": 0.3287726163864136, "memory(GiB)": 122.96, "step": 560, "token_acc": 0.8721461187214612, "train_speed(iter/s)": 0.248729 }, { "epoch": 0.0430673069593719, "grad_norm": 0.39140045642852783, "learning_rate": 9.998169504456118e-05, "loss": 0.3594696044921875, "memory(GiB)": 122.96, "step": 565, "token_acc": 0.8660205245153934, "train_speed(iter/s)": 0.249071 }, { "epoch": 0.04344843356963183, "grad_norm": 0.46012499928474426, "learning_rate": 9.99813696497126e-05, "loss": 0.40318799018859863, "memory(GiB)": 122.96, "step": 570, "token_acc": 0.8423368907470569, "train_speed(iter/s)": 0.249162 }, { "epoch": 0.04382956017989176, "grad_norm": 0.8278694152832031, "learning_rate": 9.998104138866445e-05, "loss": 0.3010098934173584, "memory(GiB)": 122.96, "step": 575, "token_acc": 0.8664741168629193, "train_speed(iter/s)": 0.249552 }, { "epoch": 0.044210686790151686, "grad_norm": 0.45664680004119873, "learning_rate": 9.99807102614355e-05, "loss": 0.3806741237640381, "memory(GiB)": 122.96, "step": 580, "token_acc": 0.8451010488616014, "train_speed(iter/s)": 0.249605 }, { "epoch": 0.044591813400411615, "grad_norm": 0.9615135788917542, "learning_rate": 9.998037626804475e-05, "loss": 0.36945419311523436, "memory(GiB)": 122.96, "step": 585, "token_acc": 0.8483567172784932, "train_speed(iter/s)": 0.249867 }, { "epoch": 0.04497294001067154, "grad_norm": 0.9080905914306641, "learning_rate": 9.998003940851137e-05, "loss": 0.26711182594299315, "memory(GiB)": 122.96, "step": 590, "token_acc": 0.8865629420084865, "train_speed(iter/s)": 0.250311 }, { "epoch": 0.04535406662093147, "grad_norm": 0.4471183717250824, "learning_rate": 9.997969968285465e-05, "loss": 0.3248094320297241, "memory(GiB)": 122.96, "step": 595, "token_acc": 0.8773170265622015, "train_speed(iter/s)": 0.25054 }, { "epoch": 0.0457351932311914, "grad_norm": 0.42623019218444824, "learning_rate": 9.997935709109412e-05, "loss": 0.3596322536468506, "memory(GiB)": 122.96, "step": 600, "token_acc": 0.8488296488946684, "train_speed(iter/s)": 0.250607 }, { "epoch": 0.0457351932311914, "eval_loss": 0.28870463371276855, "eval_runtime": 184.503, "eval_samples_per_second": 2.873, "eval_steps_per_second": 2.873, "eval_token_acc": 0.8538341063791338, "step": 600 }, { "epoch": 0.04611631984145133, "grad_norm": 0.6523170471191406, "learning_rate": 9.997901163324936e-05, "loss": 0.395623517036438, "memory(GiB)": 122.96, "step": 605, "token_acc": 0.8534692844487833, "train_speed(iter/s)": 0.232978 }, { "epoch": 0.046497446451711257, "grad_norm": 4.184655666351318, "learning_rate": 9.997866330934023e-05, "loss": 0.3646019220352173, "memory(GiB)": 122.96, "step": 610, "token_acc": 0.8678474114441417, "train_speed(iter/s)": 0.233265 }, { "epoch": 0.046878573061971185, "grad_norm": 0.8908939361572266, "learning_rate": 9.997831211938669e-05, "loss": 0.38559694290161134, "memory(GiB)": 122.96, "step": 615, "token_acc": 0.8545313107740179, "train_speed(iter/s)": 0.233727 }, { "epoch": 0.04725969967223111, "grad_norm": 1.195573091506958, "learning_rate": 9.997795806340886e-05, "loss": 0.28753085136413575, "memory(GiB)": 122.96, "step": 620, "token_acc": 0.8619008935824533, "train_speed(iter/s)": 0.234258 }, { "epoch": 0.04764082628249104, "grad_norm": 1.7803226709365845, "learning_rate": 9.997760114142706e-05, "loss": 0.29034254550933836, "memory(GiB)": 122.96, "step": 625, "token_acc": 0.8762759591693066, "train_speed(iter/s)": 0.234406 }, { "epoch": 0.04802195289275097, "grad_norm": 1.0109211206436157, "learning_rate": 9.997724135346179e-05, "loss": 0.4173550605773926, "memory(GiB)": 122.96, "step": 630, "token_acc": 0.8509212730318257, "train_speed(iter/s)": 0.234911 }, { "epoch": 0.0484030795030109, "grad_norm": 1.0904144048690796, "learning_rate": 9.997687869953363e-05, "loss": 0.4255978107452393, "memory(GiB)": 122.96, "step": 635, "token_acc": 0.8513257575757576, "train_speed(iter/s)": 0.235067 }, { "epoch": 0.04878420611327083, "grad_norm": 0.6953149437904358, "learning_rate": 9.99765131796634e-05, "loss": 0.3378890514373779, "memory(GiB)": 122.96, "step": 640, "token_acc": 0.8601312551271534, "train_speed(iter/s)": 0.235478 }, { "epoch": 0.049165332723530755, "grad_norm": 0.8727684020996094, "learning_rate": 9.997614479387205e-05, "loss": 0.4118244171142578, "memory(GiB)": 122.96, "step": 645, "token_acc": 0.8357826614534178, "train_speed(iter/s)": 0.2357 }, { "epoch": 0.049546459333790684, "grad_norm": 1.953823208808899, "learning_rate": 9.997577354218073e-05, "loss": 0.3344784498214722, "memory(GiB)": 122.96, "step": 650, "token_acc": 0.8690078037904125, "train_speed(iter/s)": 0.236235 }, { "epoch": 0.04992758594405061, "grad_norm": 1.8870900869369507, "learning_rate": 9.99753994246107e-05, "loss": 0.25777881145477294, "memory(GiB)": 122.96, "step": 655, "token_acc": 0.9104046242774566, "train_speed(iter/s)": 0.236905 }, { "epoch": 0.05030871255431054, "grad_norm": 1.2711288928985596, "learning_rate": 9.997502244118344e-05, "loss": 0.3343281984329224, "memory(GiB)": 122.96, "step": 660, "token_acc": 0.8643035170312933, "train_speed(iter/s)": 0.237409 }, { "epoch": 0.05068983916457047, "grad_norm": 1.5099661350250244, "learning_rate": 9.997464259192055e-05, "loss": 0.3363473892211914, "memory(GiB)": 122.96, "step": 665, "token_acc": 0.8750624687656172, "train_speed(iter/s)": 0.237861 }, { "epoch": 0.0510709657748304, "grad_norm": 0.9354046583175659, "learning_rate": 9.997425987684381e-05, "loss": 0.2955607891082764, "memory(GiB)": 122.96, "step": 670, "token_acc": 0.8609826589595375, "train_speed(iter/s)": 0.238294 }, { "epoch": 0.051452092385090326, "grad_norm": 1.2429924011230469, "learning_rate": 9.997387429597518e-05, "loss": 0.4170533180236816, "memory(GiB)": 122.96, "step": 675, "token_acc": 0.8409090909090909, "train_speed(iter/s)": 0.238694 }, { "epoch": 0.051833218995350254, "grad_norm": 0.6801356077194214, "learning_rate": 9.997348584933677e-05, "loss": 0.3688316822052002, "memory(GiB)": 122.96, "step": 680, "token_acc": 0.8493396015222745, "train_speed(iter/s)": 0.239034 }, { "epoch": 0.05221434560561018, "grad_norm": 0.49400457739830017, "learning_rate": 9.997309453695084e-05, "loss": 0.3974099636077881, "memory(GiB)": 122.96, "step": 685, "token_acc": 0.8362831858407079, "train_speed(iter/s)": 0.239304 }, { "epoch": 0.05259547221587011, "grad_norm": 1.8052300214767456, "learning_rate": 9.997270035883985e-05, "loss": 0.33188796043395996, "memory(GiB)": 122.96, "step": 690, "token_acc": 0.8397753551370994, "train_speed(iter/s)": 0.239741 }, { "epoch": 0.05297659882613004, "grad_norm": 0.5305536985397339, "learning_rate": 9.99723033150264e-05, "loss": 0.33836963176727297, "memory(GiB)": 122.96, "step": 695, "token_acc": 0.8622799164428528, "train_speed(iter/s)": 0.239988 }, { "epoch": 0.05335772543638997, "grad_norm": 0.9012311697006226, "learning_rate": 9.997190340553327e-05, "loss": 0.47834248542785646, "memory(GiB)": 122.96, "step": 700, "token_acc": 0.7888947092718701, "train_speed(iter/s)": 0.240509 }, { "epoch": 0.053738852046649896, "grad_norm": 0.736178457736969, "learning_rate": 9.997150063038335e-05, "loss": 0.44519920349121095, "memory(GiB)": 122.96, "step": 705, "token_acc": 0.8241646291768541, "train_speed(iter/s)": 0.240913 }, { "epoch": 0.054119978656909824, "grad_norm": 3.8395962715148926, "learning_rate": 9.997109498959977e-05, "loss": 0.4126322269439697, "memory(GiB)": 122.96, "step": 710, "token_acc": 0.8461144321093083, "train_speed(iter/s)": 0.241237 }, { "epoch": 0.05450110526716975, "grad_norm": 0.6108548641204834, "learning_rate": 9.99706864832058e-05, "loss": 0.3138087272644043, "memory(GiB)": 122.96, "step": 715, "token_acc": 0.8558558558558559, "train_speed(iter/s)": 0.241453 }, { "epoch": 0.05488223187742968, "grad_norm": 0.4257928729057312, "learning_rate": 9.997027511122484e-05, "loss": 0.40671577453613283, "memory(GiB)": 122.96, "step": 720, "token_acc": 0.8495662949194548, "train_speed(iter/s)": 0.241816 }, { "epoch": 0.05526335848768961, "grad_norm": 1.4600634574890137, "learning_rate": 9.996986087368049e-05, "loss": 0.34661815166473386, "memory(GiB)": 122.96, "step": 725, "token_acc": 0.8383768913342503, "train_speed(iter/s)": 0.242306 }, { "epoch": 0.05564448509794954, "grad_norm": 0.9344198107719421, "learning_rate": 9.996944377059651e-05, "loss": 0.3573688268661499, "memory(GiB)": 122.96, "step": 730, "token_acc": 0.8417240770181946, "train_speed(iter/s)": 0.242616 }, { "epoch": 0.056025611708209466, "grad_norm": 0.9474985003471375, "learning_rate": 9.996902380199684e-05, "loss": 0.38165082931518557, "memory(GiB)": 122.96, "step": 735, "token_acc": 0.840377358490566, "train_speed(iter/s)": 0.242905 }, { "epoch": 0.056406738318469395, "grad_norm": 0.588057279586792, "learning_rate": 9.996860096790551e-05, "loss": 0.39323973655700684, "memory(GiB)": 122.96, "step": 740, "token_acc": 0.8546952938625028, "train_speed(iter/s)": 0.243271 }, { "epoch": 0.05678786492872932, "grad_norm": 0.709802508354187, "learning_rate": 9.996817526834681e-05, "loss": 0.3299025297164917, "memory(GiB)": 122.96, "step": 745, "token_acc": 0.8650793650793651, "train_speed(iter/s)": 0.243641 }, { "epoch": 0.05716899153898925, "grad_norm": 0.5205948948860168, "learning_rate": 9.996774670334514e-05, "loss": 0.3359744310379028, "memory(GiB)": 122.96, "step": 750, "token_acc": 0.871193210184723, "train_speed(iter/s)": 0.243566 }, { "epoch": 0.05755011814924918, "grad_norm": 1.0935746431350708, "learning_rate": 9.996731527292506e-05, "loss": 0.34018242359161377, "memory(GiB)": 122.96, "step": 755, "token_acc": 0.8499406880189798, "train_speed(iter/s)": 0.243834 }, { "epoch": 0.05793124475950911, "grad_norm": 0.7897275686264038, "learning_rate": 9.996688097711133e-05, "loss": 0.4345408916473389, "memory(GiB)": 122.96, "step": 760, "token_acc": 0.8001955671447197, "train_speed(iter/s)": 0.244267 }, { "epoch": 0.05831237136976904, "grad_norm": 0.8327571153640747, "learning_rate": 9.996644381592887e-05, "loss": 0.3156434059143066, "memory(GiB)": 122.96, "step": 765, "token_acc": 0.8763452205106562, "train_speed(iter/s)": 0.244545 }, { "epoch": 0.058693497980028965, "grad_norm": 1.4792498350143433, "learning_rate": 9.996600378940271e-05, "loss": 0.2986889362335205, "memory(GiB)": 122.96, "step": 770, "token_acc": 0.8552897088498126, "train_speed(iter/s)": 0.244874 }, { "epoch": 0.05907462459028889, "grad_norm": 0.537405788898468, "learning_rate": 9.99655608975581e-05, "loss": 0.329836368560791, "memory(GiB)": 122.96, "step": 775, "token_acc": 0.8491984678677826, "train_speed(iter/s)": 0.244992 }, { "epoch": 0.05945575120054882, "grad_norm": 0.8622668981552124, "learning_rate": 9.996511514042047e-05, "loss": 0.3233332633972168, "memory(GiB)": 122.96, "step": 780, "token_acc": 0.8738760056791292, "train_speed(iter/s)": 0.24533 }, { "epoch": 0.05983687781080875, "grad_norm": 0.9952224493026733, "learning_rate": 9.996466651801532e-05, "loss": 0.34236159324646, "memory(GiB)": 122.96, "step": 785, "token_acc": 0.8532408411012357, "train_speed(iter/s)": 0.245643 }, { "epoch": 0.06021800442106868, "grad_norm": 0.7318833470344543, "learning_rate": 9.996421503036844e-05, "loss": 0.34651975631713866, "memory(GiB)": 122.96, "step": 790, "token_acc": 0.8621695178849145, "train_speed(iter/s)": 0.246023 }, { "epoch": 0.06059913103132861, "grad_norm": 0.6247409582138062, "learning_rate": 9.996376067750566e-05, "loss": 0.27611613273620605, "memory(GiB)": 122.96, "step": 795, "token_acc": 0.8787362349239644, "train_speed(iter/s)": 0.246005 }, { "epoch": 0.060980257641588535, "grad_norm": 0.38995155692100525, "learning_rate": 9.996330345945309e-05, "loss": 0.2856283187866211, "memory(GiB)": 122.96, "step": 800, "token_acc": 0.8682824025289779, "train_speed(iter/s)": 0.246117 }, { "epoch": 0.060980257641588535, "eval_loss": 0.2596844434738159, "eval_runtime": 179.5753, "eval_samples_per_second": 2.951, "eval_steps_per_second": 2.951, "eval_token_acc": 0.8607538702487801, "step": 800 }, { "epoch": 0.061361384251848464, "grad_norm": 0.9964883923530579, "learning_rate": 9.996284337623692e-05, "loss": 0.3654125690460205, "memory(GiB)": 122.96, "step": 805, "token_acc": 0.8609802450976822, "train_speed(iter/s)": 0.233627 }, { "epoch": 0.06174251086210839, "grad_norm": 2.0521633625030518, "learning_rate": 9.996238042788353e-05, "loss": 0.34076025485992434, "memory(GiB)": 122.96, "step": 810, "token_acc": 0.8723227282179247, "train_speed(iter/s)": 0.233991 }, { "epoch": 0.06212363747236832, "grad_norm": 0.40977340936660767, "learning_rate": 9.996191461441947e-05, "loss": 0.33114984035491946, "memory(GiB)": 122.96, "step": 815, "token_acc": 0.8523692928129051, "train_speed(iter/s)": 0.234164 }, { "epoch": 0.06250476408262826, "grad_norm": 0.4104391634464264, "learning_rate": 9.996144593587147e-05, "loss": 0.3621030569076538, "memory(GiB)": 122.96, "step": 820, "token_acc": 0.8619943759463552, "train_speed(iter/s)": 0.234559 }, { "epoch": 0.06288589069288818, "grad_norm": 0.46353575587272644, "learning_rate": 9.996097439226639e-05, "loss": 0.3756758213043213, "memory(GiB)": 122.96, "step": 825, "token_acc": 0.8572417373247967, "train_speed(iter/s)": 0.234786 }, { "epoch": 0.06326701730314811, "grad_norm": 0.6371397972106934, "learning_rate": 9.996049998363128e-05, "loss": 0.34291815757751465, "memory(GiB)": 122.96, "step": 830, "token_acc": 0.8714689265536724, "train_speed(iter/s)": 0.235211 }, { "epoch": 0.06364814391340803, "grad_norm": 1.5204706192016602, "learning_rate": 9.996002270999334e-05, "loss": 0.39930191040039065, "memory(GiB)": 122.96, "step": 835, "token_acc": 0.8456093361392768, "train_speed(iter/s)": 0.23549 }, { "epoch": 0.06402927052366797, "grad_norm": 0.4767929017543793, "learning_rate": 9.995954257137994e-05, "loss": 0.41980547904968263, "memory(GiB)": 122.96, "step": 840, "token_acc": 0.8417827298050139, "train_speed(iter/s)": 0.235662 }, { "epoch": 0.06441039713392789, "grad_norm": 0.48393455147743225, "learning_rate": 9.995905956781861e-05, "loss": 0.27288260459899905, "memory(GiB)": 122.96, "step": 845, "token_acc": 0.8750187097739859, "train_speed(iter/s)": 0.235896 }, { "epoch": 0.06479152374418783, "grad_norm": 1.389690637588501, "learning_rate": 9.995857369933705e-05, "loss": 0.2528748273849487, "memory(GiB)": 122.96, "step": 850, "token_acc": 0.8775692582663092, "train_speed(iter/s)": 0.236291 }, { "epoch": 0.06517265035444775, "grad_norm": 3.3885436058044434, "learning_rate": 9.995808496596313e-05, "loss": 0.2959144592285156, "memory(GiB)": 122.96, "step": 855, "token_acc": 0.8768359211078472, "train_speed(iter/s)": 0.236637 }, { "epoch": 0.06555377696470767, "grad_norm": 1.3807339668273926, "learning_rate": 9.995759336772487e-05, "loss": 0.2919294357299805, "memory(GiB)": 122.96, "step": 860, "token_acc": 0.869021190716448, "train_speed(iter/s)": 0.236979 }, { "epoch": 0.0659349035749676, "grad_norm": 0.7133747935295105, "learning_rate": 9.995709890465048e-05, "loss": 0.32124958038330076, "memory(GiB)": 122.96, "step": 865, "token_acc": 0.8601161665053243, "train_speed(iter/s)": 0.237208 }, { "epoch": 0.06631603018522753, "grad_norm": 1.3251805305480957, "learning_rate": 9.995660157676828e-05, "loss": 0.2846105098724365, "memory(GiB)": 122.96, "step": 870, "token_acc": 0.8661876092445135, "train_speed(iter/s)": 0.237601 }, { "epoch": 0.06669715679548746, "grad_norm": 0.5018326044082642, "learning_rate": 9.99561013841068e-05, "loss": 0.3328379154205322, "memory(GiB)": 122.96, "step": 875, "token_acc": 0.84472049689441, "train_speed(iter/s)": 0.237976 }, { "epoch": 0.06707828340574738, "grad_norm": 1.7366342544555664, "learning_rate": 9.995559832669475e-05, "loss": 0.26777560710906984, "memory(GiB)": 122.96, "step": 880, "token_acc": 0.8835669781931464, "train_speed(iter/s)": 0.238405 }, { "epoch": 0.06745941001600732, "grad_norm": 0.63080233335495, "learning_rate": 9.995509240456093e-05, "loss": 0.3424130916595459, "memory(GiB)": 122.96, "step": 885, "token_acc": 0.8552557616638561, "train_speed(iter/s)": 0.238749 }, { "epoch": 0.06784053662626724, "grad_norm": 0.6132400631904602, "learning_rate": 9.995458361773439e-05, "loss": 0.3544909000396729, "memory(GiB)": 122.96, "step": 890, "token_acc": 0.8791465932553338, "train_speed(iter/s)": 0.238945 }, { "epoch": 0.06822166323652717, "grad_norm": 1.3464102745056152, "learning_rate": 9.995407196624431e-05, "loss": 0.3807902574539185, "memory(GiB)": 122.96, "step": 895, "token_acc": 0.8664766498574874, "train_speed(iter/s)": 0.239296 }, { "epoch": 0.0686027898467871, "grad_norm": 0.9776564836502075, "learning_rate": 9.995355745012001e-05, "loss": 0.2929649353027344, "memory(GiB)": 122.96, "step": 900, "token_acc": 0.881107189299647, "train_speed(iter/s)": 0.23948 }, { "epoch": 0.06898391645704703, "grad_norm": 0.5579814910888672, "learning_rate": 9.995304006939101e-05, "loss": 0.2820343017578125, "memory(GiB)": 122.96, "step": 905, "token_acc": 0.8640106241699868, "train_speed(iter/s)": 0.239846 }, { "epoch": 0.06936504306730695, "grad_norm": 1.1656994819641113, "learning_rate": 9.995251982408697e-05, "loss": 0.2548708438873291, "memory(GiB)": 122.96, "step": 910, "token_acc": 0.8831203407880724, "train_speed(iter/s)": 0.240131 }, { "epoch": 0.06974616967756689, "grad_norm": 0.44933393597602844, "learning_rate": 9.995199671423772e-05, "loss": 0.3587518215179443, "memory(GiB)": 122.96, "step": 915, "token_acc": 0.8552851569756641, "train_speed(iter/s)": 0.240361 }, { "epoch": 0.07012729628782681, "grad_norm": 1.205645203590393, "learning_rate": 9.995147073987326e-05, "loss": 0.38188230991363525, "memory(GiB)": 122.96, "step": 920, "token_acc": 0.8455592105263158, "train_speed(iter/s)": 0.240513 }, { "epoch": 0.07050842289808675, "grad_norm": 1.0860291719436646, "learning_rate": 9.995094190102376e-05, "loss": 0.2624641418457031, "memory(GiB)": 122.96, "step": 925, "token_acc": 0.8802816901408451, "train_speed(iter/s)": 0.240717 }, { "epoch": 0.07088954950834667, "grad_norm": 0.7993502020835876, "learning_rate": 9.995041019771956e-05, "loss": 0.24951796531677245, "memory(GiB)": 122.96, "step": 930, "token_acc": 0.8836772983114447, "train_speed(iter/s)": 0.240964 }, { "epoch": 0.0712706761186066, "grad_norm": 0.6376010775566101, "learning_rate": 9.994987562999111e-05, "loss": 0.2515277862548828, "memory(GiB)": 122.96, "step": 935, "token_acc": 0.8518062397372742, "train_speed(iter/s)": 0.241248 }, { "epoch": 0.07165180272886652, "grad_norm": 1.588802695274353, "learning_rate": 9.994933819786908e-05, "loss": 0.2447023868560791, "memory(GiB)": 122.96, "step": 940, "token_acc": 0.8923917612442203, "train_speed(iter/s)": 0.241676 }, { "epoch": 0.07203292933912646, "grad_norm": 0.6258974075317383, "learning_rate": 9.994879790138434e-05, "loss": 0.3769711971282959, "memory(GiB)": 122.96, "step": 945, "token_acc": 0.804950917626974, "train_speed(iter/s)": 0.242087 }, { "epoch": 0.07241405594938638, "grad_norm": 1.1066696643829346, "learning_rate": 9.994825474056779e-05, "loss": 0.30419011116027833, "memory(GiB)": 122.96, "step": 950, "token_acc": 0.8668415874057068, "train_speed(iter/s)": 0.242473 }, { "epoch": 0.07279518255964632, "grad_norm": 1.1749000549316406, "learning_rate": 9.994770871545065e-05, "loss": 0.27971978187561036, "memory(GiB)": 122.96, "step": 955, "token_acc": 0.8687992582290218, "train_speed(iter/s)": 0.242874 }, { "epoch": 0.07317630916990624, "grad_norm": 0.40407902002334595, "learning_rate": 9.994715982606418e-05, "loss": 0.3986749887466431, "memory(GiB)": 122.96, "step": 960, "token_acc": 0.8473098330241188, "train_speed(iter/s)": 0.243142 }, { "epoch": 0.07355743578016617, "grad_norm": 0.5659456253051758, "learning_rate": 9.994660807243988e-05, "loss": 0.30558109283447266, "memory(GiB)": 122.96, "step": 965, "token_acc": 0.8496570121951219, "train_speed(iter/s)": 0.243374 }, { "epoch": 0.0739385623904261, "grad_norm": 0.5996445417404175, "learning_rate": 9.994605345460939e-05, "loss": 0.3275512456893921, "memory(GiB)": 122.96, "step": 970, "token_acc": 0.8582995951417004, "train_speed(iter/s)": 0.243737 }, { "epoch": 0.07431968900068603, "grad_norm": 1.0778344869613647, "learning_rate": 9.994549597260452e-05, "loss": 0.38637099266052244, "memory(GiB)": 122.96, "step": 975, "token_acc": 0.8534602879702741, "train_speed(iter/s)": 0.244063 }, { "epoch": 0.07470081561094595, "grad_norm": 0.603624701499939, "learning_rate": 9.994493562645721e-05, "loss": 0.3450649261474609, "memory(GiB)": 122.96, "step": 980, "token_acc": 0.8646434805962132, "train_speed(iter/s)": 0.244064 }, { "epoch": 0.07508194222120589, "grad_norm": 1.0091614723205566, "learning_rate": 9.994437241619964e-05, "loss": 0.44653897285461425, "memory(GiB)": 122.96, "step": 985, "token_acc": 0.8378594249201278, "train_speed(iter/s)": 0.244172 }, { "epoch": 0.07546306883146581, "grad_norm": 0.6648703217506409, "learning_rate": 9.994380634186406e-05, "loss": 0.274582052230835, "memory(GiB)": 122.96, "step": 990, "token_acc": 0.8760886777513855, "train_speed(iter/s)": 0.244417 }, { "epoch": 0.07584419544172574, "grad_norm": 0.5610662698745728, "learning_rate": 9.994323740348297e-05, "loss": 0.3271946907043457, "memory(GiB)": 122.96, "step": 995, "token_acc": 0.8453681710213776, "train_speed(iter/s)": 0.244679 }, { "epoch": 0.07622532205198566, "grad_norm": 0.6179755926132202, "learning_rate": 9.994266560108897e-05, "loss": 0.34681341648101804, "memory(GiB)": 122.96, "step": 1000, "token_acc": 0.8584206491183235, "train_speed(iter/s)": 0.245014 }, { "epoch": 0.07622532205198566, "eval_loss": 0.24275702238082886, "eval_runtime": 182.8854, "eval_samples_per_second": 2.898, "eval_steps_per_second": 2.898, "eval_token_acc": 0.8643530510210228, "step": 1000 }, { "epoch": 0.0766064486622456, "grad_norm": 0.5093851685523987, "learning_rate": 9.994209093471488e-05, "loss": 0.36008760929107664, "memory(GiB)": 122.96, "step": 1005, "token_acc": 0.8644340491884612, "train_speed(iter/s)": 0.234598 }, { "epoch": 0.07698757527250552, "grad_norm": 0.7020618915557861, "learning_rate": 9.994151340439362e-05, "loss": 0.39787113666534424, "memory(GiB)": 122.96, "step": 1010, "token_acc": 0.8376172990616075, "train_speed(iter/s)": 0.234846 }, { "epoch": 0.07736870188276546, "grad_norm": 1.0691577196121216, "learning_rate": 9.99409330101583e-05, "loss": 0.3583847999572754, "memory(GiB)": 122.96, "step": 1015, "token_acc": 0.846822130772748, "train_speed(iter/s)": 0.235139 }, { "epoch": 0.07774982849302538, "grad_norm": 0.6342916488647461, "learning_rate": 9.994034975204226e-05, "loss": 0.32695066928863525, "memory(GiB)": 122.96, "step": 1020, "token_acc": 0.8698096885813149, "train_speed(iter/s)": 0.235449 }, { "epoch": 0.07813095510328531, "grad_norm": 1.0491770505905151, "learning_rate": 9.993976363007891e-05, "loss": 0.2463550090789795, "memory(GiB)": 122.96, "step": 1025, "token_acc": 0.8688442211055276, "train_speed(iter/s)": 0.235883 }, { "epoch": 0.07851208171354523, "grad_norm": 0.6031205058097839, "learning_rate": 9.993917464430185e-05, "loss": 0.338565468788147, "memory(GiB)": 122.96, "step": 1030, "token_acc": 0.8475743348982786, "train_speed(iter/s)": 0.236207 }, { "epoch": 0.07889320832380517, "grad_norm": 2.2066619396209717, "learning_rate": 9.993858279474487e-05, "loss": 0.3718540191650391, "memory(GiB)": 122.96, "step": 1035, "token_acc": 0.8558182055635366, "train_speed(iter/s)": 0.236272 }, { "epoch": 0.07927433493406509, "grad_norm": 0.6382604837417603, "learning_rate": 9.993798808144192e-05, "loss": 0.2756333351135254, "memory(GiB)": 122.96, "step": 1040, "token_acc": 0.8514399771884802, "train_speed(iter/s)": 0.236617 }, { "epoch": 0.07965546154432503, "grad_norm": 0.6233166456222534, "learning_rate": 9.99373905044271e-05, "loss": 0.25544419288635256, "memory(GiB)": 122.96, "step": 1045, "token_acc": 0.8737040527803959, "train_speed(iter/s)": 0.236925 }, { "epoch": 0.08003658815458495, "grad_norm": 0.42494675517082214, "learning_rate": 9.993679006373465e-05, "loss": 0.269423508644104, "memory(GiB)": 122.96, "step": 1050, "token_acc": 0.8596367483688944, "train_speed(iter/s)": 0.237201 }, { "epoch": 0.08041771476484488, "grad_norm": 0.8544996380805969, "learning_rate": 9.993618675939904e-05, "loss": 0.3445001125335693, "memory(GiB)": 122.96, "step": 1055, "token_acc": 0.8548939082819986, "train_speed(iter/s)": 0.237337 }, { "epoch": 0.0807988413751048, "grad_norm": 0.8256821036338806, "learning_rate": 9.993558059145485e-05, "loss": 0.35902574062347414, "memory(GiB)": 122.96, "step": 1060, "token_acc": 0.8614755254619834, "train_speed(iter/s)": 0.237479 }, { "epoch": 0.08117996798536474, "grad_norm": 0.9025565981864929, "learning_rate": 9.993497155993684e-05, "loss": 0.2837635040283203, "memory(GiB)": 122.96, "step": 1065, "token_acc": 0.8841632088520055, "train_speed(iter/s)": 0.237813 }, { "epoch": 0.08156109459562466, "grad_norm": 1.369053840637207, "learning_rate": 9.993435966487995e-05, "loss": 0.3160522937774658, "memory(GiB)": 122.96, "step": 1070, "token_acc": 0.8536357986326911, "train_speed(iter/s)": 0.238128 }, { "epoch": 0.0819422212058846, "grad_norm": 1.2012816667556763, "learning_rate": 9.993374490631924e-05, "loss": 0.23799140453338624, "memory(GiB)": 122.96, "step": 1075, "token_acc": 0.8656273199703044, "train_speed(iter/s)": 0.238359 }, { "epoch": 0.08232334781614452, "grad_norm": 0.5712122321128845, "learning_rate": 9.993312728428998e-05, "loss": 0.3389333009719849, "memory(GiB)": 122.96, "step": 1080, "token_acc": 0.8610755441741357, "train_speed(iter/s)": 0.238663 }, { "epoch": 0.08270447442640445, "grad_norm": 0.9947119355201721, "learning_rate": 9.99325067988276e-05, "loss": 0.3630231380462646, "memory(GiB)": 122.96, "step": 1085, "token_acc": 0.8538951636258326, "train_speed(iter/s)": 0.238824 }, { "epoch": 0.08308560103666437, "grad_norm": 0.6455538868904114, "learning_rate": 9.993188344996767e-05, "loss": 0.3080313682556152, "memory(GiB)": 122.96, "step": 1090, "token_acc": 0.8792769528728211, "train_speed(iter/s)": 0.23896 }, { "epoch": 0.08346672764692431, "grad_norm": 0.946351170539856, "learning_rate": 9.993125723774592e-05, "loss": 0.391520619392395, "memory(GiB)": 122.96, "step": 1095, "token_acc": 0.8449853587115667, "train_speed(iter/s)": 0.239167 }, { "epoch": 0.08384785425718423, "grad_norm": 0.6497941017150879, "learning_rate": 9.99306281621983e-05, "loss": 0.31511199474334717, "memory(GiB)": 122.96, "step": 1100, "token_acc": 0.8728813559322034, "train_speed(iter/s)": 0.239223 }, { "epoch": 0.08422898086744417, "grad_norm": 0.49627619981765747, "learning_rate": 9.992999622336084e-05, "loss": 0.3096617698669434, "memory(GiB)": 122.96, "step": 1105, "token_acc": 0.8768566493955094, "train_speed(iter/s)": 0.239381 }, { "epoch": 0.08461010747770409, "grad_norm": 0.4730762541294098, "learning_rate": 9.992936142126982e-05, "loss": 0.2844571590423584, "memory(GiB)": 122.96, "step": 1110, "token_acc": 0.8689743045232533, "train_speed(iter/s)": 0.239649 }, { "epoch": 0.08499123408796402, "grad_norm": 0.8984159827232361, "learning_rate": 9.992872375596161e-05, "loss": 0.36366708278656007, "memory(GiB)": 122.96, "step": 1115, "token_acc": 0.854664914586071, "train_speed(iter/s)": 0.239866 }, { "epoch": 0.08537236069822395, "grad_norm": 0.9417855143547058, "learning_rate": 9.992808322747279e-05, "loss": 0.2679781675338745, "memory(GiB)": 122.96, "step": 1120, "token_acc": 0.8914285714285715, "train_speed(iter/s)": 0.24006 }, { "epoch": 0.08575348730848388, "grad_norm": 0.837355375289917, "learning_rate": 9.992743983584009e-05, "loss": 0.29336235523223875, "memory(GiB)": 122.96, "step": 1125, "token_acc": 0.8756307992837375, "train_speed(iter/s)": 0.240274 }, { "epoch": 0.0861346139187438, "grad_norm": 0.63333660364151, "learning_rate": 9.992679358110042e-05, "loss": 0.25902409553527833, "memory(GiB)": 122.96, "step": 1130, "token_acc": 0.8917301414581066, "train_speed(iter/s)": 0.240508 }, { "epoch": 0.08651574052900374, "grad_norm": 1.2863445281982422, "learning_rate": 9.992614446329082e-05, "loss": 0.3177644729614258, "memory(GiB)": 122.96, "step": 1135, "token_acc": 0.8709876543209877, "train_speed(iter/s)": 0.240876 }, { "epoch": 0.08689686713926366, "grad_norm": 0.6825645565986633, "learning_rate": 9.992549248244852e-05, "loss": 0.4172356128692627, "memory(GiB)": 122.96, "step": 1140, "token_acc": 0.8535829567462879, "train_speed(iter/s)": 0.240918 }, { "epoch": 0.0872779937495236, "grad_norm": 0.44224679470062256, "learning_rate": 9.99248376386109e-05, "loss": 0.2891047954559326, "memory(GiB)": 122.96, "step": 1145, "token_acc": 0.8595665811234674, "train_speed(iter/s)": 0.241054 }, { "epoch": 0.08765912035978352, "grad_norm": 0.6717022657394409, "learning_rate": 9.992417993181553e-05, "loss": 0.3258754014968872, "memory(GiB)": 122.96, "step": 1150, "token_acc": 0.8505302378905131, "train_speed(iter/s)": 0.241231 }, { "epoch": 0.08804024697004345, "grad_norm": 0.6225957274436951, "learning_rate": 9.992351936210012e-05, "loss": 0.38921537399291994, "memory(GiB)": 122.96, "step": 1155, "token_acc": 0.8573030557891785, "train_speed(iter/s)": 0.241496 }, { "epoch": 0.08842137358030337, "grad_norm": 0.8881798386573792, "learning_rate": 9.992285592950255e-05, "loss": 0.46387519836425783, "memory(GiB)": 122.96, "step": 1160, "token_acc": 0.7927281148679944, "train_speed(iter/s)": 0.241755 }, { "epoch": 0.08880250019056331, "grad_norm": 0.951678454875946, "learning_rate": 9.992218963406085e-05, "loss": 0.3640100955963135, "memory(GiB)": 122.96, "step": 1165, "token_acc": 0.8696841241624504, "train_speed(iter/s)": 0.241867 }, { "epoch": 0.08918362680082323, "grad_norm": 0.7177585363388062, "learning_rate": 9.992152047581324e-05, "loss": 0.2853125810623169, "memory(GiB)": 122.96, "step": 1170, "token_acc": 0.8570491803278688, "train_speed(iter/s)": 0.242103 }, { "epoch": 0.08956475341108316, "grad_norm": 0.5735414624214172, "learning_rate": 9.992084845479811e-05, "loss": 0.3742716312408447, "memory(GiB)": 122.96, "step": 1175, "token_acc": 0.8662235147486498, "train_speed(iter/s)": 0.242321 }, { "epoch": 0.08994588002134309, "grad_norm": 2.1042873859405518, "learning_rate": 9.992017357105398e-05, "loss": 0.3318443298339844, "memory(GiB)": 122.96, "step": 1180, "token_acc": 0.8697549513259483, "train_speed(iter/s)": 0.242456 }, { "epoch": 0.09032700663160302, "grad_norm": 0.5025081634521484, "learning_rate": 9.991949582461955e-05, "loss": 0.34972808361053465, "memory(GiB)": 122.96, "step": 1185, "token_acc": 0.8520315342631898, "train_speed(iter/s)": 0.24268 }, { "epoch": 0.09070813324186294, "grad_norm": 1.5829836130142212, "learning_rate": 9.991881521553368e-05, "loss": 0.23997759819030762, "memory(GiB)": 122.96, "step": 1190, "token_acc": 0.8969979296066253, "train_speed(iter/s)": 0.242924 }, { "epoch": 0.09108925985212288, "grad_norm": 0.47387367486953735, "learning_rate": 9.991813174383542e-05, "loss": 0.3860133647918701, "memory(GiB)": 122.96, "step": 1195, "token_acc": 0.8561777777777778, "train_speed(iter/s)": 0.243106 }, { "epoch": 0.0914703864623828, "grad_norm": 0.4456857740879059, "learning_rate": 9.991744540956395e-05, "loss": 0.18847503662109374, "memory(GiB)": 122.96, "step": 1200, "token_acc": 0.9123062015503876, "train_speed(iter/s)": 0.243434 }, { "epoch": 0.0914703864623828, "eval_loss": 0.2286582589149475, "eval_runtime": 186.3302, "eval_samples_per_second": 2.844, "eval_steps_per_second": 2.844, "eval_token_acc": 0.8675832781157762, "step": 1200 }, { "epoch": 0.09185151307264273, "grad_norm": 2.2292988300323486, "learning_rate": 9.991675621275863e-05, "loss": 0.3584898948669434, "memory(GiB)": 122.96, "step": 1205, "token_acc": 0.8669753736055568, "train_speed(iter/s)": 0.234783 }, { "epoch": 0.09223263968290266, "grad_norm": 0.7625659704208374, "learning_rate": 9.991606415345899e-05, "loss": 0.43616819381713867, "memory(GiB)": 122.96, "step": 1210, "token_acc": 0.8140200286123033, "train_speed(iter/s)": 0.235004 }, { "epoch": 0.09261376629316259, "grad_norm": 3.531126022338867, "learning_rate": 9.991536923170471e-05, "loss": 0.3124186277389526, "memory(GiB)": 122.96, "step": 1215, "token_acc": 0.8570397111913357, "train_speed(iter/s)": 0.235333 }, { "epoch": 0.09299489290342251, "grad_norm": 1.0546016693115234, "learning_rate": 9.991467144753564e-05, "loss": 0.3472038984298706, "memory(GiB)": 122.96, "step": 1220, "token_acc": 0.8600472813238771, "train_speed(iter/s)": 0.235462 }, { "epoch": 0.09337601951368245, "grad_norm": 1.1548722982406616, "learning_rate": 9.991397080099179e-05, "loss": 0.3956183433532715, "memory(GiB)": 122.96, "step": 1225, "token_acc": 0.8519461622408149, "train_speed(iter/s)": 0.235432 }, { "epoch": 0.09375714612394237, "grad_norm": 0.8083139657974243, "learning_rate": 9.991326729211333e-05, "loss": 0.3487874746322632, "memory(GiB)": 122.96, "step": 1230, "token_acc": 0.8684613031508106, "train_speed(iter/s)": 0.235559 }, { "epoch": 0.0941382727342023, "grad_norm": 1.159450888633728, "learning_rate": 9.991256092094064e-05, "loss": 0.22699143886566162, "memory(GiB)": 122.96, "step": 1235, "token_acc": 0.8984327294931813, "train_speed(iter/s)": 0.235754 }, { "epoch": 0.09451939934446223, "grad_norm": 0.6520458459854126, "learning_rate": 9.991185168751417e-05, "loss": 0.2988017797470093, "memory(GiB)": 122.96, "step": 1240, "token_acc": 0.8737962493664471, "train_speed(iter/s)": 0.236066 }, { "epoch": 0.09490052595472216, "grad_norm": 1.3390988111495972, "learning_rate": 9.991113959187465e-05, "loss": 0.3303156137466431, "memory(GiB)": 122.96, "step": 1245, "token_acc": 0.8714442013129103, "train_speed(iter/s)": 0.236323 }, { "epoch": 0.09528165256498208, "grad_norm": 0.8916099667549133, "learning_rate": 9.991042463406291e-05, "loss": 0.2293933868408203, "memory(GiB)": 122.96, "step": 1250, "token_acc": 0.8765576323987538, "train_speed(iter/s)": 0.2365 }, { "epoch": 0.09566277917524202, "grad_norm": 1.4430028200149536, "learning_rate": 9.990970681411991e-05, "loss": 0.30124123096466066, "memory(GiB)": 122.96, "step": 1255, "token_acc": 0.8611197030621713, "train_speed(iter/s)": 0.23678 }, { "epoch": 0.09604390578550194, "grad_norm": 0.9661933779716492, "learning_rate": 9.990898613208683e-05, "loss": 0.26608264446258545, "memory(GiB)": 122.96, "step": 1260, "token_acc": 0.8742969628796401, "train_speed(iter/s)": 0.237038 }, { "epoch": 0.09642503239576188, "grad_norm": 1.7531121969223022, "learning_rate": 9.9908262588005e-05, "loss": 0.2631381511688232, "memory(GiB)": 122.96, "step": 1265, "token_acc": 0.8767017724120216, "train_speed(iter/s)": 0.237276 }, { "epoch": 0.0968061590060218, "grad_norm": 1.0919272899627686, "learning_rate": 9.99075361819159e-05, "loss": 0.3002780914306641, "memory(GiB)": 122.96, "step": 1270, "token_acc": 0.8698266713830917, "train_speed(iter/s)": 0.237582 }, { "epoch": 0.09718728561628173, "grad_norm": 0.4003828465938568, "learning_rate": 9.990680691386122e-05, "loss": 0.31622114181518557, "memory(GiB)": 122.96, "step": 1275, "token_acc": 0.8653295128939829, "train_speed(iter/s)": 0.237778 }, { "epoch": 0.09756841222654165, "grad_norm": 0.869187593460083, "learning_rate": 9.990607478388277e-05, "loss": 0.2420757532119751, "memory(GiB)": 122.96, "step": 1280, "token_acc": 0.8963707914298207, "train_speed(iter/s)": 0.23793 }, { "epoch": 0.09794953883680159, "grad_norm": 0.9983544945716858, "learning_rate": 9.99053397920225e-05, "loss": 0.317791748046875, "memory(GiB)": 122.96, "step": 1285, "token_acc": 0.8691065662002153, "train_speed(iter/s)": 0.238116 }, { "epoch": 0.09833066544706151, "grad_norm": 0.9206972718238831, "learning_rate": 9.990460193832259e-05, "loss": 0.38195137977600097, "memory(GiB)": 122.96, "step": 1290, "token_acc": 0.8528873356631719, "train_speed(iter/s)": 0.238219 }, { "epoch": 0.09871179205732145, "grad_norm": 0.7186734676361084, "learning_rate": 9.990386122282536e-05, "loss": 0.27911901473999023, "memory(GiB)": 122.96, "step": 1295, "token_acc": 0.8647210822313421, "train_speed(iter/s)": 0.238401 }, { "epoch": 0.09909291866758137, "grad_norm": 1.2912830114364624, "learning_rate": 9.990311764557325e-05, "loss": 0.240960693359375, "memory(GiB)": 122.96, "step": 1300, "token_acc": 0.8952569169960475, "train_speed(iter/s)": 0.238724 }, { "epoch": 0.0994740452778413, "grad_norm": 0.8165742754936218, "learning_rate": 9.990237120660893e-05, "loss": 0.29543271064758303, "memory(GiB)": 122.96, "step": 1305, "token_acc": 0.885692068429238, "train_speed(iter/s)": 0.238797 }, { "epoch": 0.09985517188810122, "grad_norm": 1.6282451152801514, "learning_rate": 9.990162190597518e-05, "loss": 0.35995898246765134, "memory(GiB)": 122.96, "step": 1310, "token_acc": 0.8355452971725332, "train_speed(iter/s)": 0.239023 }, { "epoch": 0.10023629849836116, "grad_norm": 1.3043659925460815, "learning_rate": 9.990086974371501e-05, "loss": 0.3592132329940796, "memory(GiB)": 122.96, "step": 1315, "token_acc": 0.8727954450770726, "train_speed(iter/s)": 0.2391 }, { "epoch": 0.10061742510862108, "grad_norm": 0.828529417514801, "learning_rate": 9.990011471987152e-05, "loss": 0.2668302059173584, "memory(GiB)": 122.96, "step": 1320, "token_acc": 0.8741192153875452, "train_speed(iter/s)": 0.239246 }, { "epoch": 0.10099855171888102, "grad_norm": 0.37373417615890503, "learning_rate": 9.989935683448801e-05, "loss": 0.2705280303955078, "memory(GiB)": 122.96, "step": 1325, "token_acc": 0.8837427632608356, "train_speed(iter/s)": 0.239329 }, { "epoch": 0.10137967832914094, "grad_norm": 1.517012119293213, "learning_rate": 9.989859608760796e-05, "loss": 0.2926818609237671, "memory(GiB)": 122.96, "step": 1330, "token_acc": 0.8362573099415205, "train_speed(iter/s)": 0.239624 }, { "epoch": 0.10176080493940087, "grad_norm": 0.7579478621482849, "learning_rate": 9.989783247927496e-05, "loss": 0.27550106048583983, "memory(GiB)": 122.96, "step": 1335, "token_acc": 0.8929016189290162, "train_speed(iter/s)": 0.239897 }, { "epoch": 0.1021419315496608, "grad_norm": 0.7756716012954712, "learning_rate": 9.989706600953284e-05, "loss": 0.3209331750869751, "memory(GiB)": 122.96, "step": 1340, "token_acc": 0.8760054934275063, "train_speed(iter/s)": 0.240064 }, { "epoch": 0.10252305815992073, "grad_norm": 0.5040755867958069, "learning_rate": 9.989629667842553e-05, "loss": 0.3144501209259033, "memory(GiB)": 122.96, "step": 1345, "token_acc": 0.8818351560416111, "train_speed(iter/s)": 0.240099 }, { "epoch": 0.10290418477018065, "grad_norm": 1.6665209531784058, "learning_rate": 9.989552448599715e-05, "loss": 0.2784042596817017, "memory(GiB)": 122.96, "step": 1350, "token_acc": 0.8801115241635687, "train_speed(iter/s)": 0.240348 }, { "epoch": 0.10328531138044059, "grad_norm": 0.9837320446968079, "learning_rate": 9.9894749432292e-05, "loss": 0.22038679122924804, "memory(GiB)": 122.96, "step": 1355, "token_acc": 0.8879042977403633, "train_speed(iter/s)": 0.240546 }, { "epoch": 0.10366643799070051, "grad_norm": 0.9569424390792847, "learning_rate": 9.98939715173545e-05, "loss": 0.32283544540405273, "memory(GiB)": 122.96, "step": 1360, "token_acc": 0.8532675709001233, "train_speed(iter/s)": 0.240871 }, { "epoch": 0.10404756460096044, "grad_norm": 1.1798166036605835, "learning_rate": 9.989319074122926e-05, "loss": 0.3877655744552612, "memory(GiB)": 122.96, "step": 1365, "token_acc": 0.8407489803485354, "train_speed(iter/s)": 0.241065 }, { "epoch": 0.10442869121122036, "grad_norm": 0.763592004776001, "learning_rate": 9.989240710396105e-05, "loss": 0.38001341819763185, "memory(GiB)": 122.96, "step": 1370, "token_acc": 0.8498631565228724, "train_speed(iter/s)": 0.241176 }, { "epoch": 0.1048098178214803, "grad_norm": 1.0278587341308594, "learning_rate": 9.989162060559486e-05, "loss": 0.33876938819885255, "memory(GiB)": 122.96, "step": 1375, "token_acc": 0.8855004158580538, "train_speed(iter/s)": 0.241295 }, { "epoch": 0.10519094443174022, "grad_norm": 0.7296426296234131, "learning_rate": 9.989083124617573e-05, "loss": 0.3480438947677612, "memory(GiB)": 122.96, "step": 1380, "token_acc": 0.8747120326067694, "train_speed(iter/s)": 0.241363 }, { "epoch": 0.10557207104200016, "grad_norm": 0.4497007131576538, "learning_rate": 9.989003902574896e-05, "loss": 0.26218817234039304, "memory(GiB)": 122.96, "step": 1385, "token_acc": 0.881966014982642, "train_speed(iter/s)": 0.241579 }, { "epoch": 0.10595319765226008, "grad_norm": 1.4578773975372314, "learning_rate": 9.988924394435997e-05, "loss": 0.33171398639678956, "memory(GiB)": 122.96, "step": 1390, "token_acc": 0.8681318681318682, "train_speed(iter/s)": 0.24182 }, { "epoch": 0.10633432426252001, "grad_norm": 1.069934606552124, "learning_rate": 9.988844600205434e-05, "loss": 0.36407334804534913, "memory(GiB)": 122.96, "step": 1395, "token_acc": 0.8591804392121349, "train_speed(iter/s)": 0.242027 }, { "epoch": 0.10671545087277994, "grad_norm": 1.2276278734207153, "learning_rate": 9.988764519887786e-05, "loss": 0.23869426250457765, "memory(GiB)": 122.96, "step": 1400, "token_acc": 0.8888683431952663, "train_speed(iter/s)": 0.2422 }, { "epoch": 0.10671545087277994, "eval_loss": 0.22189576923847198, "eval_runtime": 190.7476, "eval_samples_per_second": 2.779, "eval_steps_per_second": 2.779, "eval_token_acc": 0.8742093849768087, "step": 1400 }, { "epoch": 0.10709657748303987, "grad_norm": 0.7361118793487549, "learning_rate": 9.988684153487642e-05, "loss": 0.30362708568573, "memory(GiB)": 122.96, "step": 1405, "token_acc": 0.874016388593066, "train_speed(iter/s)": 0.234743 }, { "epoch": 0.10747770409329979, "grad_norm": 0.8145195841789246, "learning_rate": 9.988603501009614e-05, "loss": 0.2677700281143188, "memory(GiB)": 122.96, "step": 1410, "token_acc": 0.8877637130801688, "train_speed(iter/s)": 0.234873 }, { "epoch": 0.10785883070355973, "grad_norm": 0.43503913283348083, "learning_rate": 9.988522562458324e-05, "loss": 0.2873891830444336, "memory(GiB)": 122.96, "step": 1415, "token_acc": 0.8896593307205306, "train_speed(iter/s)": 0.234953 }, { "epoch": 0.10823995731381965, "grad_norm": 0.5774110555648804, "learning_rate": 9.988441337838414e-05, "loss": 0.2769100904464722, "memory(GiB)": 122.96, "step": 1420, "token_acc": 0.8816752011704463, "train_speed(iter/s)": 0.235138 }, { "epoch": 0.10862108392407958, "grad_norm": 0.5970641374588013, "learning_rate": 9.988359827154543e-05, "loss": 0.3621359348297119, "memory(GiB)": 122.96, "step": 1425, "token_acc": 0.8643317230273752, "train_speed(iter/s)": 0.235256 }, { "epoch": 0.1090022105343395, "grad_norm": 1.4232295751571655, "learning_rate": 9.988278030411385e-05, "loss": 0.4518928050994873, "memory(GiB)": 122.96, "step": 1430, "token_acc": 0.8198607326672722, "train_speed(iter/s)": 0.235493 }, { "epoch": 0.10938333714459944, "grad_norm": 1.0085704326629639, "learning_rate": 9.98819594761363e-05, "loss": 0.23796632289886474, "memory(GiB)": 122.96, "step": 1435, "token_acc": 0.9016029593094944, "train_speed(iter/s)": 0.235703 }, { "epoch": 0.10976446375485936, "grad_norm": 1.0184452533721924, "learning_rate": 9.988113578765986e-05, "loss": 0.32533955574035645, "memory(GiB)": 122.96, "step": 1440, "token_acc": 0.870767960363336, "train_speed(iter/s)": 0.235975 }, { "epoch": 0.1101455903651193, "grad_norm": 0.7100988030433655, "learning_rate": 9.988030923873175e-05, "loss": 0.37515578269958494, "memory(GiB)": 122.96, "step": 1445, "token_acc": 0.8475103734439834, "train_speed(iter/s)": 0.236224 }, { "epoch": 0.11052671697537922, "grad_norm": 0.9686647057533264, "learning_rate": 9.987947982939938e-05, "loss": 0.3003732681274414, "memory(GiB)": 122.96, "step": 1450, "token_acc": 0.8784977908689249, "train_speed(iter/s)": 0.236474 }, { "epoch": 0.11090784358563915, "grad_norm": 0.7090170979499817, "learning_rate": 9.987864755971033e-05, "loss": 0.34993298053741456, "memory(GiB)": 122.96, "step": 1455, "token_acc": 0.8628963153384748, "train_speed(iter/s)": 0.236702 }, { "epoch": 0.11128897019589908, "grad_norm": 2.440566301345825, "learning_rate": 9.987781242971228e-05, "loss": 0.3135263919830322, "memory(GiB)": 122.96, "step": 1460, "token_acc": 0.8648468708388815, "train_speed(iter/s)": 0.236859 }, { "epoch": 0.11167009680615901, "grad_norm": 0.7448179721832275, "learning_rate": 9.987697443945316e-05, "loss": 0.2933166742324829, "memory(GiB)": 122.96, "step": 1465, "token_acc": 0.8869983948635634, "train_speed(iter/s)": 0.237022 }, { "epoch": 0.11205122341641893, "grad_norm": 0.8079035878181458, "learning_rate": 9.987613358898101e-05, "loss": 0.39398696422576907, "memory(GiB)": 122.96, "step": 1470, "token_acc": 0.8522899188557282, "train_speed(iter/s)": 0.237173 }, { "epoch": 0.11243235002667887, "grad_norm": 0.8160979747772217, "learning_rate": 9.987528987834407e-05, "loss": 0.335716700553894, "memory(GiB)": 122.96, "step": 1475, "token_acc": 0.8657760814249363, "train_speed(iter/s)": 0.23726 }, { "epoch": 0.11281347663693879, "grad_norm": 0.7681664824485779, "learning_rate": 9.987444330759068e-05, "loss": 0.19432508945465088, "memory(GiB)": 122.96, "step": 1480, "token_acc": 0.9091144484722942, "train_speed(iter/s)": 0.237457 }, { "epoch": 0.11319460324719872, "grad_norm": 0.6677245497703552, "learning_rate": 9.987359387676943e-05, "loss": 0.25733270645141604, "memory(GiB)": 122.96, "step": 1485, "token_acc": 0.8769722388655882, "train_speed(iter/s)": 0.237593 }, { "epoch": 0.11357572985745865, "grad_norm": 1.208254098892212, "learning_rate": 9.987274158592901e-05, "loss": 0.32406370639801024, "memory(GiB)": 122.96, "step": 1490, "token_acc": 0.8802365697387876, "train_speed(iter/s)": 0.237667 }, { "epoch": 0.11395685646771858, "grad_norm": 1.6211367845535278, "learning_rate": 9.98718864351183e-05, "loss": 0.23168556690216063, "memory(GiB)": 122.96, "step": 1495, "token_acc": 0.8855488141202427, "train_speed(iter/s)": 0.237868 }, { "epoch": 0.1143379830779785, "grad_norm": 1.0309813022613525, "learning_rate": 9.987102842438632e-05, "loss": 0.30960783958435056, "memory(GiB)": 122.96, "step": 1500, "token_acc": 0.8748577929465301, "train_speed(iter/s)": 0.237943 }, { "epoch": 0.11471910968823844, "grad_norm": 1.669249176979065, "learning_rate": 9.98701675537823e-05, "loss": 0.28080313205718993, "memory(GiB)": 122.96, "step": 1505, "token_acc": 0.8971603990790483, "train_speed(iter/s)": 0.238177 }, { "epoch": 0.11510023629849836, "grad_norm": 0.6794307231903076, "learning_rate": 9.98693038233556e-05, "loss": 0.3472938537597656, "memory(GiB)": 122.96, "step": 1510, "token_acc": 0.8624587847385775, "train_speed(iter/s)": 0.238256 }, { "epoch": 0.1154813629087583, "grad_norm": 0.47566187381744385, "learning_rate": 9.986843723315574e-05, "loss": 0.29816570281982424, "memory(GiB)": 122.96, "step": 1515, "token_acc": 0.8914603779112348, "train_speed(iter/s)": 0.238321 }, { "epoch": 0.11586248951901822, "grad_norm": 0.5965967774391174, "learning_rate": 9.986756778323243e-05, "loss": 0.2521768569946289, "memory(GiB)": 122.96, "step": 1520, "token_acc": 0.8954648526077098, "train_speed(iter/s)": 0.23854 }, { "epoch": 0.11624361612927815, "grad_norm": 0.897229790687561, "learning_rate": 9.986669547363552e-05, "loss": 0.353248143196106, "memory(GiB)": 122.96, "step": 1525, "token_acc": 0.8665246028671058, "train_speed(iter/s)": 0.238696 }, { "epoch": 0.11662474273953807, "grad_norm": 0.39634236693382263, "learning_rate": 9.986582030441503e-05, "loss": 0.31049978733062744, "memory(GiB)": 122.96, "step": 1530, "token_acc": 0.8638324873096447, "train_speed(iter/s)": 0.238821 }, { "epoch": 0.11700586934979801, "grad_norm": 1.684451937675476, "learning_rate": 9.986494227562115e-05, "loss": 0.3601567506790161, "memory(GiB)": 122.96, "step": 1535, "token_acc": 0.8356112376613516, "train_speed(iter/s)": 0.239089 }, { "epoch": 0.11738699596005793, "grad_norm": 0.76612389087677, "learning_rate": 9.986406138730425e-05, "loss": 0.26783323287963867, "memory(GiB)": 122.96, "step": 1540, "token_acc": 0.8914873713751169, "train_speed(iter/s)": 0.239333 }, { "epoch": 0.11776812257031787, "grad_norm": 0.9397041201591492, "learning_rate": 9.986317763951481e-05, "loss": 0.49721155166625974, "memory(GiB)": 122.96, "step": 1545, "token_acc": 0.8011350737797956, "train_speed(iter/s)": 0.239524 }, { "epoch": 0.11814924918057779, "grad_norm": 0.5398195385932922, "learning_rate": 9.986229103230352e-05, "loss": 0.2924522876739502, "memory(GiB)": 122.96, "step": 1550, "token_acc": 0.8673213941844791, "train_speed(iter/s)": 0.239715 }, { "epoch": 0.11853037579083772, "grad_norm": 1.0232571363449097, "learning_rate": 9.986140156572124e-05, "loss": 0.20286931991577148, "memory(GiB)": 122.96, "step": 1555, "token_acc": 0.90625, "train_speed(iter/s)": 0.239916 }, { "epoch": 0.11891150240109764, "grad_norm": 1.2158668041229248, "learning_rate": 9.986050923981896e-05, "loss": 0.29258365631103517, "memory(GiB)": 122.96, "step": 1560, "token_acc": 0.8631926952141058, "train_speed(iter/s)": 0.240055 }, { "epoch": 0.11929262901135758, "grad_norm": 0.8256687521934509, "learning_rate": 9.985961405464785e-05, "loss": 0.272230863571167, "memory(GiB)": 122.96, "step": 1565, "token_acc": 0.8911049062624651, "train_speed(iter/s)": 0.240305 }, { "epoch": 0.1196737556216175, "grad_norm": 1.230517029762268, "learning_rate": 9.985871601025927e-05, "loss": 0.1582486152648926, "memory(GiB)": 122.96, "step": 1570, "token_acc": 0.9267326732673268, "train_speed(iter/s)": 0.240612 }, { "epoch": 0.12005488223187744, "grad_norm": 0.5284491777420044, "learning_rate": 9.985781510670468e-05, "loss": 0.2275702476501465, "memory(GiB)": 122.96, "step": 1575, "token_acc": 0.8994169096209913, "train_speed(iter/s)": 0.240816 }, { "epoch": 0.12043600884213736, "grad_norm": 1.0637966394424438, "learning_rate": 9.985691134403579e-05, "loss": 0.27314493656158445, "memory(GiB)": 122.96, "step": 1580, "token_acc": 0.8875368731563422, "train_speed(iter/s)": 0.240998 }, { "epoch": 0.12081713545239729, "grad_norm": 0.7315467596054077, "learning_rate": 9.985600472230438e-05, "loss": 0.24440603256225585, "memory(GiB)": 122.96, "step": 1585, "token_acc": 0.9009735744089012, "train_speed(iter/s)": 0.241177 }, { "epoch": 0.12119826206265721, "grad_norm": 1.1298457384109497, "learning_rate": 9.985509524156245e-05, "loss": 0.38775172233581545, "memory(GiB)": 122.96, "step": 1590, "token_acc": 0.8499304589707928, "train_speed(iter/s)": 0.241257 }, { "epoch": 0.12157938867291715, "grad_norm": 1.1297736167907715, "learning_rate": 9.985418290186216e-05, "loss": 0.3445568084716797, "memory(GiB)": 122.96, "step": 1595, "token_acc": 0.8621887666473653, "train_speed(iter/s)": 0.241487 }, { "epoch": 0.12196051528317707, "grad_norm": 1.1153380870819092, "learning_rate": 9.985326770325587e-05, "loss": 0.2280574321746826, "memory(GiB)": 122.96, "step": 1600, "token_acc": 0.8745032419995816, "train_speed(iter/s)": 0.241677 }, { "epoch": 0.12196051528317707, "eval_loss": 0.21934306621551514, "eval_runtime": 187.9074, "eval_samples_per_second": 2.821, "eval_steps_per_second": 2.821, "eval_token_acc": 0.8781850490934281, "step": 1600 }, { "epoch": 0.122341641893437, "grad_norm": 0.8126972317695618, "learning_rate": 9.985234964579599e-05, "loss": 0.3189119815826416, "memory(GiB)": 122.96, "step": 1605, "token_acc": 0.8780609588210538, "train_speed(iter/s)": 0.235103 }, { "epoch": 0.12272276850369693, "grad_norm": 0.5869296789169312, "learning_rate": 9.985142872953521e-05, "loss": 0.24471602439880372, "memory(GiB)": 122.96, "step": 1610, "token_acc": 0.8847689674843126, "train_speed(iter/s)": 0.235352 }, { "epoch": 0.12310389511395686, "grad_norm": 0.9726985692977905, "learning_rate": 9.985050495452634e-05, "loss": 0.3701769828796387, "memory(GiB)": 122.96, "step": 1615, "token_acc": 0.8579792572766811, "train_speed(iter/s)": 0.23548 }, { "epoch": 0.12348502172421678, "grad_norm": 0.5328789949417114, "learning_rate": 9.984957832082233e-05, "loss": 0.17985012531280517, "memory(GiB)": 122.96, "step": 1620, "token_acc": 0.9097542242703534, "train_speed(iter/s)": 0.235689 }, { "epoch": 0.12386614833447672, "grad_norm": 0.6906195878982544, "learning_rate": 9.984864882847635e-05, "loss": 0.24752767086029054, "memory(GiB)": 122.96, "step": 1625, "token_acc": 0.8722466960352423, "train_speed(iter/s)": 0.235802 }, { "epoch": 0.12424727494473664, "grad_norm": 0.7773347496986389, "learning_rate": 9.984771647754168e-05, "loss": 0.3235522985458374, "memory(GiB)": 122.96, "step": 1630, "token_acc": 0.8742690058479532, "train_speed(iter/s)": 0.236023 }, { "epoch": 0.12462840155499658, "grad_norm": 0.7570682764053345, "learning_rate": 9.984678126807178e-05, "loss": 0.36856827735900877, "memory(GiB)": 122.96, "step": 1635, "token_acc": 0.8507347254447022, "train_speed(iter/s)": 0.236264 }, { "epoch": 0.1250095281652565, "grad_norm": 0.5138041973114014, "learning_rate": 9.98458432001203e-05, "loss": 0.3217953681945801, "memory(GiB)": 122.96, "step": 1640, "token_acc": 0.8672319959498798, "train_speed(iter/s)": 0.236375 }, { "epoch": 0.12539065477551642, "grad_norm": 0.7398234009742737, "learning_rate": 9.984490227374103e-05, "loss": 0.2245692014694214, "memory(GiB)": 122.96, "step": 1645, "token_acc": 0.9133550488599349, "train_speed(iter/s)": 0.236479 }, { "epoch": 0.12577178138577635, "grad_norm": 0.7383055686950684, "learning_rate": 9.984395848898793e-05, "loss": 0.26722254753112795, "memory(GiB)": 122.96, "step": 1650, "token_acc": 0.8760414549888235, "train_speed(iter/s)": 0.236648 }, { "epoch": 0.1261529079960363, "grad_norm": 0.7976310849189758, "learning_rate": 9.984301184591509e-05, "loss": 0.3560169696807861, "memory(GiB)": 122.96, "step": 1655, "token_acc": 0.8408034219825181, "train_speed(iter/s)": 0.23679 }, { "epoch": 0.12653403460629623, "grad_norm": 0.7168083786964417, "learning_rate": 9.984206234457683e-05, "loss": 0.24000742435455322, "memory(GiB)": 122.96, "step": 1660, "token_acc": 0.8801005608199575, "train_speed(iter/s)": 0.236988 }, { "epoch": 0.12691516121655613, "grad_norm": 0.6233680844306946, "learning_rate": 9.98411099850276e-05, "loss": 0.2531128883361816, "memory(GiB)": 122.96, "step": 1665, "token_acc": 0.8903225806451613, "train_speed(iter/s)": 0.237157 }, { "epoch": 0.12729628782681607, "grad_norm": 0.6759544610977173, "learning_rate": 9.9840154767322e-05, "loss": 0.22955164909362794, "memory(GiB)": 122.96, "step": 1670, "token_acc": 0.8998077333903012, "train_speed(iter/s)": 0.237333 }, { "epoch": 0.127677414437076, "grad_norm": 1.4952698945999146, "learning_rate": 9.98391966915148e-05, "loss": 0.25714402198791503, "memory(GiB)": 122.96, "step": 1675, "token_acc": 0.8724091520861372, "train_speed(iter/s)": 0.237546 }, { "epoch": 0.12805854104733594, "grad_norm": 0.2818395793437958, "learning_rate": 9.983823575766097e-05, "loss": 0.25081815719604494, "memory(GiB)": 122.96, "step": 1680, "token_acc": 0.8753277711561382, "train_speed(iter/s)": 0.237744 }, { "epoch": 0.12843966765759585, "grad_norm": 1.3141353130340576, "learning_rate": 9.983727196581558e-05, "loss": 0.2657924652099609, "memory(GiB)": 122.96, "step": 1685, "token_acc": 0.8488372093023255, "train_speed(iter/s)": 0.237992 }, { "epoch": 0.12882079426785578, "grad_norm": 1.5208375453948975, "learning_rate": 9.983630531603393e-05, "loss": 0.2389721155166626, "memory(GiB)": 122.96, "step": 1690, "token_acc": 0.901840490797546, "train_speed(iter/s)": 0.238136 }, { "epoch": 0.12920192087811572, "grad_norm": 1.6626639366149902, "learning_rate": 9.983533580837143e-05, "loss": 0.4329957962036133, "memory(GiB)": 122.96, "step": 1695, "token_acc": 0.8464687819856704, "train_speed(iter/s)": 0.23828 }, { "epoch": 0.12958304748837565, "grad_norm": 0.9874489903450012, "learning_rate": 9.983436344288368e-05, "loss": 0.33500621318817136, "memory(GiB)": 122.96, "step": 1700, "token_acc": 0.8754257907542579, "train_speed(iter/s)": 0.23843 }, { "epoch": 0.12996417409863556, "grad_norm": 0.5268015265464783, "learning_rate": 9.983338821962647e-05, "loss": 0.2787923336029053, "memory(GiB)": 122.96, "step": 1705, "token_acc": 0.8704839809134287, "train_speed(iter/s)": 0.238568 }, { "epoch": 0.1303453007088955, "grad_norm": 1.4461921453475952, "learning_rate": 9.983241013865568e-05, "loss": 0.32826807498931887, "memory(GiB)": 122.96, "step": 1710, "token_acc": 0.8601455133387227, "train_speed(iter/s)": 0.238768 }, { "epoch": 0.13072642731915543, "grad_norm": 0.9359697699546814, "learning_rate": 9.983142920002742e-05, "loss": 0.3203974485397339, "memory(GiB)": 122.96, "step": 1715, "token_acc": 0.8750318147111225, "train_speed(iter/s)": 0.238944 }, { "epoch": 0.13110755392941534, "grad_norm": 0.6711606979370117, "learning_rate": 9.983044540379795e-05, "loss": 0.28200974464416506, "memory(GiB)": 122.96, "step": 1720, "token_acc": 0.8971155612713577, "train_speed(iter/s)": 0.239066 }, { "epoch": 0.13148868053967527, "grad_norm": 0.6044157147407532, "learning_rate": 9.982945875002367e-05, "loss": 0.3281579494476318, "memory(GiB)": 122.96, "step": 1725, "token_acc": 0.8809922896413007, "train_speed(iter/s)": 0.239226 }, { "epoch": 0.1318698071499352, "grad_norm": 1.8961362838745117, "learning_rate": 9.982846923876116e-05, "loss": 0.24423377513885497, "memory(GiB)": 122.96, "step": 1730, "token_acc": 0.9073339940535183, "train_speed(iter/s)": 0.239474 }, { "epoch": 0.13225093376019514, "grad_norm": 1.1634563207626343, "learning_rate": 9.982747687006719e-05, "loss": 0.2893033027648926, "memory(GiB)": 122.96, "step": 1735, "token_acc": 0.8914563697172393, "train_speed(iter/s)": 0.239596 }, { "epoch": 0.13263206037045505, "grad_norm": 0.7382217049598694, "learning_rate": 9.982648164399864e-05, "loss": 0.3639101982116699, "memory(GiB)": 122.96, "step": 1740, "token_acc": 0.8460291734197731, "train_speed(iter/s)": 0.239742 }, { "epoch": 0.133013186980715, "grad_norm": 0.7665942311286926, "learning_rate": 9.98254835606126e-05, "loss": 0.2679546356201172, "memory(GiB)": 122.96, "step": 1745, "token_acc": 0.896854764107308, "train_speed(iter/s)": 0.239782 }, { "epoch": 0.13339431359097492, "grad_norm": 1.0224422216415405, "learning_rate": 9.982448261996629e-05, "loss": 0.2662910223007202, "memory(GiB)": 122.96, "step": 1750, "token_acc": 0.8751046025104603, "train_speed(iter/s)": 0.239966 }, { "epoch": 0.13377544020123486, "grad_norm": 0.23131869733333588, "learning_rate": 9.982347882211711e-05, "loss": 0.17339050769805908, "memory(GiB)": 122.96, "step": 1755, "token_acc": 0.9032822757111597, "train_speed(iter/s)": 0.240097 }, { "epoch": 0.13415656681149477, "grad_norm": 0.873803436756134, "learning_rate": 9.982247216712264e-05, "loss": 0.32798640727996825, "memory(GiB)": 122.96, "step": 1760, "token_acc": 0.8807142857142857, "train_speed(iter/s)": 0.240245 }, { "epoch": 0.1345376934217547, "grad_norm": 1.3829140663146973, "learning_rate": 9.982146265504057e-05, "loss": 0.3122976064682007, "memory(GiB)": 122.96, "step": 1765, "token_acc": 0.881060116354234, "train_speed(iter/s)": 0.240384 }, { "epoch": 0.13491882003201464, "grad_norm": 1.5748395919799805, "learning_rate": 9.982045028592884e-05, "loss": 0.34187591075897217, "memory(GiB)": 122.96, "step": 1770, "token_acc": 0.8577038577038577, "train_speed(iter/s)": 0.240542 }, { "epoch": 0.13529994664227457, "grad_norm": 0.7609471678733826, "learning_rate": 9.981943505984548e-05, "loss": 0.3357156991958618, "memory(GiB)": 122.96, "step": 1775, "token_acc": 0.8569656883009508, "train_speed(iter/s)": 0.240721 }, { "epoch": 0.13568107325253448, "grad_norm": 0.4783948063850403, "learning_rate": 9.98184169768487e-05, "loss": 0.28749325275421145, "memory(GiB)": 122.96, "step": 1780, "token_acc": 0.8792022792022792, "train_speed(iter/s)": 0.240889 }, { "epoch": 0.13606219986279441, "grad_norm": 0.7209165096282959, "learning_rate": 9.981739603699691e-05, "loss": 0.2604418992996216, "memory(GiB)": 122.96, "step": 1785, "token_acc": 0.8742548963951178, "train_speed(iter/s)": 0.240976 }, { "epoch": 0.13644332647305435, "grad_norm": 1.444502592086792, "learning_rate": 9.981637224034862e-05, "loss": 0.32602221965789796, "memory(GiB)": 122.96, "step": 1790, "token_acc": 0.8773459435495788, "train_speed(iter/s)": 0.241047 }, { "epoch": 0.13682445308331428, "grad_norm": 0.8601609468460083, "learning_rate": 9.981534558696258e-05, "loss": 0.2807781219482422, "memory(GiB)": 122.96, "step": 1795, "token_acc": 0.8816680205794747, "train_speed(iter/s)": 0.241138 }, { "epoch": 0.1372055796935742, "grad_norm": 1.36115300655365, "learning_rate": 9.981431607689762e-05, "loss": 0.26647086143493653, "memory(GiB)": 122.96, "step": 1800, "token_acc": 0.8788659793814433, "train_speed(iter/s)": 0.241304 }, { "epoch": 0.1372055796935742, "eval_loss": 0.20519913733005524, "eval_runtime": 187.5612, "eval_samples_per_second": 2.826, "eval_steps_per_second": 2.826, "eval_token_acc": 0.8880263237154389, "step": 1800 }, { "epoch": 0.13758670630383413, "grad_norm": 0.9871574640274048, "learning_rate": 9.98132837102128e-05, "loss": 0.3051834344863892, "memory(GiB)": 122.96, "step": 1805, "token_acc": 0.8870226101218369, "train_speed(iter/s)": 0.235602 }, { "epoch": 0.13796783291409406, "grad_norm": 0.8670793175697327, "learning_rate": 9.981224848696733e-05, "loss": 0.20040321350097656, "memory(GiB)": 122.96, "step": 1810, "token_acc": 0.926112510495382, "train_speed(iter/s)": 0.235783 }, { "epoch": 0.138348959524354, "grad_norm": 0.8690879940986633, "learning_rate": 9.981121040722057e-05, "loss": 0.2009223222732544, "memory(GiB)": 122.96, "step": 1815, "token_acc": 0.9164007657945118, "train_speed(iter/s)": 0.236036 }, { "epoch": 0.1387300861346139, "grad_norm": 0.7145750522613525, "learning_rate": 9.981016947103204e-05, "loss": 0.3130302429199219, "memory(GiB)": 122.96, "step": 1820, "token_acc": 0.8684786574242978, "train_speed(iter/s)": 0.236171 }, { "epoch": 0.13911121274487384, "grad_norm": 1.1669580936431885, "learning_rate": 9.980912567846143e-05, "loss": 0.32374980449676516, "memory(GiB)": 122.96, "step": 1825, "token_acc": 0.8695733532934131, "train_speed(iter/s)": 0.2363 }, { "epoch": 0.13949233935513378, "grad_norm": 0.9976136684417725, "learning_rate": 9.980807902956862e-05, "loss": 0.23629510402679443, "memory(GiB)": 122.96, "step": 1830, "token_acc": 0.8929453746911886, "train_speed(iter/s)": 0.236458 }, { "epoch": 0.1398734659653937, "grad_norm": 1.4495136737823486, "learning_rate": 9.98070295244136e-05, "loss": 0.2617368459701538, "memory(GiB)": 122.96, "step": 1835, "token_acc": 0.8948871750928307, "train_speed(iter/s)": 0.236667 }, { "epoch": 0.14025459257565362, "grad_norm": 0.8070573210716248, "learning_rate": 9.980597716305658e-05, "loss": 0.24829914569854736, "memory(GiB)": 122.96, "step": 1840, "token_acc": 0.8998002151529122, "train_speed(iter/s)": 0.236736 }, { "epoch": 0.14063571918591355, "grad_norm": 1.3157578706741333, "learning_rate": 9.980492194555788e-05, "loss": 0.24158000946044922, "memory(GiB)": 122.96, "step": 1845, "token_acc": 0.8975365940735451, "train_speed(iter/s)": 0.236934 }, { "epoch": 0.1410168457961735, "grad_norm": 0.7841731905937195, "learning_rate": 9.980386387197805e-05, "loss": 0.39017138481140134, "memory(GiB)": 122.96, "step": 1850, "token_acc": 0.8480383454642286, "train_speed(iter/s)": 0.237082 }, { "epoch": 0.14139797240643343, "grad_norm": 1.1406445503234863, "learning_rate": 9.980280294237774e-05, "loss": 0.22648651599884034, "memory(GiB)": 122.96, "step": 1855, "token_acc": 0.8981968444778362, "train_speed(iter/s)": 0.237304 }, { "epoch": 0.14177909901669333, "grad_norm": 1.1483399868011475, "learning_rate": 9.98017391568178e-05, "loss": 0.2971674919128418, "memory(GiB)": 122.96, "step": 1860, "token_acc": 0.8874868559411146, "train_speed(iter/s)": 0.237507 }, { "epoch": 0.14216022562695327, "grad_norm": 1.3195897340774536, "learning_rate": 9.980067251535922e-05, "loss": 0.3522101879119873, "memory(GiB)": 122.96, "step": 1865, "token_acc": 0.8594978442810043, "train_speed(iter/s)": 0.237663 }, { "epoch": 0.1425413522372132, "grad_norm": 0.5908188223838806, "learning_rate": 9.979960301806317e-05, "loss": 0.3194094657897949, "memory(GiB)": 122.96, "step": 1870, "token_acc": 0.8613229787824157, "train_speed(iter/s)": 0.237768 }, { "epoch": 0.14292247884747314, "grad_norm": 1.5131807327270508, "learning_rate": 9.9798530664991e-05, "loss": 0.30899298191070557, "memory(GiB)": 122.96, "step": 1875, "token_acc": 0.8873239436619719, "train_speed(iter/s)": 0.237915 }, { "epoch": 0.14330360545773305, "grad_norm": 0.9198616147041321, "learning_rate": 9.979745545620418e-05, "loss": 0.2468344211578369, "memory(GiB)": 122.96, "step": 1880, "token_acc": 0.9106749816581071, "train_speed(iter/s)": 0.238061 }, { "epoch": 0.14368473206799298, "grad_norm": 1.0523463487625122, "learning_rate": 9.979637739176437e-05, "loss": 0.3103508472442627, "memory(GiB)": 122.96, "step": 1885, "token_acc": 0.8868662186927306, "train_speed(iter/s)": 0.238106 }, { "epoch": 0.14406585867825292, "grad_norm": 1.820716381072998, "learning_rate": 9.979529647173342e-05, "loss": 0.3548320770263672, "memory(GiB)": 122.96, "step": 1890, "token_acc": 0.8629969418960245, "train_speed(iter/s)": 0.238248 }, { "epoch": 0.14444698528851285, "grad_norm": 0.843774676322937, "learning_rate": 9.979421269617328e-05, "loss": 0.25311760902404784, "memory(GiB)": 122.96, "step": 1895, "token_acc": 0.8962025316455696, "train_speed(iter/s)": 0.238477 }, { "epoch": 0.14482811189877276, "grad_norm": 1.19166898727417, "learning_rate": 9.979312606514613e-05, "loss": 0.20560357570648194, "memory(GiB)": 122.96, "step": 1900, "token_acc": 0.893775228383259, "train_speed(iter/s)": 0.238622 }, { "epoch": 0.1452092385090327, "grad_norm": 0.6783161759376526, "learning_rate": 9.979203657871426e-05, "loss": 0.2838034152984619, "memory(GiB)": 122.96, "step": 1905, "token_acc": 0.8902870625245772, "train_speed(iter/s)": 0.238749 }, { "epoch": 0.14559036511929263, "grad_norm": 0.5095587968826294, "learning_rate": 9.979094423694015e-05, "loss": 0.21685729026794434, "memory(GiB)": 122.96, "step": 1910, "token_acc": 0.916065911431514, "train_speed(iter/s)": 0.238916 }, { "epoch": 0.14597149172955257, "grad_norm": 0.6157704591751099, "learning_rate": 9.978984903988644e-05, "loss": 0.27637245655059817, "memory(GiB)": 122.96, "step": 1915, "token_acc": 0.877690176859152, "train_speed(iter/s)": 0.239071 }, { "epoch": 0.14635261833981247, "grad_norm": 0.5641518235206604, "learning_rate": 9.978875098761595e-05, "loss": 0.22433173656463623, "memory(GiB)": 122.96, "step": 1920, "token_acc": 0.9058589638638755, "train_speed(iter/s)": 0.239135 }, { "epoch": 0.1467337449500724, "grad_norm": 1.3883605003356934, "learning_rate": 9.978765008019165e-05, "loss": 0.345409631729126, "memory(GiB)": 122.96, "step": 1925, "token_acc": 0.8713904931141715, "train_speed(iter/s)": 0.239249 }, { "epoch": 0.14711487156033234, "grad_norm": 1.2038651704788208, "learning_rate": 9.978654631767665e-05, "loss": 0.3504453182220459, "memory(GiB)": 122.96, "step": 1930, "token_acc": 0.8709171674941207, "train_speed(iter/s)": 0.239432 }, { "epoch": 0.14749599817059228, "grad_norm": 1.9789129495620728, "learning_rate": 9.978543970013425e-05, "loss": 0.2633713960647583, "memory(GiB)": 122.96, "step": 1935, "token_acc": 0.878415551072804, "train_speed(iter/s)": 0.239562 }, { "epoch": 0.1478771247808522, "grad_norm": 0.604311466217041, "learning_rate": 9.978433022762794e-05, "loss": 0.24987993240356446, "memory(GiB)": 122.96, "step": 1940, "token_acc": 0.8959395656279509, "train_speed(iter/s)": 0.239657 }, { "epoch": 0.14825825139111212, "grad_norm": 0.878157913684845, "learning_rate": 9.97832179002213e-05, "loss": 0.2976180076599121, "memory(GiB)": 122.96, "step": 1945, "token_acc": 0.8881148706283765, "train_speed(iter/s)": 0.2397 }, { "epoch": 0.14863937800137206, "grad_norm": 0.7941150665283203, "learning_rate": 9.978210271797813e-05, "loss": 0.3298220157623291, "memory(GiB)": 122.96, "step": 1950, "token_acc": 0.8717737789203085, "train_speed(iter/s)": 0.239703 }, { "epoch": 0.149020504611632, "grad_norm": 0.8733285665512085, "learning_rate": 9.97809846809624e-05, "loss": 0.3037374496459961, "memory(GiB)": 122.96, "step": 1955, "token_acc": 0.8667074289208193, "train_speed(iter/s)": 0.239856 }, { "epoch": 0.1494016312218919, "grad_norm": 1.3003733158111572, "learning_rate": 9.97798637892382e-05, "loss": 0.23170437812805175, "memory(GiB)": 122.96, "step": 1960, "token_acc": 0.8865487222690256, "train_speed(iter/s)": 0.240042 }, { "epoch": 0.14978275783215184, "grad_norm": 0.9020549654960632, "learning_rate": 9.977874004286982e-05, "loss": 0.3253824710845947, "memory(GiB)": 122.96, "step": 1965, "token_acc": 0.8826634185952091, "train_speed(iter/s)": 0.240101 }, { "epoch": 0.15016388444241177, "grad_norm": 1.3014813661575317, "learning_rate": 9.977761344192171e-05, "loss": 0.21997196674346925, "memory(GiB)": 122.96, "step": 1970, "token_acc": 0.9049307122139865, "train_speed(iter/s)": 0.240288 }, { "epoch": 0.1505450110526717, "grad_norm": 1.3385120630264282, "learning_rate": 9.977648398645846e-05, "loss": 0.3753857135772705, "memory(GiB)": 122.96, "step": 1975, "token_acc": 0.8583705172698728, "train_speed(iter/s)": 0.240382 }, { "epoch": 0.15092613766293161, "grad_norm": 1.4283915758132935, "learning_rate": 9.977535167654483e-05, "loss": 0.22391667366027831, "memory(GiB)": 122.96, "step": 1980, "token_acc": 0.9056902002107482, "train_speed(iter/s)": 0.240522 }, { "epoch": 0.15130726427319155, "grad_norm": 1.1512219905853271, "learning_rate": 9.977421651224579e-05, "loss": 0.34629182815551757, "memory(GiB)": 122.96, "step": 1985, "token_acc": 0.8764965343415249, "train_speed(iter/s)": 0.240591 }, { "epoch": 0.15168839088345148, "grad_norm": 1.2600854635238647, "learning_rate": 9.97730784936264e-05, "loss": 0.30235106945037843, "memory(GiB)": 122.96, "step": 1990, "token_acc": 0.8854909955827387, "train_speed(iter/s)": 0.240772 }, { "epoch": 0.15206951749371142, "grad_norm": 0.447229266166687, "learning_rate": 9.977193762075194e-05, "loss": 0.21035428047180177, "memory(GiB)": 122.96, "step": 1995, "token_acc": 0.8959897061977268, "train_speed(iter/s)": 0.240918 }, { "epoch": 0.15245064410397133, "grad_norm": 0.8872116804122925, "learning_rate": 9.977079389368782e-05, "loss": 0.24118647575378419, "memory(GiB)": 122.96, "step": 2000, "token_acc": 0.9077608142493638, "train_speed(iter/s)": 0.241137 }, { "epoch": 0.15245064410397133, "eval_loss": 0.20668385922908783, "eval_runtime": 184.1995, "eval_samples_per_second": 2.877, "eval_steps_per_second": 2.877, "eval_token_acc": 0.885714715980965, "step": 2000 }, { "epoch": 0.15283177071423126, "grad_norm": 1.1372349262237549, "learning_rate": 9.976964731249965e-05, "loss": 0.265593957901001, "memory(GiB)": 122.96, "step": 2005, "token_acc": 0.8852057777030927, "train_speed(iter/s)": 0.236015 }, { "epoch": 0.1532128973244912, "grad_norm": 2.123196601867676, "learning_rate": 9.976849787725315e-05, "loss": 0.20663881301879883, "memory(GiB)": 122.96, "step": 2010, "token_acc": 0.898103220114689, "train_speed(iter/s)": 0.236148 }, { "epoch": 0.15359402393475113, "grad_norm": 1.8054008483886719, "learning_rate": 9.976734558801426e-05, "loss": 0.27978086471557617, "memory(GiB)": 122.96, "step": 2015, "token_acc": 0.8739085772984078, "train_speed(iter/s)": 0.236294 }, { "epoch": 0.15397515054501104, "grad_norm": 0.8437690138816833, "learning_rate": 9.976619044484905e-05, "loss": 0.21602773666381836, "memory(GiB)": 122.96, "step": 2020, "token_acc": 0.8905552010210593, "train_speed(iter/s)": 0.236479 }, { "epoch": 0.15435627715527098, "grad_norm": 1.1259104013442993, "learning_rate": 9.976503244782376e-05, "loss": 0.22199716567993164, "memory(GiB)": 122.96, "step": 2025, "token_acc": 0.9176019709827539, "train_speed(iter/s)": 0.236648 }, { "epoch": 0.1547374037655309, "grad_norm": 0.92354416847229, "learning_rate": 9.976387159700479e-05, "loss": 0.19872859716415406, "memory(GiB)": 122.96, "step": 2030, "token_acc": 0.8967343336275375, "train_speed(iter/s)": 0.236763 }, { "epoch": 0.15511853037579085, "grad_norm": 1.0854308605194092, "learning_rate": 9.976270789245872e-05, "loss": 0.17748762369155885, "memory(GiB)": 122.96, "step": 2035, "token_acc": 0.9172433679354095, "train_speed(iter/s)": 0.236926 }, { "epoch": 0.15549965698605075, "grad_norm": 1.5527721643447876, "learning_rate": 9.976154133425228e-05, "loss": 0.2648783683776855, "memory(GiB)": 122.96, "step": 2040, "token_acc": 0.8918850806451613, "train_speed(iter/s)": 0.237077 }, { "epoch": 0.1558807835963107, "grad_norm": 1.4501986503601074, "learning_rate": 9.976037192245237e-05, "loss": 0.21418545246124268, "memory(GiB)": 122.96, "step": 2045, "token_acc": 0.877910447761194, "train_speed(iter/s)": 0.237183 }, { "epoch": 0.15626191020657063, "grad_norm": 0.7141785025596619, "learning_rate": 9.975919965712603e-05, "loss": 0.2681345224380493, "memory(GiB)": 122.96, "step": 2050, "token_acc": 0.8947151114781172, "train_speed(iter/s)": 0.237214 }, { "epoch": 0.15664303681683056, "grad_norm": 1.6140422821044922, "learning_rate": 9.975802453834053e-05, "loss": 0.2997597694396973, "memory(GiB)": 122.96, "step": 2055, "token_acc": 0.8774156877605154, "train_speed(iter/s)": 0.237345 }, { "epoch": 0.15702416342709047, "grad_norm": 1.0279288291931152, "learning_rate": 9.975684656616321e-05, "loss": 0.1712648630142212, "memory(GiB)": 122.96, "step": 2060, "token_acc": 0.9270758122743682, "train_speed(iter/s)": 0.237401 }, { "epoch": 0.1574052900373504, "grad_norm": 1.6670385599136353, "learning_rate": 9.975566574066164e-05, "loss": 0.2513648509979248, "memory(GiB)": 122.96, "step": 2065, "token_acc": 0.8996113989637305, "train_speed(iter/s)": 0.237488 }, { "epoch": 0.15778641664761034, "grad_norm": 0.7845636010169983, "learning_rate": 9.975448206190355e-05, "loss": 0.22825837135314941, "memory(GiB)": 122.96, "step": 2070, "token_acc": 0.8874788494077834, "train_speed(iter/s)": 0.237677 }, { "epoch": 0.15816754325787027, "grad_norm": 0.340576708316803, "learning_rate": 9.975329552995678e-05, "loss": 0.17792811393737792, "memory(GiB)": 122.96, "step": 2075, "token_acc": 0.9224137931034483, "train_speed(iter/s)": 0.237859 }, { "epoch": 0.15854866986813018, "grad_norm": 1.1290943622589111, "learning_rate": 9.975210614488939e-05, "loss": 0.30759634971618655, "memory(GiB)": 122.96, "step": 2080, "token_acc": 0.8793243462725353, "train_speed(iter/s)": 0.237907 }, { "epoch": 0.15892979647839012, "grad_norm": 2.1738476753234863, "learning_rate": 9.975091390676961e-05, "loss": 0.3035086154937744, "memory(GiB)": 122.96, "step": 2085, "token_acc": 0.8951680672268908, "train_speed(iter/s)": 0.238019 }, { "epoch": 0.15931092308865005, "grad_norm": 1.1320093870162964, "learning_rate": 9.97497188156658e-05, "loss": 0.3861212968826294, "memory(GiB)": 122.96, "step": 2090, "token_acc": 0.8492569002123143, "train_speed(iter/s)": 0.238173 }, { "epoch": 0.15969204969891, "grad_norm": 0.9966463446617126, "learning_rate": 9.974852087164645e-05, "loss": 0.3135369300842285, "memory(GiB)": 122.96, "step": 2095, "token_acc": 0.8503618919373843, "train_speed(iter/s)": 0.238323 }, { "epoch": 0.1600731763091699, "grad_norm": 0.21104849874973297, "learning_rate": 9.974732007478031e-05, "loss": 0.23520112037658691, "memory(GiB)": 122.96, "step": 2100, "token_acc": 0.9032090424840847, "train_speed(iter/s)": 0.23838 }, { "epoch": 0.16045430291942983, "grad_norm": 0.7030470967292786, "learning_rate": 9.974611642513622e-05, "loss": 0.24723215103149415, "memory(GiB)": 122.96, "step": 2105, "token_acc": 0.9000299760191847, "train_speed(iter/s)": 0.238483 }, { "epoch": 0.16083542952968977, "grad_norm": 0.6704449653625488, "learning_rate": 9.97449099227832e-05, "loss": 0.2639194965362549, "memory(GiB)": 122.96, "step": 2110, "token_acc": 0.9041769041769042, "train_speed(iter/s)": 0.238566 }, { "epoch": 0.1612165561399497, "grad_norm": 0.9133105874061584, "learning_rate": 9.974370056779044e-05, "loss": 0.30077123641967773, "memory(GiB)": 122.96, "step": 2115, "token_acc": 0.8836830311672439, "train_speed(iter/s)": 0.238699 }, { "epoch": 0.1615976827502096, "grad_norm": 0.6244292259216309, "learning_rate": 9.974248836022728e-05, "loss": 0.27397823333740234, "memory(GiB)": 122.96, "step": 2120, "token_acc": 0.8956294846705806, "train_speed(iter/s)": 0.238718 }, { "epoch": 0.16197880936046954, "grad_norm": 1.4584237337112427, "learning_rate": 9.974127330016325e-05, "loss": 0.3413764476776123, "memory(GiB)": 122.96, "step": 2125, "token_acc": 0.8930018158960749, "train_speed(iter/s)": 0.238755 }, { "epoch": 0.16235993597072948, "grad_norm": 0.59260094165802, "learning_rate": 9.974005538766803e-05, "loss": 0.24233701229095458, "memory(GiB)": 122.96, "step": 2130, "token_acc": 0.9027848966342047, "train_speed(iter/s)": 0.238858 }, { "epoch": 0.16274106258098942, "grad_norm": 1.5250327587127686, "learning_rate": 9.973883462281146e-05, "loss": 0.3027610778808594, "memory(GiB)": 122.96, "step": 2135, "token_acc": 0.8899117039263573, "train_speed(iter/s)": 0.238961 }, { "epoch": 0.16312218919124932, "grad_norm": 1.7640290260314941, "learning_rate": 9.973761100566354e-05, "loss": 0.21650912761688232, "memory(GiB)": 122.96, "step": 2140, "token_acc": 0.9123244929797192, "train_speed(iter/s)": 0.239135 }, { "epoch": 0.16350331580150926, "grad_norm": 2.394850015640259, "learning_rate": 9.973638453629443e-05, "loss": 0.2717435836791992, "memory(GiB)": 122.96, "step": 2145, "token_acc": 0.9024311183144247, "train_speed(iter/s)": 0.239295 }, { "epoch": 0.1638844424117692, "grad_norm": 1.7957568168640137, "learning_rate": 9.973515521477448e-05, "loss": 0.27200021743774416, "memory(GiB)": 122.96, "step": 2150, "token_acc": 0.8996625421822272, "train_speed(iter/s)": 0.239406 }, { "epoch": 0.16426556902202913, "grad_norm": 1.2874975204467773, "learning_rate": 9.973392304117418e-05, "loss": 0.28477942943573, "memory(GiB)": 122.96, "step": 2155, "token_acc": 0.8939520624303233, "train_speed(iter/s)": 0.239507 }, { "epoch": 0.16464669563228904, "grad_norm": 0.6684015393257141, "learning_rate": 9.973268801556418e-05, "loss": 0.251181435585022, "memory(GiB)": 122.96, "step": 2160, "token_acc": 0.897028897028897, "train_speed(iter/s)": 0.239632 }, { "epoch": 0.16502782224254897, "grad_norm": 1.5154551267623901, "learning_rate": 9.973145013801532e-05, "loss": 0.3319683074951172, "memory(GiB)": 122.96, "step": 2165, "token_acc": 0.8660578386605784, "train_speed(iter/s)": 0.239782 }, { "epoch": 0.1654089488528089, "grad_norm": 0.9595803618431091, "learning_rate": 9.973020940859856e-05, "loss": 0.2190579891204834, "memory(GiB)": 122.96, "step": 2170, "token_acc": 0.8989776046738072, "train_speed(iter/s)": 0.239905 }, { "epoch": 0.16579007546306884, "grad_norm": 0.9698189496994019, "learning_rate": 9.972896582738509e-05, "loss": 0.3877572536468506, "memory(GiB)": 122.96, "step": 2175, "token_acc": 0.8187591597459697, "train_speed(iter/s)": 0.240065 }, { "epoch": 0.16617120207332875, "grad_norm": 0.5570741891860962, "learning_rate": 9.972771939444618e-05, "loss": 0.3530031442642212, "memory(GiB)": 122.96, "step": 2180, "token_acc": 0.8819084438977243, "train_speed(iter/s)": 0.240032 }, { "epoch": 0.16655232868358869, "grad_norm": 0.7085857391357422, "learning_rate": 9.972647010985335e-05, "loss": 0.19612634181976318, "memory(GiB)": 122.96, "step": 2185, "token_acc": 0.8913894324853229, "train_speed(iter/s)": 0.240194 }, { "epoch": 0.16693345529384862, "grad_norm": 0.5055636763572693, "learning_rate": 9.97252179736782e-05, "loss": 0.2705382823944092, "memory(GiB)": 122.96, "step": 2190, "token_acc": 0.898902054571149, "train_speed(iter/s)": 0.240138 }, { "epoch": 0.16731458190410856, "grad_norm": 1.396628975868225, "learning_rate": 9.972396298599255e-05, "loss": 0.2956686973571777, "memory(GiB)": 122.96, "step": 2195, "token_acc": 0.8897560975609756, "train_speed(iter/s)": 0.240296 }, { "epoch": 0.16769570851436846, "grad_norm": 0.9214076995849609, "learning_rate": 9.972270514686836e-05, "loss": 0.3205678462982178, "memory(GiB)": 122.96, "step": 2200, "token_acc": 0.8811737712836614, "train_speed(iter/s)": 0.240373 }, { "epoch": 0.16769570851436846, "eval_loss": 0.19462092220783234, "eval_runtime": 192.7243, "eval_samples_per_second": 2.75, "eval_steps_per_second": 2.75, "eval_token_acc": 0.8964821396301428, "step": 2200 }, { "epoch": 0.1680768351246284, "grad_norm": 1.0538688898086548, "learning_rate": 9.97214444563778e-05, "loss": 0.3501262664794922, "memory(GiB)": 122.96, "step": 2205, "token_acc": 0.8955404645529651, "train_speed(iter/s)": 0.235489 }, { "epoch": 0.16845796173488833, "grad_norm": 1.378690242767334, "learning_rate": 9.972018091459312e-05, "loss": 0.3005554676055908, "memory(GiB)": 122.96, "step": 2210, "token_acc": 0.8797385620915033, "train_speed(iter/s)": 0.235616 }, { "epoch": 0.16883908834514827, "grad_norm": 1.0340962409973145, "learning_rate": 9.971891452158679e-05, "loss": 0.29862303733825685, "memory(GiB)": 122.96, "step": 2215, "token_acc": 0.8950563746747615, "train_speed(iter/s)": 0.23574 }, { "epoch": 0.16922021495540818, "grad_norm": 1.8549689054489136, "learning_rate": 9.971764527743146e-05, "loss": 0.2782621145248413, "memory(GiB)": 122.96, "step": 2220, "token_acc": 0.8913583312639682, "train_speed(iter/s)": 0.235774 }, { "epoch": 0.1696013415656681, "grad_norm": 0.8636431097984314, "learning_rate": 9.971637318219987e-05, "loss": 0.1819000720977783, "memory(GiB)": 122.96, "step": 2225, "token_acc": 0.9229157938835358, "train_speed(iter/s)": 0.235953 }, { "epoch": 0.16998246817592805, "grad_norm": 0.9445730447769165, "learning_rate": 9.971509823596499e-05, "loss": 0.24790711402893068, "memory(GiB)": 122.96, "step": 2230, "token_acc": 0.8953188352377442, "train_speed(iter/s)": 0.236072 }, { "epoch": 0.17036359478618798, "grad_norm": 0.7660523056983948, "learning_rate": 9.971382043879993e-05, "loss": 0.26198453903198243, "memory(GiB)": 122.96, "step": 2235, "token_acc": 0.8889661164205039, "train_speed(iter/s)": 0.236161 }, { "epoch": 0.1707447213964479, "grad_norm": 0.9888396859169006, "learning_rate": 9.971253979077799e-05, "loss": 0.27241039276123047, "memory(GiB)": 122.96, "step": 2240, "token_acc": 0.8815615932471643, "train_speed(iter/s)": 0.236324 }, { "epoch": 0.17112584800670783, "grad_norm": 1.3814053535461426, "learning_rate": 9.971125629197257e-05, "loss": 0.2658933162689209, "memory(GiB)": 122.96, "step": 2245, "token_acc": 0.8860200046522447, "train_speed(iter/s)": 0.236459 }, { "epoch": 0.17150697461696776, "grad_norm": 0.8118754625320435, "learning_rate": 9.970996994245729e-05, "loss": 0.2792464256286621, "memory(GiB)": 122.96, "step": 2250, "token_acc": 0.8834281072298944, "train_speed(iter/s)": 0.236567 }, { "epoch": 0.1718881012272277, "grad_norm": 0.7182120680809021, "learning_rate": 9.970868074230592e-05, "loss": 0.28833248615264895, "memory(GiB)": 122.96, "step": 2255, "token_acc": 0.9071070136538686, "train_speed(iter/s)": 0.236592 }, { "epoch": 0.1722692278374876, "grad_norm": 0.7893413305282593, "learning_rate": 9.970738869159238e-05, "loss": 0.1648984432220459, "memory(GiB)": 122.96, "step": 2260, "token_acc": 0.9259119969913502, "train_speed(iter/s)": 0.236673 }, { "epoch": 0.17265035444774754, "grad_norm": 0.7585846185684204, "learning_rate": 9.970609379039077e-05, "loss": 0.2591114521026611, "memory(GiB)": 122.96, "step": 2265, "token_acc": 0.8794814664776179, "train_speed(iter/s)": 0.236763 }, { "epoch": 0.17303148105800747, "grad_norm": 1.5939124822616577, "learning_rate": 9.970479603877536e-05, "loss": 0.2379659652709961, "memory(GiB)": 122.96, "step": 2270, "token_acc": 0.9015221402214022, "train_speed(iter/s)": 0.236847 }, { "epoch": 0.1734126076682674, "grad_norm": 1.9114692211151123, "learning_rate": 9.970349543682053e-05, "loss": 0.2918152570724487, "memory(GiB)": 122.96, "step": 2275, "token_acc": 0.8930031803725579, "train_speed(iter/s)": 0.236987 }, { "epoch": 0.17379373427852732, "grad_norm": 0.40412506461143494, "learning_rate": 9.970219198460091e-05, "loss": 0.18986796140670775, "memory(GiB)": 122.96, "step": 2280, "token_acc": 0.8945222929936306, "train_speed(iter/s)": 0.237061 }, { "epoch": 0.17417486088878725, "grad_norm": 0.9701446890830994, "learning_rate": 9.970088568219123e-05, "loss": 0.31163616180419923, "memory(GiB)": 122.96, "step": 2285, "token_acc": 0.8847457627118644, "train_speed(iter/s)": 0.237126 }, { "epoch": 0.1745559874990472, "grad_norm": 0.6267778873443604, "learning_rate": 9.969957652966638e-05, "loss": 0.30379929542541506, "memory(GiB)": 122.96, "step": 2290, "token_acc": 0.90121500264131, "train_speed(iter/s)": 0.237136 }, { "epoch": 0.17493711410930712, "grad_norm": 0.8504393696784973, "learning_rate": 9.969826452710147e-05, "loss": 0.28214635848999026, "memory(GiB)": 122.96, "step": 2295, "token_acc": 0.8850802456905092, "train_speed(iter/s)": 0.237282 }, { "epoch": 0.17531824071956703, "grad_norm": 0.9132372736930847, "learning_rate": 9.96969496745717e-05, "loss": 0.3197032928466797, "memory(GiB)": 122.96, "step": 2300, "token_acc": 0.8903364116094987, "train_speed(iter/s)": 0.237369 }, { "epoch": 0.17569936732982697, "grad_norm": 0.912675678730011, "learning_rate": 9.969563197215249e-05, "loss": 0.2892963647842407, "memory(GiB)": 122.96, "step": 2305, "token_acc": 0.8918021324943192, "train_speed(iter/s)": 0.237464 }, { "epoch": 0.1760804939400869, "grad_norm": 1.037492275238037, "learning_rate": 9.96943114199194e-05, "loss": 0.3183117866516113, "memory(GiB)": 122.96, "step": 2310, "token_acc": 0.868801652892562, "train_speed(iter/s)": 0.23756 }, { "epoch": 0.17646162055034684, "grad_norm": 1.219845175743103, "learning_rate": 9.969298801794817e-05, "loss": 0.41812591552734374, "memory(GiB)": 122.96, "step": 2315, "token_acc": 0.8625424251774144, "train_speed(iter/s)": 0.237607 }, { "epoch": 0.17684274716060674, "grad_norm": 2.3860533237457275, "learning_rate": 9.969166176631468e-05, "loss": 0.2722818613052368, "memory(GiB)": 122.96, "step": 2320, "token_acc": 0.9027064862342511, "train_speed(iter/s)": 0.237722 }, { "epoch": 0.17722387377086668, "grad_norm": 0.7258024215698242, "learning_rate": 9.969033266509498e-05, "loss": 0.20225563049316406, "memory(GiB)": 122.96, "step": 2325, "token_acc": 0.9, "train_speed(iter/s)": 0.23782 }, { "epoch": 0.17760500038112662, "grad_norm": 0.7142091989517212, "learning_rate": 9.96890007143653e-05, "loss": 0.24534997940063477, "memory(GiB)": 122.96, "step": 2330, "token_acc": 0.9017755334745072, "train_speed(iter/s)": 0.237896 }, { "epoch": 0.17798612699138655, "grad_norm": 0.6796156764030457, "learning_rate": 9.9687665914202e-05, "loss": 0.2938971519470215, "memory(GiB)": 122.96, "step": 2335, "token_acc": 0.8955114054451803, "train_speed(iter/s)": 0.23799 }, { "epoch": 0.17836725360164646, "grad_norm": 1.2401645183563232, "learning_rate": 9.968632826468165e-05, "loss": 0.23135616779327392, "memory(GiB)": 122.96, "step": 2340, "token_acc": 0.9143423258649596, "train_speed(iter/s)": 0.238033 }, { "epoch": 0.1787483802119064, "grad_norm": 1.4266185760498047, "learning_rate": 9.968498776588093e-05, "loss": 0.28963940143585204, "memory(GiB)": 122.96, "step": 2345, "token_acc": 0.8730591119585944, "train_speed(iter/s)": 0.238151 }, { "epoch": 0.17912950682216633, "grad_norm": 1.120025634765625, "learning_rate": 9.968364441787674e-05, "loss": 0.28890018463134765, "memory(GiB)": 122.96, "step": 2350, "token_acc": 0.8675510633817646, "train_speed(iter/s)": 0.238247 }, { "epoch": 0.17951063343242626, "grad_norm": 1.624452829360962, "learning_rate": 9.968229822074611e-05, "loss": 0.2971433401107788, "memory(GiB)": 122.96, "step": 2355, "token_acc": 0.8989539748953975, "train_speed(iter/s)": 0.238331 }, { "epoch": 0.17989176004268617, "grad_norm": 1.5802233219146729, "learning_rate": 9.968094917456622e-05, "loss": 0.22624199390411376, "memory(GiB)": 122.96, "step": 2360, "token_acc": 0.9141007697690693, "train_speed(iter/s)": 0.238413 }, { "epoch": 0.1802728866529461, "grad_norm": 2.044118881225586, "learning_rate": 9.967959727941445e-05, "loss": 0.28411753177642823, "memory(GiB)": 122.96, "step": 2365, "token_acc": 0.8900576008014025, "train_speed(iter/s)": 0.238537 }, { "epoch": 0.18065401326320604, "grad_norm": 1.1494495868682861, "learning_rate": 9.967824253536832e-05, "loss": 0.23213346004486085, "memory(GiB)": 122.96, "step": 2370, "token_acc": 0.9068166169382603, "train_speed(iter/s)": 0.23862 }, { "epoch": 0.18103513987346598, "grad_norm": 0.9547458291053772, "learning_rate": 9.96768849425055e-05, "loss": 0.28689911365509035, "memory(GiB)": 122.96, "step": 2375, "token_acc": 0.8853850818677986, "train_speed(iter/s)": 0.238732 }, { "epoch": 0.18141626648372589, "grad_norm": 0.9832682609558105, "learning_rate": 9.967552450090389e-05, "loss": 0.16776807308197023, "memory(GiB)": 122.96, "step": 2380, "token_acc": 0.9188034188034188, "train_speed(iter/s)": 0.238894 }, { "epoch": 0.18179739309398582, "grad_norm": 1.1946977376937866, "learning_rate": 9.967416121064144e-05, "loss": 0.27297630310058596, "memory(GiB)": 122.96, "step": 2385, "token_acc": 0.8710927408938259, "train_speed(iter/s)": 0.239047 }, { "epoch": 0.18217851970424576, "grad_norm": 1.58622145652771, "learning_rate": 9.967279507179638e-05, "loss": 0.2590020656585693, "memory(GiB)": 122.96, "step": 2390, "token_acc": 0.8862559241706162, "train_speed(iter/s)": 0.239167 }, { "epoch": 0.1825596463145057, "grad_norm": 12.433429718017578, "learning_rate": 9.967142608444702e-05, "loss": 0.2869682550430298, "memory(GiB)": 122.96, "step": 2395, "token_acc": 0.894065446478092, "train_speed(iter/s)": 0.239272 }, { "epoch": 0.1829407729247656, "grad_norm": 0.9357939958572388, "learning_rate": 9.967005424867188e-05, "loss": 0.2666119813919067, "memory(GiB)": 122.96, "step": 2400, "token_acc": 0.8774570024570024, "train_speed(iter/s)": 0.239404 }, { "epoch": 0.1829407729247656, "eval_loss": 0.18799619376659393, "eval_runtime": 189.4045, "eval_samples_per_second": 2.798, "eval_steps_per_second": 2.798, "eval_token_acc": 0.8982892596831517, "step": 2400 }, { "epoch": 0.18332189953502553, "grad_norm": 0.6364712715148926, "learning_rate": 9.966867956454963e-05, "loss": 0.2422574281692505, "memory(GiB)": 122.96, "step": 2405, "token_acc": 0.8983217166688313, "train_speed(iter/s)": 0.23506 }, { "epoch": 0.18370302614528547, "grad_norm": 1.0952332019805908, "learning_rate": 9.966730203215911e-05, "loss": 0.28159847259521487, "memory(GiB)": 122.96, "step": 2410, "token_acc": 0.9008742244782854, "train_speed(iter/s)": 0.235151 }, { "epoch": 0.1840841527555454, "grad_norm": 1.0299922227859497, "learning_rate": 9.966592165157929e-05, "loss": 0.32275192737579345, "memory(GiB)": 122.96, "step": 2415, "token_acc": 0.8790554414784394, "train_speed(iter/s)": 0.235247 }, { "epoch": 0.1844652793658053, "grad_norm": 0.916754424571991, "learning_rate": 9.966453842288934e-05, "loss": 0.2838776111602783, "memory(GiB)": 122.96, "step": 2420, "token_acc": 0.8908661198387321, "train_speed(iter/s)": 0.235288 }, { "epoch": 0.18484640597606525, "grad_norm": 0.8388864994049072, "learning_rate": 9.966315234616857e-05, "loss": 0.16249300241470338, "memory(GiB)": 122.96, "step": 2425, "token_acc": 0.9173982442138867, "train_speed(iter/s)": 0.235459 }, { "epoch": 0.18522753258632518, "grad_norm": 0.7737628221511841, "learning_rate": 9.966176342149649e-05, "loss": 0.17313649654388427, "memory(GiB)": 122.96, "step": 2430, "token_acc": 0.9202240387526491, "train_speed(iter/s)": 0.235553 }, { "epoch": 0.18560865919658512, "grad_norm": 1.4051605463027954, "learning_rate": 9.966037164895275e-05, "loss": 0.29317526817321776, "memory(GiB)": 122.96, "step": 2435, "token_acc": 0.8829466973614308, "train_speed(iter/s)": 0.235642 }, { "epoch": 0.18598978580684503, "grad_norm": 1.3052648305892944, "learning_rate": 9.965897702861712e-05, "loss": 0.3096605777740479, "memory(GiB)": 122.96, "step": 2440, "token_acc": 0.878177966101695, "train_speed(iter/s)": 0.23578 }, { "epoch": 0.18637091241710496, "grad_norm": 0.8876438140869141, "learning_rate": 9.965757956056962e-05, "loss": 0.29300546646118164, "memory(GiB)": 122.96, "step": 2445, "token_acc": 0.8965463108320251, "train_speed(iter/s)": 0.23586 }, { "epoch": 0.1867520390273649, "grad_norm": 1.0546586513519287, "learning_rate": 9.965617924489038e-05, "loss": 0.29401659965515137, "memory(GiB)": 122.96, "step": 2450, "token_acc": 0.8965925925925926, "train_speed(iter/s)": 0.235944 }, { "epoch": 0.18713316563762483, "grad_norm": 1.3227176666259766, "learning_rate": 9.965477608165969e-05, "loss": 0.2643908500671387, "memory(GiB)": 122.96, "step": 2455, "token_acc": 0.8815762746292911, "train_speed(iter/s)": 0.236052 }, { "epoch": 0.18751429224788474, "grad_norm": 0.7268185615539551, "learning_rate": 9.965337007095801e-05, "loss": 0.30490238666534425, "memory(GiB)": 122.96, "step": 2460, "token_acc": 0.8823322177206271, "train_speed(iter/s)": 0.236138 }, { "epoch": 0.18789541885814467, "grad_norm": 0.8339267373085022, "learning_rate": 9.965196121286597e-05, "loss": 0.25411453247070315, "memory(GiB)": 122.96, "step": 2465, "token_acc": 0.8869687062120505, "train_speed(iter/s)": 0.236257 }, { "epoch": 0.1882765454684046, "grad_norm": 1.2658578157424927, "learning_rate": 9.965054950746438e-05, "loss": 0.23019378185272216, "memory(GiB)": 122.96, "step": 2470, "token_acc": 0.9074605451936872, "train_speed(iter/s)": 0.236405 }, { "epoch": 0.18865767207866455, "grad_norm": 1.0540765523910522, "learning_rate": 9.964913495483418e-05, "loss": 0.27036480903625487, "memory(GiB)": 122.96, "step": 2475, "token_acc": 0.8906513668579142, "train_speed(iter/s)": 0.236477 }, { "epoch": 0.18903879868892445, "grad_norm": 0.7257567048072815, "learning_rate": 9.964771755505649e-05, "loss": 0.24350364208221437, "memory(GiB)": 122.96, "step": 2480, "token_acc": 0.910212819544138, "train_speed(iter/s)": 0.236536 }, { "epoch": 0.1894199252991844, "grad_norm": 1.064794659614563, "learning_rate": 9.964629730821258e-05, "loss": 0.27180159091949463, "memory(GiB)": 122.96, "step": 2485, "token_acc": 0.9041190723433714, "train_speed(iter/s)": 0.23664 }, { "epoch": 0.18980105190944432, "grad_norm": 1.8649581670761108, "learning_rate": 9.964487421438393e-05, "loss": 0.22177386283874512, "memory(GiB)": 122.96, "step": 2490, "token_acc": 0.9085553278688525, "train_speed(iter/s)": 0.23679 }, { "epoch": 0.19018217851970426, "grad_norm": 1.0809030532836914, "learning_rate": 9.96434482736521e-05, "loss": 0.2417988061904907, "memory(GiB)": 122.96, "step": 2495, "token_acc": 0.8990056541236109, "train_speed(iter/s)": 0.236898 }, { "epoch": 0.19056330512996417, "grad_norm": 1.0591228008270264, "learning_rate": 9.96420194860989e-05, "loss": 0.22393262386322021, "memory(GiB)": 122.96, "step": 2500, "token_acc": 0.9252017380509001, "train_speed(iter/s)": 0.236947 }, { "epoch": 0.1909444317402241, "grad_norm": 0.9385740756988525, "learning_rate": 9.964058785180626e-05, "loss": 0.3084451198577881, "memory(GiB)": 122.96, "step": 2505, "token_acc": 0.8759355210132412, "train_speed(iter/s)": 0.237072 }, { "epoch": 0.19132555835048404, "grad_norm": 2.0610008239746094, "learning_rate": 9.963915337085624e-05, "loss": 0.28600873947143557, "memory(GiB)": 122.96, "step": 2510, "token_acc": 0.8976744186046511, "train_speed(iter/s)": 0.237206 }, { "epoch": 0.19170668496074397, "grad_norm": 0.8145803809165955, "learning_rate": 9.963771604333114e-05, "loss": 0.2500518798828125, "memory(GiB)": 122.96, "step": 2515, "token_acc": 0.872556684910086, "train_speed(iter/s)": 0.237302 }, { "epoch": 0.19208781157100388, "grad_norm": 0.9426414966583252, "learning_rate": 9.963627586931337e-05, "loss": 0.24223783016204833, "memory(GiB)": 122.96, "step": 2520, "token_acc": 0.9089930822444273, "train_speed(iter/s)": 0.237414 }, { "epoch": 0.19246893818126382, "grad_norm": 0.7931233048439026, "learning_rate": 9.963483284888553e-05, "loss": 0.27820515632629395, "memory(GiB)": 122.96, "step": 2525, "token_acc": 0.8989247311827957, "train_speed(iter/s)": 0.237491 }, { "epoch": 0.19285006479152375, "grad_norm": 0.896723210811615, "learning_rate": 9.963338698213035e-05, "loss": 0.21210806369781493, "memory(GiB)": 122.96, "step": 2530, "token_acc": 0.9124891335844683, "train_speed(iter/s)": 0.237642 }, { "epoch": 0.1932311914017837, "grad_norm": 15.007954597473145, "learning_rate": 9.963193826913075e-05, "loss": 0.20797309875488282, "memory(GiB)": 122.96, "step": 2535, "token_acc": 0.9022679448211363, "train_speed(iter/s)": 0.237759 }, { "epoch": 0.1936123180120436, "grad_norm": 0.9164189100265503, "learning_rate": 9.963048670996983e-05, "loss": 0.18652567863464356, "memory(GiB)": 122.96, "step": 2540, "token_acc": 0.9158704883227177, "train_speed(iter/s)": 0.237831 }, { "epoch": 0.19399344462230353, "grad_norm": 0.7434625029563904, "learning_rate": 9.96290323047308e-05, "loss": 0.31068108081817625, "memory(GiB)": 122.96, "step": 2545, "token_acc": 0.8727373476172885, "train_speed(iter/s)": 0.237936 }, { "epoch": 0.19437457123256346, "grad_norm": 1.120155930519104, "learning_rate": 9.962757505349706e-05, "loss": 0.2795144557952881, "memory(GiB)": 122.96, "step": 2550, "token_acc": 0.8749647191645498, "train_speed(iter/s)": 0.238072 }, { "epoch": 0.1947556978428234, "grad_norm": 0.8912761211395264, "learning_rate": 9.962611495635222e-05, "loss": 0.27224392890930177, "memory(GiB)": 122.96, "step": 2555, "token_acc": 0.8871421134825611, "train_speed(iter/s)": 0.23806 }, { "epoch": 0.1951368244530833, "grad_norm": 1.1786255836486816, "learning_rate": 9.962465201337995e-05, "loss": 0.35443429946899413, "memory(GiB)": 122.96, "step": 2560, "token_acc": 0.8769391408114559, "train_speed(iter/s)": 0.238125 }, { "epoch": 0.19551795106334324, "grad_norm": 0.7733458876609802, "learning_rate": 9.96231862246642e-05, "loss": 0.23119680881500243, "memory(GiB)": 122.96, "step": 2565, "token_acc": 0.9175105485232068, "train_speed(iter/s)": 0.238186 }, { "epoch": 0.19589907767360318, "grad_norm": 2.044074535369873, "learning_rate": 9.962171759028898e-05, "loss": 0.2671044826507568, "memory(GiB)": 122.96, "step": 2570, "token_acc": 0.9100946372239748, "train_speed(iter/s)": 0.238293 }, { "epoch": 0.19628020428386309, "grad_norm": 0.9324264526367188, "learning_rate": 9.962024611033853e-05, "loss": 0.23819494247436523, "memory(GiB)": 122.96, "step": 2575, "token_acc": 0.9002039428959892, "train_speed(iter/s)": 0.238326 }, { "epoch": 0.19666133089412302, "grad_norm": 2.344309091567993, "learning_rate": 9.961877178489723e-05, "loss": 0.22369728088378907, "memory(GiB)": 122.96, "step": 2580, "token_acc": 0.9132791327913279, "train_speed(iter/s)": 0.238457 }, { "epoch": 0.19704245750438296, "grad_norm": 0.7523909211158752, "learning_rate": 9.961729461404963e-05, "loss": 0.25209481716156007, "memory(GiB)": 122.96, "step": 2585, "token_acc": 0.8942350332594236, "train_speed(iter/s)": 0.238563 }, { "epoch": 0.1974235841146429, "grad_norm": 0.7247744202613831, "learning_rate": 9.961581459788046e-05, "loss": 0.2027686357498169, "memory(GiB)": 122.96, "step": 2590, "token_acc": 0.9085735963581184, "train_speed(iter/s)": 0.238716 }, { "epoch": 0.1978047107249028, "grad_norm": 0.8465806245803833, "learning_rate": 9.961433173647454e-05, "loss": 0.2022995948791504, "memory(GiB)": 122.96, "step": 2595, "token_acc": 0.9069709127382146, "train_speed(iter/s)": 0.238828 }, { "epoch": 0.19818583733516273, "grad_norm": 0.6251646280288696, "learning_rate": 9.961284602991693e-05, "loss": 0.2387778043746948, "memory(GiB)": 122.96, "step": 2600, "token_acc": 0.9080708813742939, "train_speed(iter/s)": 0.238802 }, { "epoch": 0.19818583733516273, "eval_loss": 0.1864442676305771, "eval_runtime": 183.519, "eval_samples_per_second": 2.888, "eval_steps_per_second": 2.888, "eval_token_acc": 0.9033416661646889, "step": 2600 }, { "epoch": 0.19856696394542267, "grad_norm": 0.9214624166488647, "learning_rate": 9.961135747829285e-05, "loss": 0.24641871452331543, "memory(GiB)": 122.96, "step": 2605, "token_acc": 0.9031442360488975, "train_speed(iter/s)": 0.234914 }, { "epoch": 0.1989480905556826, "grad_norm": 0.5550523400306702, "learning_rate": 9.960986608168765e-05, "loss": 0.25508747100830076, "memory(GiB)": 122.96, "step": 2610, "token_acc": 0.8790882061446977, "train_speed(iter/s)": 0.235043 }, { "epoch": 0.1993292171659425, "grad_norm": 2.148348093032837, "learning_rate": 9.960837184018683e-05, "loss": 0.21692023277282715, "memory(GiB)": 122.96, "step": 2615, "token_acc": 0.9107769423558897, "train_speed(iter/s)": 0.235179 }, { "epoch": 0.19971034377620245, "grad_norm": 0.6543712019920349, "learning_rate": 9.96068747538761e-05, "loss": 0.22900691032409667, "memory(GiB)": 122.96, "step": 2620, "token_acc": 0.9188916876574307, "train_speed(iter/s)": 0.23529 }, { "epoch": 0.20009147038646238, "grad_norm": 0.811438262462616, "learning_rate": 9.960537482284131e-05, "loss": 0.26583716869354246, "memory(GiB)": 122.96, "step": 2625, "token_acc": 0.9007904500725924, "train_speed(iter/s)": 0.235214 }, { "epoch": 0.20047259699672232, "grad_norm": 1.0107061862945557, "learning_rate": 9.960387204716847e-05, "loss": 0.27449994087219237, "memory(GiB)": 122.96, "step": 2630, "token_acc": 0.8955324909747292, "train_speed(iter/s)": 0.235307 }, { "epoch": 0.20085372360698223, "grad_norm": 0.8719041347503662, "learning_rate": 9.960236642694376e-05, "loss": 0.2563391923904419, "memory(GiB)": 122.96, "step": 2635, "token_acc": 0.9129880071446798, "train_speed(iter/s)": 0.235357 }, { "epoch": 0.20123485021724216, "grad_norm": 1.2094417810440063, "learning_rate": 9.960085796225351e-05, "loss": 0.26113128662109375, "memory(GiB)": 122.96, "step": 2640, "token_acc": 0.8726790450928382, "train_speed(iter/s)": 0.235461 }, { "epoch": 0.2016159768275021, "grad_norm": 2.2879045009613037, "learning_rate": 9.959934665318425e-05, "loss": 0.26352725028991697, "memory(GiB)": 122.96, "step": 2645, "token_acc": 0.8966889525628781, "train_speed(iter/s)": 0.235543 }, { "epoch": 0.20199710343776203, "grad_norm": 1.3269717693328857, "learning_rate": 9.959783249982262e-05, "loss": 0.25647845268249514, "memory(GiB)": 122.96, "step": 2650, "token_acc": 0.9062092922275293, "train_speed(iter/s)": 0.235604 }, { "epoch": 0.20237823004802194, "grad_norm": 0.7037633657455444, "learning_rate": 9.959631550225544e-05, "loss": 0.2731334686279297, "memory(GiB)": 122.96, "step": 2655, "token_acc": 0.9054154856034612, "train_speed(iter/s)": 0.235643 }, { "epoch": 0.20275935665828188, "grad_norm": 0.7377872467041016, "learning_rate": 9.959479566056973e-05, "loss": 0.2736817359924316, "memory(GiB)": 122.96, "step": 2660, "token_acc": 0.906088387400094, "train_speed(iter/s)": 0.235695 }, { "epoch": 0.2031404832685418, "grad_norm": 1.9063527584075928, "learning_rate": 9.959327297485266e-05, "loss": 0.25736782550811765, "memory(GiB)": 122.96, "step": 2665, "token_acc": 0.9065420560747663, "train_speed(iter/s)": 0.235828 }, { "epoch": 0.20352160987880175, "grad_norm": 0.6375294327735901, "learning_rate": 9.95917474451915e-05, "loss": 0.211942720413208, "memory(GiB)": 122.96, "step": 2670, "token_acc": 0.9118173063695144, "train_speed(iter/s)": 0.235854 }, { "epoch": 0.20390273648906165, "grad_norm": 1.0100284814834595, "learning_rate": 9.959021907167377e-05, "loss": 0.2032244920730591, "memory(GiB)": 122.96, "step": 2675, "token_acc": 0.9070769230769231, "train_speed(iter/s)": 0.235943 }, { "epoch": 0.2042838630993216, "grad_norm": 0.9140448570251465, "learning_rate": 9.95886878543871e-05, "loss": 0.09047021269798279, "memory(GiB)": 122.96, "step": 2680, "token_acc": 0.9531970995385629, "train_speed(iter/s)": 0.236109 }, { "epoch": 0.20466498970958152, "grad_norm": 1.1633543968200684, "learning_rate": 9.958715379341929e-05, "loss": 0.23606657981872559, "memory(GiB)": 122.96, "step": 2685, "token_acc": 0.8705308775731311, "train_speed(iter/s)": 0.236248 }, { "epoch": 0.20504611631984146, "grad_norm": 0.824425458908081, "learning_rate": 9.958561688885834e-05, "loss": 0.26520609855651855, "memory(GiB)": 122.96, "step": 2690, "token_acc": 0.90436688057599, "train_speed(iter/s)": 0.236297 }, { "epoch": 0.20542724293010137, "grad_norm": 1.1514136791229248, "learning_rate": 9.958407714079237e-05, "loss": 0.2350114107131958, "memory(GiB)": 122.96, "step": 2695, "token_acc": 0.9132678541701349, "train_speed(iter/s)": 0.236375 }, { "epoch": 0.2058083695403613, "grad_norm": 0.7021647691726685, "learning_rate": 9.958253454930965e-05, "loss": 0.23689453601837157, "memory(GiB)": 122.96, "step": 2700, "token_acc": 0.9215900802334063, "train_speed(iter/s)": 0.236418 }, { "epoch": 0.20618949615062124, "grad_norm": 0.9109348058700562, "learning_rate": 9.958098911449869e-05, "loss": 0.24244177341461182, "memory(GiB)": 122.96, "step": 2705, "token_acc": 0.8824043333915779, "train_speed(iter/s)": 0.236492 }, { "epoch": 0.20657062276088117, "grad_norm": 1.3590582609176636, "learning_rate": 9.957944083644808e-05, "loss": 0.34205060005187987, "memory(GiB)": 122.96, "step": 2710, "token_acc": 0.8714831047645978, "train_speed(iter/s)": 0.236568 }, { "epoch": 0.20695174937114108, "grad_norm": 0.8659549355506897, "learning_rate": 9.95778897152466e-05, "loss": 0.24093873500823976, "memory(GiB)": 122.96, "step": 2715, "token_acc": 0.9198751300728408, "train_speed(iter/s)": 0.236639 }, { "epoch": 0.20733287598140102, "grad_norm": 1.3699175119400024, "learning_rate": 9.957633575098323e-05, "loss": 0.21360211372375487, "memory(GiB)": 122.96, "step": 2720, "token_acc": 0.90732889158086, "train_speed(iter/s)": 0.236793 }, { "epoch": 0.20771400259166095, "grad_norm": 1.0688623189926147, "learning_rate": 9.957477894374707e-05, "loss": 0.21014256477355958, "memory(GiB)": 122.96, "step": 2725, "token_acc": 0.9089754445385266, "train_speed(iter/s)": 0.2369 }, { "epoch": 0.2080951292019209, "grad_norm": 1.6176766157150269, "learning_rate": 9.957321929362737e-05, "loss": 0.3316061496734619, "memory(GiB)": 122.96, "step": 2730, "token_acc": 0.879504753482202, "train_speed(iter/s)": 0.237014 }, { "epoch": 0.2084762558121808, "grad_norm": 0.7980506420135498, "learning_rate": 9.957165680071362e-05, "loss": 0.27490577697753904, "memory(GiB)": 122.96, "step": 2735, "token_acc": 0.8975659229208925, "train_speed(iter/s)": 0.237087 }, { "epoch": 0.20885738242244073, "grad_norm": 0.5326893925666809, "learning_rate": 9.957009146509537e-05, "loss": 0.20973918437957764, "memory(GiB)": 122.96, "step": 2740, "token_acc": 0.915633423180593, "train_speed(iter/s)": 0.237124 }, { "epoch": 0.20923850903270066, "grad_norm": 2.553304672241211, "learning_rate": 9.956852328686243e-05, "loss": 0.2609852313995361, "memory(GiB)": 122.96, "step": 2745, "token_acc": 0.9146508443633045, "train_speed(iter/s)": 0.237208 }, { "epoch": 0.2096196356429606, "grad_norm": 1.7659505605697632, "learning_rate": 9.956695226610469e-05, "loss": 0.1911757469177246, "memory(GiB)": 122.96, "step": 2750, "token_acc": 0.9115418894830659, "train_speed(iter/s)": 0.237335 }, { "epoch": 0.2100007622532205, "grad_norm": 1.21354079246521, "learning_rate": 9.956537840291226e-05, "loss": 0.21332383155822754, "memory(GiB)": 122.96, "step": 2755, "token_acc": 0.9128984432913269, "train_speed(iter/s)": 0.237479 }, { "epoch": 0.21038188886348044, "grad_norm": 1.082421898841858, "learning_rate": 9.956380169737538e-05, "loss": 0.28225853443145754, "memory(GiB)": 122.96, "step": 2760, "token_acc": 0.9047619047619048, "train_speed(iter/s)": 0.237556 }, { "epoch": 0.21076301547374038, "grad_norm": 1.0504456758499146, "learning_rate": 9.956222214958449e-05, "loss": 0.2824878454208374, "memory(GiB)": 122.96, "step": 2765, "token_acc": 0.8720089930945881, "train_speed(iter/s)": 0.237648 }, { "epoch": 0.2111441420840003, "grad_norm": 1.2931427955627441, "learning_rate": 9.956063975963016e-05, "loss": 0.2612369298934937, "memory(GiB)": 122.96, "step": 2770, "token_acc": 0.8914285714285715, "train_speed(iter/s)": 0.237761 }, { "epoch": 0.21152526869426022, "grad_norm": 0.6961236596107483, "learning_rate": 9.955905452760312e-05, "loss": 0.20278472900390626, "memory(GiB)": 122.96, "step": 2775, "token_acc": 0.8832467013194722, "train_speed(iter/s)": 0.237907 }, { "epoch": 0.21190639530452016, "grad_norm": 1.669674277305603, "learning_rate": 9.955746645359429e-05, "loss": 0.26557509899139403, "memory(GiB)": 122.96, "step": 2780, "token_acc": 0.8906009244992296, "train_speed(iter/s)": 0.238059 }, { "epoch": 0.2122875219147801, "grad_norm": 0.8587361574172974, "learning_rate": 9.955587553769472e-05, "loss": 0.21491804122924804, "memory(GiB)": 122.96, "step": 2785, "token_acc": 0.9086875291919664, "train_speed(iter/s)": 0.238183 }, { "epoch": 0.21266864852504003, "grad_norm": 1.6778310537338257, "learning_rate": 9.955428177999567e-05, "loss": 0.2557761430740356, "memory(GiB)": 122.96, "step": 2790, "token_acc": 0.9022629730784237, "train_speed(iter/s)": 0.238268 }, { "epoch": 0.21304977513529993, "grad_norm": 0.9391726851463318, "learning_rate": 9.955268518058852e-05, "loss": 0.2547274589538574, "memory(GiB)": 122.96, "step": 2795, "token_acc": 0.9068892161675667, "train_speed(iter/s)": 0.23833 }, { "epoch": 0.21343090174555987, "grad_norm": 0.7006628513336182, "learning_rate": 9.955108573956482e-05, "loss": 0.22569553852081298, "memory(GiB)": 122.96, "step": 2800, "token_acc": 0.9115835485793761, "train_speed(iter/s)": 0.23841 }, { "epoch": 0.21343090174555987, "eval_loss": 0.18260350823402405, "eval_runtime": 177.8285, "eval_samples_per_second": 2.98, "eval_steps_per_second": 2.98, "eval_token_acc": 0.9065191855912295, "step": 2800 }, { "epoch": 0.2138120283558198, "grad_norm": 1.637231707572937, "learning_rate": 9.954948345701631e-05, "loss": 0.2199695110321045, "memory(GiB)": 122.96, "step": 2805, "token_acc": 0.9066239962245507, "train_speed(iter/s)": 0.234986 }, { "epoch": 0.21419315496607974, "grad_norm": 0.727758526802063, "learning_rate": 9.954787833303484e-05, "loss": 0.19925637245178224, "memory(GiB)": 122.96, "step": 2810, "token_acc": 0.9264099037138928, "train_speed(iter/s)": 0.235057 }, { "epoch": 0.21457428157633965, "grad_norm": 1.5735499858856201, "learning_rate": 9.95462703677125e-05, "loss": 0.28996098041534424, "memory(GiB)": 122.96, "step": 2815, "token_acc": 0.8951467944877172, "train_speed(iter/s)": 0.235185 }, { "epoch": 0.21495540818659958, "grad_norm": 0.6912621855735779, "learning_rate": 9.954465956114147e-05, "loss": 0.31084303855895995, "memory(GiB)": 122.96, "step": 2820, "token_acc": 0.8873048200950441, "train_speed(iter/s)": 0.235253 }, { "epoch": 0.21533653479685952, "grad_norm": 1.4661269187927246, "learning_rate": 9.954304591341412e-05, "loss": 0.2760448455810547, "memory(GiB)": 122.96, "step": 2825, "token_acc": 0.8987269618088543, "train_speed(iter/s)": 0.235357 }, { "epoch": 0.21571766140711945, "grad_norm": 0.8792543411254883, "learning_rate": 9.9541429424623e-05, "loss": 0.21004006862640381, "memory(GiB)": 122.96, "step": 2830, "token_acc": 0.8876543209876543, "train_speed(iter/s)": 0.235492 }, { "epoch": 0.21609878801737936, "grad_norm": 1.5133110284805298, "learning_rate": 9.953981009486082e-05, "loss": 0.28317430019378664, "memory(GiB)": 122.96, "step": 2835, "token_acc": 0.8931870669745958, "train_speed(iter/s)": 0.235575 }, { "epoch": 0.2164799146276393, "grad_norm": 2.760392665863037, "learning_rate": 9.953818792422041e-05, "loss": 0.26878812313079836, "memory(GiB)": 122.96, "step": 2840, "token_acc": 0.915051726650153, "train_speed(iter/s)": 0.235666 }, { "epoch": 0.21686104123789923, "grad_norm": 1.522216796875, "learning_rate": 9.953656291279479e-05, "loss": 0.1937252998352051, "memory(GiB)": 122.96, "step": 2845, "token_acc": 0.9190668888359914, "train_speed(iter/s)": 0.235762 }, { "epoch": 0.21724216784815917, "grad_norm": 0.9268574118614197, "learning_rate": 9.953493506067719e-05, "loss": 0.2693490743637085, "memory(GiB)": 122.96, "step": 2850, "token_acc": 0.8991386843090751, "train_speed(iter/s)": 0.235786 }, { "epoch": 0.21762329445841908, "grad_norm": 0.9280834794044495, "learning_rate": 9.953330436796093e-05, "loss": 0.13875045776367187, "memory(GiB)": 122.96, "step": 2855, "token_acc": 0.938239159001314, "train_speed(iter/s)": 0.235938 }, { "epoch": 0.218004421068679, "grad_norm": 0.8001015782356262, "learning_rate": 9.953167083473952e-05, "loss": 0.24726285934448242, "memory(GiB)": 122.96, "step": 2860, "token_acc": 0.889272030651341, "train_speed(iter/s)": 0.236063 }, { "epoch": 0.21838554767893895, "grad_norm": 1.183586597442627, "learning_rate": 9.953003446110665e-05, "loss": 0.27178049087524414, "memory(GiB)": 122.96, "step": 2865, "token_acc": 0.9158696076517665, "train_speed(iter/s)": 0.236116 }, { "epoch": 0.21876667428919888, "grad_norm": 0.8943457007408142, "learning_rate": 9.952839524715613e-05, "loss": 0.22229323387145997, "memory(GiB)": 122.96, "step": 2870, "token_acc": 0.8989441930618401, "train_speed(iter/s)": 0.236244 }, { "epoch": 0.2191478008994588, "grad_norm": 1.0590628385543823, "learning_rate": 9.952675319298202e-05, "loss": 0.20779016017913818, "memory(GiB)": 122.96, "step": 2875, "token_acc": 0.9083465999545144, "train_speed(iter/s)": 0.236364 }, { "epoch": 0.21952892750971872, "grad_norm": 0.7936352491378784, "learning_rate": 9.952510829867842e-05, "loss": 0.1972639560699463, "memory(GiB)": 122.96, "step": 2880, "token_acc": 0.9008097165991903, "train_speed(iter/s)": 0.23648 }, { "epoch": 0.21991005411997866, "grad_norm": 1.21434485912323, "learning_rate": 9.952346056433968e-05, "loss": 0.3077178955078125, "memory(GiB)": 122.96, "step": 2885, "token_acc": 0.8817857142857143, "train_speed(iter/s)": 0.236566 }, { "epoch": 0.2202911807302386, "grad_norm": 1.7283354997634888, "learning_rate": 9.95218099900603e-05, "loss": 0.31396069526672366, "memory(GiB)": 122.96, "step": 2890, "token_acc": 0.904284919309961, "train_speed(iter/s)": 0.236629 }, { "epoch": 0.2206723073404985, "grad_norm": 1.0907855033874512, "learning_rate": 9.952015657593494e-05, "loss": 0.24241323471069337, "memory(GiB)": 122.96, "step": 2895, "token_acc": 0.8955707598127476, "train_speed(iter/s)": 0.236718 }, { "epoch": 0.22105343395075844, "grad_norm": 1.0760691165924072, "learning_rate": 9.951850032205838e-05, "loss": 0.25373985767364504, "memory(GiB)": 122.96, "step": 2900, "token_acc": 0.9000498919008815, "train_speed(iter/s)": 0.236767 }, { "epoch": 0.22143456056101837, "grad_norm": 1.8809890747070312, "learning_rate": 9.951684122852564e-05, "loss": 0.22792911529541016, "memory(GiB)": 122.96, "step": 2905, "token_acc": 0.9071487263763353, "train_speed(iter/s)": 0.236888 }, { "epoch": 0.2218156871712783, "grad_norm": 1.1051658391952515, "learning_rate": 9.951517929543184e-05, "loss": 0.1528017520904541, "memory(GiB)": 122.96, "step": 2910, "token_acc": 0.9208523592085236, "train_speed(iter/s)": 0.237053 }, { "epoch": 0.22219681378153822, "grad_norm": 1.2754848003387451, "learning_rate": 9.951351452287227e-05, "loss": 0.2564688682556152, "memory(GiB)": 122.96, "step": 2915, "token_acc": 0.8857971014492754, "train_speed(iter/s)": 0.237171 }, { "epoch": 0.22257794039179815, "grad_norm": 1.2825138568878174, "learning_rate": 9.951184691094242e-05, "loss": 0.2512900590896606, "memory(GiB)": 122.96, "step": 2920, "token_acc": 0.9047619047619048, "train_speed(iter/s)": 0.237251 }, { "epoch": 0.2229590670020581, "grad_norm": 0.869334876537323, "learning_rate": 9.951017645973791e-05, "loss": 0.16372698545455933, "memory(GiB)": 122.96, "step": 2925, "token_acc": 0.9442231075697212, "train_speed(iter/s)": 0.237403 }, { "epoch": 0.22334019361231802, "grad_norm": 1.1826997995376587, "learning_rate": 9.950850316935454e-05, "loss": 0.23336520195007324, "memory(GiB)": 122.96, "step": 2930, "token_acc": 0.9092443277704857, "train_speed(iter/s)": 0.237488 }, { "epoch": 0.22372132022257793, "grad_norm": 1.1250578165054321, "learning_rate": 9.950682703988827e-05, "loss": 0.21436161994934083, "memory(GiB)": 122.96, "step": 2935, "token_acc": 0.920939147101102, "train_speed(iter/s)": 0.237597 }, { "epoch": 0.22410244683283786, "grad_norm": 0.8603762984275818, "learning_rate": 9.950514807143519e-05, "loss": 0.2755215883255005, "memory(GiB)": 122.96, "step": 2940, "token_acc": 0.8976268031642625, "train_speed(iter/s)": 0.237701 }, { "epoch": 0.2244835734430978, "grad_norm": 1.2917633056640625, "learning_rate": 9.950346626409161e-05, "loss": 0.24103894233703613, "memory(GiB)": 122.96, "step": 2945, "token_acc": 0.9054545454545454, "train_speed(iter/s)": 0.237774 }, { "epoch": 0.22486470005335774, "grad_norm": 0.2856813371181488, "learning_rate": 9.950178161795398e-05, "loss": 0.16997655630111694, "memory(GiB)": 122.96, "step": 2950, "token_acc": 0.9125567322239032, "train_speed(iter/s)": 0.237899 }, { "epoch": 0.22524582666361764, "grad_norm": 1.7071458101272583, "learning_rate": 9.950009413311887e-05, "loss": 0.3091901302337646, "memory(GiB)": 122.96, "step": 2955, "token_acc": 0.8827639751552795, "train_speed(iter/s)": 0.238016 }, { "epoch": 0.22562695327387758, "grad_norm": 0.6557905077934265, "learning_rate": 9.949840380968307e-05, "loss": 0.3077983379364014, "memory(GiB)": 122.96, "step": 2960, "token_acc": 0.910167686984296, "train_speed(iter/s)": 0.238017 }, { "epoch": 0.2260080798841375, "grad_norm": 0.7364832758903503, "learning_rate": 9.949671064774352e-05, "loss": 0.20038692951202391, "memory(GiB)": 122.96, "step": 2965, "token_acc": 0.9223257878384377, "train_speed(iter/s)": 0.238117 }, { "epoch": 0.22638920649439745, "grad_norm": 0.9470295906066895, "learning_rate": 9.94950146473973e-05, "loss": 0.24209947586059571, "memory(GiB)": 122.96, "step": 2970, "token_acc": 0.9153509353854319, "train_speed(iter/s)": 0.238166 }, { "epoch": 0.22677033310465736, "grad_norm": 1.2002469301223755, "learning_rate": 9.949331580874168e-05, "loss": 0.20953779220581054, "memory(GiB)": 122.96, "step": 2975, "token_acc": 0.9175981284117494, "train_speed(iter/s)": 0.238284 }, { "epoch": 0.2271514597149173, "grad_norm": 0.8938823342323303, "learning_rate": 9.949161413187407e-05, "loss": 0.22920627593994142, "memory(GiB)": 122.96, "step": 2980, "token_acc": 0.9086670323642347, "train_speed(iter/s)": 0.238405 }, { "epoch": 0.22753258632517723, "grad_norm": 1.5594325065612793, "learning_rate": 9.948990961689206e-05, "loss": 0.285231876373291, "memory(GiB)": 122.96, "step": 2985, "token_acc": 0.8833192923336142, "train_speed(iter/s)": 0.238508 }, { "epoch": 0.22791371293543716, "grad_norm": 2.552048921585083, "learning_rate": 9.94882022638934e-05, "loss": 0.20185692310333253, "memory(GiB)": 122.96, "step": 2990, "token_acc": 0.9234211834908006, "train_speed(iter/s)": 0.238596 }, { "epoch": 0.22829483954569707, "grad_norm": 0.729674756526947, "learning_rate": 9.948649207297598e-05, "loss": 0.23562324047088623, "memory(GiB)": 122.96, "step": 2995, "token_acc": 0.9075667328480035, "train_speed(iter/s)": 0.23871 }, { "epoch": 0.228675966155957, "grad_norm": 1.557638168334961, "learning_rate": 9.94847790442379e-05, "loss": 0.28632240295410155, "memory(GiB)": 122.96, "step": 3000, "token_acc": 0.8821952091526636, "train_speed(iter/s)": 0.238797 }, { "epoch": 0.228675966155957, "eval_loss": 0.17560459673404694, "eval_runtime": 173.4133, "eval_samples_per_second": 3.056, "eval_steps_per_second": 3.056, "eval_token_acc": 0.9075432202879344, "step": 3000 }, { "epoch": 0.22905709276621694, "grad_norm": 1.2724863290786743, "learning_rate": 9.948306317777738e-05, "loss": 0.21831119060516357, "memory(GiB)": 122.96, "step": 3005, "token_acc": 0.9069361671573203, "train_speed(iter/s)": 0.23563 }, { "epoch": 0.22943821937647688, "grad_norm": 2.302628993988037, "learning_rate": 9.948134447369282e-05, "loss": 0.2699413537979126, "memory(GiB)": 122.96, "step": 3010, "token_acc": 0.9053231192300137, "train_speed(iter/s)": 0.235736 }, { "epoch": 0.22981934598673678, "grad_norm": 0.5406407117843628, "learning_rate": 9.947962293208276e-05, "loss": 0.1842190980911255, "memory(GiB)": 122.96, "step": 3015, "token_acc": 0.9181071737251513, "train_speed(iter/s)": 0.23583 }, { "epoch": 0.23020047259699672, "grad_norm": 1.6050798892974854, "learning_rate": 9.947789855304594e-05, "loss": 0.28391437530517577, "memory(GiB)": 122.96, "step": 3020, "token_acc": 0.8878093916261856, "train_speed(iter/s)": 0.235921 }, { "epoch": 0.23058159920725665, "grad_norm": 1.3347240686416626, "learning_rate": 9.947617133668126e-05, "loss": 0.33120408058166506, "memory(GiB)": 122.96, "step": 3025, "token_acc": 0.8814759597465524, "train_speed(iter/s)": 0.236004 }, { "epoch": 0.2309627258175166, "grad_norm": 2.320920467376709, "learning_rate": 9.947444128308774e-05, "loss": 0.31291651725769043, "memory(GiB)": 122.96, "step": 3030, "token_acc": 0.8815145713677941, "train_speed(iter/s)": 0.236111 }, { "epoch": 0.2313438524277765, "grad_norm": 1.0707837343215942, "learning_rate": 9.94727083923646e-05, "loss": 0.2676500082015991, "memory(GiB)": 122.96, "step": 3035, "token_acc": 0.9064327485380117, "train_speed(iter/s)": 0.23622 }, { "epoch": 0.23172497903803643, "grad_norm": 0.6968094706535339, "learning_rate": 9.947097266461122e-05, "loss": 0.21562507152557372, "memory(GiB)": 122.96, "step": 3040, "token_acc": 0.9158584770114943, "train_speed(iter/s)": 0.236183 }, { "epoch": 0.23210610564829637, "grad_norm": 1.3075065612792969, "learning_rate": 9.946923409992713e-05, "loss": 0.25662598609924314, "memory(GiB)": 122.96, "step": 3045, "token_acc": 0.9027862829148806, "train_speed(iter/s)": 0.23626 }, { "epoch": 0.2324872322585563, "grad_norm": 1.158544898033142, "learning_rate": 9.946749269841202e-05, "loss": 0.32373528480529784, "memory(GiB)": 122.96, "step": 3050, "token_acc": 0.8936643835616438, "train_speed(iter/s)": 0.236346 }, { "epoch": 0.2328683588688162, "grad_norm": 0.8692493438720703, "learning_rate": 9.946574846016576e-05, "loss": 0.2111349105834961, "memory(GiB)": 122.96, "step": 3055, "token_acc": 0.9240481357269679, "train_speed(iter/s)": 0.236442 }, { "epoch": 0.23324948547907615, "grad_norm": 0.7209978699684143, "learning_rate": 9.946400138528839e-05, "loss": 0.2535152196884155, "memory(GiB)": 122.96, "step": 3060, "token_acc": 0.910182119205298, "train_speed(iter/s)": 0.236534 }, { "epoch": 0.23363061208933608, "grad_norm": 1.6100413799285889, "learning_rate": 9.946225147388008e-05, "loss": 0.1996615171432495, "memory(GiB)": 122.96, "step": 3065, "token_acc": 0.9344827586206896, "train_speed(iter/s)": 0.236649 }, { "epoch": 0.23401173869959602, "grad_norm": 0.5853747129440308, "learning_rate": 9.946049872604118e-05, "loss": 0.19076142311096192, "memory(GiB)": 122.96, "step": 3070, "token_acc": 0.9263308603410126, "train_speed(iter/s)": 0.23671 }, { "epoch": 0.23439286530985592, "grad_norm": 1.0467084646224976, "learning_rate": 9.94587431418722e-05, "loss": 0.1573173999786377, "memory(GiB)": 122.96, "step": 3075, "token_acc": 0.9301282051282052, "train_speed(iter/s)": 0.23678 }, { "epoch": 0.23477399192011586, "grad_norm": 0.873877763748169, "learning_rate": 9.945698472147381e-05, "loss": 0.22424046993255614, "memory(GiB)": 122.96, "step": 3080, "token_acc": 0.9121919014084507, "train_speed(iter/s)": 0.236876 }, { "epoch": 0.2351551185303758, "grad_norm": 0.9595664739608765, "learning_rate": 9.945522346494687e-05, "loss": 0.2645240783691406, "memory(GiB)": 122.96, "step": 3085, "token_acc": 0.8848433530906011, "train_speed(iter/s)": 0.236986 }, { "epoch": 0.23553624514063573, "grad_norm": 1.1004600524902344, "learning_rate": 9.945345937239235e-05, "loss": 0.23919687271118165, "memory(GiB)": 122.96, "step": 3090, "token_acc": 0.8791384124451536, "train_speed(iter/s)": 0.237075 }, { "epoch": 0.23591737175089564, "grad_norm": 1.029441237449646, "learning_rate": 9.945169244391143e-05, "loss": 0.27618086338043213, "memory(GiB)": 122.96, "step": 3095, "token_acc": 0.9003135830995214, "train_speed(iter/s)": 0.237119 }, { "epoch": 0.23629849836115557, "grad_norm": 1.2731555700302124, "learning_rate": 9.944992267960544e-05, "loss": 0.2633334159851074, "memory(GiB)": 122.96, "step": 3100, "token_acc": 0.9025434399395619, "train_speed(iter/s)": 0.237231 }, { "epoch": 0.2366796249714155, "grad_norm": 0.790225625038147, "learning_rate": 9.944815007957586e-05, "loss": 0.24857263565063475, "memory(GiB)": 122.96, "step": 3105, "token_acc": 0.9057723431037583, "train_speed(iter/s)": 0.237307 }, { "epoch": 0.23706075158167544, "grad_norm": 0.7994900345802307, "learning_rate": 9.944637464392432e-05, "loss": 0.2545635461807251, "memory(GiB)": 122.96, "step": 3110, "token_acc": 0.9111872857588034, "train_speed(iter/s)": 0.237402 }, { "epoch": 0.23744187819193535, "grad_norm": 0.7714027762413025, "learning_rate": 9.944459637275267e-05, "loss": 0.22408416271209716, "memory(GiB)": 122.96, "step": 3115, "token_acc": 0.9147859922178988, "train_speed(iter/s)": 0.237497 }, { "epoch": 0.2378230048021953, "grad_norm": 0.4827883839607239, "learning_rate": 9.944281526616288e-05, "loss": 0.19041059017181397, "memory(GiB)": 122.96, "step": 3120, "token_acc": 0.9184692179700499, "train_speed(iter/s)": 0.237577 }, { "epoch": 0.23820413141245522, "grad_norm": 1.3856830596923828, "learning_rate": 9.944103132425706e-05, "loss": 0.1797332763671875, "memory(GiB)": 122.96, "step": 3125, "token_acc": 0.921984472637758, "train_speed(iter/s)": 0.237655 }, { "epoch": 0.23858525802271516, "grad_norm": 0.8524709343910217, "learning_rate": 9.943924454713754e-05, "loss": 0.22646827697753907, "memory(GiB)": 122.96, "step": 3130, "token_acc": 0.907399299474606, "train_speed(iter/s)": 0.237745 }, { "epoch": 0.23896638463297507, "grad_norm": 1.333315372467041, "learning_rate": 9.943745493490675e-05, "loss": 0.1917391300201416, "memory(GiB)": 122.96, "step": 3135, "token_acc": 0.9235703812316716, "train_speed(iter/s)": 0.237837 }, { "epoch": 0.239347511243235, "grad_norm": 0.5328627824783325, "learning_rate": 9.943566248766736e-05, "loss": 0.2412860155105591, "memory(GiB)": 122.96, "step": 3140, "token_acc": 0.907120318287831, "train_speed(iter/s)": 0.237897 }, { "epoch": 0.23972863785349494, "grad_norm": 0.8098029494285583, "learning_rate": 9.943386720552212e-05, "loss": 0.28713202476501465, "memory(GiB)": 122.96, "step": 3145, "token_acc": 0.8859700330803658, "train_speed(iter/s)": 0.237997 }, { "epoch": 0.24010976446375487, "grad_norm": 0.9971261620521545, "learning_rate": 9.9432069088574e-05, "loss": 0.28592920303344727, "memory(GiB)": 122.96, "step": 3150, "token_acc": 0.8952585692105578, "train_speed(iter/s)": 0.238002 }, { "epoch": 0.24049089107401478, "grad_norm": 0.6764193773269653, "learning_rate": 9.943026813692613e-05, "loss": 0.24353208541870117, "memory(GiB)": 122.96, "step": 3155, "token_acc": 0.902889050740471, "train_speed(iter/s)": 0.238097 }, { "epoch": 0.24087201768427471, "grad_norm": 2.8658385276794434, "learning_rate": 9.942846435068174e-05, "loss": 0.1882363438606262, "memory(GiB)": 122.96, "step": 3160, "token_acc": 0.9081547453381575, "train_speed(iter/s)": 0.238204 }, { "epoch": 0.24125314429453465, "grad_norm": 0.3709406554698944, "learning_rate": 9.94266577299443e-05, "loss": 0.250874662399292, "memory(GiB)": 122.96, "step": 3165, "token_acc": 0.8764044943820225, "train_speed(iter/s)": 0.238344 }, { "epoch": 0.24163427090479458, "grad_norm": 0.7960156798362732, "learning_rate": 9.942484827481743e-05, "loss": 0.23544712066650392, "memory(GiB)": 122.96, "step": 3170, "token_acc": 0.8917011438306157, "train_speed(iter/s)": 0.238455 }, { "epoch": 0.2420153975150545, "grad_norm": 0.9147986769676208, "learning_rate": 9.942303598540486e-05, "loss": 0.28400382995605467, "memory(GiB)": 122.96, "step": 3175, "token_acc": 0.9008578027053777, "train_speed(iter/s)": 0.238509 }, { "epoch": 0.24239652412531443, "grad_norm": 1.8006359338760376, "learning_rate": 9.942122086181051e-05, "loss": 0.26421821117401123, "memory(GiB)": 122.96, "step": 3180, "token_acc": 0.8852361028093245, "train_speed(iter/s)": 0.23865 }, { "epoch": 0.24277765073557436, "grad_norm": 0.9444708824157715, "learning_rate": 9.94194029041385e-05, "loss": 0.2546346664428711, "memory(GiB)": 122.96, "step": 3185, "token_acc": 0.9090909090909091, "train_speed(iter/s)": 0.238735 }, { "epoch": 0.2431587773458343, "grad_norm": 0.47868970036506653, "learning_rate": 9.941758211249307e-05, "loss": 0.15003867149353028, "memory(GiB)": 122.96, "step": 3190, "token_acc": 0.9171067738231917, "train_speed(iter/s)": 0.238832 }, { "epoch": 0.2435399039560942, "grad_norm": 0.9796867370605469, "learning_rate": 9.941575848697861e-05, "loss": 0.2325118064880371, "memory(GiB)": 122.96, "step": 3195, "token_acc": 0.9111183994752378, "train_speed(iter/s)": 0.238948 }, { "epoch": 0.24392103056635414, "grad_norm": 0.6418392658233643, "learning_rate": 9.941393202769975e-05, "loss": 0.22563722133636474, "memory(GiB)": 122.96, "step": 3200, "token_acc": 0.9069649211997967, "train_speed(iter/s)": 0.23902 }, { "epoch": 0.24392103056635414, "eval_loss": 0.17317873239517212, "eval_runtime": 184.3518, "eval_samples_per_second": 2.875, "eval_steps_per_second": 2.875, "eval_token_acc": 0.9117221854105174, "step": 3200 }, { "epoch": 0.24430215717661408, "grad_norm": 1.0575624704360962, "learning_rate": 9.941210273476119e-05, "loss": 0.3449040651321411, "memory(GiB)": 122.96, "step": 3205, "token_acc": 0.9105958094516747, "train_speed(iter/s)": 0.235864 }, { "epoch": 0.244683283786874, "grad_norm": 0.9067206382751465, "learning_rate": 9.941027060826782e-05, "loss": 0.22462077140808107, "memory(GiB)": 122.96, "step": 3210, "token_acc": 0.9096739711384286, "train_speed(iter/s)": 0.235948 }, { "epoch": 0.24506441039713392, "grad_norm": 0.9184339642524719, "learning_rate": 9.940843564832474e-05, "loss": 0.27300095558166504, "memory(GiB)": 122.96, "step": 3215, "token_acc": 0.8946373091989734, "train_speed(iter/s)": 0.236002 }, { "epoch": 0.24544553700739385, "grad_norm": 2.172671318054199, "learning_rate": 9.940659785503714e-05, "loss": 0.2927708148956299, "memory(GiB)": 122.96, "step": 3220, "token_acc": 0.8986756621689156, "train_speed(iter/s)": 0.236034 }, { "epoch": 0.2458266636176538, "grad_norm": 1.213593602180481, "learning_rate": 9.940475722851043e-05, "loss": 0.25940144062042236, "memory(GiB)": 122.96, "step": 3225, "token_acc": 0.9004392386530015, "train_speed(iter/s)": 0.236127 }, { "epoch": 0.24620779022791373, "grad_norm": 0.9681768417358398, "learning_rate": 9.940291376885019e-05, "loss": 0.21597435474395751, "memory(GiB)": 122.96, "step": 3230, "token_acc": 0.9092138884713663, "train_speed(iter/s)": 0.2362 }, { "epoch": 0.24658891683817363, "grad_norm": 1.9438751935958862, "learning_rate": 9.940106747616207e-05, "loss": 0.19244909286499023, "memory(GiB)": 122.96, "step": 3235, "token_acc": 0.9123775601068567, "train_speed(iter/s)": 0.236292 }, { "epoch": 0.24697004344843357, "grad_norm": 2.376807928085327, "learning_rate": 9.9399218350552e-05, "loss": 0.24726567268371583, "memory(GiB)": 122.96, "step": 3240, "token_acc": 0.9115367077063383, "train_speed(iter/s)": 0.236426 }, { "epoch": 0.2473511700586935, "grad_norm": 1.0981007814407349, "learning_rate": 9.939736639212597e-05, "loss": 0.23982903957366944, "memory(GiB)": 122.96, "step": 3245, "token_acc": 0.9087575179697814, "train_speed(iter/s)": 0.236494 }, { "epoch": 0.24773229666895344, "grad_norm": 1.2942782640457153, "learning_rate": 9.939551160099023e-05, "loss": 0.20327491760253907, "memory(GiB)": 122.96, "step": 3250, "token_acc": 0.9225153476024556, "train_speed(iter/s)": 0.23657 }, { "epoch": 0.24811342327921335, "grad_norm": 0.9806642532348633, "learning_rate": 9.939365397725114e-05, "loss": 0.2752419710159302, "memory(GiB)": 122.96, "step": 3255, "token_acc": 0.9080308444611623, "train_speed(iter/s)": 0.236646 }, { "epoch": 0.24849454988947328, "grad_norm": 0.7788330316543579, "learning_rate": 9.939179352101517e-05, "loss": 0.33398849964141847, "memory(GiB)": 122.96, "step": 3260, "token_acc": 0.885455590686979, "train_speed(iter/s)": 0.236713 }, { "epoch": 0.24887567649973322, "grad_norm": 0.5816663503646851, "learning_rate": 9.938993023238908e-05, "loss": 0.24198424816131592, "memory(GiB)": 122.96, "step": 3265, "token_acc": 0.9095539435098009, "train_speed(iter/s)": 0.236797 }, { "epoch": 0.24925680310999315, "grad_norm": 1.1989275217056274, "learning_rate": 9.938806411147968e-05, "loss": 0.3066340446472168, "memory(GiB)": 122.96, "step": 3270, "token_acc": 0.8832648487112439, "train_speed(iter/s)": 0.23686 }, { "epoch": 0.24963792972025306, "grad_norm": 1.0874223709106445, "learning_rate": 9.938619515839398e-05, "loss": 0.2677659749984741, "memory(GiB)": 122.96, "step": 3275, "token_acc": 0.8883374689826302, "train_speed(iter/s)": 0.236928 }, { "epoch": 0.250019056330513, "grad_norm": 0.8574619293212891, "learning_rate": 9.938432337323917e-05, "loss": 0.1826395273208618, "memory(GiB)": 122.96, "step": 3280, "token_acc": 0.9138014527845036, "train_speed(iter/s)": 0.237014 }, { "epoch": 0.25040018294077293, "grad_norm": 0.7394911646842957, "learning_rate": 9.93824487561226e-05, "loss": 0.2682335376739502, "memory(GiB)": 122.96, "step": 3285, "token_acc": 0.9012509981368113, "train_speed(iter/s)": 0.237124 }, { "epoch": 0.25078130955103284, "grad_norm": 0.939775288105011, "learning_rate": 9.938057130715172e-05, "loss": 0.1726750135421753, "memory(GiB)": 122.96, "step": 3290, "token_acc": 0.9339160839160839, "train_speed(iter/s)": 0.23724 }, { "epoch": 0.2511624361612928, "grad_norm": 0.9436139464378357, "learning_rate": 9.937869102643427e-05, "loss": 0.19674288034439086, "memory(GiB)": 122.96, "step": 3295, "token_acc": 0.9157088122605364, "train_speed(iter/s)": 0.237346 }, { "epoch": 0.2515435627715527, "grad_norm": 0.8856566548347473, "learning_rate": 9.937680791407802e-05, "loss": 0.2688908576965332, "memory(GiB)": 122.96, "step": 3300, "token_acc": 0.8824618232299861, "train_speed(iter/s)": 0.237478 }, { "epoch": 0.2519246893818126, "grad_norm": 0.8745558261871338, "learning_rate": 9.937492197019098e-05, "loss": 0.23878166675567628, "memory(GiB)": 122.96, "step": 3305, "token_acc": 0.908303520994163, "train_speed(iter/s)": 0.237573 }, { "epoch": 0.2523058159920726, "grad_norm": 1.1011661291122437, "learning_rate": 9.937303319488128e-05, "loss": 0.18626414537429808, "memory(GiB)": 122.96, "step": 3310, "token_acc": 0.9165925266903915, "train_speed(iter/s)": 0.237671 }, { "epoch": 0.2526869426023325, "grad_norm": 0.6140720844268799, "learning_rate": 9.937114158825724e-05, "loss": 0.2733391046524048, "memory(GiB)": 122.96, "step": 3315, "token_acc": 0.9027979945010512, "train_speed(iter/s)": 0.237743 }, { "epoch": 0.25306806921259245, "grad_norm": 0.9019266366958618, "learning_rate": 9.936924715042735e-05, "loss": 0.24545023441314698, "memory(GiB)": 122.96, "step": 3320, "token_acc": 0.909877800407332, "train_speed(iter/s)": 0.237777 }, { "epoch": 0.25344919582285236, "grad_norm": 0.6621191501617432, "learning_rate": 9.936734988150025e-05, "loss": 0.19869577884674072, "memory(GiB)": 122.96, "step": 3325, "token_acc": 0.9315490288962577, "train_speed(iter/s)": 0.237863 }, { "epoch": 0.25383032243311227, "grad_norm": 0.8921169638633728, "learning_rate": 9.936544978158471e-05, "loss": 0.19741605520248412, "memory(GiB)": 122.96, "step": 3330, "token_acc": 0.9223942208462332, "train_speed(iter/s)": 0.237938 }, { "epoch": 0.25421144904337223, "grad_norm": 1.2893176078796387, "learning_rate": 9.936354685078971e-05, "loss": 0.26518831253051756, "memory(GiB)": 122.96, "step": 3335, "token_acc": 0.8884177047442353, "train_speed(iter/s)": 0.238038 }, { "epoch": 0.25459257565363214, "grad_norm": 0.9657509326934814, "learning_rate": 9.936164108922439e-05, "loss": 0.34899344444274905, "memory(GiB)": 122.96, "step": 3340, "token_acc": 0.8854574786817239, "train_speed(iter/s)": 0.238124 }, { "epoch": 0.25497370226389204, "grad_norm": 1.2672642469406128, "learning_rate": 9.935973249699799e-05, "loss": 0.2303415298461914, "memory(GiB)": 122.96, "step": 3345, "token_acc": 0.9177974947807933, "train_speed(iter/s)": 0.238219 }, { "epoch": 0.255354828874152, "grad_norm": 0.9719008803367615, "learning_rate": 9.935782107422e-05, "loss": 0.18209826946258545, "memory(GiB)": 122.96, "step": 3350, "token_acc": 0.9254746387078493, "train_speed(iter/s)": 0.238317 }, { "epoch": 0.2557359554844119, "grad_norm": 1.4309730529785156, "learning_rate": 9.935590682100003e-05, "loss": 0.1636431932449341, "memory(GiB)": 122.96, "step": 3355, "token_acc": 0.9257690182354025, "train_speed(iter/s)": 0.238402 }, { "epoch": 0.2561170820946719, "grad_norm": 1.1790541410446167, "learning_rate": 9.935398973744785e-05, "loss": 0.2573086261749268, "memory(GiB)": 122.96, "step": 3360, "token_acc": 0.9025433800808177, "train_speed(iter/s)": 0.238499 }, { "epoch": 0.2564982087049318, "grad_norm": 3.0535683631896973, "learning_rate": 9.935206982367338e-05, "loss": 0.2728036165237427, "memory(GiB)": 122.96, "step": 3365, "token_acc": 0.8803030303030303, "train_speed(iter/s)": 0.238621 }, { "epoch": 0.2568793353151917, "grad_norm": 1.0349944829940796, "learning_rate": 9.935014707978672e-05, "loss": 0.21414058208465575, "memory(GiB)": 122.96, "step": 3370, "token_acc": 0.9285714285714286, "train_speed(iter/s)": 0.238728 }, { "epoch": 0.25726046192545166, "grad_norm": 0.7342001795768738, "learning_rate": 9.934822150589814e-05, "loss": 0.23120806217193604, "memory(GiB)": 122.96, "step": 3375, "token_acc": 0.9028726287262873, "train_speed(iter/s)": 0.238766 }, { "epoch": 0.25764158853571156, "grad_norm": 1.517551064491272, "learning_rate": 9.934629310211805e-05, "loss": 0.250041127204895, "memory(GiB)": 122.96, "step": 3380, "token_acc": 0.9158024257338724, "train_speed(iter/s)": 0.238852 }, { "epoch": 0.25802271514597147, "grad_norm": 1.1558641195297241, "learning_rate": 9.934436186855707e-05, "loss": 0.2873279094696045, "memory(GiB)": 122.96, "step": 3385, "token_acc": 0.8852150537634409, "train_speed(iter/s)": 0.238946 }, { "epoch": 0.25840384175623143, "grad_norm": 1.0578992366790771, "learning_rate": 9.93424278053259e-05, "loss": 0.2011775016784668, "memory(GiB)": 122.96, "step": 3390, "token_acc": 0.9093851132686084, "train_speed(iter/s)": 0.238994 }, { "epoch": 0.25878496836649134, "grad_norm": 1.4725910425186157, "learning_rate": 9.934049091253548e-05, "loss": 0.24157338142395018, "memory(GiB)": 122.96, "step": 3395, "token_acc": 0.9071748878923767, "train_speed(iter/s)": 0.239063 }, { "epoch": 0.2591660949767513, "grad_norm": 2.4549434185028076, "learning_rate": 9.933855119029689e-05, "loss": 0.26102404594421386, "memory(GiB)": 122.96, "step": 3400, "token_acc": 0.9064890204786578, "train_speed(iter/s)": 0.239152 }, { "epoch": 0.2591660949767513, "eval_loss": 0.16867278516292572, "eval_runtime": 174.9074, "eval_samples_per_second": 3.03, "eval_steps_per_second": 3.03, "eval_token_acc": 0.9134615384615384, "step": 3400 }, { "epoch": 0.2595472215870112, "grad_norm": 1.1177119016647339, "learning_rate": 9.933660863872132e-05, "loss": 0.23485116958618163, "memory(GiB)": 122.96, "step": 3405, "token_acc": 0.9128931023686261, "train_speed(iter/s)": 0.236354 }, { "epoch": 0.2599283481972711, "grad_norm": 1.0032718181610107, "learning_rate": 9.933466325792022e-05, "loss": 0.20799870491027833, "memory(GiB)": 122.96, "step": 3410, "token_acc": 0.9240226986128626, "train_speed(iter/s)": 0.236414 }, { "epoch": 0.2603094748075311, "grad_norm": 1.2010462284088135, "learning_rate": 9.93327150480051e-05, "loss": 0.2600240230560303, "memory(GiB)": 122.96, "step": 3415, "token_acc": 0.9068931721659589, "train_speed(iter/s)": 0.23648 }, { "epoch": 0.260690601417791, "grad_norm": 0.5950063467025757, "learning_rate": 9.933076400908772e-05, "loss": 0.1761531114578247, "memory(GiB)": 122.96, "step": 3420, "token_acc": 0.9365230651925276, "train_speed(iter/s)": 0.236548 }, { "epoch": 0.2610717280280509, "grad_norm": 0.9771429300308228, "learning_rate": 9.932881014127994e-05, "loss": 0.21790056228637694, "memory(GiB)": 122.96, "step": 3425, "token_acc": 0.9130969460955114, "train_speed(iter/s)": 0.236617 }, { "epoch": 0.26145285463831086, "grad_norm": 1.1972030401229858, "learning_rate": 9.932685344469381e-05, "loss": 0.2700880289077759, "memory(GiB)": 122.96, "step": 3430, "token_acc": 0.8914512338425382, "train_speed(iter/s)": 0.236685 }, { "epoch": 0.26183398124857077, "grad_norm": 1.2683387994766235, "learning_rate": 9.932489391944155e-05, "loss": 0.19975472688674928, "memory(GiB)": 122.96, "step": 3435, "token_acc": 0.8982843137254902, "train_speed(iter/s)": 0.236784 }, { "epoch": 0.2622151078588307, "grad_norm": 1.400316834449768, "learning_rate": 9.93229315656355e-05, "loss": 0.19390435218811036, "memory(GiB)": 122.96, "step": 3440, "token_acc": 0.9198506176386096, "train_speed(iter/s)": 0.236883 }, { "epoch": 0.26259623446909064, "grad_norm": 1.341288685798645, "learning_rate": 9.932096638338823e-05, "loss": 0.2701341390609741, "memory(GiB)": 122.96, "step": 3445, "token_acc": 0.8865601257532093, "train_speed(iter/s)": 0.236984 }, { "epoch": 0.26297736107935055, "grad_norm": 0.8914660215377808, "learning_rate": 9.931899837281241e-05, "loss": 0.21112780570983886, "memory(GiB)": 122.96, "step": 3450, "token_acc": 0.9133243606998654, "train_speed(iter/s)": 0.237023 }, { "epoch": 0.2633584876896105, "grad_norm": 1.003970742225647, "learning_rate": 9.93170275340209e-05, "loss": 0.24273712635040284, "memory(GiB)": 122.96, "step": 3455, "token_acc": 0.9034447300771208, "train_speed(iter/s)": 0.237041 }, { "epoch": 0.2637396142998704, "grad_norm": 0.6195768117904663, "learning_rate": 9.931505386712672e-05, "loss": 0.1922098159790039, "memory(GiB)": 122.96, "step": 3460, "token_acc": 0.9400851063829787, "train_speed(iter/s)": 0.23713 }, { "epoch": 0.2641207409101303, "grad_norm": 1.3243917226791382, "learning_rate": 9.931307737224304e-05, "loss": 0.255895733833313, "memory(GiB)": 122.96, "step": 3465, "token_acc": 0.9087471641038568, "train_speed(iter/s)": 0.237229 }, { "epoch": 0.2645018675203903, "grad_norm": 2.165931463241577, "learning_rate": 9.931109804948323e-05, "loss": 0.20632717609405518, "memory(GiB)": 122.96, "step": 3470, "token_acc": 0.9187958011487424, "train_speed(iter/s)": 0.237309 }, { "epoch": 0.2648829941306502, "grad_norm": 0.7715216279029846, "learning_rate": 9.930911589896076e-05, "loss": 0.16089391708374023, "memory(GiB)": 122.96, "step": 3475, "token_acc": 0.9304748547385293, "train_speed(iter/s)": 0.237385 }, { "epoch": 0.2652641207409101, "grad_norm": 0.7170222997665405, "learning_rate": 9.930713092078934e-05, "loss": 0.24356555938720703, "memory(GiB)": 122.96, "step": 3480, "token_acc": 0.8989237248479176, "train_speed(iter/s)": 0.237508 }, { "epoch": 0.26564524735117007, "grad_norm": 0.5387678146362305, "learning_rate": 9.930514311508277e-05, "loss": 0.20824167728424073, "memory(GiB)": 122.96, "step": 3485, "token_acc": 0.8899310344827587, "train_speed(iter/s)": 0.237612 }, { "epoch": 0.26602637396143, "grad_norm": 1.108339548110962, "learning_rate": 9.930315248195504e-05, "loss": 0.18840088844299316, "memory(GiB)": 122.96, "step": 3490, "token_acc": 0.9052688560086611, "train_speed(iter/s)": 0.237673 }, { "epoch": 0.26640750057168994, "grad_norm": 0.7743140459060669, "learning_rate": 9.930115902152031e-05, "loss": 0.1913951277732849, "memory(GiB)": 122.96, "step": 3495, "token_acc": 0.9241052177662785, "train_speed(iter/s)": 0.237783 }, { "epoch": 0.26678862718194984, "grad_norm": 0.9980422258377075, "learning_rate": 9.929916273389288e-05, "loss": 0.23452506065368653, "memory(GiB)": 122.96, "step": 3500, "token_acc": 0.9047390329811079, "train_speed(iter/s)": 0.237842 }, { "epoch": 0.26716975379220975, "grad_norm": 0.8973496556282043, "learning_rate": 9.929716361918726e-05, "loss": 0.2480475425720215, "memory(GiB)": 122.96, "step": 3505, "token_acc": 0.9011361709343734, "train_speed(iter/s)": 0.2379 }, { "epoch": 0.2675508804024697, "grad_norm": 0.6842448711395264, "learning_rate": 9.929516167751807e-05, "loss": 0.23145952224731445, "memory(GiB)": 122.96, "step": 3510, "token_acc": 0.9164243267163991, "train_speed(iter/s)": 0.237966 }, { "epoch": 0.2679320070127296, "grad_norm": 0.7292288541793823, "learning_rate": 9.929315690900012e-05, "loss": 0.19792087078094484, "memory(GiB)": 122.96, "step": 3515, "token_acc": 0.9211363154547382, "train_speed(iter/s)": 0.237966 }, { "epoch": 0.26831313362298953, "grad_norm": 0.7580230832099915, "learning_rate": 9.929114931374837e-05, "loss": 0.1762663722038269, "memory(GiB)": 122.96, "step": 3520, "token_acc": 0.9299137399876771, "train_speed(iter/s)": 0.238022 }, { "epoch": 0.2686942602332495, "grad_norm": 1.1261988878250122, "learning_rate": 9.928913889187793e-05, "loss": 0.24382281303405762, "memory(GiB)": 122.96, "step": 3525, "token_acc": 0.9020307966971658, "train_speed(iter/s)": 0.238088 }, { "epoch": 0.2690753868435094, "grad_norm": 1.465376853942871, "learning_rate": 9.928712564350412e-05, "loss": 0.2871715545654297, "memory(GiB)": 122.96, "step": 3530, "token_acc": 0.9099351085253972, "train_speed(iter/s)": 0.238108 }, { "epoch": 0.26945651345376936, "grad_norm": 0.7649602293968201, "learning_rate": 9.928510956874238e-05, "loss": 0.18003413677215577, "memory(GiB)": 122.96, "step": 3535, "token_acc": 0.9056809905316825, "train_speed(iter/s)": 0.238216 }, { "epoch": 0.26983764006402927, "grad_norm": 1.277117371559143, "learning_rate": 9.928309066770829e-05, "loss": 0.21303415298461914, "memory(GiB)": 122.96, "step": 3540, "token_acc": 0.9257958801498127, "train_speed(iter/s)": 0.238272 }, { "epoch": 0.2702187666742892, "grad_norm": 0.9539743661880493, "learning_rate": 9.928106894051767e-05, "loss": 0.23207998275756836, "memory(GiB)": 122.96, "step": 3545, "token_acc": 0.916243654822335, "train_speed(iter/s)": 0.238356 }, { "epoch": 0.27059989328454914, "grad_norm": 2.1878561973571777, "learning_rate": 9.927904438728643e-05, "loss": 0.15991783142089844, "memory(GiB)": 122.96, "step": 3550, "token_acc": 0.9404517453798767, "train_speed(iter/s)": 0.238444 }, { "epoch": 0.27098101989480905, "grad_norm": 0.7157286405563354, "learning_rate": 9.92770170081307e-05, "loss": 0.21476442813873292, "memory(GiB)": 122.96, "step": 3555, "token_acc": 0.9276504297994269, "train_speed(iter/s)": 0.238521 }, { "epoch": 0.27136214650506896, "grad_norm": 0.6534081697463989, "learning_rate": 9.927498680316669e-05, "loss": 0.23420584201812744, "memory(GiB)": 122.96, "step": 3560, "token_acc": 0.9083296606434552, "train_speed(iter/s)": 0.238588 }, { "epoch": 0.2717432731153289, "grad_norm": 1.2813585996627808, "learning_rate": 9.927295377251087e-05, "loss": 0.25551881790161135, "memory(GiB)": 122.96, "step": 3565, "token_acc": 0.8934195725534309, "train_speed(iter/s)": 0.238674 }, { "epoch": 0.27212439972558883, "grad_norm": 2.6385676860809326, "learning_rate": 9.927091791627979e-05, "loss": 0.21334559917449952, "memory(GiB)": 122.96, "step": 3570, "token_acc": 0.933417614833544, "train_speed(iter/s)": 0.238739 }, { "epoch": 0.2725055263358488, "grad_norm": 1.2019128799438477, "learning_rate": 9.926887923459023e-05, "loss": 0.2088496208190918, "memory(GiB)": 122.96, "step": 3575, "token_acc": 0.9285811825754472, "train_speed(iter/s)": 0.23876 }, { "epoch": 0.2728866529461087, "grad_norm": 0.41644740104675293, "learning_rate": 9.926683772755907e-05, "loss": 0.27340919971466066, "memory(GiB)": 122.96, "step": 3580, "token_acc": 0.8933244621867377, "train_speed(iter/s)": 0.23885 }, { "epoch": 0.2732677795563686, "grad_norm": 1.343954086303711, "learning_rate": 9.926479339530338e-05, "loss": 0.19977269172668458, "memory(GiB)": 122.96, "step": 3585, "token_acc": 0.9330977620730271, "train_speed(iter/s)": 0.238946 }, { "epoch": 0.27364890616662857, "grad_norm": 1.3609980344772339, "learning_rate": 9.926274623794043e-05, "loss": 0.2534071445465088, "memory(GiB)": 122.96, "step": 3590, "token_acc": 0.9095354523227384, "train_speed(iter/s)": 0.239034 }, { "epoch": 0.2740300327768885, "grad_norm": 0.633151113986969, "learning_rate": 9.926069625558758e-05, "loss": 0.23006744384765626, "memory(GiB)": 122.96, "step": 3595, "token_acc": 0.9043081199126464, "train_speed(iter/s)": 0.239118 }, { "epoch": 0.2744111593871484, "grad_norm": 1.6816885471343994, "learning_rate": 9.92586434483624e-05, "loss": 0.21232750415802001, "memory(GiB)": 122.96, "step": 3600, "token_acc": 0.9212889593238246, "train_speed(iter/s)": 0.239199 }, { "epoch": 0.2744111593871484, "eval_loss": 0.1650475561618805, "eval_runtime": 180.7892, "eval_samples_per_second": 2.932, "eval_steps_per_second": 2.932, "eval_token_acc": 0.9163604602132401, "step": 3600 }, { "epoch": 0.27479228599740835, "grad_norm": 1.3818657398223877, "learning_rate": 9.925658781638259e-05, "loss": 0.319395112991333, "memory(GiB)": 122.96, "step": 3605, "token_acc": 0.915130125980439, "train_speed(iter/s)": 0.236451 }, { "epoch": 0.27517341260766826, "grad_norm": 4.460085391998291, "learning_rate": 9.925452935976607e-05, "loss": 0.1826668381690979, "memory(GiB)": 122.96, "step": 3610, "token_acc": 0.9064607352621274, "train_speed(iter/s)": 0.236538 }, { "epoch": 0.2755545392179282, "grad_norm": 1.6251816749572754, "learning_rate": 9.925246807863085e-05, "loss": 0.180306339263916, "memory(GiB)": 122.96, "step": 3615, "token_acc": 0.9257668711656442, "train_speed(iter/s)": 0.236649 }, { "epoch": 0.2759356658281881, "grad_norm": 0.5841586589813232, "learning_rate": 9.925040397309514e-05, "loss": 0.2512223243713379, "memory(GiB)": 122.96, "step": 3620, "token_acc": 0.9098647210822314, "train_speed(iter/s)": 0.236724 }, { "epoch": 0.27631679243844803, "grad_norm": 1.2649734020233154, "learning_rate": 9.924833704327732e-05, "loss": 0.2753078699111938, "memory(GiB)": 122.96, "step": 3625, "token_acc": 0.890748740100792, "train_speed(iter/s)": 0.236797 }, { "epoch": 0.276697919048708, "grad_norm": 1.1432018280029297, "learning_rate": 9.924626728929591e-05, "loss": 0.23215384483337403, "memory(GiB)": 122.96, "step": 3630, "token_acc": 0.9213973799126638, "train_speed(iter/s)": 0.236859 }, { "epoch": 0.2770790456589679, "grad_norm": 0.7991051077842712, "learning_rate": 9.92441947112696e-05, "loss": 0.16317994594573976, "memory(GiB)": 122.96, "step": 3635, "token_acc": 0.9264406151565685, "train_speed(iter/s)": 0.236926 }, { "epoch": 0.2774601722692278, "grad_norm": 1.2029961347579956, "learning_rate": 9.924211930931724e-05, "loss": 0.1973349094390869, "memory(GiB)": 122.96, "step": 3640, "token_acc": 0.8882870683818551, "train_speed(iter/s)": 0.237036 }, { "epoch": 0.2778412988794878, "grad_norm": 1.158065915107727, "learning_rate": 9.924004108355785e-05, "loss": 0.14963338375091553, "memory(GiB)": 122.96, "step": 3645, "token_acc": 0.9291154071470415, "train_speed(iter/s)": 0.237168 }, { "epoch": 0.2782224254897477, "grad_norm": 1.0671676397323608, "learning_rate": 9.923796003411061e-05, "loss": 0.1647646427154541, "memory(GiB)": 122.96, "step": 3650, "token_acc": 0.9395120298203998, "train_speed(iter/s)": 0.237236 }, { "epoch": 0.27860355210000765, "grad_norm": 0.6510996222496033, "learning_rate": 9.923587616109486e-05, "loss": 0.26507527828216554, "memory(GiB)": 122.96, "step": 3655, "token_acc": 0.8826979472140762, "train_speed(iter/s)": 0.23735 }, { "epoch": 0.27898467871026755, "grad_norm": 1.2515778541564941, "learning_rate": 9.923378946463009e-05, "loss": 0.2802137851715088, "memory(GiB)": 122.96, "step": 3660, "token_acc": 0.9048991354466859, "train_speed(iter/s)": 0.237417 }, { "epoch": 0.27936580532052746, "grad_norm": 1.608299732208252, "learning_rate": 9.923169994483596e-05, "loss": 0.2557238578796387, "memory(GiB)": 122.96, "step": 3665, "token_acc": 0.8941034897713598, "train_speed(iter/s)": 0.237527 }, { "epoch": 0.2797469319307874, "grad_norm": 0.6452434659004211, "learning_rate": 9.922960760183231e-05, "loss": 0.24433546066284179, "memory(GiB)": 122.96, "step": 3670, "token_acc": 0.9148444718201417, "train_speed(iter/s)": 0.23758 }, { "epoch": 0.28012805854104733, "grad_norm": 0.6923612952232361, "learning_rate": 9.922751243573911e-05, "loss": 0.2526653528213501, "memory(GiB)": 122.96, "step": 3675, "token_acc": 0.8977532368621478, "train_speed(iter/s)": 0.23766 }, { "epoch": 0.28050918515130724, "grad_norm": 1.4394978284835815, "learning_rate": 9.922541444667651e-05, "loss": 0.17130155563354493, "memory(GiB)": 122.96, "step": 3680, "token_acc": 0.918280485209619, "train_speed(iter/s)": 0.237755 }, { "epoch": 0.2808903117615672, "grad_norm": 1.5354546308517456, "learning_rate": 9.922331363476484e-05, "loss": 0.21357426643371583, "memory(GiB)": 122.96, "step": 3685, "token_acc": 0.9027603513174404, "train_speed(iter/s)": 0.237835 }, { "epoch": 0.2812714383718271, "grad_norm": 1.8061509132385254, "learning_rate": 9.922121000012454e-05, "loss": 0.25745654106140137, "memory(GiB)": 122.96, "step": 3690, "token_acc": 0.9044991511035654, "train_speed(iter/s)": 0.237891 }, { "epoch": 0.2816525649820871, "grad_norm": 0.9210394024848938, "learning_rate": 9.921910354287629e-05, "loss": 0.22261836528778076, "memory(GiB)": 122.96, "step": 3695, "token_acc": 0.918961335425881, "train_speed(iter/s)": 0.237927 }, { "epoch": 0.282033691592347, "grad_norm": 0.6888054013252258, "learning_rate": 9.921699426314083e-05, "loss": 0.19088282585144042, "memory(GiB)": 122.96, "step": 3700, "token_acc": 0.9281833215213796, "train_speed(iter/s)": 0.238 }, { "epoch": 0.2824148182026069, "grad_norm": 1.356488585472107, "learning_rate": 9.921488216103915e-05, "loss": 0.19038491249084472, "memory(GiB)": 122.96, "step": 3705, "token_acc": 0.9301692865779927, "train_speed(iter/s)": 0.238094 }, { "epoch": 0.28279594481286685, "grad_norm": 1.6840875148773193, "learning_rate": 9.921276723669236e-05, "loss": 0.2070256233215332, "memory(GiB)": 122.96, "step": 3710, "token_acc": 0.9185203094777563, "train_speed(iter/s)": 0.238199 }, { "epoch": 0.28317707142312676, "grad_norm": 0.9993820190429688, "learning_rate": 9.921064949022176e-05, "loss": 0.2563567399978638, "memory(GiB)": 122.96, "step": 3715, "token_acc": 0.907211961301671, "train_speed(iter/s)": 0.238284 }, { "epoch": 0.28355819803338667, "grad_norm": 0.8365705609321594, "learning_rate": 9.920852892174876e-05, "loss": 0.2962583065032959, "memory(GiB)": 122.96, "step": 3720, "token_acc": 0.8812075741336192, "train_speed(iter/s)": 0.238354 }, { "epoch": 0.28393932464364663, "grad_norm": 0.8835217356681824, "learning_rate": 9.920640553139498e-05, "loss": 0.19074175357818604, "memory(GiB)": 122.96, "step": 3725, "token_acc": 0.9218841086627025, "train_speed(iter/s)": 0.238338 }, { "epoch": 0.28432045125390654, "grad_norm": 0.5738128423690796, "learning_rate": 9.92042793192822e-05, "loss": 0.14768129587173462, "memory(GiB)": 122.96, "step": 3730, "token_acc": 0.9305974652987327, "train_speed(iter/s)": 0.238448 }, { "epoch": 0.2847015778641665, "grad_norm": 0.8134034872055054, "learning_rate": 9.920215028553233e-05, "loss": 0.26588714122772217, "memory(GiB)": 122.96, "step": 3735, "token_acc": 0.8905180840664711, "train_speed(iter/s)": 0.238537 }, { "epoch": 0.2850827044744264, "grad_norm": 0.9359223246574402, "learning_rate": 9.920001843026747e-05, "loss": 0.2341309070587158, "memory(GiB)": 122.96, "step": 3740, "token_acc": 0.9027423469387755, "train_speed(iter/s)": 0.238636 }, { "epoch": 0.2854638310846863, "grad_norm": 0.6174564361572266, "learning_rate": 9.919788375360988e-05, "loss": 0.23413732051849365, "memory(GiB)": 122.96, "step": 3745, "token_acc": 0.9065331425846447, "train_speed(iter/s)": 0.238707 }, { "epoch": 0.2858449576949463, "grad_norm": 0.9077537655830383, "learning_rate": 9.919574625568194e-05, "loss": 0.23686749935150148, "memory(GiB)": 122.96, "step": 3750, "token_acc": 0.9133136554295479, "train_speed(iter/s)": 0.238737 }, { "epoch": 0.2862260843052062, "grad_norm": 0.7610635757446289, "learning_rate": 9.919360593660625e-05, "loss": 0.16650717258453368, "memory(GiB)": 122.96, "step": 3755, "token_acc": 0.9236000906823849, "train_speed(iter/s)": 0.23881 }, { "epoch": 0.2866072109154661, "grad_norm": 0.5986360311508179, "learning_rate": 9.919146279650557e-05, "loss": 0.2183854341506958, "memory(GiB)": 122.96, "step": 3760, "token_acc": 0.8936312849162011, "train_speed(iter/s)": 0.238913 }, { "epoch": 0.28698833752572606, "grad_norm": 1.6000279188156128, "learning_rate": 9.918931683550275e-05, "loss": 0.2984266996383667, "memory(GiB)": 122.96, "step": 3765, "token_acc": 0.8878923766816144, "train_speed(iter/s)": 0.238999 }, { "epoch": 0.28736946413598596, "grad_norm": 0.5970331430435181, "learning_rate": 9.91871680537209e-05, "loss": 0.26599960327148436, "memory(GiB)": 122.96, "step": 3770, "token_acc": 0.9116186693147964, "train_speed(iter/s)": 0.239061 }, { "epoch": 0.2877505907462459, "grad_norm": 0.8425403833389282, "learning_rate": 9.91850164512832e-05, "loss": 0.22742514610290526, "memory(GiB)": 122.96, "step": 3775, "token_acc": 0.9037956970439042, "train_speed(iter/s)": 0.239136 }, { "epoch": 0.28813171735650583, "grad_norm": 1.4226981401443481, "learning_rate": 9.918286202831306e-05, "loss": 0.23566601276397706, "memory(GiB)": 122.96, "step": 3780, "token_acc": 0.9046314058646204, "train_speed(iter/s)": 0.239223 }, { "epoch": 0.28851284396676574, "grad_norm": 1.1607344150543213, "learning_rate": 9.918070478493401e-05, "loss": 0.20050652027130128, "memory(GiB)": 122.96, "step": 3785, "token_acc": 0.9032388663967611, "train_speed(iter/s)": 0.239334 }, { "epoch": 0.2888939705770257, "grad_norm": 0.902807891368866, "learning_rate": 9.917854472126978e-05, "loss": 0.2734500885009766, "memory(GiB)": 122.96, "step": 3790, "token_acc": 0.8924634014097235, "train_speed(iter/s)": 0.23941 }, { "epoch": 0.2892750971872856, "grad_norm": 1.7995579242706299, "learning_rate": 9.917638183744422e-05, "loss": 0.26230859756469727, "memory(GiB)": 122.96, "step": 3795, "token_acc": 0.89185667752443, "train_speed(iter/s)": 0.239481 }, { "epoch": 0.2896562237975455, "grad_norm": 0.9189042448997498, "learning_rate": 9.917421613358135e-05, "loss": 0.2629070520401001, "memory(GiB)": 122.96, "step": 3800, "token_acc": 0.904055390702275, "train_speed(iter/s)": 0.239589 }, { "epoch": 0.2896562237975455, "eval_loss": 0.1631811261177063, "eval_runtime": 175.635, "eval_samples_per_second": 3.018, "eval_steps_per_second": 3.018, "eval_token_acc": 0.9171360159026565, "step": 3800 }, { "epoch": 0.2900373504078055, "grad_norm": 0.9109868407249451, "learning_rate": 9.917204760980541e-05, "loss": 0.18599164485931396, "memory(GiB)": 122.96, "step": 3805, "token_acc": 0.9177390644605912, "train_speed(iter/s)": 0.237048 }, { "epoch": 0.2904184770180654, "grad_norm": 1.229832649230957, "learning_rate": 9.916987626624072e-05, "loss": 0.17642409801483155, "memory(GiB)": 122.96, "step": 3810, "token_acc": 0.9292261777872521, "train_speed(iter/s)": 0.237141 }, { "epoch": 0.29079960362832535, "grad_norm": 0.8242142200469971, "learning_rate": 9.916770210301178e-05, "loss": 0.13773136138916015, "memory(GiB)": 122.96, "step": 3815, "token_acc": 0.9271480608527962, "train_speed(iter/s)": 0.237237 }, { "epoch": 0.29118073023858526, "grad_norm": 1.1672394275665283, "learning_rate": 9.916552512024331e-05, "loss": 0.23176062107086182, "memory(GiB)": 122.96, "step": 3820, "token_acc": 0.9193378480060196, "train_speed(iter/s)": 0.23727 }, { "epoch": 0.29156185684884517, "grad_norm": 0.7400051951408386, "learning_rate": 9.916334531806013e-05, "loss": 0.23980484008789063, "memory(GiB)": 122.96, "step": 3825, "token_acc": 0.9237995824634656, "train_speed(iter/s)": 0.23735 }, { "epoch": 0.29194298345910513, "grad_norm": 0.9316531419754028, "learning_rate": 9.916116269658724e-05, "loss": 0.2162524700164795, "memory(GiB)": 122.96, "step": 3830, "token_acc": 0.9218097957390732, "train_speed(iter/s)": 0.237436 }, { "epoch": 0.29232411006936504, "grad_norm": 1.165887713432312, "learning_rate": 9.91589772559498e-05, "loss": 0.22804114818572999, "memory(GiB)": 122.96, "step": 3835, "token_acc": 0.9287211740041929, "train_speed(iter/s)": 0.23751 }, { "epoch": 0.29270523667962495, "grad_norm": 1.6244484186172485, "learning_rate": 9.915678899627315e-05, "loss": 0.1847672462463379, "memory(GiB)": 122.96, "step": 3840, "token_acc": 0.9242572641201436, "train_speed(iter/s)": 0.237599 }, { "epoch": 0.2930863632898849, "grad_norm": 1.3747047185897827, "learning_rate": 9.915459791768275e-05, "loss": 0.18626983165740968, "memory(GiB)": 122.96, "step": 3845, "token_acc": 0.9316979316979317, "train_speed(iter/s)": 0.237659 }, { "epoch": 0.2934674899001448, "grad_norm": 0.8788254857063293, "learning_rate": 9.915240402030429e-05, "loss": 0.20370087623596192, "memory(GiB)": 122.96, "step": 3850, "token_acc": 0.9231667445119103, "train_speed(iter/s)": 0.237747 }, { "epoch": 0.2938486165104048, "grad_norm": 1.5376348495483398, "learning_rate": 9.915020730426354e-05, "loss": 0.22718002796173095, "memory(GiB)": 122.96, "step": 3855, "token_acc": 0.9056115107913669, "train_speed(iter/s)": 0.237847 }, { "epoch": 0.2942297431206647, "grad_norm": 1.2390118837356567, "learning_rate": 9.914800776968649e-05, "loss": 0.1832464814186096, "memory(GiB)": 122.96, "step": 3860, "token_acc": 0.9295921924015337, "train_speed(iter/s)": 0.237947 }, { "epoch": 0.2946108697309246, "grad_norm": 1.525604009628296, "learning_rate": 9.914580541669926e-05, "loss": 0.24632763862609863, "memory(GiB)": 122.96, "step": 3865, "token_acc": 0.9128651973347002, "train_speed(iter/s)": 0.238002 }, { "epoch": 0.29499199634118456, "grad_norm": 0.7186420559883118, "learning_rate": 9.914360024542816e-05, "loss": 0.24633004665374755, "memory(GiB)": 122.96, "step": 3870, "token_acc": 0.9100570753788624, "train_speed(iter/s)": 0.238064 }, { "epoch": 0.29537312295144447, "grad_norm": 1.4764314889907837, "learning_rate": 9.914139225599963e-05, "loss": 0.2644335746765137, "memory(GiB)": 122.96, "step": 3875, "token_acc": 0.9050632911392406, "train_speed(iter/s)": 0.238137 }, { "epoch": 0.2957542495617044, "grad_norm": 0.9560542106628418, "learning_rate": 9.91391814485403e-05, "loss": 0.21223621368408202, "memory(GiB)": 122.96, "step": 3880, "token_acc": 0.912027199320017, "train_speed(iter/s)": 0.238195 }, { "epoch": 0.29613537617196434, "grad_norm": 1.5893220901489258, "learning_rate": 9.913696782317697e-05, "loss": 0.24758048057556153, "memory(GiB)": 122.96, "step": 3885, "token_acc": 0.9194982534137821, "train_speed(iter/s)": 0.23823 }, { "epoch": 0.29651650278222424, "grad_norm": 0.7844113111495972, "learning_rate": 9.913475138003654e-05, "loss": 0.2554394960403442, "memory(GiB)": 122.96, "step": 3890, "token_acc": 0.8995180722891566, "train_speed(iter/s)": 0.238256 }, { "epoch": 0.2968976293924842, "grad_norm": 0.8095011115074158, "learning_rate": 9.913253211924614e-05, "loss": 0.1777181386947632, "memory(GiB)": 122.96, "step": 3895, "token_acc": 0.9320575842696629, "train_speed(iter/s)": 0.238309 }, { "epoch": 0.2972787560027441, "grad_norm": 1.1218451261520386, "learning_rate": 9.913031004093301e-05, "loss": 0.1919344663619995, "memory(GiB)": 122.96, "step": 3900, "token_acc": 0.9157159884215903, "train_speed(iter/s)": 0.238375 }, { "epoch": 0.297659882613004, "grad_norm": 0.9153861999511719, "learning_rate": 9.91280851452246e-05, "loss": 0.16441253423690796, "memory(GiB)": 122.96, "step": 3905, "token_acc": 0.9219895287958115, "train_speed(iter/s)": 0.238489 }, { "epoch": 0.298041009223264, "grad_norm": 0.7768386006355286, "learning_rate": 9.912585743224849e-05, "loss": 0.18797452449798585, "memory(GiB)": 122.96, "step": 3910, "token_acc": 0.9125, "train_speed(iter/s)": 0.23857 }, { "epoch": 0.2984221358335239, "grad_norm": 1.6011404991149902, "learning_rate": 9.912362690213244e-05, "loss": 0.14880895614624023, "memory(GiB)": 122.96, "step": 3915, "token_acc": 0.9400981151562097, "train_speed(iter/s)": 0.238646 }, { "epoch": 0.2988032624437838, "grad_norm": 0.5681048035621643, "learning_rate": 9.912139355500434e-05, "loss": 0.1459873676300049, "memory(GiB)": 122.96, "step": 3920, "token_acc": 0.930111038536904, "train_speed(iter/s)": 0.23875 }, { "epoch": 0.29918438905404376, "grad_norm": 0.9166677594184875, "learning_rate": 9.911915739099227e-05, "loss": 0.15983959436416625, "memory(GiB)": 122.96, "step": 3925, "token_acc": 0.9332734217029881, "train_speed(iter/s)": 0.238843 }, { "epoch": 0.29956551566430367, "grad_norm": 1.5516111850738525, "learning_rate": 9.911691841022446e-05, "loss": 0.35452492237091066, "memory(GiB)": 122.96, "step": 3930, "token_acc": 0.8596247394023627, "train_speed(iter/s)": 0.238911 }, { "epoch": 0.29994664227456364, "grad_norm": 0.9620996117591858, "learning_rate": 9.911467661282931e-05, "loss": 0.2383397102355957, "memory(GiB)": 122.96, "step": 3935, "token_acc": 0.9054325955734407, "train_speed(iter/s)": 0.238962 }, { "epoch": 0.30032776888482354, "grad_norm": 1.067201852798462, "learning_rate": 9.911243199893537e-05, "loss": 0.2852769374847412, "memory(GiB)": 122.96, "step": 3940, "token_acc": 0.8894382022471911, "train_speed(iter/s)": 0.239044 }, { "epoch": 0.30070889549508345, "grad_norm": 1.2879979610443115, "learning_rate": 9.911018456867139e-05, "loss": 0.23722679615020753, "memory(GiB)": 122.96, "step": 3945, "token_acc": 0.9112492933860938, "train_speed(iter/s)": 0.239112 }, { "epoch": 0.3010900221053434, "grad_norm": 0.7311355471611023, "learning_rate": 9.910793432216618e-05, "loss": 0.1851056694984436, "memory(GiB)": 122.96, "step": 3950, "token_acc": 0.9236754353464246, "train_speed(iter/s)": 0.239212 }, { "epoch": 0.3014711487156033, "grad_norm": 0.6522814631462097, "learning_rate": 9.910568125954886e-05, "loss": 0.22263648509979247, "memory(GiB)": 122.96, "step": 3955, "token_acc": 0.9063520871143376, "train_speed(iter/s)": 0.239259 }, { "epoch": 0.30185227532586323, "grad_norm": 0.8051279783248901, "learning_rate": 9.91034253809486e-05, "loss": 0.17572932243347167, "memory(GiB)": 122.96, "step": 3960, "token_acc": 0.9254674077817079, "train_speed(iter/s)": 0.239332 }, { "epoch": 0.3022334019361232, "grad_norm": 1.3910726308822632, "learning_rate": 9.910116668649474e-05, "loss": 0.21699295043945313, "memory(GiB)": 122.96, "step": 3965, "token_acc": 0.9038563127311147, "train_speed(iter/s)": 0.239397 }, { "epoch": 0.3026145285463831, "grad_norm": 1.4917962551116943, "learning_rate": 9.909890517631684e-05, "loss": 0.2199930429458618, "memory(GiB)": 122.96, "step": 3970, "token_acc": 0.9206431535269709, "train_speed(iter/s)": 0.239477 }, { "epoch": 0.30299565515664306, "grad_norm": 1.0026804208755493, "learning_rate": 9.909664085054458e-05, "loss": 0.20893681049346924, "memory(GiB)": 122.96, "step": 3975, "token_acc": 0.9345747357825868, "train_speed(iter/s)": 0.23954 }, { "epoch": 0.30337678176690297, "grad_norm": 1.9620113372802734, "learning_rate": 9.90943737093078e-05, "loss": 0.22319746017456055, "memory(GiB)": 122.96, "step": 3980, "token_acc": 0.9361233480176211, "train_speed(iter/s)": 0.239639 }, { "epoch": 0.3037579083771629, "grad_norm": 0.5851895213127136, "learning_rate": 9.909210375273651e-05, "loss": 0.21058027744293212, "memory(GiB)": 122.96, "step": 3985, "token_acc": 0.8931222167243938, "train_speed(iter/s)": 0.239749 }, { "epoch": 0.30413903498742284, "grad_norm": 1.386438012123108, "learning_rate": 9.90898309809609e-05, "loss": 0.1244768500328064, "memory(GiB)": 122.96, "step": 3990, "token_acc": 0.9324258629071464, "train_speed(iter/s)": 0.23985 }, { "epoch": 0.30452016159768275, "grad_norm": 1.069390892982483, "learning_rate": 9.908755539411127e-05, "loss": 0.2152198076248169, "memory(GiB)": 122.96, "step": 3995, "token_acc": 0.9132169576059851, "train_speed(iter/s)": 0.239957 }, { "epoch": 0.30490128820794266, "grad_norm": 0.887519121170044, "learning_rate": 9.908527699231814e-05, "loss": 0.19948571920394897, "memory(GiB)": 122.96, "step": 4000, "token_acc": 0.9277817832251862, "train_speed(iter/s)": 0.239981 }, { "epoch": 0.30490128820794266, "eval_loss": 0.16043721139431, "eval_runtime": 182.6084, "eval_samples_per_second": 2.902, "eval_steps_per_second": 2.902, "eval_token_acc": 0.9189356062887778, "step": 4000 }, { "epoch": 0.3052824148182026, "grad_norm": 1.6303051710128784, "learning_rate": 9.908299577571214e-05, "loss": 0.23665983676910402, "memory(GiB)": 122.96, "step": 4005, "token_acc": 0.9190054856264991, "train_speed(iter/s)": 0.237416 }, { "epoch": 0.3056635414284625, "grad_norm": 1.4951375722885132, "learning_rate": 9.908071174442412e-05, "loss": 0.10649605989456176, "memory(GiB)": 122.96, "step": 4010, "token_acc": 0.9595216191352346, "train_speed(iter/s)": 0.237524 }, { "epoch": 0.3060446680387225, "grad_norm": 0.6984847187995911, "learning_rate": 9.907842489858506e-05, "loss": 0.22793295383453369, "memory(GiB)": 122.96, "step": 4015, "token_acc": 0.9122844827586207, "train_speed(iter/s)": 0.237557 }, { "epoch": 0.3064257946489824, "grad_norm": 1.0836694240570068, "learning_rate": 9.907613523832606e-05, "loss": 0.17504299879074098, "memory(GiB)": 122.96, "step": 4020, "token_acc": 0.9285714285714286, "train_speed(iter/s)": 0.237637 }, { "epoch": 0.3068069212592423, "grad_norm": 1.5219589471817017, "learning_rate": 9.907384276377845e-05, "loss": 0.20650815963745117, "memory(GiB)": 122.96, "step": 4025, "token_acc": 0.9077343421605717, "train_speed(iter/s)": 0.237711 }, { "epoch": 0.30718804786950227, "grad_norm": 1.3062764406204224, "learning_rate": 9.907154747507369e-05, "loss": 0.17307276725769044, "memory(GiB)": 122.96, "step": 4030, "token_acc": 0.9208571428571428, "train_speed(iter/s)": 0.237744 }, { "epoch": 0.3075691744797622, "grad_norm": 0.8394153714179993, "learning_rate": 9.906924937234341e-05, "loss": 0.21304497718811036, "memory(GiB)": 122.96, "step": 4035, "token_acc": 0.911119661408234, "train_speed(iter/s)": 0.237803 }, { "epoch": 0.3079503010900221, "grad_norm": 0.9296163320541382, "learning_rate": 9.906694845571938e-05, "loss": 0.21893436908721925, "memory(GiB)": 122.96, "step": 4040, "token_acc": 0.9153967419863374, "train_speed(iter/s)": 0.237869 }, { "epoch": 0.30833142770028205, "grad_norm": 1.2029460668563843, "learning_rate": 9.906464472533354e-05, "loss": 0.16982815265655518, "memory(GiB)": 122.96, "step": 4045, "token_acc": 0.9210423959548814, "train_speed(iter/s)": 0.237973 }, { "epoch": 0.30871255431054195, "grad_norm": 0.9618740677833557, "learning_rate": 9.906233818131804e-05, "loss": 0.22065658569335939, "memory(GiB)": 122.96, "step": 4050, "token_acc": 0.9222222222222223, "train_speed(iter/s)": 0.238056 }, { "epoch": 0.3090936809208019, "grad_norm": 0.8810694813728333, "learning_rate": 9.906002882380511e-05, "loss": 0.19825893640518188, "memory(GiB)": 122.96, "step": 4055, "token_acc": 0.9174265450861195, "train_speed(iter/s)": 0.238135 }, { "epoch": 0.3094748075310618, "grad_norm": 0.9672714471817017, "learning_rate": 9.905771665292718e-05, "loss": 0.26341843605041504, "memory(GiB)": 122.96, "step": 4060, "token_acc": 0.9019681584555598, "train_speed(iter/s)": 0.238152 }, { "epoch": 0.30985593414132173, "grad_norm": 1.217863917350769, "learning_rate": 9.905540166881688e-05, "loss": 0.23597030639648436, "memory(GiB)": 122.96, "step": 4065, "token_acc": 0.9129076352274946, "train_speed(iter/s)": 0.238204 }, { "epoch": 0.3102370607515817, "grad_norm": 1.4500690698623657, "learning_rate": 9.905308387160693e-05, "loss": 0.24470624923706055, "memory(GiB)": 122.96, "step": 4070, "token_acc": 0.8967345799609266, "train_speed(iter/s)": 0.23829 }, { "epoch": 0.3106181873618416, "grad_norm": 2.103872537612915, "learning_rate": 9.905076326143025e-05, "loss": 0.2063844919204712, "memory(GiB)": 122.96, "step": 4075, "token_acc": 0.9235822202758923, "train_speed(iter/s)": 0.23836 }, { "epoch": 0.3109993139721015, "grad_norm": 1.673287272453308, "learning_rate": 9.904843983841992e-05, "loss": 0.2196591854095459, "memory(GiB)": 122.96, "step": 4080, "token_acc": 0.912630579297246, "train_speed(iter/s)": 0.238451 }, { "epoch": 0.3113804405823615, "grad_norm": 1.0091794729232788, "learning_rate": 9.904611360270918e-05, "loss": 0.19457906484603882, "memory(GiB)": 122.96, "step": 4085, "token_acc": 0.9270530319982099, "train_speed(iter/s)": 0.23852 }, { "epoch": 0.3117615671926214, "grad_norm": 1.0252200365066528, "learning_rate": 9.904378455443142e-05, "loss": 0.23503849506378174, "memory(GiB)": 122.96, "step": 4090, "token_acc": 0.9040796019900498, "train_speed(iter/s)": 0.23859 }, { "epoch": 0.31214269380288134, "grad_norm": 1.831763744354248, "learning_rate": 9.904145269372023e-05, "loss": 0.14758527278900146, "memory(GiB)": 122.96, "step": 4095, "token_acc": 0.929811689899731, "train_speed(iter/s)": 0.238666 }, { "epoch": 0.31252382041314125, "grad_norm": 2.021737575531006, "learning_rate": 9.903911802070929e-05, "loss": 0.22610418796539306, "memory(GiB)": 122.96, "step": 4100, "token_acc": 0.9239896180941787, "train_speed(iter/s)": 0.238707 }, { "epoch": 0.31290494702340116, "grad_norm": 1.4990754127502441, "learning_rate": 9.90367805355325e-05, "loss": 0.2990562438964844, "memory(GiB)": 122.96, "step": 4105, "token_acc": 0.8825410360264336, "train_speed(iter/s)": 0.238781 }, { "epoch": 0.3132860736336611, "grad_norm": 0.8787904381752014, "learning_rate": 9.90344402383239e-05, "loss": 0.24256362915039062, "memory(GiB)": 122.96, "step": 4110, "token_acc": 0.8989755264655663, "train_speed(iter/s)": 0.238876 }, { "epoch": 0.31366720024392103, "grad_norm": 0.6961368918418884, "learning_rate": 9.903209712921771e-05, "loss": 0.1433349609375, "memory(GiB)": 122.96, "step": 4115, "token_acc": 0.9423076923076923, "train_speed(iter/s)": 0.238974 }, { "epoch": 0.31404832685418094, "grad_norm": 0.8254474401473999, "learning_rate": 9.902975120834827e-05, "loss": 0.18326587677001954, "memory(GiB)": 122.96, "step": 4120, "token_acc": 0.9200171086398631, "train_speed(iter/s)": 0.239051 }, { "epoch": 0.3144294534644409, "grad_norm": 0.6695936918258667, "learning_rate": 9.902740247585015e-05, "loss": 0.19578639268875123, "memory(GiB)": 122.96, "step": 4125, "token_acc": 0.9091247672253259, "train_speed(iter/s)": 0.239152 }, { "epoch": 0.3148105800747008, "grad_norm": 1.3632125854492188, "learning_rate": 9.902505093185801e-05, "loss": 0.20988306999206544, "memory(GiB)": 122.96, "step": 4130, "token_acc": 0.9130069930069931, "train_speed(iter/s)": 0.239227 }, { "epoch": 0.31519170668496077, "grad_norm": 0.6102461218833923, "learning_rate": 9.90226965765067e-05, "loss": 0.12283908128738404, "memory(GiB)": 122.96, "step": 4135, "token_acc": 0.9480621680110172, "train_speed(iter/s)": 0.239277 }, { "epoch": 0.3155728332952207, "grad_norm": 0.7741445302963257, "learning_rate": 9.90203394099312e-05, "loss": 0.22585663795471192, "memory(GiB)": 122.96, "step": 4140, "token_acc": 0.8958496476115897, "train_speed(iter/s)": 0.239382 }, { "epoch": 0.3159539599054806, "grad_norm": 1.4226940870285034, "learning_rate": 9.901797943226677e-05, "loss": 0.15612078905105592, "memory(GiB)": 122.96, "step": 4145, "token_acc": 0.9186152444579411, "train_speed(iter/s)": 0.239486 }, { "epoch": 0.31633508651574055, "grad_norm": 1.1313964128494263, "learning_rate": 9.901561664364866e-05, "loss": 0.21535608768463135, "memory(GiB)": 122.96, "step": 4150, "token_acc": 0.9009139009139009, "train_speed(iter/s)": 0.239568 }, { "epoch": 0.31671621312600046, "grad_norm": 1.2719677686691284, "learning_rate": 9.901325104421239e-05, "loss": 0.2122407913208008, "memory(GiB)": 122.96, "step": 4155, "token_acc": 0.9018162674387997, "train_speed(iter/s)": 0.239632 }, { "epoch": 0.31709733973626036, "grad_norm": 1.3929712772369385, "learning_rate": 9.901088263409364e-05, "loss": 0.2670113563537598, "memory(GiB)": 122.96, "step": 4160, "token_acc": 0.8989389459036354, "train_speed(iter/s)": 0.239675 }, { "epoch": 0.3174784663465203, "grad_norm": 1.2345364093780518, "learning_rate": 9.900851141342819e-05, "loss": 0.18694796562194824, "memory(GiB)": 122.96, "step": 4165, "token_acc": 0.9182292673497003, "train_speed(iter/s)": 0.239759 }, { "epoch": 0.31785959295678023, "grad_norm": 0.605176568031311, "learning_rate": 9.900613738235207e-05, "loss": 0.23480262756347656, "memory(GiB)": 122.96, "step": 4170, "token_acc": 0.9175672279467203, "train_speed(iter/s)": 0.239777 }, { "epoch": 0.3182407195670402, "grad_norm": 1.4108123779296875, "learning_rate": 9.900376054100135e-05, "loss": 0.18888258934020996, "memory(GiB)": 122.96, "step": 4175, "token_acc": 0.9247949746990054, "train_speed(iter/s)": 0.239841 }, { "epoch": 0.3186218461773001, "grad_norm": 0.6928850412368774, "learning_rate": 9.900138088951239e-05, "loss": 0.11539915800094605, "memory(GiB)": 122.96, "step": 4180, "token_acc": 0.9387063119457486, "train_speed(iter/s)": 0.239919 }, { "epoch": 0.31900297278756, "grad_norm": 0.8720833659172058, "learning_rate": 9.899899842802163e-05, "loss": 0.2850198745727539, "memory(GiB)": 122.96, "step": 4185, "token_acc": 0.8942042318307267, "train_speed(iter/s)": 0.240003 }, { "epoch": 0.31938409939782, "grad_norm": 1.1678751707077026, "learning_rate": 9.899661315666568e-05, "loss": 0.1919572353363037, "memory(GiB)": 122.96, "step": 4190, "token_acc": 0.926303175554224, "train_speed(iter/s)": 0.240084 }, { "epoch": 0.3197652260080799, "grad_norm": 1.3113218545913696, "learning_rate": 9.899422507558136e-05, "loss": 0.24949705600738525, "memory(GiB)": 122.96, "step": 4195, "token_acc": 0.9082515868436237, "train_speed(iter/s)": 0.240125 }, { "epoch": 0.3201463526183398, "grad_norm": 0.7104855179786682, "learning_rate": 9.899183418490559e-05, "loss": 0.17391358613967894, "memory(GiB)": 122.96, "step": 4200, "token_acc": 0.9365617433414044, "train_speed(iter/s)": 0.240128 }, { "epoch": 0.3201463526183398, "eval_loss": 0.15915754437446594, "eval_runtime": 171.6959, "eval_samples_per_second": 3.087, "eval_steps_per_second": 3.087, "eval_token_acc": 0.9213149810252395, "step": 4200 }, { "epoch": 0.32052747922859975, "grad_norm": 0.8591658473014832, "learning_rate": 9.898944048477546e-05, "loss": 0.2302011251449585, "memory(GiB)": 122.96, "step": 4205, "token_acc": 0.920938530493467, "train_speed(iter/s)": 0.237883 }, { "epoch": 0.32090860583885966, "grad_norm": 0.9804273843765259, "learning_rate": 9.898704397532827e-05, "loss": 0.20005967617034912, "memory(GiB)": 122.96, "step": 4210, "token_acc": 0.9204168640454761, "train_speed(iter/s)": 0.237981 }, { "epoch": 0.3212897324491196, "grad_norm": 1.5433894395828247, "learning_rate": 9.898464465670143e-05, "loss": 0.3182472467422485, "memory(GiB)": 122.96, "step": 4215, "token_acc": 0.882266833960686, "train_speed(iter/s)": 0.238061 }, { "epoch": 0.32167085905937953, "grad_norm": 4.612955570220947, "learning_rate": 9.898224252903254e-05, "loss": 0.2517979145050049, "memory(GiB)": 122.96, "step": 4220, "token_acc": 0.8992958899623383, "train_speed(iter/s)": 0.238113 }, { "epoch": 0.32205198566963944, "grad_norm": 1.3339136838912964, "learning_rate": 9.897983759245934e-05, "loss": 0.22427866458892823, "memory(GiB)": 122.96, "step": 4225, "token_acc": 0.9130039750141965, "train_speed(iter/s)": 0.238155 }, { "epoch": 0.3224331122798994, "grad_norm": 0.9820786118507385, "learning_rate": 9.897742984711976e-05, "loss": 0.22492904663085939, "memory(GiB)": 122.96, "step": 4230, "token_acc": 0.9160021265284424, "train_speed(iter/s)": 0.238238 }, { "epoch": 0.3228142388901593, "grad_norm": 0.8684553503990173, "learning_rate": 9.897501929315185e-05, "loss": 0.1517077922821045, "memory(GiB)": 122.96, "step": 4235, "token_acc": 0.9448187359640681, "train_speed(iter/s)": 0.238319 }, { "epoch": 0.3231953655004192, "grad_norm": 1.1157253980636597, "learning_rate": 9.897260593069384e-05, "loss": 0.23897197246551513, "memory(GiB)": 122.96, "step": 4240, "token_acc": 0.9075734927752865, "train_speed(iter/s)": 0.238398 }, { "epoch": 0.3235764921106792, "grad_norm": 0.6829708218574524, "learning_rate": 9.897018975988417e-05, "loss": 0.21777374744415284, "memory(GiB)": 122.96, "step": 4245, "token_acc": 0.9067015063018752, "train_speed(iter/s)": 0.238445 }, { "epoch": 0.3239576187209391, "grad_norm": 0.875093400478363, "learning_rate": 9.896777078086135e-05, "loss": 0.22612643241882324, "memory(GiB)": 122.96, "step": 4250, "token_acc": 0.9192169837331128, "train_speed(iter/s)": 0.238529 }, { "epoch": 0.32433874533119905, "grad_norm": 0.7694298028945923, "learning_rate": 9.896534899376413e-05, "loss": 0.21973307132720948, "memory(GiB)": 122.96, "step": 4255, "token_acc": 0.9201414468837483, "train_speed(iter/s)": 0.238564 }, { "epoch": 0.32471987194145896, "grad_norm": 1.1033087968826294, "learning_rate": 9.896292439873135e-05, "loss": 0.19122434854507447, "memory(GiB)": 122.96, "step": 4260, "token_acc": 0.9279538904899135, "train_speed(iter/s)": 0.23866 }, { "epoch": 0.32510099855171887, "grad_norm": 0.736356794834137, "learning_rate": 9.89604969959021e-05, "loss": 0.19516137838363648, "memory(GiB)": 122.96, "step": 4265, "token_acc": 0.9250207813798836, "train_speed(iter/s)": 0.238707 }, { "epoch": 0.32548212516197883, "grad_norm": 1.6150696277618408, "learning_rate": 9.895806678541553e-05, "loss": 0.20950119495391845, "memory(GiB)": 122.96, "step": 4270, "token_acc": 0.930245084837059, "train_speed(iter/s)": 0.238739 }, { "epoch": 0.32586325177223874, "grad_norm": 1.345765471458435, "learning_rate": 9.895563376741103e-05, "loss": 0.17677361965179444, "memory(GiB)": 122.96, "step": 4275, "token_acc": 0.9255093378607809, "train_speed(iter/s)": 0.238802 }, { "epoch": 0.32624437838249865, "grad_norm": 1.126879334449768, "learning_rate": 9.895319794202811e-05, "loss": 0.19305839538574218, "memory(GiB)": 122.96, "step": 4280, "token_acc": 0.9002966005019393, "train_speed(iter/s)": 0.23888 }, { "epoch": 0.3266255049927586, "grad_norm": 1.7144269943237305, "learning_rate": 9.895075930940647e-05, "loss": 0.2019503593444824, "memory(GiB)": 122.96, "step": 4285, "token_acc": 0.9059615384615385, "train_speed(iter/s)": 0.238955 }, { "epoch": 0.3270066316030185, "grad_norm": 0.8250619769096375, "learning_rate": 9.894831786968592e-05, "loss": 0.25740005970001223, "memory(GiB)": 122.96, "step": 4290, "token_acc": 0.9076077318673523, "train_speed(iter/s)": 0.239011 }, { "epoch": 0.3273877582132784, "grad_norm": 1.1889468431472778, "learning_rate": 9.894587362300652e-05, "loss": 0.21496527194976806, "memory(GiB)": 122.96, "step": 4295, "token_acc": 0.908001546192501, "train_speed(iter/s)": 0.239101 }, { "epoch": 0.3277688848235384, "grad_norm": 1.0302143096923828, "learning_rate": 9.894342656950839e-05, "loss": 0.22259387969970704, "memory(GiB)": 122.96, "step": 4300, "token_acc": 0.9224635222259925, "train_speed(iter/s)": 0.239151 }, { "epoch": 0.3281500114337983, "grad_norm": 1.4209058284759521, "learning_rate": 9.894097670933186e-05, "loss": 0.19236416816711427, "memory(GiB)": 122.96, "step": 4305, "token_acc": 0.9012096774193549, "train_speed(iter/s)": 0.239238 }, { "epoch": 0.32853113804405826, "grad_norm": 0.8729272484779358, "learning_rate": 9.893852404261743e-05, "loss": 0.2333930492401123, "memory(GiB)": 122.96, "step": 4310, "token_acc": 0.9048792508624939, "train_speed(iter/s)": 0.239328 }, { "epoch": 0.32891226465431816, "grad_norm": 0.7527387738227844, "learning_rate": 9.893606856950578e-05, "loss": 0.1638605237007141, "memory(GiB)": 122.96, "step": 4315, "token_acc": 0.9310784036229833, "train_speed(iter/s)": 0.239356 }, { "epoch": 0.3292933912645781, "grad_norm": 0.8294212818145752, "learning_rate": 9.893361029013766e-05, "loss": 0.1715386390686035, "memory(GiB)": 122.96, "step": 4320, "token_acc": 0.9263589743589744, "train_speed(iter/s)": 0.239419 }, { "epoch": 0.32967451787483804, "grad_norm": 0.8740025162696838, "learning_rate": 9.893114920465408e-05, "loss": 0.20924828052520753, "memory(GiB)": 122.96, "step": 4325, "token_acc": 0.909813407049067, "train_speed(iter/s)": 0.239509 }, { "epoch": 0.33005564448509794, "grad_norm": 0.07779546082019806, "learning_rate": 9.892868531319615e-05, "loss": 0.19981666803359985, "memory(GiB)": 122.96, "step": 4330, "token_acc": 0.9161918328584995, "train_speed(iter/s)": 0.239585 }, { "epoch": 0.33043677109535785, "grad_norm": 0.7961308360099792, "learning_rate": 9.892621861590517e-05, "loss": 0.1883603811264038, "memory(GiB)": 122.96, "step": 4335, "token_acc": 0.907776560788609, "train_speed(iter/s)": 0.239657 }, { "epoch": 0.3308178977056178, "grad_norm": 1.0699453353881836, "learning_rate": 9.892374911292261e-05, "loss": 0.16226965188980103, "memory(GiB)": 122.96, "step": 4340, "token_acc": 0.9333172263831844, "train_speed(iter/s)": 0.239729 }, { "epoch": 0.3311990243158777, "grad_norm": 1.3683314323425293, "learning_rate": 9.892127680439008e-05, "loss": 0.13513892889022827, "memory(GiB)": 122.96, "step": 4345, "token_acc": 0.9352839931153184, "train_speed(iter/s)": 0.239821 }, { "epoch": 0.3315801509261377, "grad_norm": 0.7663138508796692, "learning_rate": 9.891880169044934e-05, "loss": 0.2122182607650757, "memory(GiB)": 122.96, "step": 4350, "token_acc": 0.9037089871611983, "train_speed(iter/s)": 0.239896 }, { "epoch": 0.3319612775363976, "grad_norm": 1.2376339435577393, "learning_rate": 9.891632377124232e-05, "loss": 0.24187083244323732, "memory(GiB)": 122.96, "step": 4355, "token_acc": 0.9202274573517466, "train_speed(iter/s)": 0.239967 }, { "epoch": 0.3323424041466575, "grad_norm": 0.6914349794387817, "learning_rate": 9.891384304691115e-05, "loss": 0.18405020236968994, "memory(GiB)": 122.96, "step": 4360, "token_acc": 0.9383336555190411, "train_speed(iter/s)": 0.240036 }, { "epoch": 0.33272353075691746, "grad_norm": 1.0132614374160767, "learning_rate": 9.891135951759807e-05, "loss": 0.23341946601867675, "memory(GiB)": 122.96, "step": 4365, "token_acc": 0.9023277042446372, "train_speed(iter/s)": 0.240131 }, { "epoch": 0.33310465736717737, "grad_norm": 0.8744183778762817, "learning_rate": 9.890887318344548e-05, "loss": 0.2776163578033447, "memory(GiB)": 122.96, "step": 4370, "token_acc": 0.8745556119857796, "train_speed(iter/s)": 0.240221 }, { "epoch": 0.3334857839774373, "grad_norm": 0.8210816979408264, "learning_rate": 9.890638404459599e-05, "loss": 0.20449295043945312, "memory(GiB)": 122.96, "step": 4375, "token_acc": 0.9200333889816361, "train_speed(iter/s)": 0.24026 }, { "epoch": 0.33386691058769724, "grad_norm": 0.9798177480697632, "learning_rate": 9.890389210119233e-05, "loss": 0.19327890872955322, "memory(GiB)": 122.96, "step": 4380, "token_acc": 0.9149623250807319, "train_speed(iter/s)": 0.240345 }, { "epoch": 0.33424803719795715, "grad_norm": 11.674511909484863, "learning_rate": 9.89013973533774e-05, "loss": 0.24963245391845704, "memory(GiB)": 122.96, "step": 4385, "token_acc": 0.8872536136662287, "train_speed(iter/s)": 0.240429 }, { "epoch": 0.3346291638082171, "grad_norm": 1.0615803003311157, "learning_rate": 9.889889980129425e-05, "loss": 0.16933200359344483, "memory(GiB)": 122.96, "step": 4390, "token_acc": 0.9112554112554112, "train_speed(iter/s)": 0.240514 }, { "epoch": 0.335010290418477, "grad_norm": 1.074690580368042, "learning_rate": 9.889639944508614e-05, "loss": 0.23872504234313965, "memory(GiB)": 122.96, "step": 4395, "token_acc": 0.9106824925816024, "train_speed(iter/s)": 0.240552 }, { "epoch": 0.3353914170287369, "grad_norm": 0.880744218826294, "learning_rate": 9.88938962848964e-05, "loss": 0.20713512897491454, "memory(GiB)": 122.96, "step": 4400, "token_acc": 0.9092420212765957, "train_speed(iter/s)": 0.240626 }, { "epoch": 0.3353914170287369, "eval_loss": 0.15811727941036224, "eval_runtime": 173.9477, "eval_samples_per_second": 3.047, "eval_steps_per_second": 3.047, "eval_token_acc": 0.9216613457020661, "step": 4400 }, { "epoch": 0.3357725436389969, "grad_norm": 0.9664312601089478, "learning_rate": 9.889139032086863e-05, "loss": 0.24947676658630372, "memory(GiB)": 122.96, "step": 4405, "token_acc": 0.9212314639918365, "train_speed(iter/s)": 0.238438 }, { "epoch": 0.3361536702492568, "grad_norm": 1.9522829055786133, "learning_rate": 9.888888155314649e-05, "loss": 0.2606205463409424, "memory(GiB)": 122.96, "step": 4410, "token_acc": 0.9059621067269251, "train_speed(iter/s)": 0.238502 }, { "epoch": 0.3365347968595167, "grad_norm": 0.905967116355896, "learning_rate": 9.888636998187386e-05, "loss": 0.1794285297393799, "memory(GiB)": 122.96, "step": 4415, "token_acc": 0.9269461077844311, "train_speed(iter/s)": 0.238543 }, { "epoch": 0.33691592346977667, "grad_norm": 1.0438493490219116, "learning_rate": 9.888385560719479e-05, "loss": 0.18638668060302735, "memory(GiB)": 122.96, "step": 4420, "token_acc": 0.9282684630738522, "train_speed(iter/s)": 0.238552 }, { "epoch": 0.3372970500800366, "grad_norm": 1.1250914335250854, "learning_rate": 9.888133842925344e-05, "loss": 0.23789846897125244, "memory(GiB)": 122.96, "step": 4425, "token_acc": 0.9151047409040793, "train_speed(iter/s)": 0.238607 }, { "epoch": 0.33767817669029654, "grad_norm": 0.7370328307151794, "learning_rate": 9.887881844819417e-05, "loss": 0.2605221509933472, "memory(GiB)": 122.96, "step": 4430, "token_acc": 0.8993319511633264, "train_speed(iter/s)": 0.238674 }, { "epoch": 0.33805930330055645, "grad_norm": 0.794122040271759, "learning_rate": 9.88762956641615e-05, "loss": 0.26715242862701416, "memory(GiB)": 122.96, "step": 4435, "token_acc": 0.884843790343403, "train_speed(iter/s)": 0.238754 }, { "epoch": 0.33844042991081635, "grad_norm": 1.5435172319412231, "learning_rate": 9.887377007730006e-05, "loss": 0.22917850017547609, "memory(GiB)": 122.96, "step": 4440, "token_acc": 0.929726909027657, "train_speed(iter/s)": 0.238794 }, { "epoch": 0.3388215565210763, "grad_norm": 1.0698354244232178, "learning_rate": 9.887124168775473e-05, "loss": 0.24095613956451417, "memory(GiB)": 122.96, "step": 4445, "token_acc": 0.9094147582697201, "train_speed(iter/s)": 0.238858 }, { "epoch": 0.3392026831313362, "grad_norm": 3.2474963665008545, "learning_rate": 9.886871049567047e-05, "loss": 0.19846370220184326, "memory(GiB)": 122.96, "step": 4450, "token_acc": 0.9333333333333333, "train_speed(iter/s)": 0.238954 }, { "epoch": 0.33958380974159613, "grad_norm": 0.889411985874176, "learning_rate": 9.886617650119246e-05, "loss": 0.2050023555755615, "memory(GiB)": 122.96, "step": 4455, "token_acc": 0.9200355397601067, "train_speed(iter/s)": 0.238975 }, { "epoch": 0.3399649363518561, "grad_norm": 0.8893768787384033, "learning_rate": 9.886363970446597e-05, "loss": 0.14530785083770753, "memory(GiB)": 122.96, "step": 4460, "token_acc": 0.9396281091523786, "train_speed(iter/s)": 0.239048 }, { "epoch": 0.340346062962116, "grad_norm": 1.1720106601715088, "learning_rate": 9.88611001056365e-05, "loss": 0.20404436588287353, "memory(GiB)": 122.96, "step": 4465, "token_acc": 0.9132143937103114, "train_speed(iter/s)": 0.239128 }, { "epoch": 0.34072718957237597, "grad_norm": 1.176992654800415, "learning_rate": 9.885855770484969e-05, "loss": 0.26153614521026614, "memory(GiB)": 122.96, "step": 4470, "token_acc": 0.9004935519821684, "train_speed(iter/s)": 0.239174 }, { "epoch": 0.3411083161826359, "grad_norm": 0.797675371170044, "learning_rate": 9.885601250225133e-05, "loss": 0.2723444700241089, "memory(GiB)": 122.96, "step": 4475, "token_acc": 0.9017990145525381, "train_speed(iter/s)": 0.23919 }, { "epoch": 0.3414894427928958, "grad_norm": 1.2383085489273071, "learning_rate": 9.885346449798735e-05, "loss": 0.18525922298431396, "memory(GiB)": 122.96, "step": 4480, "token_acc": 0.9296897238827213, "train_speed(iter/s)": 0.23927 }, { "epoch": 0.34187056940315574, "grad_norm": 1.2479459047317505, "learning_rate": 9.885091369220392e-05, "loss": 0.16029869318008422, "memory(GiB)": 122.96, "step": 4485, "token_acc": 0.9331254331254332, "train_speed(iter/s)": 0.239362 }, { "epoch": 0.34225169601341565, "grad_norm": 0.9152981638908386, "learning_rate": 9.884836008504727e-05, "loss": 0.1721025824546814, "memory(GiB)": 122.96, "step": 4490, "token_acc": 0.9274863857838922, "train_speed(iter/s)": 0.239439 }, { "epoch": 0.34263282262367556, "grad_norm": 1.1927741765975952, "learning_rate": 9.884580367666387e-05, "loss": 0.21854076385498047, "memory(GiB)": 122.96, "step": 4495, "token_acc": 0.9096690273160861, "train_speed(iter/s)": 0.239472 }, { "epoch": 0.3430139492339355, "grad_norm": 1.3850657939910889, "learning_rate": 9.884324446720028e-05, "loss": 0.18531676530838012, "memory(GiB)": 122.96, "step": 4500, "token_acc": 0.9246798350336445, "train_speed(iter/s)": 0.239523 }, { "epoch": 0.34339507584419543, "grad_norm": 0.8446358442306519, "learning_rate": 9.884068245680329e-05, "loss": 0.24971094131469726, "memory(GiB)": 122.96, "step": 4505, "token_acc": 0.9175972927241963, "train_speed(iter/s)": 0.239555 }, { "epoch": 0.3437762024544554, "grad_norm": 1.349091649055481, "learning_rate": 9.883811764561981e-05, "loss": 0.2597987413406372, "memory(GiB)": 122.96, "step": 4510, "token_acc": 0.9047956867196367, "train_speed(iter/s)": 0.239599 }, { "epoch": 0.3441573290647153, "grad_norm": 1.3688730001449585, "learning_rate": 9.883555003379691e-05, "loss": 0.19508850574493408, "memory(GiB)": 122.96, "step": 4515, "token_acc": 0.9025010597710894, "train_speed(iter/s)": 0.239671 }, { "epoch": 0.3445384556749752, "grad_norm": 0.7208614945411682, "learning_rate": 9.883297962148185e-05, "loss": 0.24026741981506347, "memory(GiB)": 122.96, "step": 4520, "token_acc": 0.904951709336195, "train_speed(iter/s)": 0.239726 }, { "epoch": 0.34491958228523517, "grad_norm": 0.9864040017127991, "learning_rate": 9.883040640882202e-05, "loss": 0.1955849289894104, "memory(GiB)": 122.96, "step": 4525, "token_acc": 0.9165900735294118, "train_speed(iter/s)": 0.239806 }, { "epoch": 0.3453007088954951, "grad_norm": 1.0644468069076538, "learning_rate": 9.882783039596497e-05, "loss": 0.2046663999557495, "memory(GiB)": 122.96, "step": 4530, "token_acc": 0.9186751441667899, "train_speed(iter/s)": 0.23986 }, { "epoch": 0.345681835505755, "grad_norm": 2.1824541091918945, "learning_rate": 9.882525158305845e-05, "loss": 0.25109865665435793, "memory(GiB)": 122.96, "step": 4535, "token_acc": 0.9078088823405599, "train_speed(iter/s)": 0.239924 }, { "epoch": 0.34606296211601495, "grad_norm": 0.762199342250824, "learning_rate": 9.882266997025034e-05, "loss": 0.2180487871170044, "memory(GiB)": 122.96, "step": 4540, "token_acc": 0.9176755447941889, "train_speed(iter/s)": 0.239992 }, { "epoch": 0.34644408872627486, "grad_norm": 0.21728816628456116, "learning_rate": 9.882008555768865e-05, "loss": 0.1035007357597351, "memory(GiB)": 122.96, "step": 4545, "token_acc": 0.9476744186046512, "train_speed(iter/s)": 0.240099 }, { "epoch": 0.3468252153365348, "grad_norm": 1.0724139213562012, "learning_rate": 9.881749834552161e-05, "loss": 0.128145968914032, "memory(GiB)": 122.96, "step": 4550, "token_acc": 0.9377940407736539, "train_speed(iter/s)": 0.240169 }, { "epoch": 0.3472063419467947, "grad_norm": 0.7069359421730042, "learning_rate": 9.881490833389759e-05, "loss": 0.1804369330406189, "memory(GiB)": 122.96, "step": 4555, "token_acc": 0.9181404749873674, "train_speed(iter/s)": 0.240261 }, { "epoch": 0.34758746855705464, "grad_norm": 0.948927104473114, "learning_rate": 9.88123155229651e-05, "loss": 0.18763837814331055, "memory(GiB)": 122.96, "step": 4560, "token_acc": 0.9078757225433526, "train_speed(iter/s)": 0.240355 }, { "epoch": 0.3479685951673146, "grad_norm": 0.6958962678909302, "learning_rate": 9.880971991287283e-05, "loss": 0.2074981927871704, "memory(GiB)": 122.96, "step": 4565, "token_acc": 0.9213695395513577, "train_speed(iter/s)": 0.240433 }, { "epoch": 0.3483497217775745, "grad_norm": 0.6374287605285645, "learning_rate": 9.880712150376963e-05, "loss": 0.1625828504562378, "memory(GiB)": 122.96, "step": 4570, "token_acc": 0.9349801910976462, "train_speed(iter/s)": 0.240499 }, { "epoch": 0.3487308483878344, "grad_norm": 0.7840526700019836, "learning_rate": 9.88045202958045e-05, "loss": 0.16879878044128419, "memory(GiB)": 122.96, "step": 4575, "token_acc": 0.9282359571670111, "train_speed(iter/s)": 0.240576 }, { "epoch": 0.3491119749980944, "grad_norm": 2.3380706310272217, "learning_rate": 9.880191628912662e-05, "loss": 0.1550325036048889, "memory(GiB)": 122.96, "step": 4580, "token_acc": 0.9312941176470588, "train_speed(iter/s)": 0.240674 }, { "epoch": 0.3494931016083543, "grad_norm": 0.8310628533363342, "learning_rate": 9.879930948388531e-05, "loss": 0.16306592226028443, "memory(GiB)": 122.96, "step": 4585, "token_acc": 0.9245426829268293, "train_speed(iter/s)": 0.240767 }, { "epoch": 0.34987422821861425, "grad_norm": 0.866337776184082, "learning_rate": 9.879669988023004e-05, "loss": 0.1873743176460266, "memory(GiB)": 122.96, "step": 4590, "token_acc": 0.9067193675889328, "train_speed(iter/s)": 0.240855 }, { "epoch": 0.35025535482887415, "grad_norm": 0.8613419532775879, "learning_rate": 9.879408747831049e-05, "loss": 0.23075032234191895, "memory(GiB)": 122.96, "step": 4595, "token_acc": 0.9196269982238011, "train_speed(iter/s)": 0.24089 }, { "epoch": 0.35063648143913406, "grad_norm": 1.0420233011245728, "learning_rate": 9.879147227827645e-05, "loss": 0.16877565383911133, "memory(GiB)": 122.96, "step": 4600, "token_acc": 0.8987108655616943, "train_speed(iter/s)": 0.240985 }, { "epoch": 0.35063648143913406, "eval_loss": 0.15667736530303955, "eval_runtime": 176.1338, "eval_samples_per_second": 3.009, "eval_steps_per_second": 3.009, "eval_token_acc": 0.9220077103788928, "step": 4600 }, { "epoch": 0.351017608049394, "grad_norm": 1.1197593212127686, "learning_rate": 9.878885428027791e-05, "loss": 0.22312352657318116, "memory(GiB)": 122.96, "step": 4605, "token_acc": 0.9216227344636033, "train_speed(iter/s)": 0.238835 }, { "epoch": 0.35139873465965393, "grad_norm": 1.0067657232284546, "learning_rate": 9.878623348446497e-05, "loss": 0.179567551612854, "memory(GiB)": 122.96, "step": 4610, "token_acc": 0.9369076631464933, "train_speed(iter/s)": 0.238887 }, { "epoch": 0.35177986126991384, "grad_norm": 1.0698491334915161, "learning_rate": 9.878360989098794e-05, "loss": 0.2074800491333008, "memory(GiB)": 122.96, "step": 4615, "token_acc": 0.9068599033816425, "train_speed(iter/s)": 0.238955 }, { "epoch": 0.3521609878801738, "grad_norm": 0.949504017829895, "learning_rate": 9.878098349999728e-05, "loss": 0.17751567363739013, "memory(GiB)": 122.96, "step": 4620, "token_acc": 0.9202069716775599, "train_speed(iter/s)": 0.239029 }, { "epoch": 0.3525421144904337, "grad_norm": 1.0140291452407837, "learning_rate": 9.877835431164358e-05, "loss": 0.19807982444763184, "memory(GiB)": 122.96, "step": 4625, "token_acc": 0.9290404575301843, "train_speed(iter/s)": 0.23908 }, { "epoch": 0.3529232411006937, "grad_norm": 1.6643022298812866, "learning_rate": 9.877572232607759e-05, "loss": 0.24102630615234374, "memory(GiB)": 122.96, "step": 4630, "token_acc": 0.9286092949284326, "train_speed(iter/s)": 0.239117 }, { "epoch": 0.3533043677109536, "grad_norm": 0.9363781809806824, "learning_rate": 9.877308754345031e-05, "loss": 0.23084421157836915, "memory(GiB)": 122.96, "step": 4635, "token_acc": 0.9161813117244872, "train_speed(iter/s)": 0.239192 }, { "epoch": 0.3536854943212135, "grad_norm": 1.3742951154708862, "learning_rate": 9.877044996391279e-05, "loss": 0.2624385833740234, "memory(GiB)": 122.96, "step": 4640, "token_acc": 0.9072710103871577, "train_speed(iter/s)": 0.239248 }, { "epoch": 0.35406662093147345, "grad_norm": 0.961320161819458, "learning_rate": 9.876780958761626e-05, "loss": 0.21875174045562745, "memory(GiB)": 122.96, "step": 4645, "token_acc": 0.928646105593309, "train_speed(iter/s)": 0.239328 }, { "epoch": 0.35444774754173336, "grad_norm": 1.5214426517486572, "learning_rate": 9.876516641471219e-05, "loss": 0.2157068967819214, "memory(GiB)": 122.96, "step": 4650, "token_acc": 0.9257488398256223, "train_speed(iter/s)": 0.239353 }, { "epoch": 0.35482887415199327, "grad_norm": 0.6670135855674744, "learning_rate": 9.87625204453521e-05, "loss": 0.22569336891174316, "memory(GiB)": 122.96, "step": 4655, "token_acc": 0.9131996037642397, "train_speed(iter/s)": 0.239393 }, { "epoch": 0.35521000076225323, "grad_norm": 0.799538791179657, "learning_rate": 9.875987167968775e-05, "loss": 0.19718537330627442, "memory(GiB)": 122.96, "step": 4660, "token_acc": 0.9202188940092166, "train_speed(iter/s)": 0.239418 }, { "epoch": 0.35559112737251314, "grad_norm": 0.9834874272346497, "learning_rate": 9.875722011787105e-05, "loss": 0.1284249544143677, "memory(GiB)": 122.96, "step": 4665, "token_acc": 0.9438490214352283, "train_speed(iter/s)": 0.239486 }, { "epoch": 0.3559722539827731, "grad_norm": 0.7410391569137573, "learning_rate": 9.875456576005402e-05, "loss": 0.16127818822860718, "memory(GiB)": 122.96, "step": 4670, "token_acc": 0.9308323563892146, "train_speed(iter/s)": 0.239548 }, { "epoch": 0.356353380593033, "grad_norm": 0.7877100110054016, "learning_rate": 9.875190860638892e-05, "loss": 0.13834574222564697, "memory(GiB)": 122.96, "step": 4675, "token_acc": 0.943090787716956, "train_speed(iter/s)": 0.2396 }, { "epoch": 0.3567345072032929, "grad_norm": 0.9883182048797607, "learning_rate": 9.874924865702807e-05, "loss": 0.1840498685836792, "memory(GiB)": 122.96, "step": 4680, "token_acc": 0.9261939218523878, "train_speed(iter/s)": 0.239636 }, { "epoch": 0.3571156338135529, "grad_norm": 0.9080489873886108, "learning_rate": 9.874658591212404e-05, "loss": 0.18100385665893554, "memory(GiB)": 122.96, "step": 4685, "token_acc": 0.9261433113230618, "train_speed(iter/s)": 0.239698 }, { "epoch": 0.3574967604238128, "grad_norm": 1.140531301498413, "learning_rate": 9.874392037182953e-05, "loss": 0.26099352836608886, "memory(GiB)": 122.96, "step": 4690, "token_acc": 0.8976491862567811, "train_speed(iter/s)": 0.239744 }, { "epoch": 0.3578778870340727, "grad_norm": 0.7949675917625427, "learning_rate": 9.874125203629737e-05, "loss": 0.21333322525024415, "memory(GiB)": 122.96, "step": 4695, "token_acc": 0.9264923532313765, "train_speed(iter/s)": 0.239798 }, { "epoch": 0.35825901364433266, "grad_norm": 1.2855316400527954, "learning_rate": 9.87385809056806e-05, "loss": 0.2975268840789795, "memory(GiB)": 122.96, "step": 4700, "token_acc": 0.9006211180124224, "train_speed(iter/s)": 0.239865 }, { "epoch": 0.35864014025459257, "grad_norm": 0.6822090744972229, "learning_rate": 9.873590698013239e-05, "loss": 0.1567288041114807, "memory(GiB)": 122.96, "step": 4705, "token_acc": 0.9332677165354331, "train_speed(iter/s)": 0.239918 }, { "epoch": 0.35902126686485253, "grad_norm": 1.307555079460144, "learning_rate": 9.873323025980609e-05, "loss": 0.2116835117340088, "memory(GiB)": 122.96, "step": 4710, "token_acc": 0.9261983572391811, "train_speed(iter/s)": 0.239942 }, { "epoch": 0.35940239347511244, "grad_norm": 1.60109281539917, "learning_rate": 9.873055074485517e-05, "loss": 0.1808045744895935, "memory(GiB)": 122.96, "step": 4715, "token_acc": 0.9267690619857378, "train_speed(iter/s)": 0.240004 }, { "epoch": 0.35978352008537234, "grad_norm": 0.9124451279640198, "learning_rate": 9.87278684354333e-05, "loss": 0.13485660552978515, "memory(GiB)": 122.96, "step": 4720, "token_acc": 0.941814334832055, "train_speed(iter/s)": 0.240082 }, { "epoch": 0.3601646466956323, "grad_norm": 1.0666978359222412, "learning_rate": 9.872518333169431e-05, "loss": 0.2680462598800659, "memory(GiB)": 122.96, "step": 4725, "token_acc": 0.8968119513322039, "train_speed(iter/s)": 0.240129 }, { "epoch": 0.3605457733058922, "grad_norm": 0.534623384475708, "learning_rate": 9.872249543379216e-05, "loss": 0.0836132287979126, "memory(GiB)": 122.96, "step": 4730, "token_acc": 0.9593716143011918, "train_speed(iter/s)": 0.24023 }, { "epoch": 0.3609268999161521, "grad_norm": 1.2304494380950928, "learning_rate": 9.8719804741881e-05, "loss": 0.16108639240264894, "memory(GiB)": 122.96, "step": 4735, "token_acc": 0.9252247988641742, "train_speed(iter/s)": 0.240264 }, { "epoch": 0.3613080265264121, "grad_norm": 1.151193380355835, "learning_rate": 9.871711125611513e-05, "loss": 0.15151898860931395, "memory(GiB)": 122.96, "step": 4740, "token_acc": 0.9314755596162632, "train_speed(iter/s)": 0.240323 }, { "epoch": 0.361689153136672, "grad_norm": 0.6995449662208557, "learning_rate": 9.8714414976649e-05, "loss": 0.21075305938720704, "memory(GiB)": 122.96, "step": 4745, "token_acc": 0.9283538186784326, "train_speed(iter/s)": 0.240336 }, { "epoch": 0.36207027974693196, "grad_norm": 0.6059656739234924, "learning_rate": 9.871171590363723e-05, "loss": 0.196136212348938, "memory(GiB)": 122.96, "step": 4750, "token_acc": 0.9203474403991868, "train_speed(iter/s)": 0.240392 }, { "epoch": 0.36245140635719186, "grad_norm": 1.7953662872314453, "learning_rate": 9.87090140372346e-05, "loss": 0.16549742221832275, "memory(GiB)": 122.96, "step": 4755, "token_acc": 0.9391695318698757, "train_speed(iter/s)": 0.240452 }, { "epoch": 0.36283253296745177, "grad_norm": 0.9908974170684814, "learning_rate": 9.870630937759606e-05, "loss": 0.23433010578155516, "memory(GiB)": 122.96, "step": 4760, "token_acc": 0.9260723463135007, "train_speed(iter/s)": 0.240514 }, { "epoch": 0.36321365957771173, "grad_norm": 0.9677805304527283, "learning_rate": 9.870360192487672e-05, "loss": 0.19182584285736085, "memory(GiB)": 122.96, "step": 4765, "token_acc": 0.9332323996971991, "train_speed(iter/s)": 0.240547 }, { "epoch": 0.36359478618797164, "grad_norm": 1.4979366064071655, "learning_rate": 9.87008916792318e-05, "loss": 0.20310027599334718, "memory(GiB)": 122.96, "step": 4770, "token_acc": 0.9084945883896359, "train_speed(iter/s)": 0.240633 }, { "epoch": 0.36397591279823155, "grad_norm": 1.0055533647537231, "learning_rate": 9.869817864081673e-05, "loss": 0.18223538398742675, "memory(GiB)": 122.96, "step": 4775, "token_acc": 0.9310900016100467, "train_speed(iter/s)": 0.240672 }, { "epoch": 0.3643570394084915, "grad_norm": 0.8194828033447266, "learning_rate": 9.869546280978712e-05, "loss": 0.1447862982749939, "memory(GiB)": 122.96, "step": 4780, "token_acc": 0.9344447584063295, "train_speed(iter/s)": 0.240742 }, { "epoch": 0.3647381660187514, "grad_norm": 0.8647611737251282, "learning_rate": 9.86927441862987e-05, "loss": 0.20292248725891113, "memory(GiB)": 122.96, "step": 4785, "token_acc": 0.9329323856387826, "train_speed(iter/s)": 0.240794 }, { "epoch": 0.3651192926290114, "grad_norm": 0.6766430735588074, "learning_rate": 9.869002277050734e-05, "loss": 0.205535888671875, "memory(GiB)": 122.96, "step": 4790, "token_acc": 0.9311546840958606, "train_speed(iter/s)": 0.240836 }, { "epoch": 0.3655004192392713, "grad_norm": 0.8801077604293823, "learning_rate": 9.868729856256914e-05, "loss": 0.20031869411468506, "memory(GiB)": 122.96, "step": 4795, "token_acc": 0.9225978647686833, "train_speed(iter/s)": 0.240872 }, { "epoch": 0.3658815458495312, "grad_norm": 0.8264265656471252, "learning_rate": 9.868457156264031e-05, "loss": 0.1959167718887329, "memory(GiB)": 122.96, "step": 4800, "token_acc": 0.9278169014084507, "train_speed(iter/s)": 0.240914 }, { "epoch": 0.3658815458495312, "eval_loss": 0.15245549380779266, "eval_runtime": 175.0853, "eval_samples_per_second": 3.027, "eval_steps_per_second": 3.027, "eval_token_acc": 0.924394614782242, "step": 4800 }, { "epoch": 0.36626267245979116, "grad_norm": 0.8004920482635498, "learning_rate": 9.86818417708772e-05, "loss": 0.1542346715927124, "memory(GiB)": 122.96, "step": 4805, "token_acc": 0.9247288408764212, "train_speed(iter/s)": 0.238873 }, { "epoch": 0.36664379907005107, "grad_norm": 1.1550219058990479, "learning_rate": 9.867910918743639e-05, "loss": 0.14435558319091796, "memory(GiB)": 122.96, "step": 4810, "token_acc": 0.9341142020497804, "train_speed(iter/s)": 0.238921 }, { "epoch": 0.367024925680311, "grad_norm": 0.44425806403160095, "learning_rate": 9.867637381247457e-05, "loss": 0.1131820797920227, "memory(GiB)": 122.96, "step": 4815, "token_acc": 0.9479166666666666, "train_speed(iter/s)": 0.23899 }, { "epoch": 0.36740605229057094, "grad_norm": 1.2150404453277588, "learning_rate": 9.86736356461486e-05, "loss": 0.22952535152435302, "memory(GiB)": 122.96, "step": 4820, "token_acc": 0.913777455073516, "train_speed(iter/s)": 0.239038 }, { "epoch": 0.36778717890083085, "grad_norm": 0.8951210379600525, "learning_rate": 9.86708946886155e-05, "loss": 0.19099726676940917, "memory(GiB)": 122.96, "step": 4825, "token_acc": 0.9361970057366727, "train_speed(iter/s)": 0.239058 }, { "epoch": 0.3681683055110908, "grad_norm": 0.7850123643875122, "learning_rate": 9.866815094003243e-05, "loss": 0.14610296487808228, "memory(GiB)": 122.96, "step": 4830, "token_acc": 0.9388773388773389, "train_speed(iter/s)": 0.239149 }, { "epoch": 0.3685494321213507, "grad_norm": 0.8659340739250183, "learning_rate": 9.866540440055677e-05, "loss": 0.20278141498565674, "memory(GiB)": 122.96, "step": 4835, "token_acc": 0.9188505331646485, "train_speed(iter/s)": 0.239204 }, { "epoch": 0.3689305587316106, "grad_norm": 0.8424884080886841, "learning_rate": 9.8662655070346e-05, "loss": 0.19950945377349855, "memory(GiB)": 122.96, "step": 4840, "token_acc": 0.9230896460469732, "train_speed(iter/s)": 0.239243 }, { "epoch": 0.3693116853418706, "grad_norm": 1.6758841276168823, "learning_rate": 9.865990294955778e-05, "loss": 0.1673001766204834, "memory(GiB)": 122.96, "step": 4845, "token_acc": 0.9209849320102903, "train_speed(iter/s)": 0.239326 }, { "epoch": 0.3696928119521305, "grad_norm": 0.8280849456787109, "learning_rate": 9.865714803834994e-05, "loss": 0.20846846103668212, "memory(GiB)": 122.96, "step": 4850, "token_acc": 0.9346005883684091, "train_speed(iter/s)": 0.239386 }, { "epoch": 0.3700739385623904, "grad_norm": 0.8610410094261169, "learning_rate": 9.865439033688046e-05, "loss": 0.17613157033920288, "memory(GiB)": 122.96, "step": 4855, "token_acc": 0.9181010024588614, "train_speed(iter/s)": 0.239446 }, { "epoch": 0.37045506517265037, "grad_norm": 0.6586319208145142, "learning_rate": 9.865162984530748e-05, "loss": 0.16918354034423827, "memory(GiB)": 122.96, "step": 4860, "token_acc": 0.9284346577309718, "train_speed(iter/s)": 0.23951 }, { "epoch": 0.3708361917829103, "grad_norm": 0.9983252882957458, "learning_rate": 9.86488665637893e-05, "loss": 0.19900877475738527, "memory(GiB)": 122.96, "step": 4865, "token_acc": 0.9201865057597367, "train_speed(iter/s)": 0.239574 }, { "epoch": 0.37121731839317024, "grad_norm": 0.739035427570343, "learning_rate": 9.864610049248435e-05, "loss": 0.24607391357421876, "memory(GiB)": 122.96, "step": 4870, "token_acc": 0.9066073697585769, "train_speed(iter/s)": 0.239629 }, { "epoch": 0.37159844500343014, "grad_norm": 0.5734931826591492, "learning_rate": 9.86433316315513e-05, "loss": 0.22616519927978515, "memory(GiB)": 122.96, "step": 4875, "token_acc": 0.9158447684391081, "train_speed(iter/s)": 0.239653 }, { "epoch": 0.37197957161369005, "grad_norm": 0.7829099297523499, "learning_rate": 9.864055998114893e-05, "loss": 0.21596689224243165, "memory(GiB)": 122.96, "step": 4880, "token_acc": 0.90715667311412, "train_speed(iter/s)": 0.239729 }, { "epoch": 0.37236069822395, "grad_norm": 1.5658643245697021, "learning_rate": 9.863778554143615e-05, "loss": 0.21100916862487792, "memory(GiB)": 122.96, "step": 4885, "token_acc": 0.9205479452054794, "train_speed(iter/s)": 0.239788 }, { "epoch": 0.3727418248342099, "grad_norm": 0.8149398565292358, "learning_rate": 9.863500831257209e-05, "loss": 0.20950567722320557, "memory(GiB)": 122.96, "step": 4890, "token_acc": 0.9169837426496023, "train_speed(iter/s)": 0.239875 }, { "epoch": 0.37312295144446983, "grad_norm": 1.0802415609359741, "learning_rate": 9.863222829471599e-05, "loss": 0.19631320238113403, "memory(GiB)": 122.96, "step": 4895, "token_acc": 0.905577216477369, "train_speed(iter/s)": 0.23992 }, { "epoch": 0.3735040780547298, "grad_norm": 0.9780798554420471, "learning_rate": 9.862944548802727e-05, "loss": 0.19555288553237915, "memory(GiB)": 122.96, "step": 4900, "token_acc": 0.9283824015878267, "train_speed(iter/s)": 0.239951 }, { "epoch": 0.3738852046649897, "grad_norm": 1.3320714235305786, "learning_rate": 9.862665989266554e-05, "loss": 0.31761305332183837, "memory(GiB)": 122.96, "step": 4905, "token_acc": 0.8839078406858367, "train_speed(iter/s)": 0.240009 }, { "epoch": 0.37426633127524966, "grad_norm": 0.8094413876533508, "learning_rate": 9.862387150879049e-05, "loss": 0.23569085597991943, "memory(GiB)": 122.96, "step": 4910, "token_acc": 0.9163506580414901, "train_speed(iter/s)": 0.240061 }, { "epoch": 0.37464745788550957, "grad_norm": 0.7483161687850952, "learning_rate": 9.862108033656207e-05, "loss": 0.16934081315994262, "memory(GiB)": 122.96, "step": 4915, "token_acc": 0.934181240063593, "train_speed(iter/s)": 0.240111 }, { "epoch": 0.3750285844957695, "grad_norm": 1.1448999643325806, "learning_rate": 9.861828637614031e-05, "loss": 0.16175122261047364, "memory(GiB)": 122.96, "step": 4920, "token_acc": 0.9397898324339676, "train_speed(iter/s)": 0.240184 }, { "epoch": 0.37540971110602944, "grad_norm": 1.6378339529037476, "learning_rate": 9.861548962768546e-05, "loss": 0.21855635643005372, "memory(GiB)": 122.96, "step": 4925, "token_acc": 0.9170589553911679, "train_speed(iter/s)": 0.240249 }, { "epoch": 0.37579083771628935, "grad_norm": 1.7824002504348755, "learning_rate": 9.861269009135787e-05, "loss": 0.23356881141662597, "memory(GiB)": 122.96, "step": 4930, "token_acc": 0.9166666666666666, "train_speed(iter/s)": 0.240294 }, { "epoch": 0.37617196432654926, "grad_norm": 0.23918123543262482, "learning_rate": 9.86098877673181e-05, "loss": 0.1736738920211792, "memory(GiB)": 122.96, "step": 4935, "token_acc": 0.9240387621131604, "train_speed(iter/s)": 0.240337 }, { "epoch": 0.3765530909368092, "grad_norm": 0.5896286368370056, "learning_rate": 9.860708265572684e-05, "loss": 0.21452882289886474, "memory(GiB)": 122.96, "step": 4940, "token_acc": 0.9186519465427078, "train_speed(iter/s)": 0.24043 }, { "epoch": 0.37693421754706913, "grad_norm": 1.4239826202392578, "learning_rate": 9.860427475674496e-05, "loss": 0.2024442195892334, "memory(GiB)": 122.96, "step": 4945, "token_acc": 0.9324826560951437, "train_speed(iter/s)": 0.240463 }, { "epoch": 0.3773153441573291, "grad_norm": 1.3806772232055664, "learning_rate": 9.860146407053347e-05, "loss": 0.1630520701408386, "memory(GiB)": 122.96, "step": 4950, "token_acc": 0.9201619901648829, "train_speed(iter/s)": 0.240533 }, { "epoch": 0.377696470767589, "grad_norm": 0.4779732823371887, "learning_rate": 9.859865059725355e-05, "loss": 0.16299794912338256, "memory(GiB)": 122.96, "step": 4955, "token_acc": 0.9257203277821835, "train_speed(iter/s)": 0.240597 }, { "epoch": 0.3780775973778489, "grad_norm": 1.8744924068450928, "learning_rate": 9.859583433706654e-05, "loss": 0.22518444061279297, "memory(GiB)": 122.96, "step": 4960, "token_acc": 0.9078498293515358, "train_speed(iter/s)": 0.240679 }, { "epoch": 0.37845872398810887, "grad_norm": 1.910704493522644, "learning_rate": 9.859301529013396e-05, "loss": 0.19942010641098024, "memory(GiB)": 122.96, "step": 4965, "token_acc": 0.9268059320682507, "train_speed(iter/s)": 0.240717 }, { "epoch": 0.3788398505983688, "grad_norm": 1.0774117708206177, "learning_rate": 9.859019345661744e-05, "loss": 0.16888455152511597, "memory(GiB)": 122.96, "step": 4970, "token_acc": 0.9153371075656201, "train_speed(iter/s)": 0.240795 }, { "epoch": 0.3792209772086287, "grad_norm": 1.585683822631836, "learning_rate": 9.858736883667882e-05, "loss": 0.28023395538330076, "memory(GiB)": 122.96, "step": 4975, "token_acc": 0.909070796460177, "train_speed(iter/s)": 0.240807 }, { "epoch": 0.37960210381888865, "grad_norm": 0.4177914261817932, "learning_rate": 9.858454143048006e-05, "loss": 0.16453282833099364, "memory(GiB)": 122.96, "step": 4980, "token_acc": 0.9195509822263798, "train_speed(iter/s)": 0.240857 }, { "epoch": 0.37998323042914856, "grad_norm": 0.8462322354316711, "learning_rate": 9.858171123818332e-05, "loss": 0.1851676106452942, "memory(GiB)": 122.96, "step": 4985, "token_acc": 0.9130505709624797, "train_speed(iter/s)": 0.240913 }, { "epoch": 0.3803643570394085, "grad_norm": 1.0969089269638062, "learning_rate": 9.857887825995088e-05, "loss": 0.2045442819595337, "memory(GiB)": 122.96, "step": 4990, "token_acc": 0.9264555669050051, "train_speed(iter/s)": 0.24096 }, { "epoch": 0.3807454836496684, "grad_norm": 1.1775494813919067, "learning_rate": 9.857604249594521e-05, "loss": 0.1960723042488098, "memory(GiB)": 122.96, "step": 4995, "token_acc": 0.9226856561546287, "train_speed(iter/s)": 0.241053 }, { "epoch": 0.38112661025992833, "grad_norm": 0.6412020325660706, "learning_rate": 9.857320394632892e-05, "loss": 0.1684521198272705, "memory(GiB)": 122.96, "step": 5000, "token_acc": 0.9192764053686053, "train_speed(iter/s)": 0.241116 }, { "epoch": 0.38112661025992833, "eval_loss": 0.15154628455638885, "eval_runtime": 172.6344, "eval_samples_per_second": 3.07, "eval_steps_per_second": 3.07, "eval_token_acc": 0.924394614782242, "step": 5000 }, { "epoch": 0.3815077368701883, "grad_norm": 1.034816026687622, "learning_rate": 9.857036261126477e-05, "loss": 0.18690760135650636, "memory(GiB)": 122.96, "step": 5005, "token_acc": 0.9242719881744272, "train_speed(iter/s)": 0.239201 }, { "epoch": 0.3818888634804482, "grad_norm": 0.5642911791801453, "learning_rate": 9.856751849091575e-05, "loss": 0.17478065490722655, "memory(GiB)": 122.96, "step": 5010, "token_acc": 0.9284806102987921, "train_speed(iter/s)": 0.239272 }, { "epoch": 0.3822699900907081, "grad_norm": 0.7116358876228333, "learning_rate": 9.856467158544492e-05, "loss": 0.22216565608978273, "memory(GiB)": 122.96, "step": 5015, "token_acc": 0.9060933281912842, "train_speed(iter/s)": 0.239309 }, { "epoch": 0.3826511167009681, "grad_norm": 0.9811486601829529, "learning_rate": 9.856182189501553e-05, "loss": 0.2557823657989502, "memory(GiB)": 122.96, "step": 5020, "token_acc": 0.899803536345776, "train_speed(iter/s)": 0.23937 }, { "epoch": 0.383032243311228, "grad_norm": 1.153630256652832, "learning_rate": 9.855896941979101e-05, "loss": 0.1496975064277649, "memory(GiB)": 122.96, "step": 5025, "token_acc": 0.946751863684771, "train_speed(iter/s)": 0.239411 }, { "epoch": 0.38341336992148795, "grad_norm": 1.405582308769226, "learning_rate": 9.855611415993496e-05, "loss": 0.22639055252075196, "memory(GiB)": 122.96, "step": 5030, "token_acc": 0.9081059390048154, "train_speed(iter/s)": 0.239468 }, { "epoch": 0.38379449653174785, "grad_norm": 1.1265263557434082, "learning_rate": 9.855325611561106e-05, "loss": 0.17762513160705568, "memory(GiB)": 122.96, "step": 5035, "token_acc": 0.9357460808980066, "train_speed(iter/s)": 0.239521 }, { "epoch": 0.38417562314200776, "grad_norm": 1.0546566247940063, "learning_rate": 9.855039528698325e-05, "loss": 0.23240838050842286, "memory(GiB)": 122.96, "step": 5040, "token_acc": 0.9301343570057582, "train_speed(iter/s)": 0.239558 }, { "epoch": 0.3845567497522677, "grad_norm": 0.6104698181152344, "learning_rate": 9.854753167421556e-05, "loss": 0.20711357593536378, "memory(GiB)": 122.96, "step": 5045, "token_acc": 0.9281578947368421, "train_speed(iter/s)": 0.239561 }, { "epoch": 0.38493787636252763, "grad_norm": 0.8007349371910095, "learning_rate": 9.854466527747223e-05, "loss": 0.21583545207977295, "memory(GiB)": 122.96, "step": 5050, "token_acc": 0.9161153119092628, "train_speed(iter/s)": 0.239617 }, { "epoch": 0.38531900297278754, "grad_norm": 0.9082942605018616, "learning_rate": 9.854179609691761e-05, "loss": 0.19706590175628663, "memory(GiB)": 122.96, "step": 5055, "token_acc": 0.9141799172293618, "train_speed(iter/s)": 0.239681 }, { "epoch": 0.3857001295830475, "grad_norm": 0.1592966765165329, "learning_rate": 9.853892413271626e-05, "loss": 0.20413472652435302, "memory(GiB)": 122.96, "step": 5060, "token_acc": 0.9066198224852071, "train_speed(iter/s)": 0.23973 }, { "epoch": 0.3860812561933074, "grad_norm": 1.2667183876037598, "learning_rate": 9.853604938503285e-05, "loss": 0.185267174243927, "memory(GiB)": 122.96, "step": 5065, "token_acc": 0.9276177090653549, "train_speed(iter/s)": 0.239764 }, { "epoch": 0.3864623828035674, "grad_norm": 1.1256556510925293, "learning_rate": 9.853317185403224e-05, "loss": 0.29414184093475343, "memory(GiB)": 122.96, "step": 5070, "token_acc": 0.891542614101096, "train_speed(iter/s)": 0.239813 }, { "epoch": 0.3868435094138273, "grad_norm": 1.1578857898712158, "learning_rate": 9.853029153987944e-05, "loss": 0.1848887324333191, "memory(GiB)": 122.96, "step": 5075, "token_acc": 0.9254627313656828, "train_speed(iter/s)": 0.239897 }, { "epoch": 0.3872246360240872, "grad_norm": 1.2722505331039429, "learning_rate": 9.852740844273965e-05, "loss": 0.24428200721740723, "memory(GiB)": 122.96, "step": 5080, "token_acc": 0.9094471658502449, "train_speed(iter/s)": 0.239933 }, { "epoch": 0.38760576263434715, "grad_norm": 0.6826249361038208, "learning_rate": 9.852452256277816e-05, "loss": 0.19916834831237792, "memory(GiB)": 122.96, "step": 5085, "token_acc": 0.9225108225108225, "train_speed(iter/s)": 0.239971 }, { "epoch": 0.38798688924460706, "grad_norm": 1.3301763534545898, "learning_rate": 9.85216339001605e-05, "loss": 0.20142030715942383, "memory(GiB)": 122.96, "step": 5090, "token_acc": 0.9220360824742269, "train_speed(iter/s)": 0.240064 }, { "epoch": 0.38836801585486697, "grad_norm": 1.4179213047027588, "learning_rate": 9.851874245505228e-05, "loss": 0.24690508842468262, "memory(GiB)": 122.96, "step": 5095, "token_acc": 0.9008833922261484, "train_speed(iter/s)": 0.240117 }, { "epoch": 0.38874914246512693, "grad_norm": 1.3954585790634155, "learning_rate": 9.851584822761934e-05, "loss": 0.20048537254333496, "memory(GiB)": 122.96, "step": 5100, "token_acc": 0.9238725693007861, "train_speed(iter/s)": 0.240131 }, { "epoch": 0.38913026907538684, "grad_norm": 0.7777506709098816, "learning_rate": 9.851295121802767e-05, "loss": 0.12991485595703126, "memory(GiB)": 122.96, "step": 5105, "token_acc": 0.943502824858757, "train_speed(iter/s)": 0.240198 }, { "epoch": 0.3895113956856468, "grad_norm": 0.8535301685333252, "learning_rate": 9.851005142644336e-05, "loss": 0.18277162313461304, "memory(GiB)": 122.96, "step": 5110, "token_acc": 0.9193882840850182, "train_speed(iter/s)": 0.24027 }, { "epoch": 0.3898925222959067, "grad_norm": 1.0179109573364258, "learning_rate": 9.850714885303272e-05, "loss": 0.14999103546142578, "memory(GiB)": 122.96, "step": 5115, "token_acc": 0.9343832020997376, "train_speed(iter/s)": 0.240356 }, { "epoch": 0.3902736489061666, "grad_norm": 0.9158710241317749, "learning_rate": 9.850424349796217e-05, "loss": 0.1818784475326538, "memory(GiB)": 122.96, "step": 5120, "token_acc": 0.9316655694535879, "train_speed(iter/s)": 0.240384 }, { "epoch": 0.3906547755164266, "grad_norm": 0.7674322724342346, "learning_rate": 9.850133536139836e-05, "loss": 0.16328256130218505, "memory(GiB)": 122.96, "step": 5125, "token_acc": 0.9364666981577704, "train_speed(iter/s)": 0.240439 }, { "epoch": 0.3910359021266865, "grad_norm": 0.95793217420578, "learning_rate": 9.849842444350805e-05, "loss": 0.18238660097122192, "memory(GiB)": 122.96, "step": 5130, "token_acc": 0.932292765239401, "train_speed(iter/s)": 0.240482 }, { "epoch": 0.3914170287369464, "grad_norm": 0.8844788670539856, "learning_rate": 9.849551074445816e-05, "loss": 0.14913604259490967, "memory(GiB)": 122.96, "step": 5135, "token_acc": 0.9365645046329294, "train_speed(iter/s)": 0.240549 }, { "epoch": 0.39179815534720636, "grad_norm": 0.8037666082382202, "learning_rate": 9.849259426441574e-05, "loss": 0.22845752239227296, "memory(GiB)": 122.96, "step": 5140, "token_acc": 0.9165394402035624, "train_speed(iter/s)": 0.240598 }, { "epoch": 0.39217928195746626, "grad_norm": 0.8414119482040405, "learning_rate": 9.84896750035481e-05, "loss": 0.28097584247589114, "memory(GiB)": 122.96, "step": 5145, "token_acc": 0.9069906096288568, "train_speed(iter/s)": 0.240623 }, { "epoch": 0.39256040856772617, "grad_norm": 1.1328070163726807, "learning_rate": 9.848675296202263e-05, "loss": 0.15029506683349608, "memory(GiB)": 122.96, "step": 5150, "token_acc": 0.9345902270103886, "train_speed(iter/s)": 0.240641 }, { "epoch": 0.39294153517798613, "grad_norm": 0.819054126739502, "learning_rate": 9.848382814000685e-05, "loss": 0.2713578224182129, "memory(GiB)": 122.96, "step": 5155, "token_acc": 0.8682385575589459, "train_speed(iter/s)": 0.240719 }, { "epoch": 0.39332266178824604, "grad_norm": 0.8173410296440125, "learning_rate": 9.848090053766854e-05, "loss": 0.1613088369369507, "memory(GiB)": 122.96, "step": 5160, "token_acc": 0.9259491030454735, "train_speed(iter/s)": 0.240772 }, { "epoch": 0.393703788398506, "grad_norm": 1.2266457080841064, "learning_rate": 9.847797015517557e-05, "loss": 0.1606640100479126, "memory(GiB)": 122.96, "step": 5165, "token_acc": 0.9453361246801582, "train_speed(iter/s)": 0.240825 }, { "epoch": 0.3940849150087659, "grad_norm": 0.7925116419792175, "learning_rate": 9.847503699269596e-05, "loss": 0.2361292362213135, "memory(GiB)": 122.96, "step": 5170, "token_acc": 0.9101425325346003, "train_speed(iter/s)": 0.240874 }, { "epoch": 0.3944660416190258, "grad_norm": 0.9408239722251892, "learning_rate": 9.847210105039795e-05, "loss": 0.20423665046691894, "memory(GiB)": 122.96, "step": 5175, "token_acc": 0.9258560258829873, "train_speed(iter/s)": 0.240942 }, { "epoch": 0.3948471682292858, "grad_norm": 1.3070120811462402, "learning_rate": 9.846916232844986e-05, "loss": 0.1816067337989807, "memory(GiB)": 122.96, "step": 5180, "token_acc": 0.9263588544710696, "train_speed(iter/s)": 0.240999 }, { "epoch": 0.3952282948395457, "grad_norm": 0.7797096967697144, "learning_rate": 9.846622082702023e-05, "loss": 0.15843185186386108, "memory(GiB)": 122.96, "step": 5185, "token_acc": 0.9379239162488942, "train_speed(iter/s)": 0.241042 }, { "epoch": 0.3956094214498056, "grad_norm": 0.8089981079101562, "learning_rate": 9.846327654627778e-05, "loss": 0.20733649730682374, "memory(GiB)": 122.96, "step": 5190, "token_acc": 0.918095508006235, "train_speed(iter/s)": 0.241083 }, { "epoch": 0.39599054806006556, "grad_norm": 1.2698543071746826, "learning_rate": 9.84603294863913e-05, "loss": 0.19331855773925782, "memory(GiB)": 122.96, "step": 5195, "token_acc": 0.906701030927835, "train_speed(iter/s)": 0.241151 }, { "epoch": 0.39637167467032547, "grad_norm": 0.7953601479530334, "learning_rate": 9.845737964752979e-05, "loss": 0.2619925975799561, "memory(GiB)": 122.96, "step": 5200, "token_acc": 0.892271662763466, "train_speed(iter/s)": 0.241226 }, { "epoch": 0.39637167467032547, "eval_loss": 0.14741991460323334, "eval_runtime": 178.1932, "eval_samples_per_second": 2.974, "eval_steps_per_second": 2.974, "eval_token_acc": 0.9263673875067767, "step": 5200 }, { "epoch": 0.39675280128058543, "grad_norm": 1.3512409925460815, "learning_rate": 9.845442702986246e-05, "loss": 0.21658389568328856, "memory(GiB)": 122.96, "step": 5205, "token_acc": 0.9258857766460531, "train_speed(iter/s)": 0.239303 }, { "epoch": 0.39713392789084534, "grad_norm": 2.059621572494507, "learning_rate": 9.845147163355857e-05, "loss": 0.2163167953491211, "memory(GiB)": 122.96, "step": 5210, "token_acc": 0.9094025465230167, "train_speed(iter/s)": 0.239388 }, { "epoch": 0.39751505450110525, "grad_norm": 0.6279622316360474, "learning_rate": 9.844851345878763e-05, "loss": 0.14883772134780884, "memory(GiB)": 122.96, "step": 5215, "token_acc": 0.9389353747552074, "train_speed(iter/s)": 0.239422 }, { "epoch": 0.3978961811113652, "grad_norm": 0.8725217580795288, "learning_rate": 9.844555250571927e-05, "loss": 0.20449295043945312, "memory(GiB)": 122.96, "step": 5220, "token_acc": 0.917702948162638, "train_speed(iter/s)": 0.239444 }, { "epoch": 0.3982773077216251, "grad_norm": 0.9944246411323547, "learning_rate": 9.844258877452331e-05, "loss": 0.18624258041381836, "memory(GiB)": 122.96, "step": 5225, "token_acc": 0.9261171156638167, "train_speed(iter/s)": 0.239481 }, { "epoch": 0.398658434331885, "grad_norm": 0.8348492980003357, "learning_rate": 9.843962226536964e-05, "loss": 0.19931304454803467, "memory(GiB)": 122.96, "step": 5230, "token_acc": 0.9164865395952053, "train_speed(iter/s)": 0.239532 }, { "epoch": 0.399039560942145, "grad_norm": 0.9587819576263428, "learning_rate": 9.843665297842845e-05, "loss": 0.22849133014678955, "memory(GiB)": 122.96, "step": 5235, "token_acc": 0.9147507104107466, "train_speed(iter/s)": 0.239556 }, { "epoch": 0.3994206875524049, "grad_norm": 0.7294626832008362, "learning_rate": 9.843368091386999e-05, "loss": 0.1740816831588745, "memory(GiB)": 122.96, "step": 5240, "token_acc": 0.920144371757275, "train_speed(iter/s)": 0.239611 }, { "epoch": 0.39980181416266486, "grad_norm": 1.3237510919570923, "learning_rate": 9.843070607186469e-05, "loss": 0.1999659538269043, "memory(GiB)": 122.96, "step": 5245, "token_acc": 0.9246531139077122, "train_speed(iter/s)": 0.239651 }, { "epoch": 0.40018294077292477, "grad_norm": 0.6153662204742432, "learning_rate": 9.842772845258314e-05, "loss": 0.14377148151397706, "memory(GiB)": 122.96, "step": 5250, "token_acc": 0.9165585819282317, "train_speed(iter/s)": 0.239701 }, { "epoch": 0.4005640673831847, "grad_norm": 0.8737653493881226, "learning_rate": 9.842474805619608e-05, "loss": 0.17032755613327027, "memory(GiB)": 122.96, "step": 5255, "token_acc": 0.9356333250599818, "train_speed(iter/s)": 0.239704 }, { "epoch": 0.40094519399344464, "grad_norm": 1.7577463388442993, "learning_rate": 9.842176488287444e-05, "loss": 0.17543500661849976, "memory(GiB)": 122.96, "step": 5260, "token_acc": 0.9301225542894002, "train_speed(iter/s)": 0.239755 }, { "epoch": 0.40132632060370454, "grad_norm": 0.9472769498825073, "learning_rate": 9.84187789327893e-05, "loss": 0.21139540672302246, "memory(GiB)": 122.96, "step": 5265, "token_acc": 0.9142472601422803, "train_speed(iter/s)": 0.23981 }, { "epoch": 0.40170744721396445, "grad_norm": 0.9857455492019653, "learning_rate": 9.841579020611187e-05, "loss": 0.21232914924621582, "memory(GiB)": 122.96, "step": 5270, "token_acc": 0.9140083217753121, "train_speed(iter/s)": 0.239853 }, { "epoch": 0.4020885738242244, "grad_norm": 0.5359882116317749, "learning_rate": 9.841279870301356e-05, "loss": 0.17319568395614623, "memory(GiB)": 122.96, "step": 5275, "token_acc": 0.9370416493704165, "train_speed(iter/s)": 0.23989 }, { "epoch": 0.4024697004344843, "grad_norm": 0.645533561706543, "learning_rate": 9.84098044236659e-05, "loss": 0.15964758396148682, "memory(GiB)": 122.96, "step": 5280, "token_acc": 0.930802415875755, "train_speed(iter/s)": 0.239937 }, { "epoch": 0.4028508270447443, "grad_norm": 0.8209419846534729, "learning_rate": 9.840680736824061e-05, "loss": 0.2068192481994629, "memory(GiB)": 122.96, "step": 5285, "token_acc": 0.9106409851594569, "train_speed(iter/s)": 0.240003 }, { "epoch": 0.4032319536550042, "grad_norm": 0.9682230353355408, "learning_rate": 9.840380753690955e-05, "loss": 0.1989890456199646, "memory(GiB)": 122.96, "step": 5290, "token_acc": 0.9144602851323829, "train_speed(iter/s)": 0.240038 }, { "epoch": 0.4036130802652641, "grad_norm": 1.123693823814392, "learning_rate": 9.840080492984475e-05, "loss": 0.1998154640197754, "memory(GiB)": 122.96, "step": 5295, "token_acc": 0.9291177970423253, "train_speed(iter/s)": 0.240035 }, { "epoch": 0.40399420687552406, "grad_norm": 0.9300686717033386, "learning_rate": 9.839779954721839e-05, "loss": 0.1598773717880249, "memory(GiB)": 122.96, "step": 5300, "token_acc": 0.9208994708994709, "train_speed(iter/s)": 0.240094 }, { "epoch": 0.40437533348578397, "grad_norm": 0.9222313165664673, "learning_rate": 9.839479138920282e-05, "loss": 0.09797217845916747, "memory(GiB)": 122.96, "step": 5305, "token_acc": 0.9442874444893016, "train_speed(iter/s)": 0.240181 }, { "epoch": 0.4047564600960439, "grad_norm": 2.0927202701568604, "learning_rate": 9.839178045597053e-05, "loss": 0.24593062400817872, "memory(GiB)": 122.96, "step": 5310, "token_acc": 0.8996802557953637, "train_speed(iter/s)": 0.24023 }, { "epoch": 0.40513758670630384, "grad_norm": 2.071713447570801, "learning_rate": 9.838876674769422e-05, "loss": 0.20978541374206544, "memory(GiB)": 122.96, "step": 5315, "token_acc": 0.908028956893715, "train_speed(iter/s)": 0.240282 }, { "epoch": 0.40551871331656375, "grad_norm": 1.0447150468826294, "learning_rate": 9.838575026454668e-05, "loss": 0.2163994789123535, "memory(GiB)": 122.96, "step": 5320, "token_acc": 0.9239093676022996, "train_speed(iter/s)": 0.240346 }, { "epoch": 0.4058998399268237, "grad_norm": 1.0375614166259766, "learning_rate": 9.83827310067009e-05, "loss": 0.1837661623954773, "memory(GiB)": 122.96, "step": 5325, "token_acc": 0.9215513442044954, "train_speed(iter/s)": 0.240404 }, { "epoch": 0.4062809665370836, "grad_norm": 2.1557810306549072, "learning_rate": 9.837970897433002e-05, "loss": 0.13580844402313233, "memory(GiB)": 122.96, "step": 5330, "token_acc": 0.948006379585327, "train_speed(iter/s)": 0.240477 }, { "epoch": 0.40666209314734353, "grad_norm": 0.8253194093704224, "learning_rate": 9.837668416760733e-05, "loss": 0.16056121587753297, "memory(GiB)": 122.96, "step": 5335, "token_acc": 0.9431243680485338, "train_speed(iter/s)": 0.240536 }, { "epoch": 0.4070432197576035, "grad_norm": 0.6279621124267578, "learning_rate": 9.837365658670631e-05, "loss": 0.14908971786499023, "memory(GiB)": 122.96, "step": 5340, "token_acc": 0.9344520188778186, "train_speed(iter/s)": 0.240551 }, { "epoch": 0.4074243463678634, "grad_norm": 0.7085779309272766, "learning_rate": 9.837062623180056e-05, "loss": 0.1973907470703125, "memory(GiB)": 122.96, "step": 5345, "token_acc": 0.9024808862537057, "train_speed(iter/s)": 0.240603 }, { "epoch": 0.4078054729781233, "grad_norm": 1.2273069620132446, "learning_rate": 9.836759310306387e-05, "loss": 0.19592317342758178, "memory(GiB)": 122.96, "step": 5350, "token_acc": 0.9265925809985913, "train_speed(iter/s)": 0.240636 }, { "epoch": 0.40818659958838327, "grad_norm": 1.9406726360321045, "learning_rate": 9.836455720067015e-05, "loss": 0.20247371196746827, "memory(GiB)": 122.96, "step": 5355, "token_acc": 0.9269434269434269, "train_speed(iter/s)": 0.240675 }, { "epoch": 0.4085677261986432, "grad_norm": 0.7291033864021301, "learning_rate": 9.836151852479354e-05, "loss": 0.1576331615447998, "memory(GiB)": 122.96, "step": 5360, "token_acc": 0.9397466621020198, "train_speed(iter/s)": 0.240745 }, { "epoch": 0.40894885280890314, "grad_norm": 0.8367064595222473, "learning_rate": 9.835847707560825e-05, "loss": 0.261029052734375, "memory(GiB)": 122.96, "step": 5365, "token_acc": 0.9036259541984732, "train_speed(iter/s)": 0.240776 }, { "epoch": 0.40932997941916305, "grad_norm": 1.0701203346252441, "learning_rate": 9.835543285328871e-05, "loss": 0.17764976024627685, "memory(GiB)": 122.96, "step": 5370, "token_acc": 0.9421512966088691, "train_speed(iter/s)": 0.240789 }, { "epoch": 0.40971110602942296, "grad_norm": 1.3458871841430664, "learning_rate": 9.83523858580095e-05, "loss": 0.19765472412109375, "memory(GiB)": 122.96, "step": 5375, "token_acc": 0.9074401008827239, "train_speed(iter/s)": 0.240851 }, { "epoch": 0.4100922326396829, "grad_norm": 1.6801986694335938, "learning_rate": 9.834933608994535e-05, "loss": 0.17113955020904542, "memory(GiB)": 122.96, "step": 5380, "token_acc": 0.946779303062302, "train_speed(iter/s)": 0.240901 }, { "epoch": 0.4104733592499428, "grad_norm": 1.5063880681991577, "learning_rate": 9.834628354927112e-05, "loss": 0.190246057510376, "memory(GiB)": 122.96, "step": 5385, "token_acc": 0.9225031081641111, "train_speed(iter/s)": 0.240972 }, { "epoch": 0.41085448586020273, "grad_norm": 1.1311261653900146, "learning_rate": 9.83432282361619e-05, "loss": 0.20709993839263915, "memory(GiB)": 122.96, "step": 5390, "token_acc": 0.9025718257645968, "train_speed(iter/s)": 0.241002 }, { "epoch": 0.4112356124704627, "grad_norm": 0.6697658896446228, "learning_rate": 9.834017015079288e-05, "loss": 0.25129971504211424, "memory(GiB)": 122.96, "step": 5395, "token_acc": 0.8971907633475511, "train_speed(iter/s)": 0.241035 }, { "epoch": 0.4116167390807226, "grad_norm": 0.9794591069221497, "learning_rate": 9.833710929333941e-05, "loss": 0.2303314685821533, "memory(GiB)": 122.96, "step": 5400, "token_acc": 0.916406858924396, "train_speed(iter/s)": 0.241071 }, { "epoch": 0.4116167390807226, "eval_loss": 0.14790448546409607, "eval_runtime": 176.1839, "eval_samples_per_second": 3.008, "eval_steps_per_second": 3.008, "eval_token_acc": 0.9280163845551472, "step": 5400 }, { "epoch": 0.41199786569098257, "grad_norm": 0.657495379447937, "learning_rate": 9.833404566397706e-05, "loss": 0.17798929214477538, "memory(GiB)": 122.96, "step": 5405, "token_acc": 0.9279552900924932, "train_speed(iter/s)": 0.239241 }, { "epoch": 0.4123789923012425, "grad_norm": 1.5333033800125122, "learning_rate": 9.833097926288148e-05, "loss": 0.2844879627227783, "memory(GiB)": 122.96, "step": 5410, "token_acc": 0.8903926234384295, "train_speed(iter/s)": 0.239272 }, { "epoch": 0.4127601189115024, "grad_norm": 0.8072861433029175, "learning_rate": 9.832791009022852e-05, "loss": 0.2016896963119507, "memory(GiB)": 122.96, "step": 5415, "token_acc": 0.9292307692307692, "train_speed(iter/s)": 0.239325 }, { "epoch": 0.41314124552176235, "grad_norm": 1.2160632610321045, "learning_rate": 9.832483814619417e-05, "loss": 0.214288592338562, "memory(GiB)": 122.96, "step": 5420, "token_acc": 0.9241358213520955, "train_speed(iter/s)": 0.239392 }, { "epoch": 0.41352237213202225, "grad_norm": 1.2003717422485352, "learning_rate": 9.832176343095463e-05, "loss": 0.18633885383605958, "memory(GiB)": 122.96, "step": 5425, "token_acc": 0.9387064676616915, "train_speed(iter/s)": 0.239445 }, { "epoch": 0.41390349874228216, "grad_norm": 0.943347692489624, "learning_rate": 9.831868594468619e-05, "loss": 0.1506732940673828, "memory(GiB)": 122.96, "step": 5430, "token_acc": 0.9436459046315309, "train_speed(iter/s)": 0.239495 }, { "epoch": 0.4142846253525421, "grad_norm": 1.0078617334365845, "learning_rate": 9.831560568756534e-05, "loss": 0.19785492420196532, "memory(GiB)": 122.96, "step": 5435, "token_acc": 0.9235331497564075, "train_speed(iter/s)": 0.239549 }, { "epoch": 0.41466575196280203, "grad_norm": 1.5747328996658325, "learning_rate": 9.831252265976871e-05, "loss": 0.20800106525421141, "memory(GiB)": 122.96, "step": 5440, "token_acc": 0.9152360515021459, "train_speed(iter/s)": 0.239611 }, { "epoch": 0.415046878573062, "grad_norm": 0.6138991117477417, "learning_rate": 9.83094368614731e-05, "loss": 0.1543560266494751, "memory(GiB)": 122.96, "step": 5445, "token_acc": 0.9343582656762826, "train_speed(iter/s)": 0.239643 }, { "epoch": 0.4154280051833219, "grad_norm": 1.5423082113265991, "learning_rate": 9.830634829285547e-05, "loss": 0.16219027042388917, "memory(GiB)": 122.96, "step": 5450, "token_acc": 0.940232156087923, "train_speed(iter/s)": 0.239695 }, { "epoch": 0.4158091317935818, "grad_norm": 1.222476601600647, "learning_rate": 9.830325695409294e-05, "loss": 0.1920068621635437, "memory(GiB)": 122.96, "step": 5455, "token_acc": 0.927784112504751, "train_speed(iter/s)": 0.239767 }, { "epoch": 0.4161902584038418, "grad_norm": 0.9314525127410889, "learning_rate": 9.830016284536278e-05, "loss": 0.22992796897888185, "memory(GiB)": 122.96, "step": 5460, "token_acc": 0.9054682955206516, "train_speed(iter/s)": 0.239832 }, { "epoch": 0.4165713850141017, "grad_norm": 1.1477999687194824, "learning_rate": 9.829706596684243e-05, "loss": 0.28290157318115233, "memory(GiB)": 122.96, "step": 5465, "token_acc": 0.8916195625460801, "train_speed(iter/s)": 0.239882 }, { "epoch": 0.4169525116243616, "grad_norm": 1.4484931230545044, "learning_rate": 9.829396631870945e-05, "loss": 0.22822000980377197, "memory(GiB)": 122.96, "step": 5470, "token_acc": 0.9029600244125725, "train_speed(iter/s)": 0.239948 }, { "epoch": 0.41733363823462155, "grad_norm": 1.0545072555541992, "learning_rate": 9.829086390114162e-05, "loss": 0.1859412431716919, "memory(GiB)": 122.96, "step": 5475, "token_acc": 0.9237463738085371, "train_speed(iter/s)": 0.240027 }, { "epoch": 0.41771476484488146, "grad_norm": 1.4397956132888794, "learning_rate": 9.828775871431685e-05, "loss": 0.19536244869232178, "memory(GiB)": 122.96, "step": 5480, "token_acc": 0.9314069673237915, "train_speed(iter/s)": 0.240041 }, { "epoch": 0.4180958914551414, "grad_norm": 1.3345972299575806, "learning_rate": 9.82846507584132e-05, "loss": 0.1251181960105896, "memory(GiB)": 122.96, "step": 5485, "token_acc": 0.9505766062602965, "train_speed(iter/s)": 0.240097 }, { "epoch": 0.41847701806540133, "grad_norm": 1.1314537525177002, "learning_rate": 9.828154003360889e-05, "loss": 0.2102029800415039, "memory(GiB)": 122.96, "step": 5490, "token_acc": 0.9150293002412961, "train_speed(iter/s)": 0.240136 }, { "epoch": 0.41885814467566124, "grad_norm": 0.7623291015625, "learning_rate": 9.827842654008232e-05, "loss": 0.19914332628250123, "memory(GiB)": 122.96, "step": 5495, "token_acc": 0.9112611905135857, "train_speed(iter/s)": 0.240173 }, { "epoch": 0.4192392712859212, "grad_norm": 1.0895283222198486, "learning_rate": 9.827531027801203e-05, "loss": 0.2347170114517212, "memory(GiB)": 122.96, "step": 5500, "token_acc": 0.9161451814768461, "train_speed(iter/s)": 0.24023 }, { "epoch": 0.4196203978961811, "grad_norm": 1.3167550563812256, "learning_rate": 9.827219124757669e-05, "loss": 0.25099241733551025, "memory(GiB)": 122.96, "step": 5505, "token_acc": 0.9105042016806723, "train_speed(iter/s)": 0.240273 }, { "epoch": 0.420001524506441, "grad_norm": 0.9127209782600403, "learning_rate": 9.826906944895522e-05, "loss": 0.18462662696838378, "memory(GiB)": 122.96, "step": 5510, "token_acc": 0.9113368103211639, "train_speed(iter/s)": 0.240343 }, { "epoch": 0.420382651116701, "grad_norm": 1.505408525466919, "learning_rate": 9.82659448823266e-05, "loss": 0.2526489973068237, "memory(GiB)": 122.96, "step": 5515, "token_acc": 0.8872702046479362, "train_speed(iter/s)": 0.24041 }, { "epoch": 0.4207637777269609, "grad_norm": 0.45108914375305176, "learning_rate": 9.826281754787005e-05, "loss": 0.1522266983985901, "memory(GiB)": 122.96, "step": 5520, "token_acc": 0.9354995150339476, "train_speed(iter/s)": 0.240452 }, { "epoch": 0.42114490433722085, "grad_norm": 0.9531474113464355, "learning_rate": 9.825968744576486e-05, "loss": 0.23550994396209718, "memory(GiB)": 122.96, "step": 5525, "token_acc": 0.9097222222222222, "train_speed(iter/s)": 0.240517 }, { "epoch": 0.42152603094748076, "grad_norm": 0.9390043020248413, "learning_rate": 9.825655457619054e-05, "loss": 0.24609601497650146, "memory(GiB)": 122.96, "step": 5530, "token_acc": 0.9194769442532691, "train_speed(iter/s)": 0.240566 }, { "epoch": 0.42190715755774066, "grad_norm": 2.0566720962524414, "learning_rate": 9.825341893932676e-05, "loss": 0.1472208619117737, "memory(GiB)": 122.96, "step": 5535, "token_acc": 0.9293662312530238, "train_speed(iter/s)": 0.240639 }, { "epoch": 0.4222882841680006, "grad_norm": 0.6809375882148743, "learning_rate": 9.825028053535333e-05, "loss": 0.20836026668548585, "memory(GiB)": 122.96, "step": 5540, "token_acc": 0.9198329023274319, "train_speed(iter/s)": 0.240644 }, { "epoch": 0.42266941077826053, "grad_norm": 0.895395815372467, "learning_rate": 9.824713936445022e-05, "loss": 0.24791197776794432, "memory(GiB)": 122.96, "step": 5545, "token_acc": 0.9143182928815768, "train_speed(iter/s)": 0.240682 }, { "epoch": 0.42305053738852044, "grad_norm": 0.607596218585968, "learning_rate": 9.824399542679756e-05, "loss": 0.14024617671966552, "memory(GiB)": 122.96, "step": 5550, "token_acc": 0.9470338983050848, "train_speed(iter/s)": 0.240705 }, { "epoch": 0.4234316639987804, "grad_norm": 1.3289097547531128, "learning_rate": 9.824084872257564e-05, "loss": 0.18848674297332763, "memory(GiB)": 122.96, "step": 5555, "token_acc": 0.9279661016949152, "train_speed(iter/s)": 0.240743 }, { "epoch": 0.4238127906090403, "grad_norm": 1.2601559162139893, "learning_rate": 9.823769925196491e-05, "loss": 0.20795786380767822, "memory(GiB)": 122.96, "step": 5560, "token_acc": 0.9223394055608821, "train_speed(iter/s)": 0.240816 }, { "epoch": 0.4241939172193003, "grad_norm": 0.9429551959037781, "learning_rate": 9.823454701514598e-05, "loss": 0.17151944637298583, "memory(GiB)": 122.96, "step": 5565, "token_acc": 0.9276847290640394, "train_speed(iter/s)": 0.240858 }, { "epoch": 0.4245750438295602, "grad_norm": 2.367288589477539, "learning_rate": 9.823139201229962e-05, "loss": 0.1971184253692627, "memory(GiB)": 122.96, "step": 5570, "token_acc": 0.9214466712739, "train_speed(iter/s)": 0.240914 }, { "epoch": 0.4249561704398201, "grad_norm": 1.402206301689148, "learning_rate": 9.822823424360674e-05, "loss": 0.24548025131225587, "memory(GiB)": 122.96, "step": 5575, "token_acc": 0.9088960342979635, "train_speed(iter/s)": 0.240977 }, { "epoch": 0.42533729705008005, "grad_norm": 1.0822885036468506, "learning_rate": 9.822507370924844e-05, "loss": 0.21088643074035646, "memory(GiB)": 122.96, "step": 5580, "token_acc": 0.920458212326557, "train_speed(iter/s)": 0.241013 }, { "epoch": 0.42571842366033996, "grad_norm": 1.0587328672409058, "learning_rate": 9.822191040940595e-05, "loss": 0.1941753387451172, "memory(GiB)": 122.96, "step": 5585, "token_acc": 0.9225445834442374, "train_speed(iter/s)": 0.241086 }, { "epoch": 0.42609955027059987, "grad_norm": 0.8872730135917664, "learning_rate": 9.821874434426067e-05, "loss": 0.2454383373260498, "memory(GiB)": 122.96, "step": 5590, "token_acc": 0.9032258064516129, "train_speed(iter/s)": 0.24115 }, { "epoch": 0.42648067688085983, "grad_norm": 1.2904049158096313, "learning_rate": 9.821557551399418e-05, "loss": 0.14410037994384767, "memory(GiB)": 122.96, "step": 5595, "token_acc": 0.9159460394327222, "train_speed(iter/s)": 0.241227 }, { "epoch": 0.42686180349111974, "grad_norm": 1.4907950162887573, "learning_rate": 9.821240391878816e-05, "loss": 0.2021566390991211, "memory(GiB)": 122.96, "step": 5600, "token_acc": 0.898493455174117, "train_speed(iter/s)": 0.241281 }, { "epoch": 0.42686180349111974, "eval_loss": 0.14358288049697876, "eval_runtime": 173.6481, "eval_samples_per_second": 3.052, "eval_steps_per_second": 3.052, "eval_token_acc": 0.9279410878862719, "step": 5600 }, { "epoch": 0.4272429301013797, "grad_norm": 1.2165998220443726, "learning_rate": 9.820922955882453e-05, "loss": 0.20213518142700196, "memory(GiB)": 122.96, "step": 5605, "token_acc": 0.9275284486762657, "train_speed(iter/s)": 0.239537 }, { "epoch": 0.4276240567116396, "grad_norm": 1.302382230758667, "learning_rate": 9.82060524342853e-05, "loss": 0.1432071328163147, "memory(GiB)": 122.96, "step": 5610, "token_acc": 0.9221386460662736, "train_speed(iter/s)": 0.239585 }, { "epoch": 0.4280051833218995, "grad_norm": 1.8272764682769775, "learning_rate": 9.820287254535265e-05, "loss": 0.19002441167831421, "memory(GiB)": 122.96, "step": 5615, "token_acc": 0.9255610290093049, "train_speed(iter/s)": 0.239659 }, { "epoch": 0.4283863099321595, "grad_norm": 1.4285753965377808, "learning_rate": 9.819968989220895e-05, "loss": 0.2552043437957764, "memory(GiB)": 122.96, "step": 5620, "token_acc": 0.8996229971724788, "train_speed(iter/s)": 0.239703 }, { "epoch": 0.4287674365424194, "grad_norm": 0.7161190509796143, "learning_rate": 9.819650447503672e-05, "loss": 0.18307424783706666, "memory(GiB)": 122.96, "step": 5625, "token_acc": 0.9272574245521221, "train_speed(iter/s)": 0.239715 }, { "epoch": 0.4291485631526793, "grad_norm": 1.7430254220962524, "learning_rate": 9.81933162940186e-05, "loss": 0.2075108528137207, "memory(GiB)": 122.96, "step": 5630, "token_acc": 0.92487597448618, "train_speed(iter/s)": 0.239771 }, { "epoch": 0.42952968976293926, "grad_norm": 0.792046308517456, "learning_rate": 9.819012534933747e-05, "loss": 0.2114635229110718, "memory(GiB)": 122.96, "step": 5635, "token_acc": 0.9172521467603435, "train_speed(iter/s)": 0.23982 }, { "epoch": 0.42991081637319917, "grad_norm": 1.4989656209945679, "learning_rate": 9.818693164117624e-05, "loss": 0.22358734607696534, "memory(GiB)": 122.96, "step": 5640, "token_acc": 0.9021915584415584, "train_speed(iter/s)": 0.239895 }, { "epoch": 0.43029194298345913, "grad_norm": 1.0632662773132324, "learning_rate": 9.818373516971813e-05, "loss": 0.18430403470993043, "memory(GiB)": 122.96, "step": 5645, "token_acc": 0.928936264712414, "train_speed(iter/s)": 0.239947 }, { "epoch": 0.43067306959371904, "grad_norm": 0.8415993452072144, "learning_rate": 9.818053593514637e-05, "loss": 0.2075711488723755, "memory(GiB)": 122.96, "step": 5650, "token_acc": 0.9209650582362728, "train_speed(iter/s)": 0.240017 }, { "epoch": 0.43105419620397895, "grad_norm": 0.8282695412635803, "learning_rate": 9.81773339376445e-05, "loss": 0.13644307851791382, "memory(GiB)": 122.96, "step": 5655, "token_acc": 0.9398326898326899, "train_speed(iter/s)": 0.240081 }, { "epoch": 0.4314353228142389, "grad_norm": 0.8056726455688477, "learning_rate": 9.817412917739607e-05, "loss": 0.18360655307769774, "memory(GiB)": 122.96, "step": 5660, "token_acc": 0.8999142734676382, "train_speed(iter/s)": 0.240135 }, { "epoch": 0.4318164494244988, "grad_norm": 1.007015585899353, "learning_rate": 9.81709216545849e-05, "loss": 0.18366410732269287, "memory(GiB)": 122.96, "step": 5665, "token_acc": 0.9255405956752346, "train_speed(iter/s)": 0.240188 }, { "epoch": 0.4321975760347587, "grad_norm": 0.9485337734222412, "learning_rate": 9.81677113693949e-05, "loss": 0.20480496883392335, "memory(GiB)": 122.96, "step": 5670, "token_acc": 0.9271661783172875, "train_speed(iter/s)": 0.240242 }, { "epoch": 0.4325787026450187, "grad_norm": 1.1508936882019043, "learning_rate": 9.81644983220102e-05, "loss": 0.2174469232559204, "memory(GiB)": 122.96, "step": 5675, "token_acc": 0.914601331256196, "train_speed(iter/s)": 0.240287 }, { "epoch": 0.4329598292552786, "grad_norm": 2.330214023590088, "learning_rate": 9.816128251261502e-05, "loss": 0.160527503490448, "memory(GiB)": 122.96, "step": 5680, "token_acc": 0.9397488119484046, "train_speed(iter/s)": 0.240313 }, { "epoch": 0.43334095586553856, "grad_norm": 0.8409305214881897, "learning_rate": 9.815806394139378e-05, "loss": 0.16370599269866942, "memory(GiB)": 122.96, "step": 5685, "token_acc": 0.9375946682823387, "train_speed(iter/s)": 0.240342 }, { "epoch": 0.43372208247579846, "grad_norm": 1.1967177391052246, "learning_rate": 9.815484260853107e-05, "loss": 0.15654292106628417, "memory(GiB)": 122.96, "step": 5690, "token_acc": 0.933003204194582, "train_speed(iter/s)": 0.240403 }, { "epoch": 0.4341032090860584, "grad_norm": 1.5668656826019287, "learning_rate": 9.815161851421158e-05, "loss": 0.1910780429840088, "memory(GiB)": 122.96, "step": 5695, "token_acc": 0.9326254119370194, "train_speed(iter/s)": 0.240469 }, { "epoch": 0.43448433569631834, "grad_norm": 1.0367659330368042, "learning_rate": 9.814839165862023e-05, "loss": 0.13768833875656128, "memory(GiB)": 122.96, "step": 5700, "token_acc": 0.9392446633825944, "train_speed(iter/s)": 0.240519 }, { "epoch": 0.43486546230657824, "grad_norm": 1.1778161525726318, "learning_rate": 9.814516204194207e-05, "loss": 0.15689183473587037, "memory(GiB)": 122.96, "step": 5705, "token_acc": 0.941594317284925, "train_speed(iter/s)": 0.24055 }, { "epoch": 0.43524658891683815, "grad_norm": 1.0071607828140259, "learning_rate": 9.814192966436225e-05, "loss": 0.1788067936897278, "memory(GiB)": 122.96, "step": 5710, "token_acc": 0.9265949269792467, "train_speed(iter/s)": 0.240592 }, { "epoch": 0.4356277155270981, "grad_norm": 2.2545182704925537, "learning_rate": 9.813869452606619e-05, "loss": 0.2364574432373047, "memory(GiB)": 122.96, "step": 5715, "token_acc": 0.9165201199958622, "train_speed(iter/s)": 0.240605 }, { "epoch": 0.436008842137358, "grad_norm": 1.589725375175476, "learning_rate": 9.813545662723939e-05, "loss": 0.16898825168609619, "memory(GiB)": 122.96, "step": 5720, "token_acc": 0.9426534910092338, "train_speed(iter/s)": 0.240637 }, { "epoch": 0.436389968747618, "grad_norm": 1.1484736204147339, "learning_rate": 9.813221596806754e-05, "loss": 0.17856762409210206, "memory(GiB)": 122.96, "step": 5725, "token_acc": 0.9348637015781922, "train_speed(iter/s)": 0.240701 }, { "epoch": 0.4367710953578779, "grad_norm": 1.6593124866485596, "learning_rate": 9.812897254873646e-05, "loss": 0.13222430944442748, "memory(GiB)": 122.96, "step": 5730, "token_acc": 0.9312141740489839, "train_speed(iter/s)": 0.240763 }, { "epoch": 0.4371522219681378, "grad_norm": 0.7942419052124023, "learning_rate": 9.812572636943213e-05, "loss": 0.19443143606185914, "memory(GiB)": 122.96, "step": 5735, "token_acc": 0.9239925182792043, "train_speed(iter/s)": 0.240817 }, { "epoch": 0.43753334857839776, "grad_norm": 0.8438810110092163, "learning_rate": 9.812247743034072e-05, "loss": 0.20177702903747557, "memory(GiB)": 122.96, "step": 5740, "token_acc": 0.9247989910137159, "train_speed(iter/s)": 0.240857 }, { "epoch": 0.43791447518865767, "grad_norm": 0.7885280251502991, "learning_rate": 9.811922573164858e-05, "loss": 0.20316269397735595, "memory(GiB)": 122.96, "step": 5745, "token_acc": 0.9099661722612542, "train_speed(iter/s)": 0.240913 }, { "epoch": 0.4382956017989176, "grad_norm": 1.4747692346572876, "learning_rate": 9.811597127354212e-05, "loss": 0.21400582790374756, "memory(GiB)": 122.96, "step": 5750, "token_acc": 0.9121644017235665, "train_speed(iter/s)": 0.24098 }, { "epoch": 0.43867672840917754, "grad_norm": 1.2477864027023315, "learning_rate": 9.811271405620801e-05, "loss": 0.2548990249633789, "memory(GiB)": 122.96, "step": 5755, "token_acc": 0.8939986953685584, "train_speed(iter/s)": 0.241048 }, { "epoch": 0.43905785501943745, "grad_norm": 1.4822957515716553, "learning_rate": 9.810945407983301e-05, "loss": 0.18732240200042724, "memory(GiB)": 122.96, "step": 5760, "token_acc": 0.92830626450116, "train_speed(iter/s)": 0.24111 }, { "epoch": 0.4394389816296974, "grad_norm": 0.2816905975341797, "learning_rate": 9.810619134460406e-05, "loss": 0.16111267805099488, "memory(GiB)": 122.96, "step": 5765, "token_acc": 0.9260869565217391, "train_speed(iter/s)": 0.241155 }, { "epoch": 0.4398201082399573, "grad_norm": 1.3071860074996948, "learning_rate": 9.810292585070829e-05, "loss": 0.16373000144958497, "memory(GiB)": 122.96, "step": 5770, "token_acc": 0.9322115384615385, "train_speed(iter/s)": 0.241194 }, { "epoch": 0.4402012348502172, "grad_norm": 1.3119615316390991, "learning_rate": 9.809965759833295e-05, "loss": 0.24101948738098145, "memory(GiB)": 122.96, "step": 5775, "token_acc": 0.9156074182121275, "train_speed(iter/s)": 0.241237 }, { "epoch": 0.4405823614604772, "grad_norm": 0.7399266362190247, "learning_rate": 9.809638658766545e-05, "loss": 0.23722944259643555, "memory(GiB)": 122.96, "step": 5780, "token_acc": 0.891213389121339, "train_speed(iter/s)": 0.241298 }, { "epoch": 0.4409634880707371, "grad_norm": 0.7628763914108276, "learning_rate": 9.809311281889337e-05, "loss": 0.18381640911102295, "memory(GiB)": 122.96, "step": 5785, "token_acc": 0.9190140845070423, "train_speed(iter/s)": 0.241351 }, { "epoch": 0.441344614680997, "grad_norm": 0.7859430313110352, "learning_rate": 9.808983629220446e-05, "loss": 0.1581476092338562, "memory(GiB)": 122.96, "step": 5790, "token_acc": 0.933290050316507, "train_speed(iter/s)": 0.241379 }, { "epoch": 0.44172574129125697, "grad_norm": 1.3257243633270264, "learning_rate": 9.80865570077866e-05, "loss": 0.2195117712020874, "memory(GiB)": 122.96, "step": 5795, "token_acc": 0.9146567717996289, "train_speed(iter/s)": 0.241383 }, { "epoch": 0.4421068679015169, "grad_norm": 1.1205910444259644, "learning_rate": 9.808327496582785e-05, "loss": 0.1879459023475647, "memory(GiB)": 122.96, "step": 5800, "token_acc": 0.9262930257704091, "train_speed(iter/s)": 0.241442 }, { "epoch": 0.4421068679015169, "eval_loss": 0.14410848915576935, "eval_runtime": 175.7521, "eval_samples_per_second": 3.016, "eval_steps_per_second": 3.016, "eval_token_acc": 0.9296126739353051, "step": 5800 }, { "epoch": 0.44248799451177684, "grad_norm": 0.7860932946205139, "learning_rate": 9.80799901665164e-05, "loss": 0.15974504947662355, "memory(GiB)": 122.96, "step": 5805, "token_acc": 0.9297647803082586, "train_speed(iter/s)": 0.239724 }, { "epoch": 0.44286912112203675, "grad_norm": 1.0670915842056274, "learning_rate": 9.807670261004064e-05, "loss": 0.1702812910079956, "memory(GiB)": 122.96, "step": 5810, "token_acc": 0.9422572178477691, "train_speed(iter/s)": 0.239764 }, { "epoch": 0.44325024773229665, "grad_norm": 1.1350347995758057, "learning_rate": 9.807341229658909e-05, "loss": 0.2053752899169922, "memory(GiB)": 122.96, "step": 5815, "token_acc": 0.9192162255070471, "train_speed(iter/s)": 0.239833 }, { "epoch": 0.4436313743425566, "grad_norm": 0.7824772000312805, "learning_rate": 9.807011922635043e-05, "loss": 0.1516520142555237, "memory(GiB)": 122.96, "step": 5820, "token_acc": 0.9405, "train_speed(iter/s)": 0.239908 }, { "epoch": 0.4440125009528165, "grad_norm": 0.7652502655982971, "learning_rate": 9.80668233995135e-05, "loss": 0.1715189814567566, "memory(GiB)": 122.96, "step": 5825, "token_acc": 0.9367866549604916, "train_speed(iter/s)": 0.239949 }, { "epoch": 0.44439362756307643, "grad_norm": 1.0123487710952759, "learning_rate": 9.806352481626731e-05, "loss": 0.16164605617523192, "memory(GiB)": 122.96, "step": 5830, "token_acc": 0.9404517453798767, "train_speed(iter/s)": 0.240006 }, { "epoch": 0.4447747541733364, "grad_norm": 0.8436840772628784, "learning_rate": 9.806022347680102e-05, "loss": 0.20975229740142823, "memory(GiB)": 122.96, "step": 5835, "token_acc": 0.9284684164984136, "train_speed(iter/s)": 0.240068 }, { "epoch": 0.4451558807835963, "grad_norm": 0.815699577331543, "learning_rate": 9.805691938130393e-05, "loss": 0.1857348084449768, "memory(GiB)": 122.96, "step": 5840, "token_acc": 0.9213706340009932, "train_speed(iter/s)": 0.24011 }, { "epoch": 0.44553700739385627, "grad_norm": 1.0349758863449097, "learning_rate": 9.805361252996553e-05, "loss": 0.3212902545928955, "memory(GiB)": 122.96, "step": 5845, "token_acc": 0.8871435476075399, "train_speed(iter/s)": 0.240169 }, { "epoch": 0.4459181340041162, "grad_norm": 1.031275749206543, "learning_rate": 9.805030292297545e-05, "loss": 0.21832432746887206, "memory(GiB)": 122.96, "step": 5850, "token_acc": 0.9196081062944571, "train_speed(iter/s)": 0.24018 }, { "epoch": 0.4462992606143761, "grad_norm": 0.8122202157974243, "learning_rate": 9.804699056052346e-05, "loss": 0.17147910594940186, "memory(GiB)": 122.96, "step": 5855, "token_acc": 0.9186902133922001, "train_speed(iter/s)": 0.240242 }, { "epoch": 0.44668038722463604, "grad_norm": 0.9089215993881226, "learning_rate": 9.804367544279955e-05, "loss": 0.2476818323135376, "memory(GiB)": 122.96, "step": 5860, "token_acc": 0.903143585386576, "train_speed(iter/s)": 0.240301 }, { "epoch": 0.44706151383489595, "grad_norm": 1.6330487728118896, "learning_rate": 9.804035756999379e-05, "loss": 0.12944949865341188, "memory(GiB)": 122.96, "step": 5865, "token_acc": 0.9380781574476118, "train_speed(iter/s)": 0.240334 }, { "epoch": 0.44744264044515586, "grad_norm": 0.898857593536377, "learning_rate": 9.803703694229647e-05, "loss": 0.1967276453971863, "memory(GiB)": 122.96, "step": 5870, "token_acc": 0.9038013964313422, "train_speed(iter/s)": 0.240406 }, { "epoch": 0.4478237670554158, "grad_norm": 0.8442309498786926, "learning_rate": 9.803371355989799e-05, "loss": 0.19658491611480713, "memory(GiB)": 122.96, "step": 5875, "token_acc": 0.924646265218822, "train_speed(iter/s)": 0.24044 }, { "epoch": 0.44820489366567573, "grad_norm": 0.9960586428642273, "learning_rate": 9.803038742298895e-05, "loss": 0.17091988325119017, "memory(GiB)": 122.96, "step": 5880, "token_acc": 0.9299492385786802, "train_speed(iter/s)": 0.240506 }, { "epoch": 0.4485860202759357, "grad_norm": 1.6009372472763062, "learning_rate": 9.802705853176006e-05, "loss": 0.2259000062942505, "memory(GiB)": 122.96, "step": 5885, "token_acc": 0.9284649776453056, "train_speed(iter/s)": 0.240546 }, { "epoch": 0.4489671468861956, "grad_norm": 0.6210229396820068, "learning_rate": 9.802372688640226e-05, "loss": 0.13849483728408812, "memory(GiB)": 122.96, "step": 5890, "token_acc": 0.9373254189944135, "train_speed(iter/s)": 0.240596 }, { "epoch": 0.4493482734964555, "grad_norm": 0.9559767842292786, "learning_rate": 9.802039248710658e-05, "loss": 0.2212167501449585, "memory(GiB)": 122.96, "step": 5895, "token_acc": 0.9153702551337897, "train_speed(iter/s)": 0.240631 }, { "epoch": 0.44972940010671547, "grad_norm": 1.0159063339233398, "learning_rate": 9.801705533406421e-05, "loss": 0.19691673517227173, "memory(GiB)": 122.96, "step": 5900, "token_acc": 0.9273869346733669, "train_speed(iter/s)": 0.240681 }, { "epoch": 0.4501105267169754, "grad_norm": 1.3927607536315918, "learning_rate": 9.801371542746656e-05, "loss": 0.20128841400146485, "memory(GiB)": 122.96, "step": 5905, "token_acc": 0.9102515243902439, "train_speed(iter/s)": 0.240716 }, { "epoch": 0.4504916533272353, "grad_norm": 0.8727665543556213, "learning_rate": 9.801037276750513e-05, "loss": 0.19798550605773926, "memory(GiB)": 122.96, "step": 5910, "token_acc": 0.9239709443099273, "train_speed(iter/s)": 0.240774 }, { "epoch": 0.45087277993749525, "grad_norm": 0.8969955444335938, "learning_rate": 9.800702735437163e-05, "loss": 0.19158585071563722, "memory(GiB)": 122.96, "step": 5915, "token_acc": 0.9345447440406409, "train_speed(iter/s)": 0.240826 }, { "epoch": 0.45125390654775516, "grad_norm": 1.0912129878997803, "learning_rate": 9.800367918825788e-05, "loss": 0.19772069454193114, "memory(GiB)": 122.96, "step": 5920, "token_acc": 0.914179104477612, "train_speed(iter/s)": 0.240883 }, { "epoch": 0.4516350331580151, "grad_norm": 1.689038634300232, "learning_rate": 9.80003282693559e-05, "loss": 0.13649771213531495, "memory(GiB)": 122.96, "step": 5925, "token_acc": 0.9254468085106383, "train_speed(iter/s)": 0.240927 }, { "epoch": 0.452016159768275, "grad_norm": 2.212923765182495, "learning_rate": 9.799697459785784e-05, "loss": 0.15688211917877198, "memory(GiB)": 122.96, "step": 5930, "token_acc": 0.9460495283018868, "train_speed(iter/s)": 0.240976 }, { "epoch": 0.45239728637853494, "grad_norm": 0.4153003990650177, "learning_rate": 9.799361817395603e-05, "loss": 0.17155764102935792, "memory(GiB)": 122.96, "step": 5935, "token_acc": 0.9236372180451128, "train_speed(iter/s)": 0.241025 }, { "epoch": 0.4527784129887949, "grad_norm": 0.7915443778038025, "learning_rate": 9.799025899784291e-05, "loss": 0.21812257766723633, "memory(GiB)": 122.96, "step": 5940, "token_acc": 0.9091167282022152, "train_speed(iter/s)": 0.241084 }, { "epoch": 0.4531595395990548, "grad_norm": 1.0526578426361084, "learning_rate": 9.798689706971116e-05, "loss": 0.1444568157196045, "memory(GiB)": 122.96, "step": 5945, "token_acc": 0.9345335515548282, "train_speed(iter/s)": 0.241105 }, { "epoch": 0.4535406662093147, "grad_norm": 0.9020674824714661, "learning_rate": 9.798353238975354e-05, "loss": 0.2149744987487793, "memory(GiB)": 122.96, "step": 5950, "token_acc": 0.9049147937762411, "train_speed(iter/s)": 0.24117 }, { "epoch": 0.4539217928195747, "grad_norm": 1.3054205179214478, "learning_rate": 9.798016495816301e-05, "loss": 0.19787540435791015, "memory(GiB)": 122.96, "step": 5955, "token_acc": 0.9326708578381667, "train_speed(iter/s)": 0.24121 }, { "epoch": 0.4543029194298346, "grad_norm": 1.297041416168213, "learning_rate": 9.797679477513268e-05, "loss": 0.23945987224578857, "memory(GiB)": 122.96, "step": 5960, "token_acc": 0.9138098318240621, "train_speed(iter/s)": 0.241242 }, { "epoch": 0.4546840460400945, "grad_norm": 1.0065598487854004, "learning_rate": 9.79734218408558e-05, "loss": 0.2548996925354004, "memory(GiB)": 122.96, "step": 5965, "token_acc": 0.9067090115125049, "train_speed(iter/s)": 0.241292 }, { "epoch": 0.45506517265035445, "grad_norm": 1.2521307468414307, "learning_rate": 9.797004615552581e-05, "loss": 0.23904087543487548, "memory(GiB)": 122.96, "step": 5970, "token_acc": 0.9077245718280321, "train_speed(iter/s)": 0.241304 }, { "epoch": 0.45544629926061436, "grad_norm": 1.3853578567504883, "learning_rate": 9.796666771933627e-05, "loss": 0.18243277072906494, "memory(GiB)": 122.96, "step": 5975, "token_acc": 0.9320049813200498, "train_speed(iter/s)": 0.241345 }, { "epoch": 0.4558274258708743, "grad_norm": 0.6620686650276184, "learning_rate": 9.796328653248095e-05, "loss": 0.1856495499610901, "memory(GiB)": 122.96, "step": 5980, "token_acc": 0.919170381856949, "train_speed(iter/s)": 0.241397 }, { "epoch": 0.45620855248113423, "grad_norm": 1.2593777179718018, "learning_rate": 9.79599025951537e-05, "loss": 0.19147990942001342, "memory(GiB)": 122.96, "step": 5985, "token_acc": 0.9288125191776618, "train_speed(iter/s)": 0.241422 }, { "epoch": 0.45658967909139414, "grad_norm": 1.281211495399475, "learning_rate": 9.795651590754862e-05, "loss": 0.2085340738296509, "memory(GiB)": 122.96, "step": 5990, "token_acc": 0.91320293398533, "train_speed(iter/s)": 0.241494 }, { "epoch": 0.4569708057016541, "grad_norm": 1.3750882148742676, "learning_rate": 9.795312646985988e-05, "loss": 0.1084937572479248, "memory(GiB)": 122.96, "step": 5995, "token_acc": 0.9557425018288223, "train_speed(iter/s)": 0.241534 }, { "epoch": 0.457351932311914, "grad_norm": 0.5986596345901489, "learning_rate": 9.794973428228186e-05, "loss": 0.18103621006011963, "memory(GiB)": 122.96, "step": 6000, "token_acc": 0.9323658634003461, "train_speed(iter/s)": 0.241551 }, { "epoch": 0.457351932311914, "eval_loss": 0.13991156220436096, "eval_runtime": 175.2782, "eval_samples_per_second": 3.024, "eval_steps_per_second": 3.024, "eval_token_acc": 0.9302602252876333, "step": 6000 }, { "epoch": 0.4577330589221739, "grad_norm": 1.8880329132080078, "learning_rate": 9.794633934500912e-05, "loss": 0.193447482585907, "memory(GiB)": 122.96, "step": 6005, "token_acc": 0.9300073248148449, "train_speed(iter/s)": 0.239922 }, { "epoch": 0.4581141855324339, "grad_norm": 1.0271570682525635, "learning_rate": 9.79429416582363e-05, "loss": 0.20707921981811522, "memory(GiB)": 122.96, "step": 6010, "token_acc": 0.9063872930043934, "train_speed(iter/s)": 0.239982 }, { "epoch": 0.4584953121426938, "grad_norm": 1.417992115020752, "learning_rate": 9.793954122215828e-05, "loss": 0.19072439670562744, "memory(GiB)": 122.96, "step": 6015, "token_acc": 0.9238711453744494, "train_speed(iter/s)": 0.240018 }, { "epoch": 0.45887643875295375, "grad_norm": 0.9639965295791626, "learning_rate": 9.793613803697e-05, "loss": 0.23551995754241944, "memory(GiB)": 122.96, "step": 6020, "token_acc": 0.9222369291859696, "train_speed(iter/s)": 0.240037 }, { "epoch": 0.45925756536321366, "grad_norm": 1.2461316585540771, "learning_rate": 9.793273210286668e-05, "loss": 0.17370550632476806, "memory(GiB)": 122.96, "step": 6025, "token_acc": 0.9396623304732908, "train_speed(iter/s)": 0.240091 }, { "epoch": 0.45963869197347357, "grad_norm": 0.9793689846992493, "learning_rate": 9.79293234200436e-05, "loss": 0.22397143840789796, "memory(GiB)": 122.96, "step": 6030, "token_acc": 0.9127338963404538, "train_speed(iter/s)": 0.240127 }, { "epoch": 0.46001981858373353, "grad_norm": 0.8239403963088989, "learning_rate": 9.792591198869624e-05, "loss": 0.18809986114501953, "memory(GiB)": 122.96, "step": 6035, "token_acc": 0.9366605214643139, "train_speed(iter/s)": 0.240142 }, { "epoch": 0.46040094519399344, "grad_norm": 1.0634331703186035, "learning_rate": 9.792249780902023e-05, "loss": 0.15295557975769042, "memory(GiB)": 122.96, "step": 6040, "token_acc": 0.9329516069449575, "train_speed(iter/s)": 0.240171 }, { "epoch": 0.46078207180425335, "grad_norm": 1.485556960105896, "learning_rate": 9.791908088121136e-05, "loss": 0.13063251972198486, "memory(GiB)": 122.96, "step": 6045, "token_acc": 0.9381800197823936, "train_speed(iter/s)": 0.240217 }, { "epoch": 0.4611631984145133, "grad_norm": 2.4607880115509033, "learning_rate": 9.791566120546558e-05, "loss": 0.28245253562927247, "memory(GiB)": 122.96, "step": 6050, "token_acc": 0.8954781319495922, "train_speed(iter/s)": 0.240273 }, { "epoch": 0.4615443250247732, "grad_norm": 1.2964967489242554, "learning_rate": 9.791223878197898e-05, "loss": 0.18415912389755248, "memory(GiB)": 122.96, "step": 6055, "token_acc": 0.9296092184368737, "train_speed(iter/s)": 0.240329 }, { "epoch": 0.4619254516350332, "grad_norm": 0.771730363368988, "learning_rate": 9.790881361094783e-05, "loss": 0.17172136306762695, "memory(GiB)": 122.96, "step": 6060, "token_acc": 0.9167776298268975, "train_speed(iter/s)": 0.240388 }, { "epoch": 0.4623065782452931, "grad_norm": 1.3884042501449585, "learning_rate": 9.790538569256855e-05, "loss": 0.14651261568069457, "memory(GiB)": 122.96, "step": 6065, "token_acc": 0.944647201946472, "train_speed(iter/s)": 0.240446 }, { "epoch": 0.462687704855553, "grad_norm": 1.105020523071289, "learning_rate": 9.79019550270377e-05, "loss": 0.15576455593109131, "memory(GiB)": 122.96, "step": 6070, "token_acc": 0.9360902255639098, "train_speed(iter/s)": 0.240518 }, { "epoch": 0.46306883146581296, "grad_norm": 0.15740063786506653, "learning_rate": 9.789852161455201e-05, "loss": 0.13167004585266112, "memory(GiB)": 122.96, "step": 6075, "token_acc": 0.9384745048461862, "train_speed(iter/s)": 0.240573 }, { "epoch": 0.46344995807607287, "grad_norm": 0.2486758530139923, "learning_rate": 9.78950854553084e-05, "loss": 0.16045811176300048, "memory(GiB)": 122.96, "step": 6080, "token_acc": 0.9302813017098731, "train_speed(iter/s)": 0.240588 }, { "epoch": 0.4638310846863328, "grad_norm": 1.312091588973999, "learning_rate": 9.789164654950389e-05, "loss": 0.13247040510177613, "memory(GiB)": 122.96, "step": 6085, "token_acc": 0.941397445529677, "train_speed(iter/s)": 0.240664 }, { "epoch": 0.46421221129659274, "grad_norm": 1.0914182662963867, "learning_rate": 9.788820489733569e-05, "loss": 0.20113954544067383, "memory(GiB)": 122.96, "step": 6090, "token_acc": 0.9264722830471361, "train_speed(iter/s)": 0.240659 }, { "epoch": 0.46459333790685264, "grad_norm": 0.9428834915161133, "learning_rate": 9.788476049900118e-05, "loss": 0.1981325626373291, "memory(GiB)": 122.96, "step": 6095, "token_acc": 0.9212381244253754, "train_speed(iter/s)": 0.240686 }, { "epoch": 0.4649744645171126, "grad_norm": 1.2574355602264404, "learning_rate": 9.788131335469786e-05, "loss": 0.1840927243232727, "memory(GiB)": 122.96, "step": 6100, "token_acc": 0.9277456647398844, "train_speed(iter/s)": 0.240714 }, { "epoch": 0.4653555911273725, "grad_norm": 1.0052565336227417, "learning_rate": 9.787786346462341e-05, "loss": 0.22618136405944825, "memory(GiB)": 122.96, "step": 6105, "token_acc": 0.8960548074651548, "train_speed(iter/s)": 0.240772 }, { "epoch": 0.4657367177376324, "grad_norm": 0.7051888108253479, "learning_rate": 9.787441082897569e-05, "loss": 0.20226945877075195, "memory(GiB)": 122.96, "step": 6110, "token_acc": 0.9337738399159032, "train_speed(iter/s)": 0.240799 }, { "epoch": 0.4661178443478924, "grad_norm": 0.5594191551208496, "learning_rate": 9.787095544795264e-05, "loss": 0.15390095710754395, "memory(GiB)": 122.96, "step": 6115, "token_acc": 0.9313614811469858, "train_speed(iter/s)": 0.240861 }, { "epoch": 0.4664989709581523, "grad_norm": 1.0632768869400024, "learning_rate": 9.786749732175248e-05, "loss": 0.2082515001296997, "memory(GiB)": 122.96, "step": 6120, "token_acc": 0.918763479511143, "train_speed(iter/s)": 0.240888 }, { "epoch": 0.4668800975684122, "grad_norm": 0.570037305355072, "learning_rate": 9.786403645057346e-05, "loss": 0.21178140640258789, "memory(GiB)": 122.96, "step": 6125, "token_acc": 0.9279760632613806, "train_speed(iter/s)": 0.240936 }, { "epoch": 0.46726122417867216, "grad_norm": 0.5749973058700562, "learning_rate": 9.786057283461407e-05, "loss": 0.17590250968933105, "memory(GiB)": 122.96, "step": 6130, "token_acc": 0.9260831122900088, "train_speed(iter/s)": 0.240981 }, { "epoch": 0.46764235078893207, "grad_norm": 1.78145432472229, "learning_rate": 9.785710647407293e-05, "loss": 0.17096209526062012, "memory(GiB)": 122.96, "step": 6135, "token_acc": 0.9377641183250096, "train_speed(iter/s)": 0.241046 }, { "epoch": 0.46802347739919203, "grad_norm": 1.1055325269699097, "learning_rate": 9.78536373691488e-05, "loss": 0.18834007978439332, "memory(GiB)": 122.96, "step": 6140, "token_acc": 0.9413496376811594, "train_speed(iter/s)": 0.241098 }, { "epoch": 0.46840460400945194, "grad_norm": 0.19333411753177643, "learning_rate": 9.785016552004067e-05, "loss": 0.20599629878997802, "memory(GiB)": 122.96, "step": 6145, "token_acc": 0.9043166898746767, "train_speed(iter/s)": 0.241143 }, { "epoch": 0.46878573061971185, "grad_norm": 0.502638041973114, "learning_rate": 9.784669092694758e-05, "loss": 0.1009904384613037, "memory(GiB)": 122.96, "step": 6150, "token_acc": 0.9367192299815688, "train_speed(iter/s)": 0.241174 }, { "epoch": 0.4691668572299718, "grad_norm": 1.298183798789978, "learning_rate": 9.78432135900688e-05, "loss": 0.19954068660736085, "memory(GiB)": 122.96, "step": 6155, "token_acc": 0.9176623376623376, "train_speed(iter/s)": 0.241226 }, { "epoch": 0.4695479838402317, "grad_norm": 1.2767728567123413, "learning_rate": 9.783973350960375e-05, "loss": 0.18137484788894653, "memory(GiB)": 122.96, "step": 6160, "token_acc": 0.9456163007605108, "train_speed(iter/s)": 0.241246 }, { "epoch": 0.4699291104504916, "grad_norm": 0.7384737730026245, "learning_rate": 9.783625068575197e-05, "loss": 0.1530519485473633, "memory(GiB)": 122.96, "step": 6165, "token_acc": 0.945615404631798, "train_speed(iter/s)": 0.2413 }, { "epoch": 0.4703102370607516, "grad_norm": 1.1244175434112549, "learning_rate": 9.783276511871323e-05, "loss": 0.20192952156066896, "memory(GiB)": 122.96, "step": 6170, "token_acc": 0.9290009250693803, "train_speed(iter/s)": 0.241346 }, { "epoch": 0.4706913636710115, "grad_norm": 0.7620112895965576, "learning_rate": 9.782927680868734e-05, "loss": 0.16360985040664672, "memory(GiB)": 122.96, "step": 6175, "token_acc": 0.9380905832518736, "train_speed(iter/s)": 0.241377 }, { "epoch": 0.47107249028127146, "grad_norm": 1.8818175792694092, "learning_rate": 9.782578575587441e-05, "loss": 0.17476404905319215, "memory(GiB)": 122.96, "step": 6180, "token_acc": 0.9248736273313578, "train_speed(iter/s)": 0.241401 }, { "epoch": 0.47145361689153137, "grad_norm": 0.5214102864265442, "learning_rate": 9.78222919604746e-05, "loss": 0.1421543002128601, "memory(GiB)": 122.96, "step": 6185, "token_acc": 0.9367930805056554, "train_speed(iter/s)": 0.241457 }, { "epoch": 0.4718347435017913, "grad_norm": 1.2178155183792114, "learning_rate": 9.781879542268826e-05, "loss": 0.16018153429031373, "memory(GiB)": 122.96, "step": 6190, "token_acc": 0.9360016677089847, "train_speed(iter/s)": 0.241499 }, { "epoch": 0.47221587011205124, "grad_norm": 1.4756392240524292, "learning_rate": 9.781529614271591e-05, "loss": 0.18035542964935303, "memory(GiB)": 122.96, "step": 6195, "token_acc": 0.9304424389170152, "train_speed(iter/s)": 0.241546 }, { "epoch": 0.47259699672231115, "grad_norm": 0.8919601440429688, "learning_rate": 9.781179412075822e-05, "loss": 0.2081521987915039, "memory(GiB)": 122.96, "step": 6200, "token_acc": 0.9164727178691492, "train_speed(iter/s)": 0.241592 }, { "epoch": 0.47259699672231115, "eval_loss": 0.1436389684677124, "eval_runtime": 177.2465, "eval_samples_per_second": 2.99, "eval_steps_per_second": 2.99, "eval_token_acc": 0.9298686826094814, "step": 6200 }, { "epoch": 0.47297812333257105, "grad_norm": 0.9285817742347717, "learning_rate": 9.780828935701602e-05, "loss": 0.1946765422821045, "memory(GiB)": 122.96, "step": 6205, "token_acc": 0.9297743407888459, "train_speed(iter/s)": 0.239965 }, { "epoch": 0.473359249942831, "grad_norm": 0.5768506526947021, "learning_rate": 9.780478185169027e-05, "loss": 0.2110511302947998, "memory(GiB)": 122.96, "step": 6210, "token_acc": 0.9335609617299758, "train_speed(iter/s)": 0.239989 }, { "epoch": 0.4737403765530909, "grad_norm": 1.3330050706863403, "learning_rate": 9.780127160498214e-05, "loss": 0.18445327281951904, "memory(GiB)": 122.96, "step": 6215, "token_acc": 0.9310850439882697, "train_speed(iter/s)": 0.240016 }, { "epoch": 0.4741215031633509, "grad_norm": 0.8476679921150208, "learning_rate": 9.779775861709288e-05, "loss": 0.21979308128356934, "memory(GiB)": 122.96, "step": 6220, "token_acc": 0.9230769230769231, "train_speed(iter/s)": 0.240051 }, { "epoch": 0.4745026297736108, "grad_norm": 0.828378438949585, "learning_rate": 9.7794242888224e-05, "loss": 0.16875656843185424, "memory(GiB)": 122.96, "step": 6225, "token_acc": 0.9388928828181164, "train_speed(iter/s)": 0.240108 }, { "epoch": 0.4748837563838707, "grad_norm": 1.2499819993972778, "learning_rate": 9.779072441857706e-05, "loss": 0.17264634370803833, "memory(GiB)": 122.96, "step": 6230, "token_acc": 0.9228723404255319, "train_speed(iter/s)": 0.240169 }, { "epoch": 0.47526488299413067, "grad_norm": 0.8028970956802368, "learning_rate": 9.778720320835386e-05, "loss": 0.1622241497039795, "memory(GiB)": 122.96, "step": 6235, "token_acc": 0.9402938901778809, "train_speed(iter/s)": 0.2402 }, { "epoch": 0.4756460096043906, "grad_norm": 0.8611127138137817, "learning_rate": 9.778367925775632e-05, "loss": 0.18411366939544677, "memory(GiB)": 122.96, "step": 6240, "token_acc": 0.9138078902229846, "train_speed(iter/s)": 0.240265 }, { "epoch": 0.4760271362146505, "grad_norm": 1.0704749822616577, "learning_rate": 9.778015256698651e-05, "loss": 0.1830517292022705, "memory(GiB)": 122.96, "step": 6245, "token_acc": 0.9132610508757297, "train_speed(iter/s)": 0.240314 }, { "epoch": 0.47640826282491044, "grad_norm": 1.0482726097106934, "learning_rate": 9.77766231362467e-05, "loss": 0.12964816093444825, "memory(GiB)": 122.96, "step": 6250, "token_acc": 0.9480249480249481, "train_speed(iter/s)": 0.240318 }, { "epoch": 0.47678938943517035, "grad_norm": 0.9440543055534363, "learning_rate": 9.777309096573923e-05, "loss": 0.20174243450164794, "memory(GiB)": 122.96, "step": 6255, "token_acc": 0.9307603353570396, "train_speed(iter/s)": 0.240351 }, { "epoch": 0.4771705160454303, "grad_norm": 0.863926887512207, "learning_rate": 9.776955605566671e-05, "loss": 0.19633069038391113, "memory(GiB)": 122.96, "step": 6260, "token_acc": 0.9244060475161987, "train_speed(iter/s)": 0.240371 }, { "epoch": 0.4775516426556902, "grad_norm": 0.986570417881012, "learning_rate": 9.776601840623182e-05, "loss": 0.23889293670654296, "memory(GiB)": 122.96, "step": 6265, "token_acc": 0.900495867768595, "train_speed(iter/s)": 0.240432 }, { "epoch": 0.47793276926595013, "grad_norm": 0.7743133902549744, "learning_rate": 9.776247801763744e-05, "loss": 0.1475817084312439, "memory(GiB)": 122.96, "step": 6270, "token_acc": 0.9521544487968663, "train_speed(iter/s)": 0.240493 }, { "epoch": 0.4783138958762101, "grad_norm": 0.9538258910179138, "learning_rate": 9.775893489008658e-05, "loss": 0.12543011903762818, "memory(GiB)": 122.96, "step": 6275, "token_acc": 0.9501424501424501, "train_speed(iter/s)": 0.240541 }, { "epoch": 0.47869502248647, "grad_norm": 0.6504844427108765, "learning_rate": 9.775538902378245e-05, "loss": 0.13363256454467773, "memory(GiB)": 122.96, "step": 6280, "token_acc": 0.9299855142443264, "train_speed(iter/s)": 0.240597 }, { "epoch": 0.4790761490967299, "grad_norm": 0.8694522380828857, "learning_rate": 9.775184041892836e-05, "loss": 0.18660924434661866, "memory(GiB)": 122.96, "step": 6285, "token_acc": 0.929216220088838, "train_speed(iter/s)": 0.240639 }, { "epoch": 0.47945727570698987, "grad_norm": 0.43452000617980957, "learning_rate": 9.774828907572782e-05, "loss": 0.16778392791748048, "memory(GiB)": 122.96, "step": 6290, "token_acc": 0.9315789473684211, "train_speed(iter/s)": 0.240689 }, { "epoch": 0.4798384023172498, "grad_norm": 0.9831333756446838, "learning_rate": 9.774473499438448e-05, "loss": 0.17738786935806275, "memory(GiB)": 122.96, "step": 6295, "token_acc": 0.9327010760532555, "train_speed(iter/s)": 0.240724 }, { "epoch": 0.48021952892750974, "grad_norm": 1.3676713705062866, "learning_rate": 9.774117817510214e-05, "loss": 0.17861173152923585, "memory(GiB)": 122.96, "step": 6300, "token_acc": 0.9323705040966532, "train_speed(iter/s)": 0.240743 }, { "epoch": 0.48060065553776965, "grad_norm": 0.8645610809326172, "learning_rate": 9.77376186180848e-05, "loss": 0.24651014804840088, "memory(GiB)": 122.96, "step": 6305, "token_acc": 0.9055238095238095, "train_speed(iter/s)": 0.240785 }, { "epoch": 0.48098178214802956, "grad_norm": 0.525933563709259, "learning_rate": 9.773405632353654e-05, "loss": 0.15430068969726562, "memory(GiB)": 122.96, "step": 6310, "token_acc": 0.9443252114631991, "train_speed(iter/s)": 0.240803 }, { "epoch": 0.4813629087582895, "grad_norm": 1.2553365230560303, "learning_rate": 9.773049129166168e-05, "loss": 0.22492051124572754, "memory(GiB)": 122.96, "step": 6315, "token_acc": 0.9024506305020223, "train_speed(iter/s)": 0.240853 }, { "epoch": 0.48174403536854943, "grad_norm": 1.5549840927124023, "learning_rate": 9.772692352266464e-05, "loss": 0.22755539417266846, "memory(GiB)": 122.96, "step": 6320, "token_acc": 0.9074550128534704, "train_speed(iter/s)": 0.240884 }, { "epoch": 0.48212516197880934, "grad_norm": 0.7906176447868347, "learning_rate": 9.772335301675e-05, "loss": 0.18742153644561768, "memory(GiB)": 122.96, "step": 6325, "token_acc": 0.933064173395665, "train_speed(iter/s)": 0.240935 }, { "epoch": 0.4825062885890693, "grad_norm": 0.809965968132019, "learning_rate": 9.771977977412254e-05, "loss": 0.18241484165191652, "memory(GiB)": 122.96, "step": 6330, "token_acc": 0.9310298826040555, "train_speed(iter/s)": 0.240962 }, { "epoch": 0.4828874151993292, "grad_norm": 1.0171654224395752, "learning_rate": 9.771620379498716e-05, "loss": 0.13738546371459961, "memory(GiB)": 122.96, "step": 6335, "token_acc": 0.9460628785693683, "train_speed(iter/s)": 0.241019 }, { "epoch": 0.48326854180958917, "grad_norm": 0.8749512434005737, "learning_rate": 9.77126250795489e-05, "loss": 0.19574857950210572, "memory(GiB)": 122.96, "step": 6340, "token_acc": 0.9178541492036881, "train_speed(iter/s)": 0.241055 }, { "epoch": 0.4836496684198491, "grad_norm": 0.9702167510986328, "learning_rate": 9.770904362801304e-05, "loss": 0.16193156242370604, "memory(GiB)": 122.96, "step": 6345, "token_acc": 0.9309173272933182, "train_speed(iter/s)": 0.241106 }, { "epoch": 0.484030795030109, "grad_norm": 0.942905843257904, "learning_rate": 9.77054594405849e-05, "loss": 0.18963894844055176, "memory(GiB)": 122.96, "step": 6350, "token_acc": 0.9279203477773104, "train_speed(iter/s)": 0.241133 }, { "epoch": 0.48441192164036895, "grad_norm": 0.6703991293907166, "learning_rate": 9.770187251747005e-05, "loss": 0.196355938911438, "memory(GiB)": 122.96, "step": 6355, "token_acc": 0.921037210665104, "train_speed(iter/s)": 0.241174 }, { "epoch": 0.48479304825062886, "grad_norm": 0.9493531584739685, "learning_rate": 9.769828285887417e-05, "loss": 0.18587944507598878, "memory(GiB)": 122.96, "step": 6360, "token_acc": 0.9198664440734557, "train_speed(iter/s)": 0.241234 }, { "epoch": 0.48517417486088876, "grad_norm": 0.8998671770095825, "learning_rate": 9.769469046500313e-05, "loss": 0.17140390872955322, "memory(GiB)": 122.96, "step": 6365, "token_acc": 0.9224291911201837, "train_speed(iter/s)": 0.24128 }, { "epoch": 0.4855553014711487, "grad_norm": 0.641680121421814, "learning_rate": 9.76910953360629e-05, "loss": 0.16919420957565307, "memory(GiB)": 122.96, "step": 6370, "token_acc": 0.9428067523305619, "train_speed(iter/s)": 0.241335 }, { "epoch": 0.48593642808140863, "grad_norm": 2.2604432106018066, "learning_rate": 9.768749747225968e-05, "loss": 0.19004164934158324, "memory(GiB)": 122.96, "step": 6375, "token_acc": 0.924864446165763, "train_speed(iter/s)": 0.241391 }, { "epoch": 0.4863175546916686, "grad_norm": 0.9189158082008362, "learning_rate": 9.768389687379978e-05, "loss": 0.11925365924835205, "memory(GiB)": 122.96, "step": 6380, "token_acc": 0.9522058823529411, "train_speed(iter/s)": 0.241458 }, { "epoch": 0.4866986813019285, "grad_norm": 1.0211783647537231, "learning_rate": 9.768029354088966e-05, "loss": 0.21883459091186525, "memory(GiB)": 122.96, "step": 6385, "token_acc": 0.8939354161197165, "train_speed(iter/s)": 0.241508 }, { "epoch": 0.4870798079121884, "grad_norm": 0.6961852312088013, "learning_rate": 9.767668747373597e-05, "loss": 0.1395933747291565, "memory(GiB)": 122.96, "step": 6390, "token_acc": 0.9345101048861602, "train_speed(iter/s)": 0.241515 }, { "epoch": 0.4874609345224484, "grad_norm": 1.1393874883651733, "learning_rate": 9.767307867254551e-05, "loss": 0.15481575727462768, "memory(GiB)": 122.96, "step": 6395, "token_acc": 0.9504249291784702, "train_speed(iter/s)": 0.241542 }, { "epoch": 0.4878420611327083, "grad_norm": 0.7874759435653687, "learning_rate": 9.766946713752523e-05, "loss": 0.1861614465713501, "memory(GiB)": 122.96, "step": 6400, "token_acc": 0.9349048050770625, "train_speed(iter/s)": 0.241567 }, { "epoch": 0.4878420611327083, "eval_loss": 0.1394554227590561, "eval_runtime": 195.3227, "eval_samples_per_second": 2.713, "eval_steps_per_second": 2.713, "eval_token_acc": 0.9307421239684356, "step": 6400 }, { "epoch": 0.4882231877429682, "grad_norm": 1.203920602798462, "learning_rate": 9.76658528688822e-05, "loss": 0.2062145948410034, "memory(GiB)": 122.96, "step": 6405, "token_acc": 0.9301010902444726, "train_speed(iter/s)": 0.239814 }, { "epoch": 0.48860431435322815, "grad_norm": 0.9845380783081055, "learning_rate": 9.766223586682371e-05, "loss": 0.17991364002227783, "memory(GiB)": 122.96, "step": 6410, "token_acc": 0.9276005921141165, "train_speed(iter/s)": 0.239833 }, { "epoch": 0.48898544096348806, "grad_norm": 1.248170256614685, "learning_rate": 9.765861613155718e-05, "loss": 0.14771295785903932, "memory(GiB)": 122.96, "step": 6415, "token_acc": 0.9436025321312105, "train_speed(iter/s)": 0.239864 }, { "epoch": 0.489366567573748, "grad_norm": 1.397081732749939, "learning_rate": 9.765499366329018e-05, "loss": 0.23156981468200682, "memory(GiB)": 122.96, "step": 6420, "token_acc": 0.907040841894262, "train_speed(iter/s)": 0.239917 }, { "epoch": 0.48974769418400793, "grad_norm": 0.7324181795120239, "learning_rate": 9.765136846223043e-05, "loss": 0.2054084300994873, "memory(GiB)": 122.96, "step": 6425, "token_acc": 0.9163768574138477, "train_speed(iter/s)": 0.239952 }, { "epoch": 0.49012882079426784, "grad_norm": 0.7780849933624268, "learning_rate": 9.764774052858583e-05, "loss": 0.21540656089782714, "memory(GiB)": 122.96, "step": 6430, "token_acc": 0.9053576285014958, "train_speed(iter/s)": 0.240011 }, { "epoch": 0.4905099474045278, "grad_norm": 0.8740113377571106, "learning_rate": 9.764410986256443e-05, "loss": 0.1772850513458252, "memory(GiB)": 122.96, "step": 6435, "token_acc": 0.9204599904168663, "train_speed(iter/s)": 0.240062 }, { "epoch": 0.4908910740147877, "grad_norm": 0.44009310007095337, "learning_rate": 9.764047646437442e-05, "loss": 0.21341395378112793, "memory(GiB)": 122.96, "step": 6440, "token_acc": 0.9203208556149732, "train_speed(iter/s)": 0.240089 }, { "epoch": 0.4912722006250476, "grad_norm": 1.2725169658660889, "learning_rate": 9.763684033422416e-05, "loss": 0.15246518850326538, "memory(GiB)": 122.96, "step": 6445, "token_acc": 0.933291770573566, "train_speed(iter/s)": 0.240149 }, { "epoch": 0.4916533272353076, "grad_norm": 0.807914137840271, "learning_rate": 9.763320147232217e-05, "loss": 0.2149585723876953, "memory(GiB)": 122.96, "step": 6450, "token_acc": 0.9195455243359435, "train_speed(iter/s)": 0.240178 }, { "epoch": 0.4920344538455675, "grad_norm": 1.5816762447357178, "learning_rate": 9.762955987887713e-05, "loss": 0.1282161593437195, "memory(GiB)": 122.96, "step": 6455, "token_acc": 0.9499354005167958, "train_speed(iter/s)": 0.240233 }, { "epoch": 0.49241558045582745, "grad_norm": 1.6350860595703125, "learning_rate": 9.762591555409786e-05, "loss": 0.18068366050720214, "memory(GiB)": 122.96, "step": 6460, "token_acc": 0.9322374429223744, "train_speed(iter/s)": 0.240277 }, { "epoch": 0.49279670706608736, "grad_norm": 0.6775110960006714, "learning_rate": 9.762226849819333e-05, "loss": 0.24361398220062255, "memory(GiB)": 122.96, "step": 6465, "token_acc": 0.911071978868589, "train_speed(iter/s)": 0.240286 }, { "epoch": 0.49317783367634727, "grad_norm": 0.8510713577270508, "learning_rate": 9.76186187113727e-05, "loss": 0.15374698638916015, "memory(GiB)": 122.96, "step": 6470, "token_acc": 0.9357287449392713, "train_speed(iter/s)": 0.240325 }, { "epoch": 0.49355896028660723, "grad_norm": 0.6601843237876892, "learning_rate": 9.761496619384527e-05, "loss": 0.1845821738243103, "memory(GiB)": 122.96, "step": 6475, "token_acc": 0.9324884231050451, "train_speed(iter/s)": 0.240376 }, { "epoch": 0.49394008689686714, "grad_norm": 0.8834421634674072, "learning_rate": 9.76113109458205e-05, "loss": 0.215891695022583, "memory(GiB)": 122.96, "step": 6480, "token_acc": 0.9156400083524744, "train_speed(iter/s)": 0.24043 }, { "epoch": 0.49432121350712704, "grad_norm": 1.073808193206787, "learning_rate": 9.760765296750796e-05, "loss": 0.15068576335906983, "memory(GiB)": 122.96, "step": 6485, "token_acc": 0.9489429023443118, "train_speed(iter/s)": 0.240462 }, { "epoch": 0.494702340117387, "grad_norm": 0.5935428142547607, "learning_rate": 9.760399225911748e-05, "loss": 0.17301468849182128, "memory(GiB)": 122.96, "step": 6490, "token_acc": 0.9283176270662753, "train_speed(iter/s)": 0.2405 }, { "epoch": 0.4950834667276469, "grad_norm": 1.0851857662200928, "learning_rate": 9.760032882085892e-05, "loss": 0.11099461317062378, "memory(GiB)": 122.96, "step": 6495, "token_acc": 0.9421613394216134, "train_speed(iter/s)": 0.240561 }, { "epoch": 0.4954645933379069, "grad_norm": 1.2804749011993408, "learning_rate": 9.759666265294242e-05, "loss": 0.21786072254180908, "memory(GiB)": 122.96, "step": 6500, "token_acc": 0.9078469104126576, "train_speed(iter/s)": 0.240601 }, { "epoch": 0.4958457199481668, "grad_norm": 0.6460931301116943, "learning_rate": 9.759299375557819e-05, "loss": 0.20640697479248046, "memory(GiB)": 122.96, "step": 6505, "token_acc": 0.938498003992016, "train_speed(iter/s)": 0.240594 }, { "epoch": 0.4962268465584267, "grad_norm": 1.8107999563217163, "learning_rate": 9.758932212897663e-05, "loss": 0.1988211154937744, "memory(GiB)": 122.96, "step": 6510, "token_acc": 0.9185185185185185, "train_speed(iter/s)": 0.240649 }, { "epoch": 0.49660797316868666, "grad_norm": 0.5852643251419067, "learning_rate": 9.758564777334826e-05, "loss": 0.20852484703063964, "memory(GiB)": 122.96, "step": 6515, "token_acc": 0.9229998747965444, "train_speed(iter/s)": 0.240678 }, { "epoch": 0.49698909977894656, "grad_norm": 1.001548171043396, "learning_rate": 9.758197068890385e-05, "loss": 0.14101855754852294, "memory(GiB)": 122.96, "step": 6520, "token_acc": 0.9397394136807817, "train_speed(iter/s)": 0.240718 }, { "epoch": 0.49737022638920647, "grad_norm": 0.6867068409919739, "learning_rate": 9.757829087585421e-05, "loss": 0.1366589307785034, "memory(GiB)": 122.96, "step": 6525, "token_acc": 0.9455106237148732, "train_speed(iter/s)": 0.240782 }, { "epoch": 0.49775135299946643, "grad_norm": 0.8611955642700195, "learning_rate": 9.757460833441039e-05, "loss": 0.15103321075439452, "memory(GiB)": 122.96, "step": 6530, "token_acc": 0.9488117001828154, "train_speed(iter/s)": 0.240844 }, { "epoch": 0.49813247960972634, "grad_norm": 1.3610928058624268, "learning_rate": 9.757092306478353e-05, "loss": 0.1536560297012329, "memory(GiB)": 122.96, "step": 6535, "token_acc": 0.9383786316776007, "train_speed(iter/s)": 0.240887 }, { "epoch": 0.4985136062199863, "grad_norm": 1.139328956604004, "learning_rate": 9.756723506718502e-05, "loss": 0.16389427185058594, "memory(GiB)": 122.96, "step": 6540, "token_acc": 0.9367864693446089, "train_speed(iter/s)": 0.240927 }, { "epoch": 0.4988947328302462, "grad_norm": 1.0638970136642456, "learning_rate": 9.75635443418263e-05, "loss": 0.1960911989212036, "memory(GiB)": 122.96, "step": 6545, "token_acc": 0.9129049389272438, "train_speed(iter/s)": 0.240961 }, { "epoch": 0.4992758594405061, "grad_norm": 0.7460038065910339, "learning_rate": 9.755985088891904e-05, "loss": 0.19907000064849853, "memory(GiB)": 122.96, "step": 6550, "token_acc": 0.9223195825539037, "train_speed(iter/s)": 0.240978 }, { "epoch": 0.4996569860507661, "grad_norm": 1.9872097969055176, "learning_rate": 9.755615470867503e-05, "loss": 0.2410585403442383, "memory(GiB)": 122.96, "step": 6555, "token_acc": 0.8963782696177063, "train_speed(iter/s)": 0.241038 }, { "epoch": 0.500038112661026, "grad_norm": 1.1022579669952393, "learning_rate": 9.755245580130623e-05, "loss": 0.17727997303009033, "memory(GiB)": 122.96, "step": 6560, "token_acc": 0.9186585800927578, "train_speed(iter/s)": 0.241094 }, { "epoch": 0.500419239271286, "grad_norm": 0.7179242372512817, "learning_rate": 9.754875416702476e-05, "loss": 0.1432894229888916, "memory(GiB)": 122.96, "step": 6565, "token_acc": 0.9332871652816251, "train_speed(iter/s)": 0.241136 }, { "epoch": 0.5008003658815459, "grad_norm": 0.7568638920783997, "learning_rate": 9.75450498060429e-05, "loss": 0.16202276945114136, "memory(GiB)": 122.96, "step": 6570, "token_acc": 0.9189082278481012, "train_speed(iter/s)": 0.241171 }, { "epoch": 0.5011814924918058, "grad_norm": 1.9364062547683716, "learning_rate": 9.754134271857308e-05, "loss": 0.22362132072448732, "memory(GiB)": 122.96, "step": 6575, "token_acc": 0.9005658852061439, "train_speed(iter/s)": 0.241235 }, { "epoch": 0.5015626191020657, "grad_norm": 0.8721165657043457, "learning_rate": 9.753763290482787e-05, "loss": 0.1431664228439331, "memory(GiB)": 122.96, "step": 6580, "token_acc": 0.9497340881797907, "train_speed(iter/s)": 0.241272 }, { "epoch": 0.5019437457123256, "grad_norm": 0.7225767970085144, "learning_rate": 9.753392036502e-05, "loss": 0.19342904090881347, "memory(GiB)": 122.96, "step": 6585, "token_acc": 0.925073457394711, "train_speed(iter/s)": 0.241291 }, { "epoch": 0.5023248723225856, "grad_norm": 0.8878169655799866, "learning_rate": 9.753020509936239e-05, "loss": 0.22300024032592775, "memory(GiB)": 122.96, "step": 6590, "token_acc": 0.9205658324265505, "train_speed(iter/s)": 0.241335 }, { "epoch": 0.5027059989328455, "grad_norm": 0.12067373842000961, "learning_rate": 9.752648710806809e-05, "loss": 0.1109616756439209, "memory(GiB)": 122.96, "step": 6595, "token_acc": 0.9531960417223857, "train_speed(iter/s)": 0.241388 }, { "epoch": 0.5030871255431054, "grad_norm": 1.0187158584594727, "learning_rate": 9.752276639135029e-05, "loss": 0.21343884468078614, "memory(GiB)": 122.96, "step": 6600, "token_acc": 0.9299876084262702, "train_speed(iter/s)": 0.241455 }, { "epoch": 0.5030871255431054, "eval_loss": 0.14037811756134033, "eval_runtime": 176.6244, "eval_samples_per_second": 3.001, "eval_steps_per_second": 3.001, "eval_token_acc": 0.9300795132823324, "step": 6600 }, { "epoch": 0.5034682521533653, "grad_norm": 1.2730140686035156, "learning_rate": 9.751904294942238e-05, "loss": 0.19893765449523926, "memory(GiB)": 122.96, "step": 6605, "token_acc": 0.929830121236688, "train_speed(iter/s)": 0.239958 }, { "epoch": 0.5038493787636252, "grad_norm": 0.8214978575706482, "learning_rate": 9.751531678249786e-05, "loss": 0.1650695323944092, "memory(GiB)": 122.96, "step": 6610, "token_acc": 0.9110152075626798, "train_speed(iter/s)": 0.240002 }, { "epoch": 0.5042305053738853, "grad_norm": 0.631286084651947, "learning_rate": 9.751158789079045e-05, "loss": 0.089615398645401, "memory(GiB)": 122.96, "step": 6615, "token_acc": 0.9601930036188179, "train_speed(iter/s)": 0.240073 }, { "epoch": 0.5046116319841452, "grad_norm": 0.39522644877433777, "learning_rate": 9.750785627451393e-05, "loss": 0.21457579135894775, "memory(GiB)": 122.96, "step": 6620, "token_acc": 0.9062258313998454, "train_speed(iter/s)": 0.240113 }, { "epoch": 0.5049927585944051, "grad_norm": 1.2108502388000488, "learning_rate": 9.750412193388232e-05, "loss": 0.14984385967254638, "memory(GiB)": 122.96, "step": 6625, "token_acc": 0.9485205677171037, "train_speed(iter/s)": 0.240155 }, { "epoch": 0.505373885204665, "grad_norm": 1.0687733888626099, "learning_rate": 9.750038486910977e-05, "loss": 0.2204139471054077, "memory(GiB)": 122.96, "step": 6630, "token_acc": 0.9115543030565793, "train_speed(iter/s)": 0.240204 }, { "epoch": 0.5057550118149249, "grad_norm": 1.0686695575714111, "learning_rate": 9.749664508041056e-05, "loss": 0.21006531715393068, "memory(GiB)": 122.96, "step": 6635, "token_acc": 0.9204886841578209, "train_speed(iter/s)": 0.240253 }, { "epoch": 0.5061361384251849, "grad_norm": 1.0463881492614746, "learning_rate": 9.749290256799917e-05, "loss": 0.20622293949127196, "memory(GiB)": 122.96, "step": 6640, "token_acc": 0.8971048513302035, "train_speed(iter/s)": 0.240314 }, { "epoch": 0.5065172650354448, "grad_norm": 0.8689031004905701, "learning_rate": 9.748915733209023e-05, "loss": 0.19636025428771972, "memory(GiB)": 122.96, "step": 6645, "token_acc": 0.9223097112860892, "train_speed(iter/s)": 0.240362 }, { "epoch": 0.5068983916457047, "grad_norm": 2.0423810482025146, "learning_rate": 9.748540937289849e-05, "loss": 0.16106393337249755, "memory(GiB)": 122.96, "step": 6650, "token_acc": 0.9433404940923737, "train_speed(iter/s)": 0.240409 }, { "epoch": 0.5072795182559646, "grad_norm": 0.8366822600364685, "learning_rate": 9.748165869063887e-05, "loss": 0.21754240989685059, "memory(GiB)": 122.96, "step": 6655, "token_acc": 0.9207818930041153, "train_speed(iter/s)": 0.240411 }, { "epoch": 0.5076606448662245, "grad_norm": 1.6380388736724854, "learning_rate": 9.747790528552648e-05, "loss": 0.2030207872390747, "memory(GiB)": 122.96, "step": 6660, "token_acc": 0.9142857142857143, "train_speed(iter/s)": 0.240476 }, { "epoch": 0.5080417714764844, "grad_norm": 0.9587275981903076, "learning_rate": 9.747414915777653e-05, "loss": 0.20045702457427977, "memory(GiB)": 122.96, "step": 6665, "token_acc": 0.9154210718191761, "train_speed(iter/s)": 0.240541 }, { "epoch": 0.5084228980867445, "grad_norm": 1.0902820825576782, "learning_rate": 9.747039030760443e-05, "loss": 0.16659259796142578, "memory(GiB)": 122.96, "step": 6670, "token_acc": 0.9385146363287741, "train_speed(iter/s)": 0.240577 }, { "epoch": 0.5088040246970044, "grad_norm": 1.103403091430664, "learning_rate": 9.746662873522574e-05, "loss": 0.21079261302948, "memory(GiB)": 122.96, "step": 6675, "token_acc": 0.9303400462198745, "train_speed(iter/s)": 0.240634 }, { "epoch": 0.5091851513072643, "grad_norm": 0.8276931643486023, "learning_rate": 9.746286444085617e-05, "loss": 0.17114672660827637, "memory(GiB)": 122.96, "step": 6680, "token_acc": 0.9077736890524379, "train_speed(iter/s)": 0.240677 }, { "epoch": 0.5095662779175242, "grad_norm": 0.6306222677230835, "learning_rate": 9.745909742471157e-05, "loss": 0.17284415960311889, "memory(GiB)": 122.96, "step": 6685, "token_acc": 0.9377659130724952, "train_speed(iter/s)": 0.240671 }, { "epoch": 0.5099474045277841, "grad_norm": 0.7365657687187195, "learning_rate": 9.745532768700799e-05, "loss": 0.1917936086654663, "memory(GiB)": 122.96, "step": 6690, "token_acc": 0.9203187250996016, "train_speed(iter/s)": 0.240709 }, { "epoch": 0.5103285311380441, "grad_norm": 0.6565676927566528, "learning_rate": 9.745155522796157e-05, "loss": 0.21131303310394287, "memory(GiB)": 122.96, "step": 6695, "token_acc": 0.9250952179432924, "train_speed(iter/s)": 0.240736 }, { "epoch": 0.510709657748304, "grad_norm": 0.828711748123169, "learning_rate": 9.744778004778867e-05, "loss": 0.1512755870819092, "memory(GiB)": 122.96, "step": 6700, "token_acc": 0.9411373461922181, "train_speed(iter/s)": 0.240787 }, { "epoch": 0.5110907843585639, "grad_norm": 0.8197252154350281, "learning_rate": 9.744400214670576e-05, "loss": 0.20275349617004396, "memory(GiB)": 122.96, "step": 6705, "token_acc": 0.9256722899615835, "train_speed(iter/s)": 0.240822 }, { "epoch": 0.5114719109688238, "grad_norm": 1.40071439743042, "learning_rate": 9.744022152492949e-05, "loss": 0.1810696840286255, "memory(GiB)": 122.96, "step": 6710, "token_acc": 0.9432742054693274, "train_speed(iter/s)": 0.240847 }, { "epoch": 0.5118530375790837, "grad_norm": 0.7259822487831116, "learning_rate": 9.743643818267669e-05, "loss": 0.26820394992828367, "memory(GiB)": 122.96, "step": 6715, "token_acc": 0.8901453957996769, "train_speed(iter/s)": 0.240904 }, { "epoch": 0.5122341641893438, "grad_norm": 1.1993129253387451, "learning_rate": 9.743265212016426e-05, "loss": 0.22229306697845458, "memory(GiB)": 122.96, "step": 6720, "token_acc": 0.911701671667381, "train_speed(iter/s)": 0.240952 }, { "epoch": 0.5126152907996037, "grad_norm": 1.1718672513961792, "learning_rate": 9.742886333760937e-05, "loss": 0.21621415615081788, "memory(GiB)": 122.96, "step": 6725, "token_acc": 0.9207245354034345, "train_speed(iter/s)": 0.241005 }, { "epoch": 0.5129964174098636, "grad_norm": 0.8755703568458557, "learning_rate": 9.742507183522925e-05, "loss": 0.17542717456817628, "memory(GiB)": 122.96, "step": 6730, "token_acc": 0.9281108192022928, "train_speed(iter/s)": 0.241048 }, { "epoch": 0.5133775440201235, "grad_norm": 0.8469975590705872, "learning_rate": 9.742127761324133e-05, "loss": 0.22634415626525878, "memory(GiB)": 122.96, "step": 6735, "token_acc": 0.9194991055456172, "train_speed(iter/s)": 0.241092 }, { "epoch": 0.5137586706303834, "grad_norm": 0.8017536401748657, "learning_rate": 9.741748067186323e-05, "loss": 0.24060122966766356, "memory(GiB)": 122.96, "step": 6740, "token_acc": 0.9208387516254877, "train_speed(iter/s)": 0.241121 }, { "epoch": 0.5141397972406433, "grad_norm": 1.1938374042510986, "learning_rate": 9.741368101131263e-05, "loss": 0.208384370803833, "memory(GiB)": 122.96, "step": 6745, "token_acc": 0.9120828538550058, "train_speed(iter/s)": 0.241179 }, { "epoch": 0.5145209238509033, "grad_norm": 1.8300169706344604, "learning_rate": 9.740987863180746e-05, "loss": 0.20161654949188232, "memory(GiB)": 122.96, "step": 6750, "token_acc": 0.9201331114808652, "train_speed(iter/s)": 0.241222 }, { "epoch": 0.5149020504611632, "grad_norm": 0.8090299367904663, "learning_rate": 9.740607353356576e-05, "loss": 0.16628828048706054, "memory(GiB)": 122.96, "step": 6755, "token_acc": 0.9244198424526293, "train_speed(iter/s)": 0.241261 }, { "epoch": 0.5152831770714231, "grad_norm": 0.7446261048316956, "learning_rate": 9.740226571680574e-05, "loss": 0.1339216113090515, "memory(GiB)": 122.96, "step": 6760, "token_acc": 0.9383309306423194, "train_speed(iter/s)": 0.241273 }, { "epoch": 0.515664303681683, "grad_norm": 0.736966609954834, "learning_rate": 9.739845518174575e-05, "loss": 0.1555694341659546, "memory(GiB)": 122.96, "step": 6765, "token_acc": 0.9504221304113526, "train_speed(iter/s)": 0.2413 }, { "epoch": 0.5160454302919429, "grad_norm": 2.817692756652832, "learning_rate": 9.739464192860432e-05, "loss": 0.1545361638069153, "memory(GiB)": 122.96, "step": 6770, "token_acc": 0.9354149986655992, "train_speed(iter/s)": 0.241348 }, { "epoch": 0.516426556902203, "grad_norm": 0.7395541667938232, "learning_rate": 9.73908259576001e-05, "loss": 0.16989350318908691, "memory(GiB)": 122.96, "step": 6775, "token_acc": 0.9358925143953934, "train_speed(iter/s)": 0.241375 }, { "epoch": 0.5168076835124629, "grad_norm": 0.9538095593452454, "learning_rate": 9.738700726895194e-05, "loss": 0.15133534669876098, "memory(GiB)": 122.96, "step": 6780, "token_acc": 0.9377740655669242, "train_speed(iter/s)": 0.241419 }, { "epoch": 0.5171888101227228, "grad_norm": 0.6673028469085693, "learning_rate": 9.73831858628788e-05, "loss": 0.16896359920501708, "memory(GiB)": 122.96, "step": 6785, "token_acc": 0.9381267738883633, "train_speed(iter/s)": 0.241418 }, { "epoch": 0.5175699367329827, "grad_norm": 1.8202033042907715, "learning_rate": 9.737936173959985e-05, "loss": 0.18470910787582398, "memory(GiB)": 122.96, "step": 6790, "token_acc": 0.9275396085740913, "train_speed(iter/s)": 0.241465 }, { "epoch": 0.5179510633432426, "grad_norm": 1.574926495552063, "learning_rate": 9.737553489933436e-05, "loss": 0.22680885791778566, "memory(GiB)": 122.96, "step": 6795, "token_acc": 0.9053914480479239, "train_speed(iter/s)": 0.241502 }, { "epoch": 0.5183321899535026, "grad_norm": 1.0017858743667603, "learning_rate": 9.73717053423018e-05, "loss": 0.20320556163787842, "memory(GiB)": 122.96, "step": 6800, "token_acc": 0.9252225519287833, "train_speed(iter/s)": 0.241546 }, { "epoch": 0.5183321899535026, "eval_loss": 0.13594388961791992, "eval_runtime": 177.7803, "eval_samples_per_second": 2.981, "eval_steps_per_second": 2.981, "eval_token_acc": 0.9329558460333714, "step": 6800 }, { "epoch": 0.5187133165637625, "grad_norm": 1.628959059715271, "learning_rate": 9.736787306872177e-05, "loss": 0.1233770489692688, "memory(GiB)": 122.96, "step": 6805, "token_acc": 0.9333250322278214, "train_speed(iter/s)": 0.240079 }, { "epoch": 0.5190944431740224, "grad_norm": 1.333674430847168, "learning_rate": 9.736403807881404e-05, "loss": 0.12556332349777222, "memory(GiB)": 122.96, "step": 6810, "token_acc": 0.9502405498281787, "train_speed(iter/s)": 0.240099 }, { "epoch": 0.5194755697842823, "grad_norm": 1.6312485933303833, "learning_rate": 9.736020037279852e-05, "loss": 0.206437087059021, "memory(GiB)": 122.96, "step": 6815, "token_acc": 0.9377194321477637, "train_speed(iter/s)": 0.240132 }, { "epoch": 0.5198566963945422, "grad_norm": 0.8180206418037415, "learning_rate": 9.735635995089528e-05, "loss": 0.1499392032623291, "memory(GiB)": 122.96, "step": 6820, "token_acc": 0.9305753685211603, "train_speed(iter/s)": 0.24017 }, { "epoch": 0.5202378230048021, "grad_norm": 1.1971153020858765, "learning_rate": 9.735251681332456e-05, "loss": 0.19402761459350587, "memory(GiB)": 122.96, "step": 6825, "token_acc": 0.9319799630899025, "train_speed(iter/s)": 0.240224 }, { "epoch": 0.5206189496150622, "grad_norm": 1.278516173362732, "learning_rate": 9.734867096030674e-05, "loss": 0.16840741634368897, "memory(GiB)": 122.96, "step": 6830, "token_acc": 0.9409850613814524, "train_speed(iter/s)": 0.240249 }, { "epoch": 0.5210000762253221, "grad_norm": 1.0187299251556396, "learning_rate": 9.734482239206238e-05, "loss": 0.21811699867248535, "memory(GiB)": 122.96, "step": 6835, "token_acc": 0.9201240791004265, "train_speed(iter/s)": 0.240289 }, { "epoch": 0.521381202835582, "grad_norm": 0.8793230652809143, "learning_rate": 9.734097110881215e-05, "loss": 0.18455482721328736, "memory(GiB)": 122.96, "step": 6840, "token_acc": 0.9256270447110142, "train_speed(iter/s)": 0.240331 }, { "epoch": 0.5217623294458419, "grad_norm": 1.0267198085784912, "learning_rate": 9.733711711077691e-05, "loss": 0.24943172931671143, "memory(GiB)": 122.96, "step": 6845, "token_acc": 0.9002849002849003, "train_speed(iter/s)": 0.240386 }, { "epoch": 0.5221434560561018, "grad_norm": 1.106573224067688, "learning_rate": 9.733326039817768e-05, "loss": 0.17839913368225097, "memory(GiB)": 122.96, "step": 6850, "token_acc": 0.9378519710378117, "train_speed(iter/s)": 0.24042 }, { "epoch": 0.5225245826663618, "grad_norm": 1.0248230695724487, "learning_rate": 9.732940097123561e-05, "loss": 0.17860398292541504, "memory(GiB)": 122.96, "step": 6855, "token_acc": 0.9328914664457332, "train_speed(iter/s)": 0.24046 }, { "epoch": 0.5229057092766217, "grad_norm": 1.3686844110488892, "learning_rate": 9.732553883017206e-05, "loss": 0.20544672012329102, "memory(GiB)": 122.96, "step": 6860, "token_acc": 0.9195219123505977, "train_speed(iter/s)": 0.24051 }, { "epoch": 0.5232868358868816, "grad_norm": 1.2865279912948608, "learning_rate": 9.732167397520845e-05, "loss": 0.1440887928009033, "memory(GiB)": 122.96, "step": 6865, "token_acc": 0.9329224075416969, "train_speed(iter/s)": 0.240568 }, { "epoch": 0.5236679624971415, "grad_norm": 1.1384451389312744, "learning_rate": 9.731780640656644e-05, "loss": 0.1782839775085449, "memory(GiB)": 122.96, "step": 6870, "token_acc": 0.9406906906906907, "train_speed(iter/s)": 0.240628 }, { "epoch": 0.5240490891074014, "grad_norm": 0.6938775777816772, "learning_rate": 9.731393612446781e-05, "loss": 0.14369645118713378, "memory(GiB)": 122.96, "step": 6875, "token_acc": 0.9343909126517822, "train_speed(iter/s)": 0.240676 }, { "epoch": 0.5244302157176614, "grad_norm": 0.773326575756073, "learning_rate": 9.731006312913453e-05, "loss": 0.19155839681625367, "memory(GiB)": 122.96, "step": 6880, "token_acc": 0.9281499479347449, "train_speed(iter/s)": 0.240709 }, { "epoch": 0.5248113423279214, "grad_norm": 1.0415959358215332, "learning_rate": 9.730618742078865e-05, "loss": 0.16096376180648803, "memory(GiB)": 122.96, "step": 6885, "token_acc": 0.9411320754716981, "train_speed(iter/s)": 0.240755 }, { "epoch": 0.5251924689381813, "grad_norm": 1.0389882326126099, "learning_rate": 9.730230899965247e-05, "loss": 0.19486793279647827, "memory(GiB)": 122.96, "step": 6890, "token_acc": 0.9188342967244701, "train_speed(iter/s)": 0.240797 }, { "epoch": 0.5255735955484412, "grad_norm": 1.70291006565094, "learning_rate": 9.729842786594836e-05, "loss": 0.2795358419418335, "memory(GiB)": 122.96, "step": 6895, "token_acc": 0.9125775521714607, "train_speed(iter/s)": 0.240827 }, { "epoch": 0.5259547221587011, "grad_norm": 1.003394365310669, "learning_rate": 9.72945440198989e-05, "loss": 0.19661734104156495, "memory(GiB)": 122.96, "step": 6900, "token_acc": 0.9329531442663379, "train_speed(iter/s)": 0.240853 }, { "epoch": 0.526335848768961, "grad_norm": 0.7300508618354797, "learning_rate": 9.729065746172684e-05, "loss": 0.21375226974487305, "memory(GiB)": 122.96, "step": 6905, "token_acc": 0.9206519792226402, "train_speed(iter/s)": 0.240887 }, { "epoch": 0.526716975379221, "grad_norm": 1.0847630500793457, "learning_rate": 9.728676819165501e-05, "loss": 0.19867029190063476, "memory(GiB)": 122.96, "step": 6910, "token_acc": 0.9305614183199662, "train_speed(iter/s)": 0.240939 }, { "epoch": 0.5270981019894809, "grad_norm": 0.3016345202922821, "learning_rate": 9.728287620990646e-05, "loss": 0.17728137969970703, "memory(GiB)": 122.96, "step": 6915, "token_acc": 0.9229534510433387, "train_speed(iter/s)": 0.240977 }, { "epoch": 0.5274792285997408, "grad_norm": 0.7678334712982178, "learning_rate": 9.727898151670438e-05, "loss": 0.23979830741882324, "memory(GiB)": 122.96, "step": 6920, "token_acc": 0.9059357768407396, "train_speed(iter/s)": 0.241031 }, { "epoch": 0.5278603552100007, "grad_norm": 0.8401669263839722, "learning_rate": 9.727508411227211e-05, "loss": 0.20578312873840332, "memory(GiB)": 122.96, "step": 6925, "token_acc": 0.9212280347404564, "train_speed(iter/s)": 0.241075 }, { "epoch": 0.5282414818202606, "grad_norm": 0.540363609790802, "learning_rate": 9.727118399683318e-05, "loss": 0.14745962619781494, "memory(GiB)": 122.96, "step": 6930, "token_acc": 0.9318264014466546, "train_speed(iter/s)": 0.241108 }, { "epoch": 0.5286226084305207, "grad_norm": 1.309598445892334, "learning_rate": 9.726728117061117e-05, "loss": 0.15799858570098876, "memory(GiB)": 122.96, "step": 6935, "token_acc": 0.904862579281184, "train_speed(iter/s)": 0.241169 }, { "epoch": 0.5290037350407806, "grad_norm": 0.7275777459144592, "learning_rate": 9.726337563382994e-05, "loss": 0.1521458864212036, "memory(GiB)": 122.96, "step": 6940, "token_acc": 0.9382347452883001, "train_speed(iter/s)": 0.241207 }, { "epoch": 0.5293848616510405, "grad_norm": 0.8559795618057251, "learning_rate": 9.725946738671346e-05, "loss": 0.24810147285461426, "memory(GiB)": 122.96, "step": 6945, "token_acc": 0.8955717118307998, "train_speed(iter/s)": 0.241261 }, { "epoch": 0.5297659882613004, "grad_norm": 0.7258918285369873, "learning_rate": 9.725555642948584e-05, "loss": 0.18241209983825685, "memory(GiB)": 122.96, "step": 6950, "token_acc": 0.9366937884500893, "train_speed(iter/s)": 0.241299 }, { "epoch": 0.5301471148715603, "grad_norm": 0.8279014825820923, "learning_rate": 9.725164276237134e-05, "loss": 0.23742976188659667, "memory(GiB)": 122.96, "step": 6955, "token_acc": 0.920378399684667, "train_speed(iter/s)": 0.241312 }, { "epoch": 0.5305282414818202, "grad_norm": 0.8959754705429077, "learning_rate": 9.72477263855944e-05, "loss": 0.16767842769622804, "memory(GiB)": 122.96, "step": 6960, "token_acc": 0.9406689874375662, "train_speed(iter/s)": 0.241335 }, { "epoch": 0.5309093680920802, "grad_norm": 1.2941001653671265, "learning_rate": 9.724380729937961e-05, "loss": 0.15441770553588868, "memory(GiB)": 122.96, "step": 6965, "token_acc": 0.9496499730748519, "train_speed(iter/s)": 0.241385 }, { "epoch": 0.5312904947023401, "grad_norm": 0.8814386129379272, "learning_rate": 9.723988550395172e-05, "loss": 0.21450495719909668, "memory(GiB)": 122.96, "step": 6970, "token_acc": 0.9226545714967773, "train_speed(iter/s)": 0.241422 }, { "epoch": 0.5316716213126, "grad_norm": 1.1711746454238892, "learning_rate": 9.723596099953562e-05, "loss": 0.20270662307739257, "memory(GiB)": 122.96, "step": 6975, "token_acc": 0.9156048320370677, "train_speed(iter/s)": 0.241458 }, { "epoch": 0.53205274792286, "grad_norm": 0.0902424231171608, "learning_rate": 9.723203378635634e-05, "loss": 0.1703126311302185, "memory(GiB)": 122.96, "step": 6980, "token_acc": 0.9409652971679298, "train_speed(iter/s)": 0.241515 }, { "epoch": 0.5324338745331199, "grad_norm": 1.8017958402633667, "learning_rate": 9.722810386463911e-05, "loss": 0.17908949851989747, "memory(GiB)": 122.96, "step": 6985, "token_acc": 0.9327755337564916, "train_speed(iter/s)": 0.241566 }, { "epoch": 0.5328150011433799, "grad_norm": 0.5711160898208618, "learning_rate": 9.72241712346093e-05, "loss": 0.1333256959915161, "memory(GiB)": 122.96, "step": 6990, "token_acc": 0.9565330896749772, "train_speed(iter/s)": 0.241558 }, { "epoch": 0.5331961277536398, "grad_norm": 1.4494740962982178, "learning_rate": 9.722023589649241e-05, "loss": 0.1545030117034912, "memory(GiB)": 122.96, "step": 6995, "token_acc": 0.9428347689898199, "train_speed(iter/s)": 0.241616 }, { "epoch": 0.5335772543638997, "grad_norm": 0.95125812292099, "learning_rate": 9.721629785051412e-05, "loss": 0.186778724193573, "memory(GiB)": 122.96, "step": 7000, "token_acc": 0.9283048211508553, "train_speed(iter/s)": 0.241646 }, { "epoch": 0.5335772543638997, "eval_loss": 0.13523255288600922, "eval_runtime": 178.7585, "eval_samples_per_second": 2.965, "eval_steps_per_second": 2.965, "eval_token_acc": 0.9342584784049154, "step": 7000 }, { "epoch": 0.5339583809741596, "grad_norm": 0.9552993774414062, "learning_rate": 9.721235709690024e-05, "loss": 0.2177253007888794, "memory(GiB)": 122.96, "step": 7005, "token_acc": 0.9337681273927441, "train_speed(iter/s)": 0.240185 }, { "epoch": 0.5343395075844195, "grad_norm": 1.4895660877227783, "learning_rate": 9.720841363587679e-05, "loss": 0.1499803900718689, "memory(GiB)": 122.96, "step": 7010, "token_acc": 0.9291635267520724, "train_speed(iter/s)": 0.240221 }, { "epoch": 0.5347206341946795, "grad_norm": 0.9676122665405273, "learning_rate": 9.720446746766989e-05, "loss": 0.1713352918624878, "memory(GiB)": 122.96, "step": 7015, "token_acc": 0.9412935323383085, "train_speed(iter/s)": 0.240243 }, { "epoch": 0.5351017608049394, "grad_norm": 1.143479347229004, "learning_rate": 9.720051859250584e-05, "loss": 0.1898650646209717, "memory(GiB)": 122.96, "step": 7020, "token_acc": 0.9213943950786057, "train_speed(iter/s)": 0.240297 }, { "epoch": 0.5354828874151993, "grad_norm": 0.2252119481563568, "learning_rate": 9.719656701061108e-05, "loss": 0.1349409341812134, "memory(GiB)": 122.96, "step": 7025, "token_acc": 0.9419542083198968, "train_speed(iter/s)": 0.240349 }, { "epoch": 0.5358640140254592, "grad_norm": 0.7449554204940796, "learning_rate": 9.719261272221223e-05, "loss": 0.1890088438987732, "memory(GiB)": 122.96, "step": 7030, "token_acc": 0.9361417991987374, "train_speed(iter/s)": 0.240358 }, { "epoch": 0.5362451406357192, "grad_norm": 0.7858633995056152, "learning_rate": 9.718865572753604e-05, "loss": 0.17039281129837036, "memory(GiB)": 122.96, "step": 7035, "token_acc": 0.9373773066352572, "train_speed(iter/s)": 0.24039 }, { "epoch": 0.5366262672459791, "grad_norm": 0.9247518181800842, "learning_rate": 9.718469602680941e-05, "loss": 0.18226996660232545, "memory(GiB)": 122.96, "step": 7040, "token_acc": 0.9276232616940582, "train_speed(iter/s)": 0.24044 }, { "epoch": 0.5370073938562391, "grad_norm": 0.7346091270446777, "learning_rate": 9.718073362025943e-05, "loss": 0.14166916608810426, "memory(GiB)": 122.96, "step": 7045, "token_acc": 0.9400544959128065, "train_speed(iter/s)": 0.240478 }, { "epoch": 0.537388520466499, "grad_norm": 0.6862905621528625, "learning_rate": 9.717676850811334e-05, "loss": 0.17694406509399413, "memory(GiB)": 122.96, "step": 7050, "token_acc": 0.9281859692818597, "train_speed(iter/s)": 0.240518 }, { "epoch": 0.5377696470767589, "grad_norm": 1.0654163360595703, "learning_rate": 9.717280069059848e-05, "loss": 0.17104512453079224, "memory(GiB)": 122.96, "step": 7055, "token_acc": 0.9354445797807551, "train_speed(iter/s)": 0.240575 }, { "epoch": 0.5381507736870188, "grad_norm": 1.9396964311599731, "learning_rate": 9.716883016794242e-05, "loss": 0.18441424369812012, "memory(GiB)": 122.96, "step": 7060, "token_acc": 0.932460577209164, "train_speed(iter/s)": 0.240626 }, { "epoch": 0.5385319002972787, "grad_norm": 0.522526204586029, "learning_rate": 9.716485694037285e-05, "loss": 0.1816539764404297, "memory(GiB)": 122.96, "step": 7065, "token_acc": 0.9558945908460471, "train_speed(iter/s)": 0.24065 }, { "epoch": 0.5389130269075387, "grad_norm": 2.046234369277954, "learning_rate": 9.71608810081176e-05, "loss": 0.20994975566864013, "memory(GiB)": 122.96, "step": 7070, "token_acc": 0.9214801444043321, "train_speed(iter/s)": 0.240705 }, { "epoch": 0.5392941535177986, "grad_norm": 0.6658031940460205, "learning_rate": 9.715690237140468e-05, "loss": 0.15250284671783448, "memory(GiB)": 122.96, "step": 7075, "token_acc": 0.9342105263157895, "train_speed(iter/s)": 0.240742 }, { "epoch": 0.5396752801280585, "grad_norm": 1.0810900926589966, "learning_rate": 9.715292103046223e-05, "loss": 0.23550853729248047, "memory(GiB)": 122.96, "step": 7080, "token_acc": 0.9036144578313253, "train_speed(iter/s)": 0.24079 }, { "epoch": 0.5400564067383185, "grad_norm": 1.0171676874160767, "learning_rate": 9.714893698551859e-05, "loss": 0.1596773624420166, "memory(GiB)": 122.96, "step": 7085, "token_acc": 0.9355742296918768, "train_speed(iter/s)": 0.240846 }, { "epoch": 0.5404375333485784, "grad_norm": 0.9134028553962708, "learning_rate": 9.714495023680221e-05, "loss": 0.18215417861938477, "memory(GiB)": 122.96, "step": 7090, "token_acc": 0.9417241379310345, "train_speed(iter/s)": 0.240901 }, { "epoch": 0.5408186599588384, "grad_norm": 0.8627896904945374, "learning_rate": 9.714096078454171e-05, "loss": 0.15136098861694336, "memory(GiB)": 122.96, "step": 7095, "token_acc": 0.935319582378406, "train_speed(iter/s)": 0.240953 }, { "epoch": 0.5411997865690983, "grad_norm": 1.037477970123291, "learning_rate": 9.713696862896587e-05, "loss": 0.1281415343284607, "memory(GiB)": 122.96, "step": 7100, "token_acc": 0.9440231628291741, "train_speed(iter/s)": 0.240959 }, { "epoch": 0.5415809131793582, "grad_norm": 0.7001175284385681, "learning_rate": 9.713297377030361e-05, "loss": 0.16154402494430542, "memory(GiB)": 122.96, "step": 7105, "token_acc": 0.9390651085141903, "train_speed(iter/s)": 0.240981 }, { "epoch": 0.5419620397896181, "grad_norm": 1.0888545513153076, "learning_rate": 9.712897620878404e-05, "loss": 0.1571489930152893, "memory(GiB)": 122.96, "step": 7110, "token_acc": 0.9354906054279749, "train_speed(iter/s)": 0.241023 }, { "epoch": 0.542343166399878, "grad_norm": 1.9328068494796753, "learning_rate": 9.712497594463639e-05, "loss": 0.1785590648651123, "memory(GiB)": 122.96, "step": 7115, "token_acc": 0.9262114537444934, "train_speed(iter/s)": 0.241061 }, { "epoch": 0.5427242930101379, "grad_norm": 1.490885615348816, "learning_rate": 9.712097297809006e-05, "loss": 0.12409394979476929, "memory(GiB)": 122.96, "step": 7120, "token_acc": 0.9401599174619552, "train_speed(iter/s)": 0.241112 }, { "epoch": 0.5431054196203979, "grad_norm": 0.722251832485199, "learning_rate": 9.711696730937459e-05, "loss": 0.23386116027832032, "memory(GiB)": 122.96, "step": 7125, "token_acc": 0.8962097059865392, "train_speed(iter/s)": 0.241163 }, { "epoch": 0.5434865462306578, "grad_norm": 0.5960565805435181, "learning_rate": 9.711295893871969e-05, "loss": 0.16658614873886107, "memory(GiB)": 122.96, "step": 7130, "token_acc": 0.9522868435911914, "train_speed(iter/s)": 0.2412 }, { "epoch": 0.5438676728409177, "grad_norm": 1.311477780342102, "learning_rate": 9.710894786635522e-05, "loss": 0.16592382192611693, "memory(GiB)": 122.96, "step": 7135, "token_acc": 0.9273404750815091, "train_speed(iter/s)": 0.241249 }, { "epoch": 0.5442487994511777, "grad_norm": 0.5246511101722717, "learning_rate": 9.710493409251122e-05, "loss": 0.14395551681518554, "memory(GiB)": 122.96, "step": 7140, "token_acc": 0.9367462466158011, "train_speed(iter/s)": 0.24129 }, { "epoch": 0.5446299260614376, "grad_norm": 1.039299726486206, "learning_rate": 9.710091761741784e-05, "loss": 0.17862125635147094, "memory(GiB)": 122.96, "step": 7145, "token_acc": 0.9372332015810276, "train_speed(iter/s)": 0.241308 }, { "epoch": 0.5450110526716976, "grad_norm": 0.694965660572052, "learning_rate": 9.709689844130541e-05, "loss": 0.14991508722305297, "memory(GiB)": 122.96, "step": 7150, "token_acc": 0.9357773615199357, "train_speed(iter/s)": 0.241355 }, { "epoch": 0.5453921792819575, "grad_norm": 1.1957526206970215, "learning_rate": 9.70928765644044e-05, "loss": 0.20826640129089355, "memory(GiB)": 122.96, "step": 7155, "token_acc": 0.9243818805269807, "train_speed(iter/s)": 0.241386 }, { "epoch": 0.5457733058922174, "grad_norm": 0.6676205396652222, "learning_rate": 9.708885198694547e-05, "loss": 0.162802255153656, "memory(GiB)": 122.96, "step": 7160, "token_acc": 0.9230311446141638, "train_speed(iter/s)": 0.241426 }, { "epoch": 0.5461544325024773, "grad_norm": 1.0624045133590698, "learning_rate": 9.70848247091594e-05, "loss": 0.15277191400527954, "memory(GiB)": 122.96, "step": 7165, "token_acc": 0.9469608073222249, "train_speed(iter/s)": 0.24147 }, { "epoch": 0.5465355591127372, "grad_norm": 0.5598741173744202, "learning_rate": 9.708079473127711e-05, "loss": 0.18649954795837403, "memory(GiB)": 122.96, "step": 7170, "token_acc": 0.925827226293582, "train_speed(iter/s)": 0.24151 }, { "epoch": 0.5469166857229972, "grad_norm": 1.1659364700317383, "learning_rate": 9.707676205352975e-05, "loss": 0.2174985885620117, "memory(GiB)": 122.96, "step": 7175, "token_acc": 0.9007666098807495, "train_speed(iter/s)": 0.241567 }, { "epoch": 0.5472978123332571, "grad_norm": 0.7786362767219543, "learning_rate": 9.707272667614853e-05, "loss": 0.16082971096038817, "memory(GiB)": 122.96, "step": 7180, "token_acc": 0.9322686439469556, "train_speed(iter/s)": 0.241595 }, { "epoch": 0.547678938943517, "grad_norm": 1.5186296701431274, "learning_rate": 9.706868859936489e-05, "loss": 0.2307065725326538, "memory(GiB)": 122.96, "step": 7185, "token_acc": 0.9102621057307864, "train_speed(iter/s)": 0.241647 }, { "epoch": 0.548060065553777, "grad_norm": 1.5418174266815186, "learning_rate": 9.706464782341039e-05, "loss": 0.23250169754028321, "memory(GiB)": 122.96, "step": 7190, "token_acc": 0.9338828166995162, "train_speed(iter/s)": 0.241674 }, { "epoch": 0.5484411921640369, "grad_norm": 0.9822309017181396, "learning_rate": 9.706060434851673e-05, "loss": 0.20433480739593507, "memory(GiB)": 122.96, "step": 7195, "token_acc": 0.9261044176706827, "train_speed(iter/s)": 0.241707 }, { "epoch": 0.5488223187742968, "grad_norm": 0.6674410104751587, "learning_rate": 9.70565581749158e-05, "loss": 0.1459787130355835, "memory(GiB)": 122.96, "step": 7200, "token_acc": 0.945746214852199, "train_speed(iter/s)": 0.241742 }, { "epoch": 0.5488223187742968, "eval_loss": 0.13749712705612183, "eval_runtime": 174.503, "eval_samples_per_second": 3.037, "eval_steps_per_second": 3.037, "eval_token_acc": 0.9328127823625083, "step": 7200 }, { "epoch": 0.5492034453845568, "grad_norm": 0.7893772721290588, "learning_rate": 9.705250930283963e-05, "loss": 0.238517427444458, "memory(GiB)": 122.96, "step": 7205, "token_acc": 0.9324229589601285, "train_speed(iter/s)": 0.240384 }, { "epoch": 0.5495845719948167, "grad_norm": 0.9873624444007874, "learning_rate": 9.704845773252041e-05, "loss": 0.1969143867492676, "memory(GiB)": 122.96, "step": 7210, "token_acc": 0.9305645684620376, "train_speed(iter/s)": 0.240434 }, { "epoch": 0.5499656986050766, "grad_norm": 1.5839787721633911, "learning_rate": 9.704440346419046e-05, "loss": 0.206299090385437, "memory(GiB)": 122.96, "step": 7215, "token_acc": 0.9169646404449742, "train_speed(iter/s)": 0.24049 }, { "epoch": 0.5503468252153365, "grad_norm": 1.184552550315857, "learning_rate": 9.70403464980823e-05, "loss": 0.17557740211486816, "memory(GiB)": 122.96, "step": 7220, "token_acc": 0.923785839672323, "train_speed(iter/s)": 0.24052 }, { "epoch": 0.5507279518255964, "grad_norm": 2.0773868560791016, "learning_rate": 9.703628683442853e-05, "loss": 0.17515619993209838, "memory(GiB)": 122.96, "step": 7225, "token_acc": 0.9381915299504006, "train_speed(iter/s)": 0.240576 }, { "epoch": 0.5511090784358564, "grad_norm": 0.9168033003807068, "learning_rate": 9.703222447346201e-05, "loss": 0.1855842351913452, "memory(GiB)": 122.96, "step": 7230, "token_acc": 0.9129206267659903, "train_speed(iter/s)": 0.240631 }, { "epoch": 0.5514902050461163, "grad_norm": 0.9931985139846802, "learning_rate": 9.702815941541566e-05, "loss": 0.17217323780059815, "memory(GiB)": 122.96, "step": 7235, "token_acc": 0.9426104621635348, "train_speed(iter/s)": 0.240676 }, { "epoch": 0.5518713316563763, "grad_norm": 1.077362060546875, "learning_rate": 9.702409166052262e-05, "loss": 0.23195300102233887, "memory(GiB)": 122.96, "step": 7240, "token_acc": 0.9067441860465116, "train_speed(iter/s)": 0.240722 }, { "epoch": 0.5522524582666362, "grad_norm": 1.0164830684661865, "learning_rate": 9.702002120901613e-05, "loss": 0.1731897473335266, "memory(GiB)": 122.96, "step": 7245, "token_acc": 0.9357282502443792, "train_speed(iter/s)": 0.24076 }, { "epoch": 0.5526335848768961, "grad_norm": 0.7468012571334839, "learning_rate": 9.701594806112963e-05, "loss": 0.18579742908477784, "memory(GiB)": 122.96, "step": 7250, "token_acc": 0.9308318264014467, "train_speed(iter/s)": 0.240789 }, { "epoch": 0.5530147114871561, "grad_norm": 0.8685094118118286, "learning_rate": 9.70118722170967e-05, "loss": 0.08958526253700257, "memory(GiB)": 122.96, "step": 7255, "token_acc": 0.9609094535301157, "train_speed(iter/s)": 0.240841 }, { "epoch": 0.553395838097416, "grad_norm": 1.1702519655227661, "learning_rate": 9.700779367715102e-05, "loss": 0.1861223816871643, "memory(GiB)": 122.96, "step": 7260, "token_acc": 0.9358874120406567, "train_speed(iter/s)": 0.240839 }, { "epoch": 0.5537769647076759, "grad_norm": 0.7052550315856934, "learning_rate": 9.700371244152656e-05, "loss": 0.19272793531417848, "memory(GiB)": 122.96, "step": 7265, "token_acc": 0.9331482272533106, "train_speed(iter/s)": 0.24088 }, { "epoch": 0.5541580913179358, "grad_norm": 1.1246980428695679, "learning_rate": 9.69996285104573e-05, "loss": 0.2241297721862793, "memory(GiB)": 122.96, "step": 7270, "token_acc": 0.9172885572139303, "train_speed(iter/s)": 0.240918 }, { "epoch": 0.5545392179281957, "grad_norm": 1.7306162118911743, "learning_rate": 9.699554188417744e-05, "loss": 0.16886777877807618, "memory(GiB)": 122.96, "step": 7275, "token_acc": 0.9282231324361225, "train_speed(iter/s)": 0.240963 }, { "epoch": 0.5549203445384556, "grad_norm": 1.2219594717025757, "learning_rate": 9.699145256292135e-05, "loss": 0.16877036094665526, "memory(GiB)": 122.96, "step": 7280, "token_acc": 0.9227323628219485, "train_speed(iter/s)": 0.241009 }, { "epoch": 0.5553014711487156, "grad_norm": 1.4795225858688354, "learning_rate": 9.69873605469235e-05, "loss": 0.19342910051345824, "memory(GiB)": 122.96, "step": 7285, "token_acc": 0.9231961425865335, "train_speed(iter/s)": 0.241041 }, { "epoch": 0.5556825977589755, "grad_norm": 1.164156198501587, "learning_rate": 9.69832658364186e-05, "loss": 0.1637222647666931, "memory(GiB)": 122.96, "step": 7290, "token_acc": 0.9376037172253567, "train_speed(iter/s)": 0.241099 }, { "epoch": 0.5560637243692355, "grad_norm": 0.31161683797836304, "learning_rate": 9.697916843164143e-05, "loss": 0.12819610834121703, "memory(GiB)": 122.96, "step": 7295, "token_acc": 0.9502521008403362, "train_speed(iter/s)": 0.241134 }, { "epoch": 0.5564448509794954, "grad_norm": 1.4558924436569214, "learning_rate": 9.697506833282694e-05, "loss": 0.13326963186264038, "memory(GiB)": 122.96, "step": 7300, "token_acc": 0.9349466562581317, "train_speed(iter/s)": 0.241188 }, { "epoch": 0.5568259775897553, "grad_norm": 0.6215888261795044, "learning_rate": 9.69709655402103e-05, "loss": 0.2098093032836914, "memory(GiB)": 122.96, "step": 7305, "token_acc": 0.9286191685603529, "train_speed(iter/s)": 0.24122 }, { "epoch": 0.5572071042000153, "grad_norm": 0.5341887474060059, "learning_rate": 9.696686005402673e-05, "loss": 0.14716250896453859, "memory(GiB)": 122.96, "step": 7310, "token_acc": 0.9255510204081633, "train_speed(iter/s)": 0.241263 }, { "epoch": 0.5575882308102752, "grad_norm": 0.6500809192657471, "learning_rate": 9.696275187451172e-05, "loss": 0.15772172212600707, "memory(GiB)": 122.96, "step": 7315, "token_acc": 0.9388797116684187, "train_speed(iter/s)": 0.241279 }, { "epoch": 0.5579693574205351, "grad_norm": 0.9627507328987122, "learning_rate": 9.69586410019008e-05, "loss": 0.18280308246612548, "memory(GiB)": 122.96, "step": 7320, "token_acc": 0.9396951623591783, "train_speed(iter/s)": 0.241326 }, { "epoch": 0.558350484030795, "grad_norm": 1.1206635236740112, "learning_rate": 9.695452743642973e-05, "loss": 0.1852535605430603, "memory(GiB)": 122.96, "step": 7325, "token_acc": 0.9211590296495957, "train_speed(iter/s)": 0.241365 }, { "epoch": 0.5587316106410549, "grad_norm": 0.5266935229301453, "learning_rate": 9.695041117833442e-05, "loss": 0.1526786208152771, "memory(GiB)": 122.96, "step": 7330, "token_acc": 0.9300808395972202, "train_speed(iter/s)": 0.241395 }, { "epoch": 0.5591127372513149, "grad_norm": 0.7123040556907654, "learning_rate": 9.69462922278509e-05, "loss": 0.19264662265777588, "memory(GiB)": 122.96, "step": 7335, "token_acc": 0.9275487321028117, "train_speed(iter/s)": 0.241438 }, { "epoch": 0.5594938638615748, "grad_norm": 0.8570783138275146, "learning_rate": 9.694217058521538e-05, "loss": 0.11813973188400269, "memory(GiB)": 122.96, "step": 7340, "token_acc": 0.9491145483075544, "train_speed(iter/s)": 0.241486 }, { "epoch": 0.5598749904718348, "grad_norm": 0.7099019289016724, "learning_rate": 9.693804625066421e-05, "loss": 0.1413159489631653, "memory(GiB)": 122.96, "step": 7345, "token_acc": 0.9392980437284235, "train_speed(iter/s)": 0.241523 }, { "epoch": 0.5602561170820947, "grad_norm": 1.0060694217681885, "learning_rate": 9.693391922443392e-05, "loss": 0.1669630765914917, "memory(GiB)": 122.96, "step": 7350, "token_acc": 0.9374866652442927, "train_speed(iter/s)": 0.241555 }, { "epoch": 0.5606372436923546, "grad_norm": 0.9519914388656616, "learning_rate": 9.692978950676115e-05, "loss": 0.20133748054504394, "memory(GiB)": 122.96, "step": 7355, "token_acc": 0.9160950709842081, "train_speed(iter/s)": 0.241586 }, { "epoch": 0.5610183703026145, "grad_norm": 0.13615615665912628, "learning_rate": 9.692565709788274e-05, "loss": 0.1258029341697693, "memory(GiB)": 122.96, "step": 7360, "token_acc": 0.9387691346454233, "train_speed(iter/s)": 0.24163 }, { "epoch": 0.5613994969128745, "grad_norm": 0.7518936991691589, "learning_rate": 9.692152199803566e-05, "loss": 0.09808213710784912, "memory(GiB)": 122.96, "step": 7365, "token_acc": 0.9624105011933174, "train_speed(iter/s)": 0.241677 }, { "epoch": 0.5617806235231344, "grad_norm": 0.9012131690979004, "learning_rate": 9.691738420745702e-05, "loss": 0.20080718994140626, "memory(GiB)": 122.96, "step": 7370, "token_acc": 0.9180409795102449, "train_speed(iter/s)": 0.241722 }, { "epoch": 0.5621617501333943, "grad_norm": 1.7895857095718384, "learning_rate": 9.691324372638413e-05, "loss": 0.19270722866058348, "memory(GiB)": 122.96, "step": 7375, "token_acc": 0.9286151960784313, "train_speed(iter/s)": 0.24177 }, { "epoch": 0.5625428767436542, "grad_norm": 1.196455478668213, "learning_rate": 9.690910055505443e-05, "loss": 0.1613546133041382, "memory(GiB)": 122.96, "step": 7380, "token_acc": 0.9454022988505747, "train_speed(iter/s)": 0.241822 }, { "epoch": 0.5629240033539141, "grad_norm": 0.9167526960372925, "learning_rate": 9.690495469370546e-05, "loss": 0.1532915472984314, "memory(GiB)": 122.96, "step": 7385, "token_acc": 0.9504734158776402, "train_speed(iter/s)": 0.241839 }, { "epoch": 0.5633051299641741, "grad_norm": 1.4132252931594849, "learning_rate": 9.690080614257504e-05, "loss": 0.11016379594802857, "memory(GiB)": 122.96, "step": 7390, "token_acc": 0.9448979591836735, "train_speed(iter/s)": 0.241887 }, { "epoch": 0.563686256574434, "grad_norm": 0.5560929179191589, "learning_rate": 9.689665490190101e-05, "loss": 0.13651907444000244, "memory(GiB)": 122.96, "step": 7395, "token_acc": 0.9302367612506778, "train_speed(iter/s)": 0.241927 }, { "epoch": 0.564067383184694, "grad_norm": 0.9663979411125183, "learning_rate": 9.689250097192146e-05, "loss": 0.16572701930999756, "memory(GiB)": 122.96, "step": 7400, "token_acc": 0.927118949073241, "train_speed(iter/s)": 0.241967 }, { "epoch": 0.564067383184694, "eval_loss": 0.13419906795024872, "eval_runtime": 200.2745, "eval_samples_per_second": 2.646, "eval_steps_per_second": 2.646, "eval_token_acc": 0.9346123727486296, "step": 7400 }, { "epoch": 0.5644485097949539, "grad_norm": 0.14313562214374542, "learning_rate": 9.68883443528746e-05, "loss": 0.09302259087562562, "memory(GiB)": 122.96, "step": 7405, "token_acc": 0.93522943509144, "train_speed(iter/s)": 0.240432 }, { "epoch": 0.5648296364052138, "grad_norm": 0.6787572503089905, "learning_rate": 9.688418504499875e-05, "loss": 0.24929468631744384, "memory(GiB)": 122.96, "step": 7410, "token_acc": 0.9085324232081912, "train_speed(iter/s)": 0.240466 }, { "epoch": 0.5652107630154738, "grad_norm": 1.024576187133789, "learning_rate": 9.688002304853248e-05, "loss": 0.15870609283447265, "memory(GiB)": 122.96, "step": 7415, "token_acc": 0.938478439989014, "train_speed(iter/s)": 0.240512 }, { "epoch": 0.5655918896257337, "grad_norm": 1.241624116897583, "learning_rate": 9.687585836371444e-05, "loss": 0.13587572574615478, "memory(GiB)": 122.96, "step": 7420, "token_acc": 0.9355492501013376, "train_speed(iter/s)": 0.240541 }, { "epoch": 0.5659730162359936, "grad_norm": 0.8461443781852722, "learning_rate": 9.687169099078343e-05, "loss": 0.15884044170379638, "memory(GiB)": 122.96, "step": 7425, "token_acc": 0.9453755431409062, "train_speed(iter/s)": 0.240575 }, { "epoch": 0.5663541428462535, "grad_norm": 1.0135505199432373, "learning_rate": 9.686752092997847e-05, "loss": 0.16462674140930175, "memory(GiB)": 122.96, "step": 7430, "token_acc": 0.9199684604770353, "train_speed(iter/s)": 0.240606 }, { "epoch": 0.5667352694565134, "grad_norm": 0.6886172890663147, "learning_rate": 9.686334818153868e-05, "loss": 0.13394465446472167, "memory(GiB)": 122.96, "step": 7435, "token_acc": 0.9478917326459215, "train_speed(iter/s)": 0.240638 }, { "epoch": 0.5671163960667733, "grad_norm": 1.5112619400024414, "learning_rate": 9.685917274570334e-05, "loss": 0.16913990974426268, "memory(GiB)": 122.96, "step": 7440, "token_acc": 0.9388122375524895, "train_speed(iter/s)": 0.240652 }, { "epoch": 0.5674975226770334, "grad_norm": 1.155969262123108, "learning_rate": 9.685499462271189e-05, "loss": 0.16341168880462648, "memory(GiB)": 122.96, "step": 7445, "token_acc": 0.927, "train_speed(iter/s)": 0.240694 }, { "epoch": 0.5678786492872933, "grad_norm": 1.5673954486846924, "learning_rate": 9.685081381280394e-05, "loss": 0.16689846515655518, "memory(GiB)": 122.96, "step": 7450, "token_acc": 0.9266435986159169, "train_speed(iter/s)": 0.240739 }, { "epoch": 0.5682597758975532, "grad_norm": 0.8962209224700928, "learning_rate": 9.684663031621924e-05, "loss": 0.2130706787109375, "memory(GiB)": 122.96, "step": 7455, "token_acc": 0.924688862886927, "train_speed(iter/s)": 0.240745 }, { "epoch": 0.5686409025078131, "grad_norm": 0.9303638339042664, "learning_rate": 9.684244413319765e-05, "loss": 0.12676362991333007, "memory(GiB)": 122.96, "step": 7460, "token_acc": 0.9426710097719869, "train_speed(iter/s)": 0.240759 }, { "epoch": 0.569022029118073, "grad_norm": 0.6107320189476013, "learning_rate": 9.683825526397929e-05, "loss": 0.17388373613357544, "memory(GiB)": 122.96, "step": 7465, "token_acc": 0.9382443007825791, "train_speed(iter/s)": 0.240797 }, { "epoch": 0.569403155728333, "grad_norm": 0.5643892884254456, "learning_rate": 9.683406370880436e-05, "loss": 0.16611562967300414, "memory(GiB)": 122.96, "step": 7470, "token_acc": 0.9354144241119483, "train_speed(iter/s)": 0.240831 }, { "epoch": 0.5697842823385929, "grad_norm": 1.325543999671936, "learning_rate": 9.68298694679132e-05, "loss": 0.18226802349090576, "memory(GiB)": 122.96, "step": 7475, "token_acc": 0.9326303456356181, "train_speed(iter/s)": 0.240855 }, { "epoch": 0.5701654089488528, "grad_norm": 1.0153743028640747, "learning_rate": 9.682567254154633e-05, "loss": 0.20334067344665527, "memory(GiB)": 122.96, "step": 7480, "token_acc": 0.9232035106966539, "train_speed(iter/s)": 0.240909 }, { "epoch": 0.5705465355591127, "grad_norm": 0.853122353553772, "learning_rate": 9.682147292994446e-05, "loss": 0.1760540246963501, "memory(GiB)": 122.96, "step": 7485, "token_acc": 0.9269853709508882, "train_speed(iter/s)": 0.240926 }, { "epoch": 0.5709276621693726, "grad_norm": 1.0168591737747192, "learning_rate": 9.681727063334838e-05, "loss": 0.2216787099838257, "memory(GiB)": 122.96, "step": 7490, "token_acc": 0.9181818181818182, "train_speed(iter/s)": 0.240964 }, { "epoch": 0.5713087887796326, "grad_norm": 0.9210538864135742, "learning_rate": 9.68130656519991e-05, "loss": 0.19910807609558107, "memory(GiB)": 122.96, "step": 7495, "token_acc": 0.9251219922598014, "train_speed(iter/s)": 0.240993 }, { "epoch": 0.5716899153898926, "grad_norm": 0.8071500062942505, "learning_rate": 9.680885798613773e-05, "loss": 0.14987853765487671, "memory(GiB)": 122.96, "step": 7500, "token_acc": 0.9377981301278382, "train_speed(iter/s)": 0.241027 }, { "epoch": 0.5720710420001525, "grad_norm": 0.7605462074279785, "learning_rate": 9.680464763600559e-05, "loss": 0.2036132335662842, "memory(GiB)": 122.96, "step": 7505, "token_acc": 0.9153372008701958, "train_speed(iter/s)": 0.241055 }, { "epoch": 0.5724521686104124, "grad_norm": 0.9473944306373596, "learning_rate": 9.680043460184409e-05, "loss": 0.15068873167037963, "memory(GiB)": 122.96, "step": 7510, "token_acc": 0.9403337531486146, "train_speed(iter/s)": 0.241076 }, { "epoch": 0.5728332952206723, "grad_norm": 1.0704567432403564, "learning_rate": 9.679621888389485e-05, "loss": 0.1542802333831787, "memory(GiB)": 122.96, "step": 7515, "token_acc": 0.9285435376805935, "train_speed(iter/s)": 0.241128 }, { "epoch": 0.5732144218309322, "grad_norm": 1.499940276145935, "learning_rate": 9.679200048239962e-05, "loss": 0.19009582996368407, "memory(GiB)": 122.96, "step": 7520, "token_acc": 0.9291153009427121, "train_speed(iter/s)": 0.241161 }, { "epoch": 0.5735955484411922, "grad_norm": 1.206945538520813, "learning_rate": 9.678777939760033e-05, "loss": 0.126200532913208, "memory(GiB)": 122.96, "step": 7525, "token_acc": 0.9363147466742145, "train_speed(iter/s)": 0.241205 }, { "epoch": 0.5739766750514521, "grad_norm": 0.7059234380722046, "learning_rate": 9.678355562973898e-05, "loss": 0.19405065774917601, "memory(GiB)": 122.96, "step": 7530, "token_acc": 0.9342120611221276, "train_speed(iter/s)": 0.241216 }, { "epoch": 0.574357801661712, "grad_norm": 1.00224769115448, "learning_rate": 9.677932917905783e-05, "loss": 0.2124797821044922, "memory(GiB)": 122.96, "step": 7535, "token_acc": 0.9263420724094882, "train_speed(iter/s)": 0.241257 }, { "epoch": 0.5747389282719719, "grad_norm": 1.31934654712677, "learning_rate": 9.677510004579922e-05, "loss": 0.14778281450271608, "memory(GiB)": 122.96, "step": 7540, "token_acc": 0.9440690690690691, "train_speed(iter/s)": 0.241308 }, { "epoch": 0.5751200548822318, "grad_norm": 1.319337248802185, "learning_rate": 9.67708682302057e-05, "loss": 0.1965206027030945, "memory(GiB)": 122.96, "step": 7545, "token_acc": 0.914251781472684, "train_speed(iter/s)": 0.24135 }, { "epoch": 0.5755011814924919, "grad_norm": 0.7344614267349243, "learning_rate": 9.676663373251993e-05, "loss": 0.11911581754684449, "memory(GiB)": 122.96, "step": 7550, "token_acc": 0.9473020675743823, "train_speed(iter/s)": 0.241388 }, { "epoch": 0.5758823081027518, "grad_norm": 1.1485555171966553, "learning_rate": 9.676239655298474e-05, "loss": 0.18304580450057983, "memory(GiB)": 122.96, "step": 7555, "token_acc": 0.9351415094339622, "train_speed(iter/s)": 0.241416 }, { "epoch": 0.5762634347130117, "grad_norm": 1.1993964910507202, "learning_rate": 9.67581566918431e-05, "loss": 0.17165684700012207, "memory(GiB)": 122.96, "step": 7560, "token_acc": 0.9263410728582866, "train_speed(iter/s)": 0.241448 }, { "epoch": 0.5766445613232716, "grad_norm": 1.4518812894821167, "learning_rate": 9.675391414933816e-05, "loss": 0.17632282972335817, "memory(GiB)": 122.96, "step": 7565, "token_acc": 0.9362594352809617, "train_speed(iter/s)": 0.241491 }, { "epoch": 0.5770256879335315, "grad_norm": 0.6603596210479736, "learning_rate": 9.674966892571322e-05, "loss": 0.14193686246871948, "memory(GiB)": 122.96, "step": 7570, "token_acc": 0.9522497704315886, "train_speed(iter/s)": 0.241511 }, { "epoch": 0.5774068145437915, "grad_norm": 0.8104476928710938, "learning_rate": 9.674542102121172e-05, "loss": 0.15146863460540771, "memory(GiB)": 122.96, "step": 7575, "token_acc": 0.941064235743057, "train_speed(iter/s)": 0.241543 }, { "epoch": 0.5777879411540514, "grad_norm": 1.0258283615112305, "learning_rate": 9.674117043607723e-05, "loss": 0.11542356014251709, "memory(GiB)": 122.96, "step": 7580, "token_acc": 0.9261727762112278, "train_speed(iter/s)": 0.241587 }, { "epoch": 0.5781690677643113, "grad_norm": 0.9134455323219299, "learning_rate": 9.673691717055352e-05, "loss": 0.1304369330406189, "memory(GiB)": 122.96, "step": 7585, "token_acc": 0.9425587467362925, "train_speed(iter/s)": 0.241607 }, { "epoch": 0.5785501943745712, "grad_norm": 0.8701491355895996, "learning_rate": 9.673266122488452e-05, "loss": 0.18219377994537353, "memory(GiB)": 122.96, "step": 7590, "token_acc": 0.9176212527418962, "train_speed(iter/s)": 0.241639 }, { "epoch": 0.5789313209848311, "grad_norm": 0.7204287648200989, "learning_rate": 9.672840259931424e-05, "loss": 0.15706671476364137, "memory(GiB)": 122.96, "step": 7595, "token_acc": 0.9355893097481254, "train_speed(iter/s)": 0.241681 }, { "epoch": 0.579312447595091, "grad_norm": 2.1629350185394287, "learning_rate": 9.672414129408692e-05, "loss": 0.20680177211761475, "memory(GiB)": 122.96, "step": 7600, "token_acc": 0.9207729468599034, "train_speed(iter/s)": 0.241728 }, { "epoch": 0.579312447595091, "eval_loss": 0.1314232051372528, "eval_runtime": 174.8925, "eval_samples_per_second": 3.03, "eval_steps_per_second": 3.03, "eval_token_acc": 0.9345295464128667, "step": 7600 }, { "epoch": 0.5796935742053511, "grad_norm": 0.8661823272705078, "learning_rate": 9.671987730944694e-05, "loss": 0.172832715511322, "memory(GiB)": 122.96, "step": 7605, "token_acc": 0.9341598075585439, "train_speed(iter/s)": 0.240418 }, { "epoch": 0.580074700815611, "grad_norm": 1.6881942749023438, "learning_rate": 9.671561064563879e-05, "loss": 0.15058627128601074, "memory(GiB)": 122.96, "step": 7610, "token_acc": 0.9387894521109441, "train_speed(iter/s)": 0.240442 }, { "epoch": 0.5804558274258709, "grad_norm": 2.240844964981079, "learning_rate": 9.671134130290715e-05, "loss": 0.18901643753051758, "memory(GiB)": 122.96, "step": 7615, "token_acc": 0.9326524098308331, "train_speed(iter/s)": 0.240494 }, { "epoch": 0.5808369540361308, "grad_norm": 1.1816797256469727, "learning_rate": 9.670706928149686e-05, "loss": 0.143050479888916, "memory(GiB)": 122.96, "step": 7620, "token_acc": 0.9468230160894464, "train_speed(iter/s)": 0.240537 }, { "epoch": 0.5812180806463907, "grad_norm": 0.7439324855804443, "learning_rate": 9.67027945816529e-05, "loss": 0.20845270156860352, "memory(GiB)": 122.96, "step": 7625, "token_acc": 0.9161425576519916, "train_speed(iter/s)": 0.240584 }, { "epoch": 0.5815992072566507, "grad_norm": 0.6814951300621033, "learning_rate": 9.66985172036204e-05, "loss": 0.15073174238204956, "memory(GiB)": 122.96, "step": 7630, "token_acc": 0.9444444444444444, "train_speed(iter/s)": 0.240616 }, { "epoch": 0.5819803338669106, "grad_norm": 0.7176348567008972, "learning_rate": 9.669423714764463e-05, "loss": 0.1744018316268921, "memory(GiB)": 122.96, "step": 7635, "token_acc": 0.9290436315557514, "train_speed(iter/s)": 0.240647 }, { "epoch": 0.5823614604771705, "grad_norm": 1.4978859424591064, "learning_rate": 9.668995441397107e-05, "loss": 0.1665830135345459, "memory(GiB)": 122.96, "step": 7640, "token_acc": 0.9419152276295133, "train_speed(iter/s)": 0.24071 }, { "epoch": 0.5827425870874304, "grad_norm": 0.45738324522972107, "learning_rate": 9.668566900284525e-05, "loss": 0.15068110227584838, "memory(GiB)": 122.96, "step": 7645, "token_acc": 0.9379823967501693, "train_speed(iter/s)": 0.240745 }, { "epoch": 0.5831237136976903, "grad_norm": 0.819275975227356, "learning_rate": 9.6681380914513e-05, "loss": 0.15307831764221191, "memory(GiB)": 122.96, "step": 7650, "token_acc": 0.9429140993113447, "train_speed(iter/s)": 0.240779 }, { "epoch": 0.5835048403079504, "grad_norm": 0.29432886838912964, "learning_rate": 9.667709014922017e-05, "loss": 0.10569866895675659, "memory(GiB)": 122.96, "step": 7655, "token_acc": 0.9602076124567474, "train_speed(iter/s)": 0.240843 }, { "epoch": 0.5838859669182103, "grad_norm": 1.4956772327423096, "learning_rate": 9.667279670721283e-05, "loss": 0.19539698362350463, "memory(GiB)": 122.96, "step": 7660, "token_acc": 0.9123711340206185, "train_speed(iter/s)": 0.240892 }, { "epoch": 0.5842670935284702, "grad_norm": 1.064120888710022, "learning_rate": 9.66685005887372e-05, "loss": 0.14177179336547852, "memory(GiB)": 122.96, "step": 7665, "token_acc": 0.9409473356185728, "train_speed(iter/s)": 0.240926 }, { "epoch": 0.5846482201387301, "grad_norm": 0.9683123230934143, "learning_rate": 9.666420179403962e-05, "loss": 0.13923001289367676, "memory(GiB)": 122.96, "step": 7670, "token_acc": 0.9499565595134666, "train_speed(iter/s)": 0.24095 }, { "epoch": 0.58502934674899, "grad_norm": 1.5154054164886475, "learning_rate": 9.66599003233666e-05, "loss": 0.18237149715423584, "memory(GiB)": 122.96, "step": 7675, "token_acc": 0.9219535551831458, "train_speed(iter/s)": 0.240996 }, { "epoch": 0.5854104733592499, "grad_norm": 1.4648553133010864, "learning_rate": 9.665559617696485e-05, "loss": 0.17306165695190429, "memory(GiB)": 122.96, "step": 7680, "token_acc": 0.948938611589214, "train_speed(iter/s)": 0.241036 }, { "epoch": 0.5857915999695099, "grad_norm": 0.9786561131477356, "learning_rate": 9.665128935508115e-05, "loss": 0.17935343980789184, "memory(GiB)": 122.96, "step": 7685, "token_acc": 0.9170506912442397, "train_speed(iter/s)": 0.24108 }, { "epoch": 0.5861727265797698, "grad_norm": 0.97187340259552, "learning_rate": 9.664697985796249e-05, "loss": 0.21822142601013184, "memory(GiB)": 122.96, "step": 7690, "token_acc": 0.9170681348933242, "train_speed(iter/s)": 0.241106 }, { "epoch": 0.5865538531900297, "grad_norm": 1.012854814529419, "learning_rate": 9.664266768585601e-05, "loss": 0.1480696201324463, "memory(GiB)": 122.96, "step": 7695, "token_acc": 0.9462025316455697, "train_speed(iter/s)": 0.241151 }, { "epoch": 0.5869349798002896, "grad_norm": 0.8676527738571167, "learning_rate": 9.663835283900899e-05, "loss": 0.1725999116897583, "memory(GiB)": 122.96, "step": 7700, "token_acc": 0.9398532227185705, "train_speed(iter/s)": 0.241188 }, { "epoch": 0.5873161064105495, "grad_norm": 0.7988055944442749, "learning_rate": 9.663403531766887e-05, "loss": 0.1398878812789917, "memory(GiB)": 122.96, "step": 7705, "token_acc": 0.9496844902025905, "train_speed(iter/s)": 0.241213 }, { "epoch": 0.5876972330208096, "grad_norm": 1.9096754789352417, "learning_rate": 9.662971512208323e-05, "loss": 0.14847960472106933, "memory(GiB)": 122.96, "step": 7710, "token_acc": 0.9306253770359944, "train_speed(iter/s)": 0.241255 }, { "epoch": 0.5880783596310695, "grad_norm": 0.7272353172302246, "learning_rate": 9.66253922524998e-05, "loss": 0.171694016456604, "memory(GiB)": 122.96, "step": 7715, "token_acc": 0.9302825552825553, "train_speed(iter/s)": 0.241296 }, { "epoch": 0.5884594862413294, "grad_norm": 1.2555022239685059, "learning_rate": 9.66210667091665e-05, "loss": 0.20573174953460693, "memory(GiB)": 122.96, "step": 7720, "token_acc": 0.9235801367826345, "train_speed(iter/s)": 0.241342 }, { "epoch": 0.5888406128515893, "grad_norm": 1.1017787456512451, "learning_rate": 9.661673849233139e-05, "loss": 0.16938560009002684, "memory(GiB)": 122.96, "step": 7725, "token_acc": 0.9296995973985754, "train_speed(iter/s)": 0.241394 }, { "epoch": 0.5892217394618492, "grad_norm": 0.6680178642272949, "learning_rate": 9.661240760224264e-05, "loss": 0.14452893733978273, "memory(GiB)": 122.96, "step": 7730, "token_acc": 0.9395299145299145, "train_speed(iter/s)": 0.241431 }, { "epoch": 0.5896028660721091, "grad_norm": 0.5053827166557312, "learning_rate": 9.660807403914863e-05, "loss": 0.17860064506530762, "memory(GiB)": 122.96, "step": 7735, "token_acc": 0.9355971896955504, "train_speed(iter/s)": 0.241468 }, { "epoch": 0.5899839926823691, "grad_norm": 0.874459445476532, "learning_rate": 9.660373780329785e-05, "loss": 0.19879472255706787, "memory(GiB)": 122.96, "step": 7740, "token_acc": 0.9262396694214876, "train_speed(iter/s)": 0.241509 }, { "epoch": 0.590365119292629, "grad_norm": 1.2819390296936035, "learning_rate": 9.659939889493897e-05, "loss": 0.15277471542358398, "memory(GiB)": 122.96, "step": 7745, "token_acc": 0.9369610466845079, "train_speed(iter/s)": 0.241554 }, { "epoch": 0.5907462459028889, "grad_norm": 0.9887340068817139, "learning_rate": 9.659505731432083e-05, "loss": 0.13909441232681274, "memory(GiB)": 122.96, "step": 7750, "token_acc": 0.9354932021984379, "train_speed(iter/s)": 0.241599 }, { "epoch": 0.5911273725131488, "grad_norm": 0.8788447380065918, "learning_rate": 9.659071306169236e-05, "loss": 0.1114037275314331, "memory(GiB)": 122.96, "step": 7755, "token_acc": 0.927102238354507, "train_speed(iter/s)": 0.241651 }, { "epoch": 0.5915084991234087, "grad_norm": 1.8797560930252075, "learning_rate": 9.658636613730271e-05, "loss": 0.14730302095413209, "memory(GiB)": 122.96, "step": 7760, "token_acc": 0.9441595441595442, "train_speed(iter/s)": 0.241687 }, { "epoch": 0.5918896257336688, "grad_norm": 0.9251717925071716, "learning_rate": 9.658201654140116e-05, "loss": 0.14082696437835693, "memory(GiB)": 122.96, "step": 7765, "token_acc": 0.9441519368183527, "train_speed(iter/s)": 0.241722 }, { "epoch": 0.5922707523439287, "grad_norm": 0.8361734747886658, "learning_rate": 9.657766427423713e-05, "loss": 0.23255736827850343, "memory(GiB)": 122.96, "step": 7770, "token_acc": 0.9120903454384411, "train_speed(iter/s)": 0.241756 }, { "epoch": 0.5926518789541886, "grad_norm": 1.2354007959365845, "learning_rate": 9.65733093360602e-05, "loss": 0.1532285213470459, "memory(GiB)": 122.96, "step": 7775, "token_acc": 0.9359342130426468, "train_speed(iter/s)": 0.241792 }, { "epoch": 0.5930330055644485, "grad_norm": 0.7249266505241394, "learning_rate": 9.656895172712009e-05, "loss": 0.19975624084472657, "memory(GiB)": 122.96, "step": 7780, "token_acc": 0.9169244365885992, "train_speed(iter/s)": 0.241829 }, { "epoch": 0.5934141321747084, "grad_norm": 0.6983355283737183, "learning_rate": 9.656459144766671e-05, "loss": 0.1816406726837158, "memory(GiB)": 122.96, "step": 7785, "token_acc": 0.9277124712876639, "train_speed(iter/s)": 0.241853 }, { "epoch": 0.5937952587849684, "grad_norm": 0.7544824481010437, "learning_rate": 9.65602284979501e-05, "loss": 0.12894355058670043, "memory(GiB)": 122.96, "step": 7790, "token_acc": 0.9368944099378882, "train_speed(iter/s)": 0.241895 }, { "epoch": 0.5941763853952283, "grad_norm": 0.7422509789466858, "learning_rate": 9.655586287822045e-05, "loss": 0.18926039934158326, "memory(GiB)": 122.96, "step": 7795, "token_acc": 0.9182325308279506, "train_speed(iter/s)": 0.241924 }, { "epoch": 0.5945575120054882, "grad_norm": 1.2811518907546997, "learning_rate": 9.655149458872812e-05, "loss": 0.16706838607788085, "memory(GiB)": 122.96, "step": 7800, "token_acc": 0.9291871921182266, "train_speed(iter/s)": 0.241975 }, { "epoch": 0.5945575120054882, "eval_loss": 0.13117600977420807, "eval_runtime": 172.1513, "eval_samples_per_second": 3.079, "eval_steps_per_second": 3.079, "eval_token_acc": 0.9363743148003132, "step": 7800 }, { "epoch": 0.5949386386157481, "grad_norm": 1.3139721155166626, "learning_rate": 9.654712362972359e-05, "loss": 0.23185529708862304, "memory(GiB)": 122.96, "step": 7805, "token_acc": 0.9358196326715197, "train_speed(iter/s)": 0.240726 }, { "epoch": 0.595319765226008, "grad_norm": 1.1425786018371582, "learning_rate": 9.654275000145753e-05, "loss": 0.1876257061958313, "memory(GiB)": 122.96, "step": 7810, "token_acc": 0.9300682834942312, "train_speed(iter/s)": 0.240778 }, { "epoch": 0.595700891836268, "grad_norm": 1.4417964220046997, "learning_rate": 9.653837370418074e-05, "loss": 0.16797358989715577, "memory(GiB)": 122.96, "step": 7815, "token_acc": 0.9380551127305853, "train_speed(iter/s)": 0.240812 }, { "epoch": 0.596082018446528, "grad_norm": 0.6133943796157837, "learning_rate": 9.653399473814417e-05, "loss": 0.19549913406372071, "memory(GiB)": 122.96, "step": 7820, "token_acc": 0.9250446162998215, "train_speed(iter/s)": 0.24087 }, { "epoch": 0.5964631450567879, "grad_norm": 1.1524935960769653, "learning_rate": 9.652961310359896e-05, "loss": 0.18271559476852417, "memory(GiB)": 122.96, "step": 7825, "token_acc": 0.9293619025194255, "train_speed(iter/s)": 0.240913 }, { "epoch": 0.5968442716670478, "grad_norm": 1.1153091192245483, "learning_rate": 9.652522880079637e-05, "loss": 0.1810246706008911, "memory(GiB)": 122.96, "step": 7830, "token_acc": 0.9281529448041936, "train_speed(iter/s)": 0.240935 }, { "epoch": 0.5972253982773077, "grad_norm": 0.9788456559181213, "learning_rate": 9.652084182998779e-05, "loss": 0.20587754249572754, "memory(GiB)": 122.96, "step": 7835, "token_acc": 0.925007944073721, "train_speed(iter/s)": 0.240964 }, { "epoch": 0.5976065248875676, "grad_norm": 1.002518892288208, "learning_rate": 9.651645219142483e-05, "loss": 0.22603607177734375, "memory(GiB)": 122.96, "step": 7840, "token_acc": 0.9084863837872071, "train_speed(iter/s)": 0.241017 }, { "epoch": 0.5979876514978276, "grad_norm": 1.1552704572677612, "learning_rate": 9.651205988535919e-05, "loss": 0.1674239754676819, "memory(GiB)": 122.96, "step": 7845, "token_acc": 0.9442746719396009, "train_speed(iter/s)": 0.241039 }, { "epoch": 0.5983687781080875, "grad_norm": 0.8209718465805054, "learning_rate": 9.650766491204277e-05, "loss": 0.1773926019668579, "memory(GiB)": 122.96, "step": 7850, "token_acc": 0.9268342115930279, "train_speed(iter/s)": 0.241086 }, { "epoch": 0.5987499047183474, "grad_norm": 1.494126319885254, "learning_rate": 9.650326727172758e-05, "loss": 0.17339322566986085, "memory(GiB)": 122.96, "step": 7855, "token_acc": 0.9263836646963998, "train_speed(iter/s)": 0.241147 }, { "epoch": 0.5991310313286073, "grad_norm": 1.327548623085022, "learning_rate": 9.649886696466581e-05, "loss": 0.15323047637939452, "memory(GiB)": 122.96, "step": 7860, "token_acc": 0.9318181818181818, "train_speed(iter/s)": 0.241191 }, { "epoch": 0.5995121579388673, "grad_norm": 0.7013490796089172, "learning_rate": 9.649446399110982e-05, "loss": 0.16441717147827148, "memory(GiB)": 122.96, "step": 7865, "token_acc": 0.9357724509691479, "train_speed(iter/s)": 0.241218 }, { "epoch": 0.5998932845491273, "grad_norm": 1.173106074333191, "learning_rate": 9.649005835131206e-05, "loss": 0.17687045335769652, "memory(GiB)": 122.96, "step": 7870, "token_acc": 0.9316129032258065, "train_speed(iter/s)": 0.241229 }, { "epoch": 0.6002744111593872, "grad_norm": 0.6432010531425476, "learning_rate": 9.648565004552522e-05, "loss": 0.15458996295928956, "memory(GiB)": 122.96, "step": 7875, "token_acc": 0.9373620777457138, "train_speed(iter/s)": 0.241255 }, { "epoch": 0.6006555377696471, "grad_norm": 1.331567645072937, "learning_rate": 9.648123907400204e-05, "loss": 0.18623952865600585, "memory(GiB)": 122.96, "step": 7880, "token_acc": 0.9336827252570047, "train_speed(iter/s)": 0.241284 }, { "epoch": 0.601036664379907, "grad_norm": 0.9318958520889282, "learning_rate": 9.64768254369955e-05, "loss": 0.1972717046737671, "memory(GiB)": 122.96, "step": 7885, "token_acc": 0.9313154831199069, "train_speed(iter/s)": 0.241302 }, { "epoch": 0.6014177909901669, "grad_norm": 1.2176384925842285, "learning_rate": 9.647240913475871e-05, "loss": 0.13848457336425782, "memory(GiB)": 122.96, "step": 7890, "token_acc": 0.933203125, "train_speed(iter/s)": 0.241357 }, { "epoch": 0.6017989176004268, "grad_norm": 1.1331379413604736, "learning_rate": 9.64679901675449e-05, "loss": 0.25974676609039304, "memory(GiB)": 122.96, "step": 7895, "token_acc": 0.9138576779026217, "train_speed(iter/s)": 0.241383 }, { "epoch": 0.6021800442106868, "grad_norm": 0.4685252904891968, "learning_rate": 9.646356853560752e-05, "loss": 0.1462315559387207, "memory(GiB)": 122.96, "step": 7900, "token_acc": 0.9355575868372943, "train_speed(iter/s)": 0.241419 }, { "epoch": 0.6025611708209467, "grad_norm": 1.5465182065963745, "learning_rate": 9.645914423920008e-05, "loss": 0.21618452072143554, "memory(GiB)": 122.96, "step": 7905, "token_acc": 0.907625786163522, "train_speed(iter/s)": 0.241471 }, { "epoch": 0.6029422974312066, "grad_norm": 0.840312123298645, "learning_rate": 9.645471727857633e-05, "loss": 0.16696090698242189, "memory(GiB)": 122.96, "step": 7910, "token_acc": 0.9385840935371964, "train_speed(iter/s)": 0.24147 }, { "epoch": 0.6033234240414665, "grad_norm": 1.3208403587341309, "learning_rate": 9.645028765399012e-05, "loss": 0.24054486751556398, "memory(GiB)": 122.96, "step": 7915, "token_acc": 0.9050297816015883, "train_speed(iter/s)": 0.241515 }, { "epoch": 0.6037045506517265, "grad_norm": 0.7623984217643738, "learning_rate": 9.644585536569546e-05, "loss": 0.18515775203704835, "memory(GiB)": 122.96, "step": 7920, "token_acc": 0.9365043240782885, "train_speed(iter/s)": 0.241541 }, { "epoch": 0.6040856772619865, "grad_norm": 0.9161081314086914, "learning_rate": 9.644142041394653e-05, "loss": 0.13962360620498657, "memory(GiB)": 122.96, "step": 7925, "token_acc": 0.9414389291689905, "train_speed(iter/s)": 0.241567 }, { "epoch": 0.6044668038722464, "grad_norm": 0.9703963398933411, "learning_rate": 9.643698279899764e-05, "loss": 0.2150402307510376, "memory(GiB)": 122.96, "step": 7930, "token_acc": 0.9262391817466562, "train_speed(iter/s)": 0.24159 }, { "epoch": 0.6048479304825063, "grad_norm": 1.1734962463378906, "learning_rate": 9.643254252110329e-05, "loss": 0.21015410423278807, "memory(GiB)": 122.96, "step": 7935, "token_acc": 0.9268443893366398, "train_speed(iter/s)": 0.24163 }, { "epoch": 0.6052290570927662, "grad_norm": 1.4418681859970093, "learning_rate": 9.642809958051811e-05, "loss": 0.17848646640777588, "memory(GiB)": 122.96, "step": 7940, "token_acc": 0.9235867446393762, "train_speed(iter/s)": 0.24168 }, { "epoch": 0.6056101837030261, "grad_norm": 0.8545412421226501, "learning_rate": 9.642365397749688e-05, "loss": 0.12784312963485717, "memory(GiB)": 122.96, "step": 7945, "token_acc": 0.9490161001788909, "train_speed(iter/s)": 0.241722 }, { "epoch": 0.6059913103132861, "grad_norm": 0.8447553515434265, "learning_rate": 9.64192057122945e-05, "loss": 0.1770368218421936, "memory(GiB)": 122.96, "step": 7950, "token_acc": 0.920125786163522, "train_speed(iter/s)": 0.241766 }, { "epoch": 0.606372436923546, "grad_norm": 0.86783367395401, "learning_rate": 9.64147547851661e-05, "loss": 0.1523826837539673, "memory(GiB)": 122.96, "step": 7955, "token_acc": 0.9378681360440814, "train_speed(iter/s)": 0.241788 }, { "epoch": 0.6067535635338059, "grad_norm": 0.6750895977020264, "learning_rate": 9.64103011963669e-05, "loss": 0.11448771953582763, "memory(GiB)": 122.96, "step": 7960, "token_acc": 0.9375757575757576, "train_speed(iter/s)": 0.241828 }, { "epoch": 0.6071346901440658, "grad_norm": 0.9182257652282715, "learning_rate": 9.640584494615231e-05, "loss": 0.1699825644493103, "memory(GiB)": 122.96, "step": 7965, "token_acc": 0.9289617486338798, "train_speed(iter/s)": 0.241871 }, { "epoch": 0.6075158167543258, "grad_norm": 1.0282111167907715, "learning_rate": 9.640138603477783e-05, "loss": 0.1932484745979309, "memory(GiB)": 122.96, "step": 7970, "token_acc": 0.9180645161290323, "train_speed(iter/s)": 0.241906 }, { "epoch": 0.6078969433645857, "grad_norm": 0.7538350224494934, "learning_rate": 9.639692446249922e-05, "loss": 0.1876423716545105, "memory(GiB)": 122.96, "step": 7975, "token_acc": 0.9247138769670958, "train_speed(iter/s)": 0.241931 }, { "epoch": 0.6082780699748457, "grad_norm": 0.5646142959594727, "learning_rate": 9.639246022957229e-05, "loss": 0.14887337684631347, "memory(GiB)": 122.96, "step": 7980, "token_acc": 0.941722972972973, "train_speed(iter/s)": 0.241989 }, { "epoch": 0.6086591965851056, "grad_norm": 2.0593976974487305, "learning_rate": 9.638799333625305e-05, "loss": 0.16283581256866456, "memory(GiB)": 122.96, "step": 7985, "token_acc": 0.9440847557386698, "train_speed(iter/s)": 0.242029 }, { "epoch": 0.6090403231953655, "grad_norm": 0.3530195653438568, "learning_rate": 9.638352378279765e-05, "loss": 0.15430002212524413, "memory(GiB)": 122.96, "step": 7990, "token_acc": 0.9292753623188406, "train_speed(iter/s)": 0.242078 }, { "epoch": 0.6094214498056254, "grad_norm": 1.5503309965133667, "learning_rate": 9.637905156946243e-05, "loss": 0.22461538314819335, "memory(GiB)": 122.96, "step": 7995, "token_acc": 0.9164294088092447, "train_speed(iter/s)": 0.242105 }, { "epoch": 0.6098025764158853, "grad_norm": 1.5756884813308716, "learning_rate": 9.63745766965038e-05, "loss": 0.1813715934753418, "memory(GiB)": 122.96, "step": 8000, "token_acc": 0.9185963237049415, "train_speed(iter/s)": 0.242141 }, { "epoch": 0.6098025764158853, "eval_loss": 0.13106344640254974, "eval_runtime": 189.8956, "eval_samples_per_second": 2.791, "eval_steps_per_second": 2.791, "eval_token_acc": 0.9363366664658755, "step": 8000 }, { "epoch": 0.6101837030261453, "grad_norm": 1.428428292274475, "learning_rate": 9.637009916417843e-05, "loss": 0.20544662475585937, "memory(GiB)": 122.96, "step": 8005, "token_acc": 0.9358545229522952, "train_speed(iter/s)": 0.24077 }, { "epoch": 0.6105648296364052, "grad_norm": 0.12143536657094955, "learning_rate": 9.636561897274303e-05, "loss": 0.12355036735534668, "memory(GiB)": 122.96, "step": 8010, "token_acc": 0.9426017874875869, "train_speed(iter/s)": 0.240799 }, { "epoch": 0.6109459562466651, "grad_norm": 0.2754247486591339, "learning_rate": 9.636113612245457e-05, "loss": 0.16331074237823487, "memory(GiB)": 122.96, "step": 8015, "token_acc": 0.9207863544376987, "train_speed(iter/s)": 0.240845 }, { "epoch": 0.611327082856925, "grad_norm": 0.7716213464736938, "learning_rate": 9.635665061357007e-05, "loss": 0.19358372688293457, "memory(GiB)": 122.96, "step": 8020, "token_acc": 0.9422433527029407, "train_speed(iter/s)": 0.240863 }, { "epoch": 0.611708209467185, "grad_norm": 1.4119852781295776, "learning_rate": 9.63521624463468e-05, "loss": 0.19928573369979857, "memory(GiB)": 122.96, "step": 8025, "token_acc": 0.9114077669902912, "train_speed(iter/s)": 0.240907 }, { "epoch": 0.612089336077445, "grad_norm": 1.0648599863052368, "learning_rate": 9.63476716210421e-05, "loss": 0.24821348190307618, "memory(GiB)": 122.96, "step": 8030, "token_acc": 0.8982778415614237, "train_speed(iter/s)": 0.240949 }, { "epoch": 0.6124704626877049, "grad_norm": 0.8465040922164917, "learning_rate": 9.634317813791351e-05, "loss": 0.10390888452529908, "memory(GiB)": 122.96, "step": 8035, "token_acc": 0.9552734375, "train_speed(iter/s)": 0.240969 }, { "epoch": 0.6128515892979648, "grad_norm": 1.271389365196228, "learning_rate": 9.633868199721872e-05, "loss": 0.1562321424484253, "memory(GiB)": 122.96, "step": 8040, "token_acc": 0.93179694799128, "train_speed(iter/s)": 0.241017 }, { "epoch": 0.6132327159082247, "grad_norm": 0.5480735301971436, "learning_rate": 9.633418319921555e-05, "loss": 0.18682260513305665, "memory(GiB)": 122.96, "step": 8045, "token_acc": 0.9414935577175914, "train_speed(iter/s)": 0.241039 }, { "epoch": 0.6136138425184846, "grad_norm": 0.9608625769615173, "learning_rate": 9.6329681744162e-05, "loss": 0.17028530836105346, "memory(GiB)": 122.96, "step": 8050, "token_acc": 0.9411764705882353, "train_speed(iter/s)": 0.241045 }, { "epoch": 0.6139949691287445, "grad_norm": 1.0030444860458374, "learning_rate": 9.632517763231619e-05, "loss": 0.19159598350524903, "memory(GiB)": 122.96, "step": 8055, "token_acc": 0.9214901477832512, "train_speed(iter/s)": 0.241097 }, { "epoch": 0.6143760957390045, "grad_norm": 0.7449002265930176, "learning_rate": 9.632067086393642e-05, "loss": 0.16223336458206178, "memory(GiB)": 122.96, "step": 8060, "token_acc": 0.93519882179676, "train_speed(iter/s)": 0.241138 }, { "epoch": 0.6147572223492644, "grad_norm": 1.9718002080917358, "learning_rate": 9.631616143928112e-05, "loss": 0.21803746223449708, "memory(GiB)": 122.96, "step": 8065, "token_acc": 0.9145274212368728, "train_speed(iter/s)": 0.24118 }, { "epoch": 0.6151383489595244, "grad_norm": 1.1464660167694092, "learning_rate": 9.631164935860892e-05, "loss": 0.1557462692260742, "memory(GiB)": 122.96, "step": 8070, "token_acc": 0.9365896719140797, "train_speed(iter/s)": 0.241192 }, { "epoch": 0.6155194755697843, "grad_norm": 0.9512729644775391, "learning_rate": 9.630713462217853e-05, "loss": 0.16249715089797973, "memory(GiB)": 122.96, "step": 8075, "token_acc": 0.933440440670186, "train_speed(iter/s)": 0.241232 }, { "epoch": 0.6159006021800442, "grad_norm": 0.874847948551178, "learning_rate": 9.630261723024885e-05, "loss": 0.16679707765579224, "memory(GiB)": 122.96, "step": 8080, "token_acc": 0.9261477045908184, "train_speed(iter/s)": 0.241277 }, { "epoch": 0.6162817287903042, "grad_norm": 0.9450111985206604, "learning_rate": 9.629809718307895e-05, "loss": 0.12746152877807618, "memory(GiB)": 122.96, "step": 8085, "token_acc": 0.9326716738197425, "train_speed(iter/s)": 0.241318 }, { "epoch": 0.6166628554005641, "grad_norm": 1.5321428775787354, "learning_rate": 9.629357448092803e-05, "loss": 0.20208725929260254, "memory(GiB)": 122.96, "step": 8090, "token_acc": 0.9392151481888035, "train_speed(iter/s)": 0.241323 }, { "epoch": 0.617043982010824, "grad_norm": 1.7236593961715698, "learning_rate": 9.628904912405544e-05, "loss": 0.2043161153793335, "memory(GiB)": 122.96, "step": 8095, "token_acc": 0.9239969135802469, "train_speed(iter/s)": 0.241352 }, { "epoch": 0.6174251086210839, "grad_norm": 0.9077289700508118, "learning_rate": 9.628452111272069e-05, "loss": 0.14441049098968506, "memory(GiB)": 122.96, "step": 8100, "token_acc": 0.9339187705817783, "train_speed(iter/s)": 0.241388 }, { "epoch": 0.6178062352313438, "grad_norm": 0.9347286224365234, "learning_rate": 9.627999044718343e-05, "loss": 0.14001425504684448, "memory(GiB)": 122.96, "step": 8105, "token_acc": 0.947243627741553, "train_speed(iter/s)": 0.241383 }, { "epoch": 0.6181873618416038, "grad_norm": 1.159408688545227, "learning_rate": 9.62754571277035e-05, "loss": 0.18200641870498657, "memory(GiB)": 122.96, "step": 8110, "token_acc": 0.9196542893725992, "train_speed(iter/s)": 0.241434 }, { "epoch": 0.6185684884518637, "grad_norm": 0.8021283745765686, "learning_rate": 9.627092115454084e-05, "loss": 0.1689950108528137, "memory(GiB)": 122.96, "step": 8115, "token_acc": 0.93241323320479, "train_speed(iter/s)": 0.241464 }, { "epoch": 0.6189496150621236, "grad_norm": 0.8328059315681458, "learning_rate": 9.626638252795556e-05, "loss": 0.18900158405303955, "memory(GiB)": 122.96, "step": 8120, "token_acc": 0.9310391001606856, "train_speed(iter/s)": 0.241476 }, { "epoch": 0.6193307416723836, "grad_norm": 0.6116961240768433, "learning_rate": 9.626184124820797e-05, "loss": 0.15584073066711426, "memory(GiB)": 122.96, "step": 8125, "token_acc": 0.9136435331230284, "train_speed(iter/s)": 0.241512 }, { "epoch": 0.6197118682826435, "grad_norm": 1.0864230394363403, "learning_rate": 9.625729731555845e-05, "loss": 0.19368693828582764, "memory(GiB)": 122.96, "step": 8130, "token_acc": 0.9302884615384616, "train_speed(iter/s)": 0.241539 }, { "epoch": 0.6200929948929034, "grad_norm": 1.4659103155136108, "learning_rate": 9.62527507302676e-05, "loss": 0.1807619094848633, "memory(GiB)": 122.96, "step": 8135, "token_acc": 0.9343296148179331, "train_speed(iter/s)": 0.241575 }, { "epoch": 0.6204741215031634, "grad_norm": 0.9813858866691589, "learning_rate": 9.624820149259612e-05, "loss": 0.1562959671020508, "memory(GiB)": 122.96, "step": 8140, "token_acc": 0.9450072358900145, "train_speed(iter/s)": 0.241621 }, { "epoch": 0.6208552481134233, "grad_norm": 0.6769987940788269, "learning_rate": 9.624364960280492e-05, "loss": 0.16451846361160277, "memory(GiB)": 122.96, "step": 8145, "token_acc": 0.9341119435874835, "train_speed(iter/s)": 0.241658 }, { "epoch": 0.6212363747236832, "grad_norm": 1.4193699359893799, "learning_rate": 9.6239095061155e-05, "loss": 0.2132810592651367, "memory(GiB)": 122.96, "step": 8150, "token_acc": 0.9216338880484115, "train_speed(iter/s)": 0.241706 }, { "epoch": 0.6216175013339431, "grad_norm": 2.2731168270111084, "learning_rate": 9.623453786790755e-05, "loss": 0.17738310098648072, "memory(GiB)": 122.96, "step": 8155, "token_acc": 0.9500672172076051, "train_speed(iter/s)": 0.241702 }, { "epoch": 0.621998627944203, "grad_norm": 0.8574259877204895, "learning_rate": 9.622997802332392e-05, "loss": 0.15275660753250123, "memory(GiB)": 122.96, "step": 8160, "token_acc": 0.9302240176276166, "train_speed(iter/s)": 0.241754 }, { "epoch": 0.622379754554463, "grad_norm": 1.5105760097503662, "learning_rate": 9.622541552766557e-05, "loss": 0.18044419288635255, "memory(GiB)": 122.96, "step": 8165, "token_acc": 0.911191662890802, "train_speed(iter/s)": 0.241807 }, { "epoch": 0.622760881164723, "grad_norm": 0.9093693494796753, "learning_rate": 9.622085038119417e-05, "loss": 0.12229187488555908, "memory(GiB)": 122.96, "step": 8170, "token_acc": 0.9341389728096676, "train_speed(iter/s)": 0.241851 }, { "epoch": 0.6231420077749829, "grad_norm": 0.7494461536407471, "learning_rate": 9.621628258417148e-05, "loss": 0.21604681015014648, "memory(GiB)": 122.96, "step": 8175, "token_acc": 0.9219075052008321, "train_speed(iter/s)": 0.241873 }, { "epoch": 0.6235231343852428, "grad_norm": 1.2331860065460205, "learning_rate": 9.621171213685944e-05, "loss": 0.1693800926208496, "memory(GiB)": 122.96, "step": 8180, "token_acc": 0.9256071906674316, "train_speed(iter/s)": 0.241919 }, { "epoch": 0.6239042609955027, "grad_norm": 0.7395375967025757, "learning_rate": 9.62071390395202e-05, "loss": 0.1824579954147339, "memory(GiB)": 122.96, "step": 8185, "token_acc": 0.9351615152219381, "train_speed(iter/s)": 0.241964 }, { "epoch": 0.6242853876057627, "grad_norm": 0.9445475339889526, "learning_rate": 9.620256329241591e-05, "loss": 0.21112446784973143, "memory(GiB)": 122.96, "step": 8190, "token_acc": 0.9307549791570171, "train_speed(iter/s)": 0.24198 }, { "epoch": 0.6246665142160226, "grad_norm": 1.097900629043579, "learning_rate": 9.619798489580905e-05, "loss": 0.17850127220153808, "memory(GiB)": 122.96, "step": 8195, "token_acc": 0.9275023386342376, "train_speed(iter/s)": 0.242014 }, { "epoch": 0.6250476408262825, "grad_norm": 0.8079699873924255, "learning_rate": 9.619340384996214e-05, "loss": 0.14135751724243165, "memory(GiB)": 122.96, "step": 8200, "token_acc": 0.9340305711987128, "train_speed(iter/s)": 0.242057 }, { "epoch": 0.6250476408262825, "eval_loss": 0.1273099184036255, "eval_runtime": 171.4553, "eval_samples_per_second": 3.091, "eval_steps_per_second": 3.091, "eval_token_acc": 0.9369465694837661, "step": 8200 }, { "epoch": 0.6254287674365424, "grad_norm": 1.4148907661437988, "learning_rate": 9.618882015513788e-05, "loss": 0.1710277318954468, "memory(GiB)": 122.96, "step": 8205, "token_acc": 0.9372786871258124, "train_speed(iter/s)": 0.240858 }, { "epoch": 0.6258098940468023, "grad_norm": 0.8310389518737793, "learning_rate": 9.618423381159914e-05, "loss": 0.23213915824890136, "memory(GiB)": 122.96, "step": 8210, "token_acc": 0.9021739130434783, "train_speed(iter/s)": 0.240887 }, { "epoch": 0.6261910206570622, "grad_norm": 0.9546253085136414, "learning_rate": 9.617964481960888e-05, "loss": 0.14814151525497438, "memory(GiB)": 122.96, "step": 8215, "token_acc": 0.9305822696275573, "train_speed(iter/s)": 0.240917 }, { "epoch": 0.6265721472673222, "grad_norm": 0.8442847728729248, "learning_rate": 9.617505317943032e-05, "loss": 0.20111682415008544, "memory(GiB)": 122.96, "step": 8220, "token_acc": 0.9201833110559481, "train_speed(iter/s)": 0.240943 }, { "epoch": 0.6269532738775822, "grad_norm": 0.5700371861457825, "learning_rate": 9.617045889132672e-05, "loss": 0.13059465885162352, "memory(GiB)": 122.96, "step": 8225, "token_acc": 0.9474576271186441, "train_speed(iter/s)": 0.240978 }, { "epoch": 0.6273344004878421, "grad_norm": 0.7776800394058228, "learning_rate": 9.616586195556157e-05, "loss": 0.18340636491775514, "memory(GiB)": 122.96, "step": 8230, "token_acc": 0.9301075268817204, "train_speed(iter/s)": 0.241028 }, { "epoch": 0.627715527098102, "grad_norm": 1.103229284286499, "learning_rate": 9.616126237239847e-05, "loss": 0.1375953435897827, "memory(GiB)": 122.96, "step": 8235, "token_acc": 0.9381225475399939, "train_speed(iter/s)": 0.241068 }, { "epoch": 0.6280966537083619, "grad_norm": 1.4373728036880493, "learning_rate": 9.615666014210119e-05, "loss": 0.17645232677459716, "memory(GiB)": 122.96, "step": 8240, "token_acc": 0.9376840039254171, "train_speed(iter/s)": 0.241074 }, { "epoch": 0.6284777803186219, "grad_norm": 0.907810628414154, "learning_rate": 9.615205526493363e-05, "loss": 0.2072843551635742, "memory(GiB)": 122.96, "step": 8245, "token_acc": 0.922324398356486, "train_speed(iter/s)": 0.241109 }, { "epoch": 0.6288589069288818, "grad_norm": 1.4297528266906738, "learning_rate": 9.614744774115989e-05, "loss": 0.11596149206161499, "memory(GiB)": 122.96, "step": 8250, "token_acc": 0.9546319796954315, "train_speed(iter/s)": 0.241155 }, { "epoch": 0.6292400335391417, "grad_norm": 0.8754181861877441, "learning_rate": 9.614283757104417e-05, "loss": 0.11538888216018676, "memory(GiB)": 122.96, "step": 8255, "token_acc": 0.935546875, "train_speed(iter/s)": 0.241193 }, { "epoch": 0.6296211601494016, "grad_norm": 0.6612743735313416, "learning_rate": 9.613822475485083e-05, "loss": 0.15533294677734374, "memory(GiB)": 122.96, "step": 8260, "token_acc": 0.9368171021377673, "train_speed(iter/s)": 0.241214 }, { "epoch": 0.6300022867596615, "grad_norm": 1.3262935876846313, "learning_rate": 9.613360929284442e-05, "loss": 0.1915029764175415, "memory(GiB)": 122.96, "step": 8265, "token_acc": 0.9227589208006962, "train_speed(iter/s)": 0.24125 }, { "epoch": 0.6303834133699215, "grad_norm": 1.2241666316986084, "learning_rate": 9.612899118528959e-05, "loss": 0.1591894030570984, "memory(GiB)": 122.96, "step": 8270, "token_acc": 0.940260403369926, "train_speed(iter/s)": 0.24129 }, { "epoch": 0.6307645399801814, "grad_norm": 1.1499006748199463, "learning_rate": 9.61243704324512e-05, "loss": 0.1767812490463257, "memory(GiB)": 122.96, "step": 8275, "token_acc": 0.9336655986709387, "train_speed(iter/s)": 0.241304 }, { "epoch": 0.6311456665904414, "grad_norm": 1.2670087814331055, "learning_rate": 9.61197470345942e-05, "loss": 0.18676252365112306, "memory(GiB)": 122.96, "step": 8280, "token_acc": 0.9348773217152029, "train_speed(iter/s)": 0.241345 }, { "epoch": 0.6315267932007013, "grad_norm": 0.9649338722229004, "learning_rate": 9.611512099198372e-05, "loss": 0.17513937950134278, "memory(GiB)": 122.96, "step": 8285, "token_acc": 0.926073926073926, "train_speed(iter/s)": 0.241388 }, { "epoch": 0.6319079198109612, "grad_norm": 1.201296091079712, "learning_rate": 9.611049230488506e-05, "loss": 0.2051142692565918, "memory(GiB)": 122.96, "step": 8290, "token_acc": 0.923993676273866, "train_speed(iter/s)": 0.241392 }, { "epoch": 0.6322890464212211, "grad_norm": 0.8939799666404724, "learning_rate": 9.610586097356365e-05, "loss": 0.17460932731628417, "memory(GiB)": 122.96, "step": 8295, "token_acc": 0.9313154831199069, "train_speed(iter/s)": 0.241411 }, { "epoch": 0.6326701730314811, "grad_norm": 0.15668103098869324, "learning_rate": 9.610122699828507e-05, "loss": 0.1212655782699585, "memory(GiB)": 122.96, "step": 8300, "token_acc": 0.949376652814507, "train_speed(iter/s)": 0.241459 }, { "epoch": 0.633051299641741, "grad_norm": 0.9086278080940247, "learning_rate": 9.609659037931504e-05, "loss": 0.17887375354766846, "memory(GiB)": 122.96, "step": 8305, "token_acc": 0.9343821155040364, "train_speed(iter/s)": 0.241492 }, { "epoch": 0.6334324262520009, "grad_norm": 0.7159310579299927, "learning_rate": 9.609195111691949e-05, "loss": 0.17997028827667236, "memory(GiB)": 122.96, "step": 8310, "token_acc": 0.927650618258353, "train_speed(iter/s)": 0.241537 }, { "epoch": 0.6338135528622608, "grad_norm": 1.4166866540908813, "learning_rate": 9.608730921136442e-05, "loss": 0.12555792331695556, "memory(GiB)": 122.96, "step": 8315, "token_acc": 0.9483708484180702, "train_speed(iter/s)": 0.241562 }, { "epoch": 0.6341946794725207, "grad_norm": 1.9215995073318481, "learning_rate": 9.608266466291605e-05, "loss": 0.153370463848114, "memory(GiB)": 122.96, "step": 8320, "token_acc": 0.941318522096112, "train_speed(iter/s)": 0.2416 }, { "epoch": 0.6345758060827807, "grad_norm": 0.7461358308792114, "learning_rate": 9.60780174718407e-05, "loss": 0.1777164578437805, "memory(GiB)": 122.96, "step": 8325, "token_acc": 0.9164391043145822, "train_speed(iter/s)": 0.241641 }, { "epoch": 0.6349569326930407, "grad_norm": 0.8541620969772339, "learning_rate": 9.60733676384049e-05, "loss": 0.15266958475112916, "memory(GiB)": 122.96, "step": 8330, "token_acc": 0.9271899886234357, "train_speed(iter/s)": 0.241678 }, { "epoch": 0.6353380593033006, "grad_norm": 0.5619824528694153, "learning_rate": 9.606871516287524e-05, "loss": 0.13736329078674317, "memory(GiB)": 122.96, "step": 8335, "token_acc": 0.9475948978023667, "train_speed(iter/s)": 0.241707 }, { "epoch": 0.6357191859135605, "grad_norm": 0.7591145038604736, "learning_rate": 9.606406004551856e-05, "loss": 0.1611067533493042, "memory(GiB)": 122.96, "step": 8340, "token_acc": 0.9353582554517134, "train_speed(iter/s)": 0.241723 }, { "epoch": 0.6361003125238204, "grad_norm": 0.8863813877105713, "learning_rate": 9.60594022866018e-05, "loss": 0.20532302856445311, "memory(GiB)": 122.96, "step": 8345, "token_acc": 0.9146280579131303, "train_speed(iter/s)": 0.241758 }, { "epoch": 0.6364814391340804, "grad_norm": 0.6345336437225342, "learning_rate": 9.605474188639208e-05, "loss": 0.1616098999977112, "memory(GiB)": 122.96, "step": 8350, "token_acc": 0.9428836777339216, "train_speed(iter/s)": 0.24179 }, { "epoch": 0.6368625657443403, "grad_norm": 0.7842341661453247, "learning_rate": 9.60500788451566e-05, "loss": 0.18547506332397462, "memory(GiB)": 122.96, "step": 8355, "token_acc": 0.9306521019529957, "train_speed(iter/s)": 0.241817 }, { "epoch": 0.6372436923546002, "grad_norm": 1.428398847579956, "learning_rate": 9.604541316316283e-05, "loss": 0.17235245704650878, "memory(GiB)": 122.96, "step": 8360, "token_acc": 0.9445856634468734, "train_speed(iter/s)": 0.241844 }, { "epoch": 0.6376248189648601, "grad_norm": 0.8446768522262573, "learning_rate": 9.604074484067827e-05, "loss": 0.17958345413208007, "memory(GiB)": 122.96, "step": 8365, "token_acc": 0.9331285444234405, "train_speed(iter/s)": 0.241878 }, { "epoch": 0.63800594557512, "grad_norm": 1.2249500751495361, "learning_rate": 9.603607387797065e-05, "loss": 0.20288915634155275, "memory(GiB)": 122.96, "step": 8370, "token_acc": 0.9316254111130344, "train_speed(iter/s)": 0.241898 }, { "epoch": 0.6383870721853799, "grad_norm": 0.8047518730163574, "learning_rate": 9.603140027530783e-05, "loss": 0.20430231094360352, "memory(GiB)": 122.96, "step": 8375, "token_acc": 0.9308029950942421, "train_speed(iter/s)": 0.241931 }, { "epoch": 0.63876819879564, "grad_norm": 1.311505675315857, "learning_rate": 9.602672403295782e-05, "loss": 0.15162469148635865, "memory(GiB)": 122.96, "step": 8380, "token_acc": 0.9390831390831391, "train_speed(iter/s)": 0.241947 }, { "epoch": 0.6391493254058999, "grad_norm": 0.35626307129859924, "learning_rate": 9.602204515118876e-05, "loss": 0.1750964641571045, "memory(GiB)": 122.96, "step": 8385, "token_acc": 0.9267333809864189, "train_speed(iter/s)": 0.24199 }, { "epoch": 0.6395304520161598, "grad_norm": 1.098136305809021, "learning_rate": 9.6017363630269e-05, "loss": 0.19403172731399537, "memory(GiB)": 122.96, "step": 8390, "token_acc": 0.9216266173752311, "train_speed(iter/s)": 0.242036 }, { "epoch": 0.6399115786264197, "grad_norm": 1.4040976762771606, "learning_rate": 9.601267947046697e-05, "loss": 0.22104952335357667, "memory(GiB)": 122.96, "step": 8395, "token_acc": 0.9206942590120161, "train_speed(iter/s)": 0.24208 }, { "epoch": 0.6402927052366796, "grad_norm": 1.5646140575408936, "learning_rate": 9.60079926720513e-05, "loss": 0.15823090076446533, "memory(GiB)": 122.96, "step": 8400, "token_acc": 0.9271961492178099, "train_speed(iter/s)": 0.24213 }, { "epoch": 0.6402927052366796, "eval_loss": 0.12958687543869019, "eval_runtime": 174.435, "eval_samples_per_second": 3.038, "eval_steps_per_second": 3.038, "eval_token_acc": 0.9374435274983435, "step": 8400 }, { "epoch": 0.6406738318469396, "grad_norm": 0.38185179233551025, "learning_rate": 9.600330323529077e-05, "loss": 0.142455792427063, "memory(GiB)": 122.96, "step": 8405, "token_acc": 0.9377693401572189, "train_speed(iter/s)": 0.240942 }, { "epoch": 0.6410549584571995, "grad_norm": 1.0085633993148804, "learning_rate": 9.599861116045426e-05, "loss": 0.1553672432899475, "memory(GiB)": 122.96, "step": 8410, "token_acc": 0.9388057246257608, "train_speed(iter/s)": 0.240966 }, { "epoch": 0.6414360850674594, "grad_norm": 0.8816835880279541, "learning_rate": 9.599391644781086e-05, "loss": 0.15746684074401857, "memory(GiB)": 122.96, "step": 8415, "token_acc": 0.9401725431357839, "train_speed(iter/s)": 0.240993 }, { "epoch": 0.6418172116777193, "grad_norm": 0.8446353673934937, "learning_rate": 9.59892190976298e-05, "loss": 0.1980975389480591, "memory(GiB)": 122.96, "step": 8420, "token_acc": 0.9022608695652173, "train_speed(iter/s)": 0.24104 }, { "epoch": 0.6421983382879792, "grad_norm": 0.7561236023902893, "learning_rate": 9.598451911018044e-05, "loss": 0.1818060278892517, "memory(GiB)": 122.96, "step": 8425, "token_acc": 0.9297912713472486, "train_speed(iter/s)": 0.241086 }, { "epoch": 0.6425794648982392, "grad_norm": 1.3481884002685547, "learning_rate": 9.597981648573229e-05, "loss": 0.17112646102905274, "memory(GiB)": 122.96, "step": 8430, "token_acc": 0.926605504587156, "train_speed(iter/s)": 0.241115 }, { "epoch": 0.6429605915084992, "grad_norm": 1.0952452421188354, "learning_rate": 9.597511122455505e-05, "loss": 0.16461453437805176, "memory(GiB)": 122.96, "step": 8435, "token_acc": 0.9383355614973262, "train_speed(iter/s)": 0.241141 }, { "epoch": 0.6433417181187591, "grad_norm": 1.0959644317626953, "learning_rate": 9.597040332691854e-05, "loss": 0.19042859077453614, "memory(GiB)": 122.96, "step": 8440, "token_acc": 0.9300491336953711, "train_speed(iter/s)": 0.241149 }, { "epoch": 0.643722844729019, "grad_norm": 1.0455317497253418, "learning_rate": 9.59656927930927e-05, "loss": 0.19325727224349976, "memory(GiB)": 122.96, "step": 8445, "token_acc": 0.918869828456105, "train_speed(iter/s)": 0.241184 }, { "epoch": 0.6441039713392789, "grad_norm": 0.5525988340377808, "learning_rate": 9.596097962334771e-05, "loss": 0.13968425989151, "memory(GiB)": 122.96, "step": 8450, "token_acc": 0.9417139256458728, "train_speed(iter/s)": 0.241233 }, { "epoch": 0.6444850979495388, "grad_norm": 1.1237283945083618, "learning_rate": 9.595626381795381e-05, "loss": 0.1856292486190796, "memory(GiB)": 122.96, "step": 8455, "token_acc": 0.9213863060016906, "train_speed(iter/s)": 0.241274 }, { "epoch": 0.6448662245597988, "grad_norm": 0.9030933380126953, "learning_rate": 9.595154537718145e-05, "loss": 0.27615833282470703, "memory(GiB)": 122.96, "step": 8460, "token_acc": 0.9105993904503894, "train_speed(iter/s)": 0.24132 }, { "epoch": 0.6452473511700587, "grad_norm": 1.223082184791565, "learning_rate": 9.59468243013012e-05, "loss": 0.18834749460220337, "memory(GiB)": 122.96, "step": 8465, "token_acc": 0.9293639053254438, "train_speed(iter/s)": 0.241346 }, { "epoch": 0.6456284777803186, "grad_norm": 0.8325604200363159, "learning_rate": 9.594210059058379e-05, "loss": 0.10805976390838623, "memory(GiB)": 122.96, "step": 8470, "token_acc": 0.9526328444337581, "train_speed(iter/s)": 0.241379 }, { "epoch": 0.6460096043905785, "grad_norm": 2.5277583599090576, "learning_rate": 9.593737424530013e-05, "loss": 0.19726216793060303, "memory(GiB)": 122.96, "step": 8475, "token_acc": 0.9306829765545361, "train_speed(iter/s)": 0.241419 }, { "epoch": 0.6463907310008384, "grad_norm": 0.8176725506782532, "learning_rate": 9.593264526572122e-05, "loss": 0.20717239379882812, "memory(GiB)": 122.96, "step": 8480, "token_acc": 0.9232407317781435, "train_speed(iter/s)": 0.241433 }, { "epoch": 0.6467718576110985, "grad_norm": 0.9424812197685242, "learning_rate": 9.592791365211825e-05, "loss": 0.2273397207260132, "memory(GiB)": 122.96, "step": 8485, "token_acc": 0.9210587959709534, "train_speed(iter/s)": 0.241465 }, { "epoch": 0.6471529842213584, "grad_norm": 1.3571972846984863, "learning_rate": 9.592317940476258e-05, "loss": 0.1594170331954956, "memory(GiB)": 122.96, "step": 8490, "token_acc": 0.9413751181953263, "train_speed(iter/s)": 0.241467 }, { "epoch": 0.6475341108316183, "grad_norm": 0.8461487293243408, "learning_rate": 9.591844252392566e-05, "loss": 0.1853726863861084, "memory(GiB)": 122.96, "step": 8495, "token_acc": 0.9233050009883376, "train_speed(iter/s)": 0.241504 }, { "epoch": 0.6479152374418782, "grad_norm": 0.760529100894928, "learning_rate": 9.591370300987917e-05, "loss": 0.18645672798156737, "memory(GiB)": 122.96, "step": 8500, "token_acc": 0.9280104712041884, "train_speed(iter/s)": 0.241523 }, { "epoch": 0.6482963640521381, "grad_norm": 1.5562944412231445, "learning_rate": 9.590896086289486e-05, "loss": 0.15756999254226683, "memory(GiB)": 122.96, "step": 8505, "token_acc": 0.946322336398053, "train_speed(iter/s)": 0.241537 }, { "epoch": 0.6486774906623981, "grad_norm": 0.937748372554779, "learning_rate": 9.590421608324469e-05, "loss": 0.23643884658813477, "memory(GiB)": 122.96, "step": 8510, "token_acc": 0.92109375, "train_speed(iter/s)": 0.241563 }, { "epoch": 0.649058617272658, "grad_norm": 0.9871267676353455, "learning_rate": 9.589946867120076e-05, "loss": 0.20934031009674073, "memory(GiB)": 122.96, "step": 8515, "token_acc": 0.9365773646801531, "train_speed(iter/s)": 0.24158 }, { "epoch": 0.6494397438829179, "grad_norm": 1.434014081954956, "learning_rate": 9.58947186270353e-05, "loss": 0.1735867142677307, "memory(GiB)": 122.96, "step": 8520, "token_acc": 0.9406152327685474, "train_speed(iter/s)": 0.241603 }, { "epoch": 0.6498208704931778, "grad_norm": 0.746117889881134, "learning_rate": 9.58899659510207e-05, "loss": 0.1511477828025818, "memory(GiB)": 122.96, "step": 8525, "token_acc": 0.9349690803909835, "train_speed(iter/s)": 0.24163 }, { "epoch": 0.6502019971034377, "grad_norm": 1.4615434408187866, "learning_rate": 9.58852106434295e-05, "loss": 0.13082759380340575, "memory(GiB)": 122.96, "step": 8530, "token_acc": 0.953125, "train_speed(iter/s)": 0.241657 }, { "epoch": 0.6505831237136976, "grad_norm": 0.811945915222168, "learning_rate": 9.588045270453442e-05, "loss": 0.18232908248901367, "memory(GiB)": 122.96, "step": 8535, "token_acc": 0.9222222222222223, "train_speed(iter/s)": 0.241693 }, { "epoch": 0.6509642503239577, "grad_norm": 0.814946711063385, "learning_rate": 9.587569213460828e-05, "loss": 0.15288931131362915, "memory(GiB)": 122.96, "step": 8540, "token_acc": 0.9336523819882532, "train_speed(iter/s)": 0.241726 }, { "epoch": 0.6513453769342176, "grad_norm": 0.9621989727020264, "learning_rate": 9.587092893392409e-05, "loss": 0.2379392147064209, "memory(GiB)": 122.96, "step": 8545, "token_acc": 0.8843537414965986, "train_speed(iter/s)": 0.241771 }, { "epoch": 0.6517265035444775, "grad_norm": 0.9783580899238586, "learning_rate": 9.586616310275498e-05, "loss": 0.16908464431762696, "memory(GiB)": 122.96, "step": 8550, "token_acc": 0.9246058944482523, "train_speed(iter/s)": 0.241811 }, { "epoch": 0.6521076301547374, "grad_norm": 0.9110943675041199, "learning_rate": 9.586139464137426e-05, "loss": 0.14428446292877198, "memory(GiB)": 122.96, "step": 8555, "token_acc": 0.946236559139785, "train_speed(iter/s)": 0.241842 }, { "epoch": 0.6524887567649973, "grad_norm": 1.1597049236297607, "learning_rate": 9.58566235500554e-05, "loss": 0.15822073221206664, "memory(GiB)": 122.96, "step": 8560, "token_acc": 0.9395315053568186, "train_speed(iter/s)": 0.241869 }, { "epoch": 0.6528698833752573, "grad_norm": 0.99576735496521, "learning_rate": 9.585184982907196e-05, "loss": 0.16016092300415039, "memory(GiB)": 122.96, "step": 8565, "token_acc": 0.9336173233936812, "train_speed(iter/s)": 0.241894 }, { "epoch": 0.6532510099855172, "grad_norm": 0.5846201777458191, "learning_rate": 9.584707347869773e-05, "loss": 0.19196685552597045, "memory(GiB)": 122.96, "step": 8570, "token_acc": 0.9252013808975834, "train_speed(iter/s)": 0.241927 }, { "epoch": 0.6536321365957771, "grad_norm": 0.6323632597923279, "learning_rate": 9.584229449920659e-05, "loss": 0.1647853136062622, "memory(GiB)": 122.96, "step": 8575, "token_acc": 0.9167635433620088, "train_speed(iter/s)": 0.241959 }, { "epoch": 0.654013263206037, "grad_norm": 0.9840952754020691, "learning_rate": 9.583751289087257e-05, "loss": 0.15533188581466675, "memory(GiB)": 122.96, "step": 8580, "token_acc": 0.9332344213649851, "train_speed(iter/s)": 0.242011 }, { "epoch": 0.6543943898162969, "grad_norm": 1.1417115926742554, "learning_rate": 9.583272865396993e-05, "loss": 0.1474178671836853, "memory(GiB)": 122.96, "step": 8585, "token_acc": 0.9445531637312459, "train_speed(iter/s)": 0.24205 }, { "epoch": 0.6547755164265568, "grad_norm": 0.7346475124359131, "learning_rate": 9.582794178877297e-05, "loss": 0.14976317882537843, "memory(GiB)": 122.96, "step": 8590, "token_acc": 0.9408752327746741, "train_speed(iter/s)": 0.242065 }, { "epoch": 0.6551566430368169, "grad_norm": 0.7473490238189697, "learning_rate": 9.582315229555623e-05, "loss": 0.16570782661437988, "memory(GiB)": 122.96, "step": 8595, "token_acc": 0.9331460674157304, "train_speed(iter/s)": 0.242092 }, { "epoch": 0.6555377696470768, "grad_norm": 0.9431344270706177, "learning_rate": 9.581836017459433e-05, "loss": 0.19766793251037598, "memory(GiB)": 122.96, "step": 8600, "token_acc": 0.913049918530588, "train_speed(iter/s)": 0.242123 }, { "epoch": 0.6555377696470768, "eval_loss": 0.12851114571094513, "eval_runtime": 171.0549, "eval_samples_per_second": 3.098, "eval_steps_per_second": 3.098, "eval_token_acc": 0.9384073248599482, "step": 8600 }, { "epoch": 0.6559188962573367, "grad_norm": 1.3170934915542603, "learning_rate": 9.581356542616211e-05, "loss": 0.15280789136886597, "memory(GiB)": 122.96, "step": 8605, "token_acc": 0.9384182464454977, "train_speed(iter/s)": 0.241014 }, { "epoch": 0.6563000228675966, "grad_norm": 1.1195261478424072, "learning_rate": 9.580876805053452e-05, "loss": 0.15156619548797606, "memory(GiB)": 122.96, "step": 8610, "token_acc": 0.9429179566563467, "train_speed(iter/s)": 0.24105 }, { "epoch": 0.6566811494778565, "grad_norm": 1.047929048538208, "learning_rate": 9.580396804798666e-05, "loss": 0.15759152173995972, "memory(GiB)": 122.96, "step": 8615, "token_acc": 0.9396963123644252, "train_speed(iter/s)": 0.24107 }, { "epoch": 0.6570622760881165, "grad_norm": 1.3274258375167847, "learning_rate": 9.579916541879378e-05, "loss": 0.20240907669067382, "memory(GiB)": 122.96, "step": 8620, "token_acc": 0.919089245781663, "train_speed(iter/s)": 0.241108 }, { "epoch": 0.6574434026983764, "grad_norm": 1.9867382049560547, "learning_rate": 9.579436016323131e-05, "loss": 0.22017102241516112, "memory(GiB)": 122.96, "step": 8625, "token_acc": 0.9193030719853278, "train_speed(iter/s)": 0.241138 }, { "epoch": 0.6578245293086363, "grad_norm": 1.5564203262329102, "learning_rate": 9.578955228157478e-05, "loss": 0.14391725063323973, "memory(GiB)": 122.96, "step": 8630, "token_acc": 0.9433287950987066, "train_speed(iter/s)": 0.241165 }, { "epoch": 0.6582056559188962, "grad_norm": 0.45271944999694824, "learning_rate": 9.578474177409992e-05, "loss": 0.22846317291259766, "memory(GiB)": 122.96, "step": 8635, "token_acc": 0.926254997778765, "train_speed(iter/s)": 0.24118 }, { "epoch": 0.6585867825291561, "grad_norm": 0.39930853247642517, "learning_rate": 9.57799286410826e-05, "loss": 0.13359334468841552, "memory(GiB)": 122.96, "step": 8640, "token_acc": 0.9412148922273024, "train_speed(iter/s)": 0.241226 }, { "epoch": 0.6589679091394162, "grad_norm": 1.244123935699463, "learning_rate": 9.577511288279881e-05, "loss": 0.15227952003479003, "memory(GiB)": 122.96, "step": 8645, "token_acc": 0.946162998215348, "train_speed(iter/s)": 0.241272 }, { "epoch": 0.6593490357496761, "grad_norm": 1.2357984781265259, "learning_rate": 9.577029449952471e-05, "loss": 0.14404083490371705, "memory(GiB)": 122.96, "step": 8650, "token_acc": 0.94362292051756, "train_speed(iter/s)": 0.241276 }, { "epoch": 0.659730162359936, "grad_norm": 0.6363328695297241, "learning_rate": 9.576547349153664e-05, "loss": 0.2556620597839355, "memory(GiB)": 122.96, "step": 8655, "token_acc": 0.9197975770587333, "train_speed(iter/s)": 0.241307 }, { "epoch": 0.6601112889701959, "grad_norm": 0.6645667552947998, "learning_rate": 9.576064985911101e-05, "loss": 0.12260277271270752, "memory(GiB)": 122.96, "step": 8660, "token_acc": 0.9377148902406771, "train_speed(iter/s)": 0.24134 }, { "epoch": 0.6604924155804558, "grad_norm": 0.7569519281387329, "learning_rate": 9.575582360252451e-05, "loss": 0.12180485725402831, "memory(GiB)": 122.96, "step": 8665, "token_acc": 0.954295154185022, "train_speed(iter/s)": 0.241358 }, { "epoch": 0.6608735421907157, "grad_norm": 1.0172390937805176, "learning_rate": 9.575099472205383e-05, "loss": 0.17168790102005005, "memory(GiB)": 122.96, "step": 8670, "token_acc": 0.9368475991649269, "train_speed(iter/s)": 0.241378 }, { "epoch": 0.6612546688009757, "grad_norm": 1.1671638488769531, "learning_rate": 9.574616321797592e-05, "loss": 0.15445761680603026, "memory(GiB)": 122.96, "step": 8675, "token_acc": 0.9321253954558527, "train_speed(iter/s)": 0.241416 }, { "epoch": 0.6616357954112356, "grad_norm": 1.1544653177261353, "learning_rate": 9.574132909056783e-05, "loss": 0.18331100940704345, "memory(GiB)": 122.96, "step": 8680, "token_acc": 0.9309290953545232, "train_speed(iter/s)": 0.241457 }, { "epoch": 0.6620169220214955, "grad_norm": 0.5532286167144775, "learning_rate": 9.573649234010679e-05, "loss": 0.13530209064483642, "memory(GiB)": 122.96, "step": 8685, "token_acc": 0.9339246119733925, "train_speed(iter/s)": 0.241506 }, { "epoch": 0.6623980486317554, "grad_norm": 1.3802263736724854, "learning_rate": 9.573165296687016e-05, "loss": 0.1668811559677124, "memory(GiB)": 122.96, "step": 8690, "token_acc": 0.9258397932816538, "train_speed(iter/s)": 0.241545 }, { "epoch": 0.6627791752420154, "grad_norm": 1.3633880615234375, "learning_rate": 9.572681097113544e-05, "loss": 0.1721900224685669, "memory(GiB)": 122.96, "step": 8695, "token_acc": 0.943401287553648, "train_speed(iter/s)": 0.241558 }, { "epoch": 0.6631603018522754, "grad_norm": 1.5130645036697388, "learning_rate": 9.572196635318032e-05, "loss": 0.27863690853118894, "memory(GiB)": 122.96, "step": 8700, "token_acc": 0.8944376776289078, "train_speed(iter/s)": 0.2416 }, { "epoch": 0.6635414284625353, "grad_norm": 0.8275206685066223, "learning_rate": 9.571711911328261e-05, "loss": 0.12863738536834718, "memory(GiB)": 122.96, "step": 8705, "token_acc": 0.958993165527588, "train_speed(iter/s)": 0.241613 }, { "epoch": 0.6639225550727952, "grad_norm": 1.5509916543960571, "learning_rate": 9.571226925172027e-05, "loss": 0.13788354396820068, "memory(GiB)": 122.96, "step": 8710, "token_acc": 0.9416506717850288, "train_speed(iter/s)": 0.241662 }, { "epoch": 0.6643036816830551, "grad_norm": 0.9834699630737305, "learning_rate": 9.57074167687714e-05, "loss": 0.17092932462692262, "memory(GiB)": 122.96, "step": 8715, "token_acc": 0.9350145489815713, "train_speed(iter/s)": 0.241699 }, { "epoch": 0.664684808293315, "grad_norm": 1.0568833351135254, "learning_rate": 9.570256166471432e-05, "loss": 0.1065927267074585, "memory(GiB)": 122.96, "step": 8720, "token_acc": 0.9558321132259638, "train_speed(iter/s)": 0.24173 }, { "epoch": 0.665065934903575, "grad_norm": 1.631014347076416, "learning_rate": 9.569770393982738e-05, "loss": 0.16662125587463378, "memory(GiB)": 122.96, "step": 8725, "token_acc": 0.9376803231390652, "train_speed(iter/s)": 0.241754 }, { "epoch": 0.6654470615138349, "grad_norm": 0.6505218744277954, "learning_rate": 9.569284359438921e-05, "loss": 0.1329158902168274, "memory(GiB)": 122.96, "step": 8730, "token_acc": 0.9490716180371352, "train_speed(iter/s)": 0.241786 }, { "epoch": 0.6658281881240948, "grad_norm": 0.8702318072319031, "learning_rate": 9.568798062867849e-05, "loss": 0.10045559406280517, "memory(GiB)": 122.96, "step": 8735, "token_acc": 0.950937950937951, "train_speed(iter/s)": 0.241819 }, { "epoch": 0.6662093147343547, "grad_norm": 0.7056555151939392, "learning_rate": 9.568311504297409e-05, "loss": 0.15316239595413209, "memory(GiB)": 122.96, "step": 8740, "token_acc": 0.9529939535790911, "train_speed(iter/s)": 0.241848 }, { "epoch": 0.6665904413446146, "grad_norm": 1.7238225936889648, "learning_rate": 9.567824683755505e-05, "loss": 0.164631450176239, "memory(GiB)": 122.96, "step": 8745, "token_acc": 0.938570205479452, "train_speed(iter/s)": 0.241876 }, { "epoch": 0.6669715679548746, "grad_norm": 1.4183790683746338, "learning_rate": 9.567337601270053e-05, "loss": 0.1579514503479004, "memory(GiB)": 122.96, "step": 8750, "token_acc": 0.9435760579489134, "train_speed(iter/s)": 0.241904 }, { "epoch": 0.6673526945651346, "grad_norm": 1.1541823148727417, "learning_rate": 9.566850256868984e-05, "loss": 0.21323838233947753, "memory(GiB)": 122.96, "step": 8755, "token_acc": 0.9202320522117476, "train_speed(iter/s)": 0.241947 }, { "epoch": 0.6677338211753945, "grad_norm": 0.6535806655883789, "learning_rate": 9.566362650580245e-05, "loss": 0.15494590997695923, "memory(GiB)": 122.96, "step": 8760, "token_acc": 0.9462190352020861, "train_speed(iter/s)": 0.24199 }, { "epoch": 0.6681149477856544, "grad_norm": 1.1694241762161255, "learning_rate": 9.565874782431798e-05, "loss": 0.19834680557250978, "memory(GiB)": 122.96, "step": 8765, "token_acc": 0.9168311944718658, "train_speed(iter/s)": 0.242028 }, { "epoch": 0.6684960743959143, "grad_norm": 0.9871301054954529, "learning_rate": 9.565386652451622e-05, "loss": 0.17736284732818602, "memory(GiB)": 122.96, "step": 8770, "token_acc": 0.9304359007329305, "train_speed(iter/s)": 0.242047 }, { "epoch": 0.6688772010061742, "grad_norm": 1.199397325515747, "learning_rate": 9.564898260667707e-05, "loss": 0.15914329290390014, "memory(GiB)": 122.96, "step": 8775, "token_acc": 0.9309958970533383, "train_speed(iter/s)": 0.24209 }, { "epoch": 0.6692583276164342, "grad_norm": 0.4279326796531677, "learning_rate": 9.56440960710806e-05, "loss": 0.164863920211792, "memory(GiB)": 122.96, "step": 8780, "token_acc": 0.942070512021926, "train_speed(iter/s)": 0.242101 }, { "epoch": 0.6696394542266941, "grad_norm": 0.5897040367126465, "learning_rate": 9.563920691800706e-05, "loss": 0.21215009689331055, "memory(GiB)": 122.96, "step": 8785, "token_acc": 0.8963181148748159, "train_speed(iter/s)": 0.242147 }, { "epoch": 0.670020580836954, "grad_norm": 0.5419905781745911, "learning_rate": 9.563431514773675e-05, "loss": 0.11364173889160156, "memory(GiB)": 122.96, "step": 8790, "token_acc": 0.9449855947317877, "train_speed(iter/s)": 0.242175 }, { "epoch": 0.670401707447214, "grad_norm": 2.000537395477295, "learning_rate": 9.562942076055026e-05, "loss": 0.203667950630188, "memory(GiB)": 122.96, "step": 8795, "token_acc": 0.9189620758483034, "train_speed(iter/s)": 0.242221 }, { "epoch": 0.6707828340574739, "grad_norm": 1.3072203397750854, "learning_rate": 9.562452375672823e-05, "loss": 0.22212111949920654, "memory(GiB)": 122.96, "step": 8800, "token_acc": 0.9124151140518892, "train_speed(iter/s)": 0.242253 }, { "epoch": 0.6707828340574739, "eval_loss": 0.13469699025154114, "eval_runtime": 175.1444, "eval_samples_per_second": 3.026, "eval_steps_per_second": 3.026, "eval_token_acc": 0.937142340822842, "step": 8800 }, { "epoch": 0.6711639606677339, "grad_norm": 2.536484956741333, "learning_rate": 9.56196241365515e-05, "loss": 0.1705387830734253, "memory(GiB)": 122.96, "step": 8805, "token_acc": 0.9369145241777803, "train_speed(iter/s)": 0.241124 }, { "epoch": 0.6715450872779938, "grad_norm": 1.401249647140503, "learning_rate": 9.561472190030102e-05, "loss": 0.17152541875839233, "memory(GiB)": 122.96, "step": 8810, "token_acc": 0.9259558427571352, "train_speed(iter/s)": 0.241163 }, { "epoch": 0.6719262138882537, "grad_norm": 0.8801302313804626, "learning_rate": 9.560981704825791e-05, "loss": 0.141107439994812, "memory(GiB)": 122.96, "step": 8815, "token_acc": 0.9467579387716296, "train_speed(iter/s)": 0.24119 }, { "epoch": 0.6723073404985136, "grad_norm": 0.912456750869751, "learning_rate": 9.560490958070346e-05, "loss": 0.17345608472824098, "memory(GiB)": 122.96, "step": 8820, "token_acc": 0.9444444444444444, "train_speed(iter/s)": 0.241228 }, { "epoch": 0.6726884671087735, "grad_norm": 1.5818183422088623, "learning_rate": 9.559999949791907e-05, "loss": 0.22027831077575682, "memory(GiB)": 122.96, "step": 8825, "token_acc": 0.9044585987261147, "train_speed(iter/s)": 0.241268 }, { "epoch": 0.6730695937190334, "grad_norm": 0.6276481747627258, "learning_rate": 9.559508680018632e-05, "loss": 0.1497356653213501, "memory(GiB)": 122.96, "step": 8830, "token_acc": 0.9286152587441194, "train_speed(iter/s)": 0.241297 }, { "epoch": 0.6734507203292934, "grad_norm": 1.1506242752075195, "learning_rate": 9.559017148778693e-05, "loss": 0.21606216430664063, "memory(GiB)": 122.96, "step": 8835, "token_acc": 0.918335635359116, "train_speed(iter/s)": 0.241327 }, { "epoch": 0.6738318469395533, "grad_norm": 0.6084758043289185, "learning_rate": 9.558525356100276e-05, "loss": 0.13313487768173218, "memory(GiB)": 122.96, "step": 8840, "token_acc": 0.9542372881355933, "train_speed(iter/s)": 0.241367 }, { "epoch": 0.6742129735498132, "grad_norm": 1.8422175645828247, "learning_rate": 9.558033302011584e-05, "loss": 0.21463360786437988, "memory(GiB)": 122.96, "step": 8845, "token_acc": 0.9329668005920914, "train_speed(iter/s)": 0.241399 }, { "epoch": 0.6745941001600732, "grad_norm": 0.6583683490753174, "learning_rate": 9.557540986540836e-05, "loss": 0.22564201354980468, "memory(GiB)": 122.96, "step": 8850, "token_acc": 0.9087281795511222, "train_speed(iter/s)": 0.241438 }, { "epoch": 0.6749752267703331, "grad_norm": 0.7961545586585999, "learning_rate": 9.55704840971626e-05, "loss": 0.16976985931396485, "memory(GiB)": 122.96, "step": 8855, "token_acc": 0.9276350288411117, "train_speed(iter/s)": 0.241473 }, { "epoch": 0.6753563533805931, "grad_norm": 1.2208166122436523, "learning_rate": 9.556555571566105e-05, "loss": 0.16795276403427123, "memory(GiB)": 122.96, "step": 8860, "token_acc": 0.9380205306992059, "train_speed(iter/s)": 0.241502 }, { "epoch": 0.675737479990853, "grad_norm": 0.6625256538391113, "learning_rate": 9.556062472118635e-05, "loss": 0.16901130676269532, "memory(GiB)": 122.96, "step": 8865, "token_acc": 0.9376804211241297, "train_speed(iter/s)": 0.241518 }, { "epoch": 0.6761186066011129, "grad_norm": 1.3168303966522217, "learning_rate": 9.555569111402123e-05, "loss": 0.18368821144104003, "memory(GiB)": 122.96, "step": 8870, "token_acc": 0.9378407851690295, "train_speed(iter/s)": 0.24153 }, { "epoch": 0.6764997332113728, "grad_norm": 1.4147961139678955, "learning_rate": 9.555075489444865e-05, "loss": 0.19685887098312377, "memory(GiB)": 122.96, "step": 8875, "token_acc": 0.9314310051107325, "train_speed(iter/s)": 0.241569 }, { "epoch": 0.6768808598216327, "grad_norm": 0.6231678128242493, "learning_rate": 9.554581606275164e-05, "loss": 0.126328444480896, "memory(GiB)": 122.96, "step": 8880, "token_acc": 0.9509788060184436, "train_speed(iter/s)": 0.241586 }, { "epoch": 0.6772619864318927, "grad_norm": 1.1861693859100342, "learning_rate": 9.554087461921344e-05, "loss": 0.20332281589508056, "memory(GiB)": 122.96, "step": 8885, "token_acc": 0.9291544740489758, "train_speed(iter/s)": 0.241612 }, { "epoch": 0.6776431130421526, "grad_norm": 1.1347607374191284, "learning_rate": 9.553593056411741e-05, "loss": 0.16985955238342285, "memory(GiB)": 122.96, "step": 8890, "token_acc": 0.9355909694555112, "train_speed(iter/s)": 0.241656 }, { "epoch": 0.6780242396524125, "grad_norm": 1.5179601907730103, "learning_rate": 9.553098389774708e-05, "loss": 0.178075110912323, "memory(GiB)": 122.96, "step": 8895, "token_acc": 0.9278499278499278, "train_speed(iter/s)": 0.241694 }, { "epoch": 0.6784053662626724, "grad_norm": 1.1453924179077148, "learning_rate": 9.552603462038611e-05, "loss": 0.1451705813407898, "memory(GiB)": 122.96, "step": 8900, "token_acc": 0.9355423672931091, "train_speed(iter/s)": 0.241717 }, { "epoch": 0.6787864928729324, "grad_norm": 1.2617045640945435, "learning_rate": 9.552108273231832e-05, "loss": 0.14203604459762573, "memory(GiB)": 122.96, "step": 8905, "token_acc": 0.9513624377380604, "train_speed(iter/s)": 0.241758 }, { "epoch": 0.6791676194831923, "grad_norm": 1.0436880588531494, "learning_rate": 9.551612823382769e-05, "loss": 0.15114080905914307, "memory(GiB)": 122.96, "step": 8910, "token_acc": 0.9223436410137177, "train_speed(iter/s)": 0.241798 }, { "epoch": 0.6795487460934523, "grad_norm": 0.3451308608055115, "learning_rate": 9.551117112519832e-05, "loss": 0.13928529024124145, "memory(GiB)": 122.96, "step": 8915, "token_acc": 0.9386036403151318, "train_speed(iter/s)": 0.241843 }, { "epoch": 0.6799298727037122, "grad_norm": 0.6288059949874878, "learning_rate": 9.55062114067145e-05, "loss": 0.1583176851272583, "memory(GiB)": 122.96, "step": 8920, "token_acc": 0.9367149758454106, "train_speed(iter/s)": 0.241877 }, { "epoch": 0.6803109993139721, "grad_norm": 0.8597826361656189, "learning_rate": 9.55012490786606e-05, "loss": 0.14547147750854492, "memory(GiB)": 122.96, "step": 8925, "token_acc": 0.9392673074244323, "train_speed(iter/s)": 0.241892 }, { "epoch": 0.680692125924232, "grad_norm": 0.6857344508171082, "learning_rate": 9.549628414132124e-05, "loss": 0.10918498039245605, "memory(GiB)": 122.96, "step": 8930, "token_acc": 0.9560311929649908, "train_speed(iter/s)": 0.241916 }, { "epoch": 0.6810732525344919, "grad_norm": 1.0298407077789307, "learning_rate": 9.549131659498109e-05, "loss": 0.1594465970993042, "memory(GiB)": 122.96, "step": 8935, "token_acc": 0.9315220404768506, "train_speed(iter/s)": 0.241957 }, { "epoch": 0.6814543791447519, "grad_norm": 1.1644285917282104, "learning_rate": 9.548634643992507e-05, "loss": 0.19863269329071045, "memory(GiB)": 122.96, "step": 8940, "token_acc": 0.9338138925294889, "train_speed(iter/s)": 0.241983 }, { "epoch": 0.6818355057550118, "grad_norm": 0.5354268550872803, "learning_rate": 9.548137367643814e-05, "loss": 0.13427571058273316, "memory(GiB)": 122.96, "step": 8945, "token_acc": 0.959456056408965, "train_speed(iter/s)": 0.242017 }, { "epoch": 0.6822166323652717, "grad_norm": 4.319709300994873, "learning_rate": 9.54763983048055e-05, "loss": 0.11590391397476196, "memory(GiB)": 122.96, "step": 8950, "token_acc": 0.9361313868613139, "train_speed(iter/s)": 0.242061 }, { "epoch": 0.6825977589755317, "grad_norm": 0.9975879788398743, "learning_rate": 9.547142032531245e-05, "loss": 0.22108159065246583, "memory(GiB)": 122.96, "step": 8955, "token_acc": 0.9218507859198584, "train_speed(iter/s)": 0.242092 }, { "epoch": 0.6829788855857916, "grad_norm": 1.5780223608016968, "learning_rate": 9.546643973824444e-05, "loss": 0.2179497718811035, "memory(GiB)": 122.96, "step": 8960, "token_acc": 0.9248660655951915, "train_speed(iter/s)": 0.242106 }, { "epoch": 0.6833600121960516, "grad_norm": 1.0401691198349, "learning_rate": 9.546145654388714e-05, "loss": 0.18097102642059326, "memory(GiB)": 122.96, "step": 8965, "token_acc": 0.9299655568312285, "train_speed(iter/s)": 0.242144 }, { "epoch": 0.6837411388063115, "grad_norm": 0.8187171816825867, "learning_rate": 9.545647074252625e-05, "loss": 0.21135964393615722, "memory(GiB)": 122.96, "step": 8970, "token_acc": 0.9091251175917215, "train_speed(iter/s)": 0.242176 }, { "epoch": 0.6841222654165714, "grad_norm": 0.7409788966178894, "learning_rate": 9.545148233444771e-05, "loss": 0.18899887800216675, "memory(GiB)": 122.96, "step": 8975, "token_acc": 0.932745909009171, "train_speed(iter/s)": 0.242198 }, { "epoch": 0.6845033920268313, "grad_norm": 1.0722646713256836, "learning_rate": 9.544649131993757e-05, "loss": 0.14227595329284667, "memory(GiB)": 122.96, "step": 8980, "token_acc": 0.9453159041394336, "train_speed(iter/s)": 0.242229 }, { "epoch": 0.6848845186370912, "grad_norm": 0.6809432506561279, "learning_rate": 9.544149769928205e-05, "loss": 0.15148912668228148, "memory(GiB)": 122.96, "step": 8985, "token_acc": 0.943134229667181, "train_speed(iter/s)": 0.242264 }, { "epoch": 0.6852656452473511, "grad_norm": 0.8471017479896545, "learning_rate": 9.543650147276753e-05, "loss": 0.15777790546417236, "memory(GiB)": 122.96, "step": 8990, "token_acc": 0.9369369369369369, "train_speed(iter/s)": 0.242278 }, { "epoch": 0.6856467718576111, "grad_norm": 1.049741268157959, "learning_rate": 9.54315026406805e-05, "loss": 0.18837494850158693, "memory(GiB)": 122.96, "step": 8995, "token_acc": 0.9266227657572906, "train_speed(iter/s)": 0.242324 }, { "epoch": 0.686027898467871, "grad_norm": 0.9582337737083435, "learning_rate": 9.542650120330761e-05, "loss": 0.16907622814178466, "memory(GiB)": 122.96, "step": 9000, "token_acc": 0.9318869828456104, "train_speed(iter/s)": 0.242353 }, { "epoch": 0.686027898467871, "eval_loss": 0.12857121229171753, "eval_runtime": 175.5949, "eval_samples_per_second": 3.018, "eval_steps_per_second": 3.018, "eval_token_acc": 0.9386181555327993, "step": 9000 }, { "epoch": 0.686409025078131, "grad_norm": 0.5060502886772156, "learning_rate": 9.542149716093568e-05, "loss": 0.16513874530792236, "memory(GiB)": 122.96, "step": 9005, "token_acc": 0.938493368308272, "train_speed(iter/s)": 0.241252 }, { "epoch": 0.6867901516883909, "grad_norm": 1.1009718179702759, "learning_rate": 9.541649051385167e-05, "loss": 0.18938639163970947, "memory(GiB)": 122.96, "step": 9010, "token_acc": 0.9269157349331879, "train_speed(iter/s)": 0.241293 }, { "epoch": 0.6871712782986508, "grad_norm": 0.732887327671051, "learning_rate": 9.541148126234269e-05, "loss": 0.11380200386047364, "memory(GiB)": 122.96, "step": 9015, "token_acc": 0.9514489069649212, "train_speed(iter/s)": 0.241334 }, { "epoch": 0.6875524049089108, "grad_norm": 0.22346508502960205, "learning_rate": 9.5406469406696e-05, "loss": 0.14297108650207518, "memory(GiB)": 122.96, "step": 9020, "token_acc": 0.9321570576540755, "train_speed(iter/s)": 0.241371 }, { "epoch": 0.6879335315191707, "grad_norm": 1.7149039506912231, "learning_rate": 9.5401454947199e-05, "loss": 0.1342632293701172, "memory(GiB)": 122.96, "step": 9025, "token_acc": 0.9445718654434251, "train_speed(iter/s)": 0.241416 }, { "epoch": 0.6883146581294306, "grad_norm": 0.9419713020324707, "learning_rate": 9.539643788413923e-05, "loss": 0.20138678550720215, "memory(GiB)": 122.96, "step": 9030, "token_acc": 0.9299401197604791, "train_speed(iter/s)": 0.241428 }, { "epoch": 0.6886957847396905, "grad_norm": 0.6921876072883606, "learning_rate": 9.539141821780444e-05, "loss": 0.1344299793243408, "memory(GiB)": 122.96, "step": 9035, "token_acc": 0.9403168743270266, "train_speed(iter/s)": 0.241458 }, { "epoch": 0.6890769113499504, "grad_norm": 0.8287458419799805, "learning_rate": 9.538639594848244e-05, "loss": 0.1378612756729126, "memory(GiB)": 122.96, "step": 9040, "token_acc": 0.9367967440746947, "train_speed(iter/s)": 0.241498 }, { "epoch": 0.6894580379602104, "grad_norm": 0.5757372975349426, "learning_rate": 9.538137107646125e-05, "loss": 0.1407497763633728, "memory(GiB)": 122.96, "step": 9045, "token_acc": 0.9246487867177522, "train_speed(iter/s)": 0.241536 }, { "epoch": 0.6898391645704703, "grad_norm": 0.7121516466140747, "learning_rate": 9.537634360202903e-05, "loss": 0.16111423969268798, "memory(GiB)": 122.96, "step": 9050, "token_acc": 0.9290633608815427, "train_speed(iter/s)": 0.241592 }, { "epoch": 0.6902202911807302, "grad_norm": 0.5916762351989746, "learning_rate": 9.537131352547409e-05, "loss": 0.17431943416595458, "memory(GiB)": 122.96, "step": 9055, "token_acc": 0.9391008174386921, "train_speed(iter/s)": 0.241602 }, { "epoch": 0.6906014177909902, "grad_norm": 1.150350570678711, "learning_rate": 9.536628084708483e-05, "loss": 0.2074373960494995, "memory(GiB)": 122.96, "step": 9060, "token_acc": 0.9168180167090447, "train_speed(iter/s)": 0.241646 }, { "epoch": 0.6909825444012501, "grad_norm": 0.9240394830703735, "learning_rate": 9.536124556714992e-05, "loss": 0.1914812445640564, "memory(GiB)": 122.96, "step": 9065, "token_acc": 0.9369243091710052, "train_speed(iter/s)": 0.241668 }, { "epoch": 0.69136367101151, "grad_norm": 0.9513451457023621, "learning_rate": 9.535620768595807e-05, "loss": 0.1695890784263611, "memory(GiB)": 122.96, "step": 9070, "token_acc": 0.9394941634241245, "train_speed(iter/s)": 0.2417 }, { "epoch": 0.69174479762177, "grad_norm": 1.1648385524749756, "learning_rate": 9.535116720379819e-05, "loss": 0.1434258460998535, "memory(GiB)": 122.96, "step": 9075, "token_acc": 0.9369349005424955, "train_speed(iter/s)": 0.241738 }, { "epoch": 0.6921259242320299, "grad_norm": 0.8391145467758179, "learning_rate": 9.534612412095931e-05, "loss": 0.17801239490509033, "memory(GiB)": 122.96, "step": 9080, "token_acc": 0.9343617962071338, "train_speed(iter/s)": 0.241742 }, { "epoch": 0.6925070508422898, "grad_norm": 0.9537278413772583, "learning_rate": 9.534107843773066e-05, "loss": 0.1734191060066223, "memory(GiB)": 122.96, "step": 9085, "token_acc": 0.9314606741573034, "train_speed(iter/s)": 0.241783 }, { "epoch": 0.6928881774525497, "grad_norm": 0.7163766026496887, "learning_rate": 9.533603015440158e-05, "loss": 0.16781280040740967, "memory(GiB)": 122.96, "step": 9090, "token_acc": 0.9270752521334368, "train_speed(iter/s)": 0.241831 }, { "epoch": 0.6932693040628096, "grad_norm": 1.8151538372039795, "learning_rate": 9.533097927126153e-05, "loss": 0.21151726245880126, "memory(GiB)": 122.96, "step": 9095, "token_acc": 0.926256254078747, "train_speed(iter/s)": 0.241864 }, { "epoch": 0.6936504306730696, "grad_norm": 0.7945890426635742, "learning_rate": 9.53259257886002e-05, "loss": 0.12327859401702881, "memory(GiB)": 122.96, "step": 9100, "token_acc": 0.9404814004376367, "train_speed(iter/s)": 0.241915 }, { "epoch": 0.6940315572833295, "grad_norm": 1.2090681791305542, "learning_rate": 9.532086970670736e-05, "loss": 0.21699943542480468, "memory(GiB)": 122.96, "step": 9105, "token_acc": 0.9195285215366705, "train_speed(iter/s)": 0.241928 }, { "epoch": 0.6944126838935895, "grad_norm": 0.818718433380127, "learning_rate": 9.531581102587294e-05, "loss": 0.1805219054222107, "memory(GiB)": 122.96, "step": 9110, "token_acc": 0.9303778637310325, "train_speed(iter/s)": 0.241969 }, { "epoch": 0.6947938105038494, "grad_norm": 0.670319139957428, "learning_rate": 9.531074974638708e-05, "loss": 0.12614606618881224, "memory(GiB)": 122.96, "step": 9115, "token_acc": 0.9405272838749233, "train_speed(iter/s)": 0.241991 }, { "epoch": 0.6951749371141093, "grad_norm": 0.38572072982788086, "learning_rate": 9.530568586853996e-05, "loss": 0.15102009773254393, "memory(GiB)": 122.96, "step": 9120, "token_acc": 0.9331191002209279, "train_speed(iter/s)": 0.242016 }, { "epoch": 0.6955560637243693, "grad_norm": 0.7704436182975769, "learning_rate": 9.5300619392622e-05, "loss": 0.15617527961730956, "memory(GiB)": 122.96, "step": 9125, "token_acc": 0.9230597181838515, "train_speed(iter/s)": 0.242055 }, { "epoch": 0.6959371903346292, "grad_norm": 0.34445956349372864, "learning_rate": 9.529555031892376e-05, "loss": 0.2003537654876709, "memory(GiB)": 122.96, "step": 9130, "token_acc": 0.9003436426116839, "train_speed(iter/s)": 0.242098 }, { "epoch": 0.6963183169448891, "grad_norm": 1.1476998329162598, "learning_rate": 9.52904786477359e-05, "loss": 0.178894305229187, "memory(GiB)": 122.96, "step": 9135, "token_acc": 0.9200308562612497, "train_speed(iter/s)": 0.242136 }, { "epoch": 0.696699443555149, "grad_norm": 0.6060994267463684, "learning_rate": 9.528540437934925e-05, "loss": 0.13092939853668212, "memory(GiB)": 122.96, "step": 9140, "token_acc": 0.9360399183488319, "train_speed(iter/s)": 0.242173 }, { "epoch": 0.6970805701654089, "grad_norm": 1.7710938453674316, "learning_rate": 9.528032751405483e-05, "loss": 0.1916312336921692, "memory(GiB)": 122.96, "step": 9145, "token_acc": 0.9309944911568571, "train_speed(iter/s)": 0.242213 }, { "epoch": 0.6974616967756688, "grad_norm": 0.5953335762023926, "learning_rate": 9.527524805214374e-05, "loss": 0.1354650378227234, "memory(GiB)": 122.96, "step": 9150, "token_acc": 0.9455841699403463, "train_speed(iter/s)": 0.242239 }, { "epoch": 0.6978428233859288, "grad_norm": 0.8541198968887329, "learning_rate": 9.527016599390727e-05, "loss": 0.1727538824081421, "memory(GiB)": 122.96, "step": 9155, "token_acc": 0.9399711861693613, "train_speed(iter/s)": 0.242265 }, { "epoch": 0.6982239499961888, "grad_norm": 1.7090308666229248, "learning_rate": 9.526508133963688e-05, "loss": 0.147752046585083, "memory(GiB)": 122.96, "step": 9160, "token_acc": 0.9479127491538172, "train_speed(iter/s)": 0.242294 }, { "epoch": 0.6986050766064487, "grad_norm": 1.2373900413513184, "learning_rate": 9.52599940896241e-05, "loss": 0.16596381664276122, "memory(GiB)": 122.96, "step": 9165, "token_acc": 0.9392314566577301, "train_speed(iter/s)": 0.242318 }, { "epoch": 0.6989862032167086, "grad_norm": 0.767850399017334, "learning_rate": 9.525490424416072e-05, "loss": 0.18876746892929078, "memory(GiB)": 122.96, "step": 9170, "token_acc": 0.935822200024972, "train_speed(iter/s)": 0.242327 }, { "epoch": 0.6993673298269685, "grad_norm": 0.9806342720985413, "learning_rate": 9.524981180353859e-05, "loss": 0.27291035652160645, "memory(GiB)": 122.96, "step": 9175, "token_acc": 0.9012487676634899, "train_speed(iter/s)": 0.242347 }, { "epoch": 0.6997484564372285, "grad_norm": 1.3313506841659546, "learning_rate": 9.524471676804971e-05, "loss": 0.13511433601379394, "memory(GiB)": 122.96, "step": 9180, "token_acc": 0.9422304603027495, "train_speed(iter/s)": 0.242389 }, { "epoch": 0.7001295830474884, "grad_norm": 0.5960055589675903, "learning_rate": 9.52396191379863e-05, "loss": 0.13694289922714234, "memory(GiB)": 122.96, "step": 9185, "token_acc": 0.9469122426868906, "train_speed(iter/s)": 0.24242 }, { "epoch": 0.7005107096577483, "grad_norm": 0.5328623652458191, "learning_rate": 9.523451891364068e-05, "loss": 0.1686814546585083, "memory(GiB)": 122.96, "step": 9190, "token_acc": 0.9456869009584664, "train_speed(iter/s)": 0.24246 }, { "epoch": 0.7008918362680082, "grad_norm": 0.9396845698356628, "learning_rate": 9.52294160953053e-05, "loss": 0.19919424057006835, "memory(GiB)": 122.96, "step": 9195, "token_acc": 0.9125506072874494, "train_speed(iter/s)": 0.242497 }, { "epoch": 0.7012729628782681, "grad_norm": 0.6883851289749146, "learning_rate": 9.52243106832728e-05, "loss": 0.15298905372619628, "memory(GiB)": 122.96, "step": 9200, "token_acc": 0.9370526076582206, "train_speed(iter/s)": 0.242501 }, { "epoch": 0.7012729628782681, "eval_loss": 0.13086062669754028, "eval_runtime": 185.3798, "eval_samples_per_second": 2.859, "eval_steps_per_second": 2.859, "eval_token_acc": 0.9381739051864345, "step": 9200 }, { "epoch": 0.7016540894885281, "grad_norm": 0.9017201662063599, "learning_rate": 9.521920267783595e-05, "loss": 0.1146062970161438, "memory(GiB)": 122.96, "step": 9205, "token_acc": 0.9388045506875371, "train_speed(iter/s)": 0.241322 }, { "epoch": 0.702035216098788, "grad_norm": 0.9677888751029968, "learning_rate": 9.521409207928768e-05, "loss": 0.17072107791900634, "memory(GiB)": 122.96, "step": 9210, "token_acc": 0.9385238358181484, "train_speed(iter/s)": 0.241354 }, { "epoch": 0.702416342709048, "grad_norm": 0.6450486183166504, "learning_rate": 9.520897888792104e-05, "loss": 0.17181146144866943, "memory(GiB)": 122.96, "step": 9215, "token_acc": 0.9460006224712108, "train_speed(iter/s)": 0.241376 }, { "epoch": 0.7027974693193079, "grad_norm": 1.513820767402649, "learning_rate": 9.520386310402925e-05, "loss": 0.1502668857574463, "memory(GiB)": 122.96, "step": 9220, "token_acc": 0.9331018095412174, "train_speed(iter/s)": 0.241407 }, { "epoch": 0.7031785959295678, "grad_norm": 0.9984436631202698, "learning_rate": 9.519874472790569e-05, "loss": 0.13592061996459961, "memory(GiB)": 122.96, "step": 9225, "token_acc": 0.9400906735751295, "train_speed(iter/s)": 0.241426 }, { "epoch": 0.7035597225398277, "grad_norm": 1.0146212577819824, "learning_rate": 9.519362375984386e-05, "loss": 0.20855324268341063, "memory(GiB)": 122.96, "step": 9230, "token_acc": 0.9197015455675964, "train_speed(iter/s)": 0.241445 }, { "epoch": 0.7039408491500877, "grad_norm": 0.7764794826507568, "learning_rate": 9.518850020013744e-05, "loss": 0.14185711145401, "memory(GiB)": 122.96, "step": 9235, "token_acc": 0.9375239555385205, "train_speed(iter/s)": 0.241488 }, { "epoch": 0.7043219757603476, "grad_norm": 1.4391552209854126, "learning_rate": 9.518337404908022e-05, "loss": 0.14879271984100342, "memory(GiB)": 122.96, "step": 9240, "token_acc": 0.9346200660754651, "train_speed(iter/s)": 0.24151 }, { "epoch": 0.7047031023706075, "grad_norm": 0.967044472694397, "learning_rate": 9.517824530696619e-05, "loss": 0.131601881980896, "memory(GiB)": 122.96, "step": 9245, "token_acc": 0.9465505062396986, "train_speed(iter/s)": 0.241534 }, { "epoch": 0.7050842289808674, "grad_norm": 0.6186357736587524, "learning_rate": 9.517311397408945e-05, "loss": 0.10536360740661621, "memory(GiB)": 122.96, "step": 9250, "token_acc": 0.9493827160493827, "train_speed(iter/s)": 0.241563 }, { "epoch": 0.7054653555911273, "grad_norm": 0.4821854531764984, "learning_rate": 9.516798005074423e-05, "loss": 0.22609519958496094, "memory(GiB)": 122.96, "step": 9255, "token_acc": 0.9139840775354794, "train_speed(iter/s)": 0.241589 }, { "epoch": 0.7058464822013873, "grad_norm": 0.4596918523311615, "learning_rate": 9.516284353722498e-05, "loss": 0.123163902759552, "memory(GiB)": 122.96, "step": 9260, "token_acc": 0.9469420818144998, "train_speed(iter/s)": 0.241613 }, { "epoch": 0.7062276088116473, "grad_norm": 0.7561137080192566, "learning_rate": 9.515770443382622e-05, "loss": 0.12428357601165771, "memory(GiB)": 122.96, "step": 9265, "token_acc": 0.9425622849994788, "train_speed(iter/s)": 0.241625 }, { "epoch": 0.7066087354219072, "grad_norm": 0.995267391204834, "learning_rate": 9.515256274084268e-05, "loss": 0.1576755166053772, "memory(GiB)": 122.96, "step": 9270, "token_acc": 0.9382673942701227, "train_speed(iter/s)": 0.241662 }, { "epoch": 0.7069898620321671, "grad_norm": 0.7541844248771667, "learning_rate": 9.514741845856918e-05, "loss": 0.2116389274597168, "memory(GiB)": 122.96, "step": 9275, "token_acc": 0.9186567164179105, "train_speed(iter/s)": 0.241683 }, { "epoch": 0.707370988642427, "grad_norm": 0.6140597462654114, "learning_rate": 9.514227158730076e-05, "loss": 0.07625975012779236, "memory(GiB)": 122.96, "step": 9280, "token_acc": 0.9641943734015346, "train_speed(iter/s)": 0.241731 }, { "epoch": 0.707752115252687, "grad_norm": 2.147630214691162, "learning_rate": 9.513712212733255e-05, "loss": 0.15799405574798583, "memory(GiB)": 122.96, "step": 9285, "token_acc": 0.9294072511030117, "train_speed(iter/s)": 0.241763 }, { "epoch": 0.7081332418629469, "grad_norm": 0.9567639827728271, "learning_rate": 9.513197007895984e-05, "loss": 0.18859422206878662, "memory(GiB)": 122.96, "step": 9290, "token_acc": 0.9298525308888003, "train_speed(iter/s)": 0.241787 }, { "epoch": 0.7085143684732068, "grad_norm": 1.3304308652877808, "learning_rate": 9.512681544247809e-05, "loss": 0.2021576166152954, "memory(GiB)": 122.96, "step": 9295, "token_acc": 0.9238696808510638, "train_speed(iter/s)": 0.241825 }, { "epoch": 0.7088954950834667, "grad_norm": 1.1353472471237183, "learning_rate": 9.512165821818288e-05, "loss": 0.2955919742584229, "memory(GiB)": 122.96, "step": 9300, "token_acc": 0.8938511326860842, "train_speed(iter/s)": 0.241865 }, { "epoch": 0.7092766216937266, "grad_norm": 1.061025857925415, "learning_rate": 9.511649840636997e-05, "loss": 0.13640637397766114, "memory(GiB)": 122.96, "step": 9305, "token_acc": 0.9401926001013685, "train_speed(iter/s)": 0.241902 }, { "epoch": 0.7096577483039865, "grad_norm": 1.5271340608596802, "learning_rate": 9.511133600733524e-05, "loss": 0.14563467502593994, "memory(GiB)": 122.96, "step": 9310, "token_acc": 0.9430226943505553, "train_speed(iter/s)": 0.241952 }, { "epoch": 0.7100388749142466, "grad_norm": 0.812310516834259, "learning_rate": 9.510617102137474e-05, "loss": 0.13970211744308472, "memory(GiB)": 122.96, "step": 9315, "token_acc": 0.9331290052939537, "train_speed(iter/s)": 0.241991 }, { "epoch": 0.7104200015245065, "grad_norm": 1.0962315797805786, "learning_rate": 9.510100344878463e-05, "loss": 0.14172837734222413, "memory(GiB)": 122.96, "step": 9320, "token_acc": 0.941320293398533, "train_speed(iter/s)": 0.242004 }, { "epoch": 0.7108011281347664, "grad_norm": 0.9021138548851013, "learning_rate": 9.509583328986126e-05, "loss": 0.1299859404563904, "memory(GiB)": 122.96, "step": 9325, "token_acc": 0.9495633576902512, "train_speed(iter/s)": 0.242026 }, { "epoch": 0.7111822547450263, "grad_norm": 0.9577571153640747, "learning_rate": 9.509066054490115e-05, "loss": 0.1417333483695984, "memory(GiB)": 122.96, "step": 9330, "token_acc": 0.9496973491964099, "train_speed(iter/s)": 0.242052 }, { "epoch": 0.7115633813552862, "grad_norm": 1.4673559665679932, "learning_rate": 9.508548521420089e-05, "loss": 0.13561688661575316, "memory(GiB)": 122.96, "step": 9335, "token_acc": 0.9363571598195203, "train_speed(iter/s)": 0.242091 }, { "epoch": 0.7119445079655462, "grad_norm": 1.1084150075912476, "learning_rate": 9.508030729805728e-05, "loss": 0.20392985343933107, "memory(GiB)": 122.96, "step": 9340, "token_acc": 0.9213788932567282, "train_speed(iter/s)": 0.242129 }, { "epoch": 0.7123256345758061, "grad_norm": 0.8382878303527832, "learning_rate": 9.507512679676724e-05, "loss": 0.252334451675415, "memory(GiB)": 122.96, "step": 9345, "token_acc": 0.8906018381262971, "train_speed(iter/s)": 0.242166 }, { "epoch": 0.712706761186066, "grad_norm": 1.254881739616394, "learning_rate": 9.506994371062787e-05, "loss": 0.1874903917312622, "memory(GiB)": 122.96, "step": 9350, "token_acc": 0.9323593073593074, "train_speed(iter/s)": 0.242215 }, { "epoch": 0.7130878877963259, "grad_norm": 1.7376272678375244, "learning_rate": 9.506475803993635e-05, "loss": 0.19393815994262695, "memory(GiB)": 122.96, "step": 9355, "token_acc": 0.9255390086685931, "train_speed(iter/s)": 0.242249 }, { "epoch": 0.7134690144065858, "grad_norm": 0.9269551634788513, "learning_rate": 9.50595697849901e-05, "loss": 0.13960931301116944, "memory(GiB)": 122.96, "step": 9360, "token_acc": 0.939868804664723, "train_speed(iter/s)": 0.242295 }, { "epoch": 0.7138501410168459, "grad_norm": 0.7581728100776672, "learning_rate": 9.505437894608662e-05, "loss": 0.15177395343780517, "memory(GiB)": 122.96, "step": 9365, "token_acc": 0.9414946619217082, "train_speed(iter/s)": 0.242303 }, { "epoch": 0.7142312676271058, "grad_norm": 1.1978108882904053, "learning_rate": 9.504918552352359e-05, "loss": 0.14161267280578613, "memory(GiB)": 122.96, "step": 9370, "token_acc": 0.9457682826622843, "train_speed(iter/s)": 0.242333 }, { "epoch": 0.7146123942373657, "grad_norm": 1.2492308616638184, "learning_rate": 9.50439895175988e-05, "loss": 0.152770471572876, "memory(GiB)": 122.96, "step": 9375, "token_acc": 0.9309632079371641, "train_speed(iter/s)": 0.242363 }, { "epoch": 0.7149935208476256, "grad_norm": 0.9446655511856079, "learning_rate": 9.503879092861028e-05, "loss": 0.13427114486694336, "memory(GiB)": 122.96, "step": 9380, "token_acc": 0.9409550693954364, "train_speed(iter/s)": 0.242393 }, { "epoch": 0.7153746474578855, "grad_norm": 0.9060647487640381, "learning_rate": 9.503358975685607e-05, "loss": 0.16234104633331298, "memory(GiB)": 122.96, "step": 9385, "token_acc": 0.9401289009497965, "train_speed(iter/s)": 0.242418 }, { "epoch": 0.7157557740681454, "grad_norm": 1.012534499168396, "learning_rate": 9.502838600263449e-05, "loss": 0.22716121673583983, "memory(GiB)": 122.96, "step": 9390, "token_acc": 0.9105678712420285, "train_speed(iter/s)": 0.242433 }, { "epoch": 0.7161369006784054, "grad_norm": 2.1363651752471924, "learning_rate": 9.502317966624393e-05, "loss": 0.16765108108520507, "memory(GiB)": 122.96, "step": 9395, "token_acc": 0.9295953547450554, "train_speed(iter/s)": 0.242458 }, { "epoch": 0.7165180272886653, "grad_norm": 0.9273198246955872, "learning_rate": 9.501797074798295e-05, "loss": 0.1417910099029541, "memory(GiB)": 122.96, "step": 9400, "token_acc": 0.947547974413646, "train_speed(iter/s)": 0.242469 }, { "epoch": 0.7165180272886653, "eval_loss": 0.12554779648780823, "eval_runtime": 173.7211, "eval_samples_per_second": 3.051, "eval_steps_per_second": 3.051, "eval_token_acc": 0.9396798385639419, "step": 9400 }, { "epoch": 0.7168991538989252, "grad_norm": 0.9080906510353088, "learning_rate": 9.501275924815025e-05, "loss": 0.16858606338500975, "memory(GiB)": 122.96, "step": 9405, "token_acc": 0.93959511398561, "train_speed(iter/s)": 0.241409 }, { "epoch": 0.7172802805091851, "grad_norm": 0.6607199311256409, "learning_rate": 9.50075451670447e-05, "loss": 0.1373058080673218, "memory(GiB)": 122.96, "step": 9410, "token_acc": 0.9470670568774653, "train_speed(iter/s)": 0.241424 }, { "epoch": 0.717661407119445, "grad_norm": 0.7100223302841187, "learning_rate": 9.500232850496528e-05, "loss": 0.20568475723266602, "memory(GiB)": 122.96, "step": 9415, "token_acc": 0.9378869170449855, "train_speed(iter/s)": 0.24145 }, { "epoch": 0.7180425337297051, "grad_norm": 1.002092957496643, "learning_rate": 9.499710926221116e-05, "loss": 0.11031844615936279, "memory(GiB)": 122.96, "step": 9420, "token_acc": 0.9440449438202247, "train_speed(iter/s)": 0.241478 }, { "epoch": 0.718423660339965, "grad_norm": 0.7908276915550232, "learning_rate": 9.499188743908164e-05, "loss": 0.154939603805542, "memory(GiB)": 122.96, "step": 9425, "token_acc": 0.936569895096365, "train_speed(iter/s)": 0.241508 }, { "epoch": 0.7188047869502249, "grad_norm": 0.7121626138687134, "learning_rate": 9.498666303587616e-05, "loss": 0.16566227674484252, "memory(GiB)": 122.96, "step": 9430, "token_acc": 0.9372707263389581, "train_speed(iter/s)": 0.241552 }, { "epoch": 0.7191859135604848, "grad_norm": 0.6913105249404907, "learning_rate": 9.498143605289433e-05, "loss": 0.23364462852478027, "memory(GiB)": 122.96, "step": 9435, "token_acc": 0.9195697919102175, "train_speed(iter/s)": 0.241581 }, { "epoch": 0.7195670401707447, "grad_norm": 0.8680779933929443, "learning_rate": 9.497620649043587e-05, "loss": 0.1495967388153076, "memory(GiB)": 122.96, "step": 9440, "token_acc": 0.936176935229068, "train_speed(iter/s)": 0.241621 }, { "epoch": 0.7199481667810046, "grad_norm": 1.2803620100021362, "learning_rate": 9.497097434880069e-05, "loss": 0.1381125569343567, "memory(GiB)": 122.96, "step": 9445, "token_acc": 0.9335423197492163, "train_speed(iter/s)": 0.241657 }, { "epoch": 0.7203292933912646, "grad_norm": 0.8265624642372131, "learning_rate": 9.496573962828881e-05, "loss": 0.16256260871887207, "memory(GiB)": 122.96, "step": 9450, "token_acc": 0.9412811387900356, "train_speed(iter/s)": 0.241689 }, { "epoch": 0.7207104200015245, "grad_norm": 0.7400648593902588, "learning_rate": 9.496050232920044e-05, "loss": 0.20339486598968506, "memory(GiB)": 122.96, "step": 9455, "token_acc": 0.8913672036348617, "train_speed(iter/s)": 0.241734 }, { "epoch": 0.7210915466117844, "grad_norm": 0.9505528211593628, "learning_rate": 9.49552624518359e-05, "loss": 0.15558651685714722, "memory(GiB)": 122.96, "step": 9460, "token_acc": 0.9371092313350496, "train_speed(iter/s)": 0.241777 }, { "epoch": 0.7214726732220443, "grad_norm": 1.6561814546585083, "learning_rate": 9.495001999649569e-05, "loss": 0.1974816679954529, "memory(GiB)": 122.96, "step": 9465, "token_acc": 0.9179174484052532, "train_speed(iter/s)": 0.24181 }, { "epoch": 0.7218537998323042, "grad_norm": 0.7471816539764404, "learning_rate": 9.49447749634804e-05, "loss": 0.15923697948455812, "memory(GiB)": 122.96, "step": 9470, "token_acc": 0.9396566094349058, "train_speed(iter/s)": 0.241831 }, { "epoch": 0.7222349264425643, "grad_norm": 1.4712625741958618, "learning_rate": 9.493952735309085e-05, "loss": 0.1591506004333496, "memory(GiB)": 122.96, "step": 9475, "token_acc": 0.9229904440697021, "train_speed(iter/s)": 0.241872 }, { "epoch": 0.7226160530528242, "grad_norm": 1.2755671739578247, "learning_rate": 9.493427716562796e-05, "loss": 0.21880991458892823, "memory(GiB)": 122.96, "step": 9480, "token_acc": 0.9199611147116008, "train_speed(iter/s)": 0.241911 }, { "epoch": 0.7229971796630841, "grad_norm": 0.5972647666931152, "learning_rate": 9.492902440139277e-05, "loss": 0.1862488269805908, "memory(GiB)": 122.96, "step": 9485, "token_acc": 0.927975196756499, "train_speed(iter/s)": 0.241949 }, { "epoch": 0.723378306273344, "grad_norm": 0.6670304536819458, "learning_rate": 9.492376906068654e-05, "loss": 0.1362619400024414, "memory(GiB)": 122.96, "step": 9490, "token_acc": 0.955973972033781, "train_speed(iter/s)": 0.241959 }, { "epoch": 0.7237594328836039, "grad_norm": 1.2728389501571655, "learning_rate": 9.491851114381063e-05, "loss": 0.15555206537246705, "memory(GiB)": 122.96, "step": 9495, "token_acc": 0.9410029498525073, "train_speed(iter/s)": 0.24198 }, { "epoch": 0.7241405594938639, "grad_norm": 1.068898320198059, "learning_rate": 9.491325065106656e-05, "loss": 0.19695690870285035, "memory(GiB)": 122.96, "step": 9500, "token_acc": 0.9226334387204761, "train_speed(iter/s)": 0.241999 }, { "epoch": 0.7245216861041238, "grad_norm": 0.8497505187988281, "learning_rate": 9.490798758275598e-05, "loss": 0.14586708545684815, "memory(GiB)": 122.96, "step": 9505, "token_acc": 0.939119170984456, "train_speed(iter/s)": 0.242019 }, { "epoch": 0.7249028127143837, "grad_norm": 1.040644645690918, "learning_rate": 9.49027219391807e-05, "loss": 0.17108550071716308, "memory(GiB)": 122.96, "step": 9510, "token_acc": 0.9277486910994764, "train_speed(iter/s)": 0.242059 }, { "epoch": 0.7252839393246436, "grad_norm": 0.6566997766494751, "learning_rate": 9.48974537206427e-05, "loss": 0.13256853818893433, "memory(GiB)": 122.96, "step": 9515, "token_acc": 0.9459363957597173, "train_speed(iter/s)": 0.242062 }, { "epoch": 0.7256650659349035, "grad_norm": 0.9990972876548767, "learning_rate": 9.489218292744408e-05, "loss": 0.1183309555053711, "memory(GiB)": 122.96, "step": 9520, "token_acc": 0.9384344766930519, "train_speed(iter/s)": 0.242106 }, { "epoch": 0.7260461925451634, "grad_norm": 1.0642836093902588, "learning_rate": 9.488690955988711e-05, "loss": 0.1532878279685974, "memory(GiB)": 122.96, "step": 9525, "token_acc": 0.9389814597512322, "train_speed(iter/s)": 0.24214 }, { "epoch": 0.7264273191554235, "grad_norm": 1.2906914949417114, "learning_rate": 9.488163361827416e-05, "loss": 0.1754160761833191, "memory(GiB)": 122.96, "step": 9530, "token_acc": 0.9368943512172063, "train_speed(iter/s)": 0.242176 }, { "epoch": 0.7268084457656834, "grad_norm": 1.346134066581726, "learning_rate": 9.48763551029078e-05, "loss": 0.157304847240448, "memory(GiB)": 122.96, "step": 9535, "token_acc": 0.931729055258467, "train_speed(iter/s)": 0.2422 }, { "epoch": 0.7271895723759433, "grad_norm": 1.4403769969940186, "learning_rate": 9.487107401409072e-05, "loss": 0.159693706035614, "memory(GiB)": 122.96, "step": 9540, "token_acc": 0.9316239316239316, "train_speed(iter/s)": 0.242241 }, { "epoch": 0.7275706989862032, "grad_norm": 1.439956545829773, "learning_rate": 9.486579035212577e-05, "loss": 0.1705371379852295, "memory(GiB)": 122.96, "step": 9545, "token_acc": 0.9354916646532979, "train_speed(iter/s)": 0.242273 }, { "epoch": 0.7279518255964631, "grad_norm": 1.6187372207641602, "learning_rate": 9.486050411731596e-05, "loss": 0.1458192825317383, "memory(GiB)": 122.96, "step": 9550, "token_acc": 0.938885560215698, "train_speed(iter/s)": 0.2423 }, { "epoch": 0.7283329522067231, "grad_norm": 1.4867513179779053, "learning_rate": 9.485521530996442e-05, "loss": 0.14240323305130004, "memory(GiB)": 122.96, "step": 9555, "token_acc": 0.939350388042541, "train_speed(iter/s)": 0.242336 }, { "epoch": 0.728714078816983, "grad_norm": 0.6910136342048645, "learning_rate": 9.484992393037441e-05, "loss": 0.1394789695739746, "memory(GiB)": 122.96, "step": 9560, "token_acc": 0.9460321074803598, "train_speed(iter/s)": 0.242341 }, { "epoch": 0.7290952054272429, "grad_norm": 0.9604949951171875, "learning_rate": 9.484462997884942e-05, "loss": 0.21459486484527587, "memory(GiB)": 122.96, "step": 9565, "token_acc": 0.9230769230769231, "train_speed(iter/s)": 0.242374 }, { "epoch": 0.7294763320375028, "grad_norm": 0.7432778477668762, "learning_rate": 9.4839333455693e-05, "loss": 0.18734811544418334, "memory(GiB)": 122.96, "step": 9570, "token_acc": 0.9286615024319942, "train_speed(iter/s)": 0.242404 }, { "epoch": 0.7298574586477627, "grad_norm": 0.8178945183753967, "learning_rate": 9.483403436120887e-05, "loss": 0.17016313076019288, "memory(GiB)": 122.96, "step": 9575, "token_acc": 0.9339912635495874, "train_speed(iter/s)": 0.242425 }, { "epoch": 0.7302385852580228, "grad_norm": 1.0594521760940552, "learning_rate": 9.482873269570094e-05, "loss": 0.15654901266098023, "memory(GiB)": 122.96, "step": 9580, "token_acc": 0.9259342638451148, "train_speed(iter/s)": 0.242456 }, { "epoch": 0.7306197118682827, "grad_norm": 0.8725131750106812, "learning_rate": 9.48234284594732e-05, "loss": 0.26044521331787107, "memory(GiB)": 122.96, "step": 9585, "token_acc": 0.9078838174273859, "train_speed(iter/s)": 0.242493 }, { "epoch": 0.7310008384785426, "grad_norm": 2.1509768962860107, "learning_rate": 9.481812165282987e-05, "loss": 0.12553904056549073, "memory(GiB)": 122.96, "step": 9590, "token_acc": 0.9537725823591924, "train_speed(iter/s)": 0.24251 }, { "epoch": 0.7313819650888025, "grad_norm": 0.8540927171707153, "learning_rate": 9.481281227607523e-05, "loss": 0.1360520601272583, "memory(GiB)": 122.96, "step": 9595, "token_acc": 0.9492507492507493, "train_speed(iter/s)": 0.242534 }, { "epoch": 0.7317630916990624, "grad_norm": 0.9505517482757568, "learning_rate": 9.480750032951377e-05, "loss": 0.14591158628463746, "memory(GiB)": 122.96, "step": 9600, "token_acc": 0.9419551934826884, "train_speed(iter/s)": 0.242573 }, { "epoch": 0.7317630916990624, "eval_loss": 0.12556853890419006, "eval_runtime": 177.3511, "eval_samples_per_second": 2.988, "eval_steps_per_second": 2.988, "eval_token_acc": 0.9394313595566532, "step": 9600 }, { "epoch": 0.7321442183093223, "grad_norm": 0.610419750213623, "learning_rate": 9.480218581345011e-05, "loss": 0.15734323263168334, "memory(GiB)": 122.96, "step": 9605, "token_acc": 0.9391855740170347, "train_speed(iter/s)": 0.241509 }, { "epoch": 0.7325253449195823, "grad_norm": 1.129184365272522, "learning_rate": 9.479686872818899e-05, "loss": 0.15857715606689454, "memory(GiB)": 122.96, "step": 9610, "token_acc": 0.9423385801477674, "train_speed(iter/s)": 0.241539 }, { "epoch": 0.7329064715298422, "grad_norm": 0.4709247648715973, "learning_rate": 9.479154907403531e-05, "loss": 0.11634982824325561, "memory(GiB)": 122.96, "step": 9615, "token_acc": 0.9361420243773185, "train_speed(iter/s)": 0.241577 }, { "epoch": 0.7332875981401021, "grad_norm": 0.9982877373695374, "learning_rate": 9.478622685129418e-05, "loss": 0.18759751319885254, "memory(GiB)": 122.96, "step": 9620, "token_acc": 0.9207955958089149, "train_speed(iter/s)": 0.241605 }, { "epoch": 0.733668724750362, "grad_norm": 0.7970172762870789, "learning_rate": 9.478090206027074e-05, "loss": 0.18188464641571045, "memory(GiB)": 122.96, "step": 9625, "token_acc": 0.928188196450681, "train_speed(iter/s)": 0.241626 }, { "epoch": 0.734049851360622, "grad_norm": 1.150672197341919, "learning_rate": 9.47755747012704e-05, "loss": 0.21206355094909668, "memory(GiB)": 122.96, "step": 9630, "token_acc": 0.9088504088504088, "train_speed(iter/s)": 0.241659 }, { "epoch": 0.734430977970882, "grad_norm": 0.8234087824821472, "learning_rate": 9.477024477459864e-05, "loss": 0.1261660099029541, "memory(GiB)": 122.96, "step": 9635, "token_acc": 0.9429967426710097, "train_speed(iter/s)": 0.241687 }, { "epoch": 0.7348121045811419, "grad_norm": 1.4177038669586182, "learning_rate": 9.476491228056109e-05, "loss": 0.16825098991394044, "memory(GiB)": 122.96, "step": 9640, "token_acc": 0.9292631578947368, "train_speed(iter/s)": 0.241732 }, { "epoch": 0.7351932311914018, "grad_norm": 0.5649355053901672, "learning_rate": 9.475957721946356e-05, "loss": 0.1483863115310669, "memory(GiB)": 122.96, "step": 9645, "token_acc": 0.9455744914788345, "train_speed(iter/s)": 0.241763 }, { "epoch": 0.7355743578016617, "grad_norm": 0.5706619024276733, "learning_rate": 9.475423959161198e-05, "loss": 0.14149978160858154, "memory(GiB)": 122.96, "step": 9650, "token_acc": 0.95, "train_speed(iter/s)": 0.241762 }, { "epoch": 0.7359554844119216, "grad_norm": 0.8660262227058411, "learning_rate": 9.474889939731245e-05, "loss": 0.2085973024368286, "memory(GiB)": 122.96, "step": 9655, "token_acc": 0.9146562905317769, "train_speed(iter/s)": 0.241775 }, { "epoch": 0.7363366110221816, "grad_norm": 0.5611258745193481, "learning_rate": 9.47435566368712e-05, "loss": 0.08104597330093384, "memory(GiB)": 122.96, "step": 9660, "token_acc": 0.9566523605150214, "train_speed(iter/s)": 0.241786 }, { "epoch": 0.7367177376324415, "grad_norm": 1.653222918510437, "learning_rate": 9.473821131059462e-05, "loss": 0.19141640663146972, "memory(GiB)": 122.96, "step": 9665, "token_acc": 0.9127230411171451, "train_speed(iter/s)": 0.241826 }, { "epoch": 0.7370988642427014, "grad_norm": 1.4369605779647827, "learning_rate": 9.473286341878921e-05, "loss": 0.20618739128112792, "memory(GiB)": 122.96, "step": 9670, "token_acc": 0.9296497584541062, "train_speed(iter/s)": 0.241862 }, { "epoch": 0.7374799908529613, "grad_norm": 0.692276656627655, "learning_rate": 9.472751296176168e-05, "loss": 0.14784332513809204, "memory(GiB)": 122.96, "step": 9675, "token_acc": 0.9454705364995603, "train_speed(iter/s)": 0.241896 }, { "epoch": 0.7378611174632212, "grad_norm": 0.7921292185783386, "learning_rate": 9.472215993981885e-05, "loss": 0.15219603776931762, "memory(GiB)": 122.96, "step": 9680, "token_acc": 0.9408060453400504, "train_speed(iter/s)": 0.241937 }, { "epoch": 0.7382422440734812, "grad_norm": 0.8768872022628784, "learning_rate": 9.471680435326767e-05, "loss": 0.12949321269989014, "memory(GiB)": 122.96, "step": 9685, "token_acc": 0.9418074550817913, "train_speed(iter/s)": 0.241972 }, { "epoch": 0.7386233706837412, "grad_norm": 1.658180594444275, "learning_rate": 9.471144620241528e-05, "loss": 0.1659456968307495, "memory(GiB)": 122.96, "step": 9690, "token_acc": 0.9275167785234899, "train_speed(iter/s)": 0.242015 }, { "epoch": 0.7390044972940011, "grad_norm": 1.8701764345169067, "learning_rate": 9.470608548756894e-05, "loss": 0.14761550426483155, "memory(GiB)": 122.96, "step": 9695, "token_acc": 0.9227184466019418, "train_speed(iter/s)": 0.242055 }, { "epoch": 0.739385623904261, "grad_norm": 1.4696731567382812, "learning_rate": 9.470072220903605e-05, "loss": 0.13869086503982545, "memory(GiB)": 122.96, "step": 9700, "token_acc": 0.9386724386724387, "train_speed(iter/s)": 0.242099 }, { "epoch": 0.7397667505145209, "grad_norm": 0.8274171948432922, "learning_rate": 9.469535636712419e-05, "loss": 0.13499306440353392, "memory(GiB)": 122.96, "step": 9705, "token_acc": 0.9539641943734015, "train_speed(iter/s)": 0.242129 }, { "epoch": 0.7401478771247808, "grad_norm": 0.9982659816741943, "learning_rate": 9.468998796214105e-05, "loss": 0.1871713638305664, "memory(GiB)": 122.96, "step": 9710, "token_acc": 0.9339862169024302, "train_speed(iter/s)": 0.242157 }, { "epoch": 0.7405290037350408, "grad_norm": 1.2357797622680664, "learning_rate": 9.468461699439448e-05, "loss": 0.14164385795593262, "memory(GiB)": 122.96, "step": 9715, "token_acc": 0.9490610835311893, "train_speed(iter/s)": 0.242185 }, { "epoch": 0.7409101303453007, "grad_norm": 0.7687814235687256, "learning_rate": 9.46792434641925e-05, "loss": 0.16590912342071534, "memory(GiB)": 122.96, "step": 9720, "token_acc": 0.9344993141289437, "train_speed(iter/s)": 0.242206 }, { "epoch": 0.7412912569555606, "grad_norm": 1.006686806678772, "learning_rate": 9.467386737184323e-05, "loss": 0.13399699926376343, "memory(GiB)": 122.96, "step": 9725, "token_acc": 0.9455845771144279, "train_speed(iter/s)": 0.24224 }, { "epoch": 0.7416723835658205, "grad_norm": 0.8445634245872498, "learning_rate": 9.466848871765498e-05, "loss": 0.15944788455963135, "memory(GiB)": 122.96, "step": 9730, "token_acc": 0.938961906883493, "train_speed(iter/s)": 0.242268 }, { "epoch": 0.7420535101760805, "grad_norm": 0.7931368947029114, "learning_rate": 9.466310750193618e-05, "loss": 0.10543488264083863, "memory(GiB)": 122.96, "step": 9735, "token_acc": 0.9571428571428572, "train_speed(iter/s)": 0.242298 }, { "epoch": 0.7424346367863405, "grad_norm": 0.8846327662467957, "learning_rate": 9.465772372499545e-05, "loss": 0.18347625732421874, "memory(GiB)": 122.96, "step": 9740, "token_acc": 0.9367333763718528, "train_speed(iter/s)": 0.242332 }, { "epoch": 0.7428157633966004, "grad_norm": 2.4335834980010986, "learning_rate": 9.46523373871415e-05, "loss": 0.090405935049057, "memory(GiB)": 122.96, "step": 9745, "token_acc": 0.9663705583756346, "train_speed(iter/s)": 0.242366 }, { "epoch": 0.7431968900068603, "grad_norm": 0.9144492149353027, "learning_rate": 9.464694848868321e-05, "loss": 0.17692941427230835, "memory(GiB)": 122.96, "step": 9750, "token_acc": 0.9279426816786079, "train_speed(iter/s)": 0.242399 }, { "epoch": 0.7435780166171202, "grad_norm": 1.8017278909683228, "learning_rate": 9.464155702992959e-05, "loss": 0.20904991626739503, "memory(GiB)": 122.96, "step": 9755, "token_acc": 0.9430740037950665, "train_speed(iter/s)": 0.24243 }, { "epoch": 0.7439591432273801, "grad_norm": 0.7906907796859741, "learning_rate": 9.463616301118987e-05, "loss": 0.16922208070755004, "memory(GiB)": 122.96, "step": 9760, "token_acc": 0.9392718822618126, "train_speed(iter/s)": 0.24246 }, { "epoch": 0.74434026983764, "grad_norm": 0.6670966148376465, "learning_rate": 9.46307664327733e-05, "loss": 0.14137485027313232, "memory(GiB)": 122.96, "step": 9765, "token_acc": 0.9473282442748091, "train_speed(iter/s)": 0.242492 }, { "epoch": 0.7447213964479, "grad_norm": 0.7380162477493286, "learning_rate": 9.462536729498942e-05, "loss": 0.184731125831604, "memory(GiB)": 122.96, "step": 9770, "token_acc": 0.9306559571619812, "train_speed(iter/s)": 0.242528 }, { "epoch": 0.7451025230581599, "grad_norm": 0.907791793346405, "learning_rate": 9.46199655981478e-05, "loss": 0.15084266662597656, "memory(GiB)": 122.96, "step": 9775, "token_acc": 0.941376550620248, "train_speed(iter/s)": 0.242549 }, { "epoch": 0.7454836496684198, "grad_norm": 0.8116359114646912, "learning_rate": 9.461456134255821e-05, "loss": 0.12521952390670776, "memory(GiB)": 122.96, "step": 9780, "token_acc": 0.9502164502164502, "train_speed(iter/s)": 0.2426 }, { "epoch": 0.7458647762786798, "grad_norm": 0.7579566240310669, "learning_rate": 9.460915452853057e-05, "loss": 0.0974208652973175, "memory(GiB)": 122.96, "step": 9785, "token_acc": 0.9567546380977267, "train_speed(iter/s)": 0.242607 }, { "epoch": 0.7462459028889397, "grad_norm": 0.7426918148994446, "learning_rate": 9.460374515637493e-05, "loss": 0.1450915217399597, "memory(GiB)": 122.96, "step": 9790, "token_acc": 0.9392077607113986, "train_speed(iter/s)": 0.242624 }, { "epoch": 0.7466270294991997, "grad_norm": 1.4536398649215698, "learning_rate": 9.459833322640149e-05, "loss": 0.15610674619674683, "memory(GiB)": 122.96, "step": 9795, "token_acc": 0.934260429835651, "train_speed(iter/s)": 0.242658 }, { "epoch": 0.7470081561094596, "grad_norm": 0.5939784646034241, "learning_rate": 9.45929187389206e-05, "loss": 0.20035262107849122, "memory(GiB)": 122.96, "step": 9800, "token_acc": 0.923921568627451, "train_speed(iter/s)": 0.242692 }, { "epoch": 0.7470081561094596, "eval_loss": 0.12852832674980164, "eval_runtime": 179.5577, "eval_samples_per_second": 2.952, "eval_steps_per_second": 2.952, "eval_token_acc": 0.9392506475513523, "step": 9800 }, { "epoch": 0.7473892827197195, "grad_norm": 0.7172967791557312, "learning_rate": 9.458750169424277e-05, "loss": 0.22302870750427245, "memory(GiB)": 122.96, "step": 9805, "token_acc": 0.9384932119071867, "train_speed(iter/s)": 0.241652 }, { "epoch": 0.7477704093299794, "grad_norm": 0.45996665954589844, "learning_rate": 9.458208209267861e-05, "loss": 0.21264286041259767, "memory(GiB)": 122.96, "step": 9810, "token_acc": 0.917193789534215, "train_speed(iter/s)": 0.241689 }, { "epoch": 0.7481515359402393, "grad_norm": 0.668114185333252, "learning_rate": 9.457665993453894e-05, "loss": 0.19275894165039062, "memory(GiB)": 122.96, "step": 9815, "token_acc": 0.9345279117849759, "train_speed(iter/s)": 0.24172 }, { "epoch": 0.7485326625504993, "grad_norm": 0.9888028502464294, "learning_rate": 9.457123522013468e-05, "loss": 0.1603376030921936, "memory(GiB)": 122.96, "step": 9820, "token_acc": 0.9278818852924475, "train_speed(iter/s)": 0.241746 }, { "epoch": 0.7489137891607592, "grad_norm": 1.2633556127548218, "learning_rate": 9.456580794977692e-05, "loss": 0.14284130334854125, "memory(GiB)": 122.96, "step": 9825, "token_acc": 0.9365288385567553, "train_speed(iter/s)": 0.241781 }, { "epoch": 0.7492949157710191, "grad_norm": 0.7736327052116394, "learning_rate": 9.456037812377687e-05, "loss": 0.17930821180343628, "memory(GiB)": 122.96, "step": 9830, "token_acc": 0.9209093061804404, "train_speed(iter/s)": 0.24182 }, { "epoch": 0.749676042381279, "grad_norm": 1.0758336782455444, "learning_rate": 9.455494574244593e-05, "loss": 0.17710413932800292, "memory(GiB)": 122.96, "step": 9835, "token_acc": 0.9232730644413537, "train_speed(iter/s)": 0.241857 }, { "epoch": 0.750057168991539, "grad_norm": 0.6678187847137451, "learning_rate": 9.454951080609562e-05, "loss": 0.13782687187194825, "memory(GiB)": 122.96, "step": 9840, "token_acc": 0.9334828101644245, "train_speed(iter/s)": 0.241895 }, { "epoch": 0.7504382956017989, "grad_norm": 0.8791162371635437, "learning_rate": 9.454407331503759e-05, "loss": 0.19369581937789918, "memory(GiB)": 122.96, "step": 9845, "token_acc": 0.9381207028265852, "train_speed(iter/s)": 0.24192 }, { "epoch": 0.7508194222120589, "grad_norm": 0.604066014289856, "learning_rate": 9.453863326958367e-05, "loss": 0.1585877776145935, "memory(GiB)": 122.96, "step": 9850, "token_acc": 0.926711668273867, "train_speed(iter/s)": 0.241938 }, { "epoch": 0.7512005488223188, "grad_norm": 1.0204887390136719, "learning_rate": 9.453319067004582e-05, "loss": 0.16560683250427247, "memory(GiB)": 122.96, "step": 9855, "token_acc": 0.9277280858676208, "train_speed(iter/s)": 0.241978 }, { "epoch": 0.7515816754325787, "grad_norm": 1.352710485458374, "learning_rate": 9.452774551673614e-05, "loss": 0.1613044857978821, "memory(GiB)": 122.96, "step": 9860, "token_acc": 0.9371957156767283, "train_speed(iter/s)": 0.242004 }, { "epoch": 0.7519628020428386, "grad_norm": 0.7990666627883911, "learning_rate": 9.452229780996687e-05, "loss": 0.16148719787597657, "memory(GiB)": 122.96, "step": 9865, "token_acc": 0.9467756584922797, "train_speed(iter/s)": 0.242029 }, { "epoch": 0.7523439286530985, "grad_norm": 0.7430155873298645, "learning_rate": 9.451684755005046e-05, "loss": 0.16029212474822999, "memory(GiB)": 122.96, "step": 9870, "token_acc": 0.9414125200642055, "train_speed(iter/s)": 0.24207 }, { "epoch": 0.7527250552633585, "grad_norm": 0.7618395686149597, "learning_rate": 9.45113947372994e-05, "loss": 0.17550100088119508, "memory(GiB)": 122.96, "step": 9875, "token_acc": 0.9378847995621836, "train_speed(iter/s)": 0.242085 }, { "epoch": 0.7531061818736184, "grad_norm": 1.0987098217010498, "learning_rate": 9.450593937202643e-05, "loss": 0.17197105884552003, "memory(GiB)": 122.96, "step": 9880, "token_acc": 0.9398074989225687, "train_speed(iter/s)": 0.242098 }, { "epoch": 0.7534873084838783, "grad_norm": 0.6541916131973267, "learning_rate": 9.450048145454436e-05, "loss": 0.17654073238372803, "memory(GiB)": 122.96, "step": 9885, "token_acc": 0.9377442769402569, "train_speed(iter/s)": 0.242097 }, { "epoch": 0.7538684350941383, "grad_norm": 0.7859042286872864, "learning_rate": 9.449502098516618e-05, "loss": 0.1931756854057312, "memory(GiB)": 122.96, "step": 9890, "token_acc": 0.9284565916398714, "train_speed(iter/s)": 0.242121 }, { "epoch": 0.7542495617043982, "grad_norm": 0.8613477945327759, "learning_rate": 9.448955796420504e-05, "loss": 0.154740309715271, "memory(GiB)": 122.96, "step": 9895, "token_acc": 0.9390700160342063, "train_speed(iter/s)": 0.24216 }, { "epoch": 0.7546306883146582, "grad_norm": 0.7947913408279419, "learning_rate": 9.448409239197421e-05, "loss": 0.16298364400863646, "memory(GiB)": 122.96, "step": 9900, "token_acc": 0.9443956687152473, "train_speed(iter/s)": 0.242187 }, { "epoch": 0.7550118149249181, "grad_norm": 0.8859321475028992, "learning_rate": 9.447862426878711e-05, "loss": 0.15721626281738282, "memory(GiB)": 122.96, "step": 9905, "token_acc": 0.9324531777709548, "train_speed(iter/s)": 0.242223 }, { "epoch": 0.755392941535178, "grad_norm": 0.8989928960800171, "learning_rate": 9.447315359495731e-05, "loss": 0.12190951108932495, "memory(GiB)": 122.96, "step": 9910, "token_acc": 0.9421052631578948, "train_speed(iter/s)": 0.242267 }, { "epoch": 0.7557740681454379, "grad_norm": 1.4257994890213013, "learning_rate": 9.446768037079853e-05, "loss": 0.1995999813079834, "memory(GiB)": 122.96, "step": 9915, "token_acc": 0.9245331286774111, "train_speed(iter/s)": 0.2423 }, { "epoch": 0.7561551947556978, "grad_norm": 0.7679669857025146, "learning_rate": 9.446220459662465e-05, "loss": 0.12771419286727906, "memory(GiB)": 122.96, "step": 9920, "token_acc": 0.9552346570397112, "train_speed(iter/s)": 0.24232 }, { "epoch": 0.7565363213659577, "grad_norm": 0.6501082181930542, "learning_rate": 9.445672627274964e-05, "loss": 0.20506398677825927, "memory(GiB)": 122.96, "step": 9925, "token_acc": 0.9420320855614973, "train_speed(iter/s)": 0.242323 }, { "epoch": 0.7569174479762177, "grad_norm": 0.6196282505989075, "learning_rate": 9.445124539948771e-05, "loss": 0.1855010986328125, "memory(GiB)": 122.96, "step": 9930, "token_acc": 0.922675026123302, "train_speed(iter/s)": 0.242354 }, { "epoch": 0.7572985745864776, "grad_norm": 0.5875713229179382, "learning_rate": 9.444576197715311e-05, "loss": 0.1592625617980957, "memory(GiB)": 122.96, "step": 9935, "token_acc": 0.9370283018867924, "train_speed(iter/s)": 0.242381 }, { "epoch": 0.7576797011967376, "grad_norm": 1.2036080360412598, "learning_rate": 9.444027600606033e-05, "loss": 0.21202497482299804, "memory(GiB)": 122.96, "step": 9940, "token_acc": 0.9194528875379939, "train_speed(iter/s)": 0.242421 }, { "epoch": 0.7580608278069975, "grad_norm": 0.7774091362953186, "learning_rate": 9.443478748652394e-05, "loss": 0.1458479046821594, "memory(GiB)": 122.96, "step": 9945, "token_acc": 0.9455326460481099, "train_speed(iter/s)": 0.242434 }, { "epoch": 0.7584419544172574, "grad_norm": 0.445627361536026, "learning_rate": 9.442929641885871e-05, "loss": 0.17077217102050782, "memory(GiB)": 122.96, "step": 9950, "token_acc": 0.926, "train_speed(iter/s)": 0.242455 }, { "epoch": 0.7588230810275174, "grad_norm": 1.2411736249923706, "learning_rate": 9.44238028033795e-05, "loss": 0.15312187671661376, "memory(GiB)": 122.96, "step": 9955, "token_acc": 0.938275251033668, "train_speed(iter/s)": 0.242479 }, { "epoch": 0.7592042076377773, "grad_norm": 0.42862576246261597, "learning_rate": 9.441830664040135e-05, "loss": 0.143123996257782, "memory(GiB)": 122.96, "step": 9960, "token_acc": 0.94595221526328, "train_speed(iter/s)": 0.242503 }, { "epoch": 0.7595853342480372, "grad_norm": 0.834311306476593, "learning_rate": 9.441280793023944e-05, "loss": 0.23034977912902832, "memory(GiB)": 122.96, "step": 9965, "token_acc": 0.9096669080376538, "train_speed(iter/s)": 0.242521 }, { "epoch": 0.7599664608582971, "grad_norm": 0.6895515322685242, "learning_rate": 9.440730667320908e-05, "loss": 0.14444001913070678, "memory(GiB)": 122.96, "step": 9970, "token_acc": 0.9546213808463252, "train_speed(iter/s)": 0.242551 }, { "epoch": 0.760347587468557, "grad_norm": 0.6669225096702576, "learning_rate": 9.440180286962577e-05, "loss": 0.13719649314880372, "memory(GiB)": 122.96, "step": 9975, "token_acc": 0.9459751485683414, "train_speed(iter/s)": 0.242595 }, { "epoch": 0.760728714078817, "grad_norm": 0.5618553161621094, "learning_rate": 9.439629651980511e-05, "loss": 0.14297524690628052, "memory(GiB)": 122.96, "step": 9980, "token_acc": 0.9300312575138254, "train_speed(iter/s)": 0.242624 }, { "epoch": 0.7611098406890769, "grad_norm": 0.9065801501274109, "learning_rate": 9.439078762406288e-05, "loss": 0.09632376432418824, "memory(GiB)": 122.96, "step": 9985, "token_acc": 0.9547675872258211, "train_speed(iter/s)": 0.242631 }, { "epoch": 0.7614909672993369, "grad_norm": 1.6461727619171143, "learning_rate": 9.438527618271496e-05, "loss": 0.16396851539611818, "memory(GiB)": 122.96, "step": 9990, "token_acc": 0.9366391184573003, "train_speed(iter/s)": 0.242667 }, { "epoch": 0.7618720939095968, "grad_norm": 0.6022230982780457, "learning_rate": 9.437976219607744e-05, "loss": 0.11436672210693359, "memory(GiB)": 122.96, "step": 9995, "token_acc": 0.956983024691358, "train_speed(iter/s)": 0.242692 }, { "epoch": 0.7622532205198567, "grad_norm": 0.9264677166938782, "learning_rate": 9.43742456644665e-05, "loss": 0.2112905502319336, "memory(GiB)": 122.96, "step": 10000, "token_acc": 0.9170635572343571, "train_speed(iter/s)": 0.242716 }, { "epoch": 0.7622532205198567, "eval_loss": 0.1262308955192566, "eval_runtime": 183.2008, "eval_samples_per_second": 2.893, "eval_steps_per_second": 2.893, "eval_token_acc": 0.9408619962652852, "step": 10000 }, { "epoch": 0.7626343471301166, "grad_norm": 1.1761114597320557, "learning_rate": 9.436872658819849e-05, "loss": 0.1687753438949585, "memory(GiB)": 122.96, "step": 10005, "token_acc": 0.9406503587832187, "train_speed(iter/s)": 0.241671 }, { "epoch": 0.7630154737403766, "grad_norm": 0.5939816236495972, "learning_rate": 9.43632049675899e-05, "loss": 0.138466215133667, "memory(GiB)": 122.96, "step": 10010, "token_acc": 0.9492884017667267, "train_speed(iter/s)": 0.241686 }, { "epoch": 0.7633966003506365, "grad_norm": 0.9326996803283691, "learning_rate": 9.435768080295739e-05, "loss": 0.17694741487503052, "memory(GiB)": 122.96, "step": 10015, "token_acc": 0.9313267600761114, "train_speed(iter/s)": 0.241704 }, { "epoch": 0.7637777269608964, "grad_norm": 0.3736574649810791, "learning_rate": 9.435215409461773e-05, "loss": 0.1318502187728882, "memory(GiB)": 122.96, "step": 10020, "token_acc": 0.9435715866342111, "train_speed(iter/s)": 0.241736 }, { "epoch": 0.7641588535711563, "grad_norm": 0.6198335289955139, "learning_rate": 9.434662484288785e-05, "loss": 0.12952210903167724, "memory(GiB)": 122.96, "step": 10025, "token_acc": 0.9394979618107702, "train_speed(iter/s)": 0.241762 }, { "epoch": 0.7645399801814162, "grad_norm": 0.9166069626808167, "learning_rate": 9.434109304808483e-05, "loss": 0.11471017599105834, "memory(GiB)": 122.96, "step": 10030, "token_acc": 0.9524940617577197, "train_speed(iter/s)": 0.241803 }, { "epoch": 0.7649211067916762, "grad_norm": 1.0876930952072144, "learning_rate": 9.433555871052588e-05, "loss": 0.14897937774658204, "memory(GiB)": 122.96, "step": 10035, "token_acc": 0.9416666666666667, "train_speed(iter/s)": 0.241846 }, { "epoch": 0.7653022334019361, "grad_norm": 0.9251031875610352, "learning_rate": 9.433002183052839e-05, "loss": 0.1854841709136963, "memory(GiB)": 122.96, "step": 10040, "token_acc": 0.933953488372093, "train_speed(iter/s)": 0.24188 }, { "epoch": 0.7656833600121961, "grad_norm": 0.48083794116973877, "learning_rate": 9.432448240840985e-05, "loss": 0.1282339334487915, "memory(GiB)": 122.96, "step": 10045, "token_acc": 0.9512546870493221, "train_speed(iter/s)": 0.241915 }, { "epoch": 0.766064486622456, "grad_norm": 0.7858202457427979, "learning_rate": 9.431894044448795e-05, "loss": 0.15743547677993774, "memory(GiB)": 122.96, "step": 10050, "token_acc": 0.9435823250920569, "train_speed(iter/s)": 0.241917 }, { "epoch": 0.7664456132327159, "grad_norm": 1.7340123653411865, "learning_rate": 9.431339593908049e-05, "loss": 0.1939536452293396, "memory(GiB)": 122.96, "step": 10055, "token_acc": 0.9142771804062126, "train_speed(iter/s)": 0.241948 }, { "epoch": 0.7668267398429759, "grad_norm": 1.1218425035476685, "learning_rate": 9.430784889250539e-05, "loss": 0.18309401273727416, "memory(GiB)": 122.96, "step": 10060, "token_acc": 0.923827392120075, "train_speed(iter/s)": 0.241968 }, { "epoch": 0.7672078664532358, "grad_norm": 0.9787033200263977, "learning_rate": 9.430229930508078e-05, "loss": 0.13447102308273315, "memory(GiB)": 122.96, "step": 10065, "token_acc": 0.9452200035341933, "train_speed(iter/s)": 0.241985 }, { "epoch": 0.7675889930634957, "grad_norm": 0.9535993933677673, "learning_rate": 9.429674717712489e-05, "loss": 0.22339208126068116, "memory(GiB)": 122.96, "step": 10070, "token_acc": 0.9107969778211065, "train_speed(iter/s)": 0.242013 }, { "epoch": 0.7679701196737556, "grad_norm": 0.8389396667480469, "learning_rate": 9.42911925089561e-05, "loss": 0.26694793701171876, "memory(GiB)": 122.96, "step": 10075, "token_acc": 0.9048640915593705, "train_speed(iter/s)": 0.242035 }, { "epoch": 0.7683512462840155, "grad_norm": 1.7508668899536133, "learning_rate": 9.428563530089298e-05, "loss": 0.15905604362487794, "memory(GiB)": 122.96, "step": 10080, "token_acc": 0.9247585155058464, "train_speed(iter/s)": 0.24208 }, { "epoch": 0.7687323728942754, "grad_norm": 0.8412601351737976, "learning_rate": 9.428007555325418e-05, "loss": 0.20968191623687743, "memory(GiB)": 122.96, "step": 10085, "token_acc": 0.9096121416526138, "train_speed(iter/s)": 0.242117 }, { "epoch": 0.7691134995045354, "grad_norm": 1.2597651481628418, "learning_rate": 9.427451326635852e-05, "loss": 0.1205409288406372, "memory(GiB)": 122.96, "step": 10090, "token_acc": 0.9366883116883117, "train_speed(iter/s)": 0.242153 }, { "epoch": 0.7694946261147954, "grad_norm": 0.7471310496330261, "learning_rate": 9.426894844052498e-05, "loss": 0.20515937805175782, "memory(GiB)": 122.96, "step": 10095, "token_acc": 0.9411388355726168, "train_speed(iter/s)": 0.242175 }, { "epoch": 0.7698757527250553, "grad_norm": 0.9959738254547119, "learning_rate": 9.42633810760727e-05, "loss": 0.16183044910430908, "memory(GiB)": 122.96, "step": 10100, "token_acc": 0.9368882888963888, "train_speed(iter/s)": 0.24221 }, { "epoch": 0.7702568793353152, "grad_norm": 0.7294350862503052, "learning_rate": 9.425781117332091e-05, "loss": 0.13832993507385255, "memory(GiB)": 122.96, "step": 10105, "token_acc": 0.9506083166878518, "train_speed(iter/s)": 0.242231 }, { "epoch": 0.7706380059455751, "grad_norm": 0.5784624218940735, "learning_rate": 9.425223873258904e-05, "loss": 0.1605436086654663, "memory(GiB)": 122.96, "step": 10110, "token_acc": 0.9165739710789766, "train_speed(iter/s)": 0.242273 }, { "epoch": 0.7710191325558351, "grad_norm": 1.3615195751190186, "learning_rate": 9.424666375419662e-05, "loss": 0.16338441371917725, "memory(GiB)": 122.96, "step": 10115, "token_acc": 0.931640625, "train_speed(iter/s)": 0.242302 }, { "epoch": 0.771400259166095, "grad_norm": 0.9296935200691223, "learning_rate": 9.424108623846337e-05, "loss": 0.1505724549293518, "memory(GiB)": 122.96, "step": 10120, "token_acc": 0.943804373383494, "train_speed(iter/s)": 0.242329 }, { "epoch": 0.7717813857763549, "grad_norm": 1.312263011932373, "learning_rate": 9.423550618570912e-05, "loss": 0.11956144571304321, "memory(GiB)": 122.96, "step": 10125, "token_acc": 0.9571642349420127, "train_speed(iter/s)": 0.242346 }, { "epoch": 0.7721625123866148, "grad_norm": 1.4668742418289185, "learning_rate": 9.422992359625387e-05, "loss": 0.17109134197235107, "memory(GiB)": 122.96, "step": 10130, "token_acc": 0.9216392704345868, "train_speed(iter/s)": 0.24237 }, { "epoch": 0.7725436389968747, "grad_norm": 0.7158969640731812, "learning_rate": 9.422433847041776e-05, "loss": 0.19327943325042723, "memory(GiB)": 122.96, "step": 10135, "token_acc": 0.9259970142887609, "train_speed(iter/s)": 0.242392 }, { "epoch": 0.7729247656071347, "grad_norm": 0.7863249778747559, "learning_rate": 9.421875080852107e-05, "loss": 0.14741979837417601, "memory(GiB)": 122.96, "step": 10140, "token_acc": 0.9338940728085162, "train_speed(iter/s)": 0.242402 }, { "epoch": 0.7733058922173947, "grad_norm": 1.155783772468567, "learning_rate": 9.421316061088421e-05, "loss": 0.16271607875823973, "memory(GiB)": 122.96, "step": 10145, "token_acc": 0.9319860085503303, "train_speed(iter/s)": 0.242444 }, { "epoch": 0.7736870188276546, "grad_norm": 0.6933473944664001, "learning_rate": 9.420756787782777e-05, "loss": 0.14023783206939697, "memory(GiB)": 122.96, "step": 10150, "token_acc": 0.9371219580813054, "train_speed(iter/s)": 0.242453 }, { "epoch": 0.7740681454379145, "grad_norm": 0.8344619870185852, "learning_rate": 9.420197260967246e-05, "loss": 0.14766404628753663, "memory(GiB)": 122.96, "step": 10155, "token_acc": 0.9424184261036468, "train_speed(iter/s)": 0.24249 }, { "epoch": 0.7744492720481744, "grad_norm": 0.69620680809021, "learning_rate": 9.419637480673916e-05, "loss": 0.18667682409286498, "memory(GiB)": 122.96, "step": 10160, "token_acc": 0.9236373448461953, "train_speed(iter/s)": 0.242499 }, { "epoch": 0.7748303986584343, "grad_norm": 0.2953942120075226, "learning_rate": 9.419077446934883e-05, "loss": 0.1722819685935974, "memory(GiB)": 122.96, "step": 10165, "token_acc": 0.9291825095057035, "train_speed(iter/s)": 0.242537 }, { "epoch": 0.7752115252686943, "grad_norm": 1.504596471786499, "learning_rate": 9.418517159782267e-05, "loss": 0.19174317121505738, "memory(GiB)": 122.96, "step": 10170, "token_acc": 0.9420699399505476, "train_speed(iter/s)": 0.242556 }, { "epoch": 0.7755926518789542, "grad_norm": 1.0394821166992188, "learning_rate": 9.417956619248196e-05, "loss": 0.19660117626190185, "memory(GiB)": 122.96, "step": 10175, "token_acc": 0.9263157894736842, "train_speed(iter/s)": 0.242584 }, { "epoch": 0.7759737784892141, "grad_norm": 1.115530014038086, "learning_rate": 9.417395825364814e-05, "loss": 0.13537335395812988, "memory(GiB)": 122.96, "step": 10180, "token_acc": 0.9436798644929071, "train_speed(iter/s)": 0.242616 }, { "epoch": 0.776354905099474, "grad_norm": 0.7701281309127808, "learning_rate": 9.416834778164281e-05, "loss": 0.1846461772918701, "memory(GiB)": 122.96, "step": 10185, "token_acc": 0.9064327485380117, "train_speed(iter/s)": 0.242657 }, { "epoch": 0.7767360317097339, "grad_norm": 0.5644527673721313, "learning_rate": 9.416273477678771e-05, "loss": 0.17863857746124268, "memory(GiB)": 122.96, "step": 10190, "token_acc": 0.9416715599921707, "train_speed(iter/s)": 0.24268 }, { "epoch": 0.777117158319994, "grad_norm": 0.9774928092956543, "learning_rate": 9.415711923940471e-05, "loss": 0.14011191129684447, "memory(GiB)": 122.96, "step": 10195, "token_acc": 0.9488123734118947, "train_speed(iter/s)": 0.242701 }, { "epoch": 0.7774982849302539, "grad_norm": 0.8542625904083252, "learning_rate": 9.415150116981583e-05, "loss": 0.2169797658920288, "memory(GiB)": 122.96, "step": 10200, "token_acc": 0.9156010230179028, "train_speed(iter/s)": 0.242732 }, { "epoch": 0.7774982849302539, "eval_loss": 0.12458353489637375, "eval_runtime": 183.8235, "eval_samples_per_second": 2.883, "eval_steps_per_second": 2.883, "eval_token_acc": 0.9413137762785374, "step": 10200 }, { "epoch": 0.7778794115405138, "grad_norm": 1.0876400470733643, "learning_rate": 9.414588056834323e-05, "loss": 0.14743691682815552, "memory(GiB)": 122.96, "step": 10205, "token_acc": 0.9412254645429016, "train_speed(iter/s)": 0.241713 }, { "epoch": 0.7782605381507737, "grad_norm": 0.6664872169494629, "learning_rate": 9.414025743530927e-05, "loss": 0.1404191732406616, "memory(GiB)": 122.96, "step": 10210, "token_acc": 0.9502509595512253, "train_speed(iter/s)": 0.241721 }, { "epoch": 0.7786416647610336, "grad_norm": 0.3842448592185974, "learning_rate": 9.413463177103637e-05, "loss": 0.10142707824707031, "memory(GiB)": 122.96, "step": 10215, "token_acc": 0.9546182594767753, "train_speed(iter/s)": 0.241752 }, { "epoch": 0.7790227913712936, "grad_norm": 1.743982195854187, "learning_rate": 9.412900357584716e-05, "loss": 0.16325678825378417, "memory(GiB)": 122.96, "step": 10220, "token_acc": 0.9423426400132165, "train_speed(iter/s)": 0.241773 }, { "epoch": 0.7794039179815535, "grad_norm": 1.8774921894073486, "learning_rate": 9.412337285006435e-05, "loss": 0.18970932960510253, "memory(GiB)": 122.96, "step": 10225, "token_acc": 0.928341384863124, "train_speed(iter/s)": 0.241809 }, { "epoch": 0.7797850445918134, "grad_norm": 1.1602296829223633, "learning_rate": 9.411773959401089e-05, "loss": 0.1505889415740967, "memory(GiB)": 122.96, "step": 10230, "token_acc": 0.9360119047619048, "train_speed(iter/s)": 0.241852 }, { "epoch": 0.7801661712020733, "grad_norm": 0.97850501537323, "learning_rate": 9.411210380800978e-05, "loss": 0.10819762945175171, "memory(GiB)": 122.96, "step": 10235, "token_acc": 0.960728389644581, "train_speed(iter/s)": 0.241879 }, { "epoch": 0.7805472978123332, "grad_norm": 1.051405668258667, "learning_rate": 9.410646549238422e-05, "loss": 0.19633734226226807, "memory(GiB)": 122.96, "step": 10240, "token_acc": 0.9292671961636445, "train_speed(iter/s)": 0.241887 }, { "epoch": 0.7809284244225931, "grad_norm": 1.4533792734146118, "learning_rate": 9.410082464745755e-05, "loss": 0.17310070991516113, "memory(GiB)": 122.96, "step": 10245, "token_acc": 0.9418483904465212, "train_speed(iter/s)": 0.241907 }, { "epoch": 0.7813095510328532, "grad_norm": 0.7465944290161133, "learning_rate": 9.409518127355324e-05, "loss": 0.18299177885055543, "memory(GiB)": 122.96, "step": 10250, "token_acc": 0.9159636599610642, "train_speed(iter/s)": 0.24195 }, { "epoch": 0.7816906776431131, "grad_norm": 1.000803828239441, "learning_rate": 9.408953537099492e-05, "loss": 0.15438640117645264, "memory(GiB)": 122.96, "step": 10255, "token_acc": 0.9422308344435025, "train_speed(iter/s)": 0.241958 }, { "epoch": 0.782071804253373, "grad_norm": 0.6200625896453857, "learning_rate": 9.408388694010633e-05, "loss": 0.1917045831680298, "memory(GiB)": 122.96, "step": 10260, "token_acc": 0.9287201348270051, "train_speed(iter/s)": 0.241968 }, { "epoch": 0.7824529308636329, "grad_norm": 1.1774468421936035, "learning_rate": 9.407823598121142e-05, "loss": 0.13135336637496947, "memory(GiB)": 122.96, "step": 10265, "token_acc": 0.9460400348128808, "train_speed(iter/s)": 0.241998 }, { "epoch": 0.7828340574738928, "grad_norm": 1.0562764406204224, "learning_rate": 9.40725824946342e-05, "loss": 0.1914328455924988, "memory(GiB)": 122.96, "step": 10270, "token_acc": 0.935064935064935, "train_speed(iter/s)": 0.242024 }, { "epoch": 0.7832151840841528, "grad_norm": 0.8816249370574951, "learning_rate": 9.40669264806989e-05, "loss": 0.14099621772766113, "memory(GiB)": 122.96, "step": 10275, "token_acc": 0.9373246024321796, "train_speed(iter/s)": 0.242042 }, { "epoch": 0.7835963106944127, "grad_norm": 0.800359845161438, "learning_rate": 9.406126793972987e-05, "loss": 0.15888605117797852, "memory(GiB)": 122.96, "step": 10280, "token_acc": 0.9431659693165969, "train_speed(iter/s)": 0.242056 }, { "epoch": 0.7839774373046726, "grad_norm": 0.9142786264419556, "learning_rate": 9.405560687205159e-05, "loss": 0.106557297706604, "memory(GiB)": 122.96, "step": 10285, "token_acc": 0.9477256317689531, "train_speed(iter/s)": 0.242079 }, { "epoch": 0.7843585639149325, "grad_norm": 0.6874719262123108, "learning_rate": 9.404994327798871e-05, "loss": 0.20827360153198243, "memory(GiB)": 122.96, "step": 10290, "token_acc": 0.9161576901721266, "train_speed(iter/s)": 0.242119 }, { "epoch": 0.7847396905251924, "grad_norm": 1.3623566627502441, "learning_rate": 9.404427715786599e-05, "loss": 0.1490943670272827, "memory(GiB)": 122.96, "step": 10295, "token_acc": 0.935052298710776, "train_speed(iter/s)": 0.242151 }, { "epoch": 0.7851208171354523, "grad_norm": 1.812449336051941, "learning_rate": 9.403860851200837e-05, "loss": 0.19132256507873535, "memory(GiB)": 122.96, "step": 10300, "token_acc": 0.9326138457488813, "train_speed(iter/s)": 0.242183 }, { "epoch": 0.7855019437457124, "grad_norm": 1.300950288772583, "learning_rate": 9.40329373407409e-05, "loss": 0.11235989332199096, "memory(GiB)": 122.96, "step": 10305, "token_acc": 0.9493142516398331, "train_speed(iter/s)": 0.242216 }, { "epoch": 0.7858830703559723, "grad_norm": 0.9311965703964233, "learning_rate": 9.402726364438883e-05, "loss": 0.1981670618057251, "memory(GiB)": 122.96, "step": 10310, "token_acc": 0.9197353914002205, "train_speed(iter/s)": 0.242245 }, { "epoch": 0.7862641969662322, "grad_norm": 0.9519211649894714, "learning_rate": 9.402158742327749e-05, "loss": 0.09531230926513672, "memory(GiB)": 122.96, "step": 10315, "token_acc": 0.9487391484084332, "train_speed(iter/s)": 0.242272 }, { "epoch": 0.7866453235764921, "grad_norm": 1.2270889282226562, "learning_rate": 9.40159086777324e-05, "loss": 0.21883184909820558, "memory(GiB)": 122.96, "step": 10320, "token_acc": 0.9143610013175231, "train_speed(iter/s)": 0.242299 }, { "epoch": 0.787026450186752, "grad_norm": 0.8737512826919556, "learning_rate": 9.401022740807921e-05, "loss": 0.1618422031402588, "memory(GiB)": 122.96, "step": 10325, "token_acc": 0.9213759213759214, "train_speed(iter/s)": 0.242326 }, { "epoch": 0.787407576797012, "grad_norm": 0.8279870748519897, "learning_rate": 9.40045436146437e-05, "loss": 0.15270328521728516, "memory(GiB)": 122.96, "step": 10330, "token_acc": 0.9269406392694064, "train_speed(iter/s)": 0.242368 }, { "epoch": 0.7877887034072719, "grad_norm": 0.7508084177970886, "learning_rate": 9.399885729775184e-05, "loss": 0.16004294157028198, "memory(GiB)": 122.96, "step": 10335, "token_acc": 0.9399704079475798, "train_speed(iter/s)": 0.242393 }, { "epoch": 0.7881698300175318, "grad_norm": 0.593233048915863, "learning_rate": 9.399316845772968e-05, "loss": 0.12106168270111084, "memory(GiB)": 122.96, "step": 10340, "token_acc": 0.9498332828129736, "train_speed(iter/s)": 0.2424 }, { "epoch": 0.7885509566277917, "grad_norm": 0.3860870897769928, "learning_rate": 9.398747709490345e-05, "loss": 0.1334306478500366, "memory(GiB)": 122.96, "step": 10345, "token_acc": 0.9488910318225651, "train_speed(iter/s)": 0.24242 }, { "epoch": 0.7889320832380516, "grad_norm": 0.6042333245277405, "learning_rate": 9.398178320959955e-05, "loss": 0.1790782928466797, "memory(GiB)": 122.96, "step": 10350, "token_acc": 0.9308157970596713, "train_speed(iter/s)": 0.242435 }, { "epoch": 0.7893132098483117, "grad_norm": 1.5253959894180298, "learning_rate": 9.397608680214447e-05, "loss": 0.15976942777633668, "memory(GiB)": 122.96, "step": 10355, "token_acc": 0.9345063538611925, "train_speed(iter/s)": 0.242476 }, { "epoch": 0.7896943364585716, "grad_norm": 0.7172838449478149, "learning_rate": 9.397038787286491e-05, "loss": 0.10646889209747315, "memory(GiB)": 122.96, "step": 10360, "token_acc": 0.9462827675118178, "train_speed(iter/s)": 0.242501 }, { "epoch": 0.7900754630688315, "grad_norm": 0.7561727166175842, "learning_rate": 9.396468642208764e-05, "loss": 0.17067475318908693, "memory(GiB)": 122.96, "step": 10365, "token_acc": 0.9375408052230686, "train_speed(iter/s)": 0.242529 }, { "epoch": 0.7904565896790914, "grad_norm": 0.6545218825340271, "learning_rate": 9.395898245013962e-05, "loss": 0.13535504341125487, "memory(GiB)": 122.96, "step": 10370, "token_acc": 0.9462098642833499, "train_speed(iter/s)": 0.242548 }, { "epoch": 0.7908377162893513, "grad_norm": 0.8547434210777283, "learning_rate": 9.395327595734796e-05, "loss": 0.18405344486236572, "memory(GiB)": 122.96, "step": 10375, "token_acc": 0.9301258075484529, "train_speed(iter/s)": 0.242564 }, { "epoch": 0.7912188428996112, "grad_norm": 1.0439496040344238, "learning_rate": 9.394756694403988e-05, "loss": 0.19842528104782103, "memory(GiB)": 122.96, "step": 10380, "token_acc": 0.9240918127598048, "train_speed(iter/s)": 0.242591 }, { "epoch": 0.7915999695098712, "grad_norm": 0.7558311820030212, "learning_rate": 9.394185541054279e-05, "loss": 0.15426919460296631, "memory(GiB)": 122.96, "step": 10385, "token_acc": 0.9409044193216856, "train_speed(iter/s)": 0.24261 }, { "epoch": 0.7919810961201311, "grad_norm": 0.5151486396789551, "learning_rate": 9.393614135718421e-05, "loss": 0.118315589427948, "memory(GiB)": 122.96, "step": 10390, "token_acc": 0.9550040905372239, "train_speed(iter/s)": 0.242623 }, { "epoch": 0.792362222730391, "grad_norm": 0.815010666847229, "learning_rate": 9.39304247842918e-05, "loss": 0.1966944694519043, "memory(GiB)": 122.96, "step": 10395, "token_acc": 0.9262371615312792, "train_speed(iter/s)": 0.242646 }, { "epoch": 0.7927433493406509, "grad_norm": 0.9891412258148193, "learning_rate": 9.392470569219338e-05, "loss": 0.12950125932693482, "memory(GiB)": 122.96, "step": 10400, "token_acc": 0.9354932866316404, "train_speed(iter/s)": 0.242675 }, { "epoch": 0.7927433493406509, "eval_loss": 0.12092699855566025, "eval_runtime": 184.3402, "eval_samples_per_second": 2.875, "eval_steps_per_second": 2.875, "eval_token_acc": 0.9420065056321908, "step": 10400 }, { "epoch": 0.7931244759509108, "grad_norm": 1.0076584815979004, "learning_rate": 9.391898408121695e-05, "loss": 0.16607811450958251, "memory(GiB)": 122.96, "step": 10405, "token_acc": 0.9418785602096136, "train_speed(iter/s)": 0.241653 }, { "epoch": 0.7935056025611709, "grad_norm": 1.0159857273101807, "learning_rate": 9.391325995169058e-05, "loss": 0.15128093957901, "memory(GiB)": 122.96, "step": 10410, "token_acc": 0.9373852062113875, "train_speed(iter/s)": 0.241667 }, { "epoch": 0.7938867291714308, "grad_norm": 0.8562823534011841, "learning_rate": 9.390753330394253e-05, "loss": 0.1382489800453186, "memory(GiB)": 122.96, "step": 10415, "token_acc": 0.9444291609353508, "train_speed(iter/s)": 0.241694 }, { "epoch": 0.7942678557816907, "grad_norm": 1.2479914426803589, "learning_rate": 9.39018041383012e-05, "loss": 0.1549461603164673, "memory(GiB)": 122.96, "step": 10420, "token_acc": 0.9338081273565144, "train_speed(iter/s)": 0.241715 }, { "epoch": 0.7946489823919506, "grad_norm": 0.7958732843399048, "learning_rate": 9.389607245509516e-05, "loss": 0.1448938250541687, "memory(GiB)": 122.96, "step": 10425, "token_acc": 0.9453944532260382, "train_speed(iter/s)": 0.241734 }, { "epoch": 0.7950301090022105, "grad_norm": 0.8930846452713013, "learning_rate": 9.389033825465303e-05, "loss": 0.18153517246246337, "memory(GiB)": 122.96, "step": 10430, "token_acc": 0.9298689353874454, "train_speed(iter/s)": 0.241748 }, { "epoch": 0.7954112356124705, "grad_norm": 1.8274303674697876, "learning_rate": 9.388460153730369e-05, "loss": 0.24058408737182618, "memory(GiB)": 122.96, "step": 10435, "token_acc": 0.9321077919364946, "train_speed(iter/s)": 0.241774 }, { "epoch": 0.7957923622227304, "grad_norm": 0.594234824180603, "learning_rate": 9.387886230337611e-05, "loss": 0.16767632961273193, "memory(GiB)": 122.96, "step": 10440, "token_acc": 0.9351514190581893, "train_speed(iter/s)": 0.241783 }, { "epoch": 0.7961734888329903, "grad_norm": 1.3462164402008057, "learning_rate": 9.38731205531994e-05, "loss": 0.14289262294769287, "memory(GiB)": 122.96, "step": 10445, "token_acc": 0.9447799827437446, "train_speed(iter/s)": 0.241809 }, { "epoch": 0.7965546154432502, "grad_norm": 1.0372138023376465, "learning_rate": 9.386737628710281e-05, "loss": 0.17768731117248535, "memory(GiB)": 122.96, "step": 10450, "token_acc": 0.9278120713305898, "train_speed(iter/s)": 0.241831 }, { "epoch": 0.7969357420535101, "grad_norm": 0.7743672728538513, "learning_rate": 9.386162950541577e-05, "loss": 0.1944607377052307, "memory(GiB)": 122.96, "step": 10455, "token_acc": 0.9256756756756757, "train_speed(iter/s)": 0.241855 }, { "epoch": 0.79731686866377, "grad_norm": 1.0271313190460205, "learning_rate": 9.38558802084678e-05, "loss": 0.11160448789596558, "memory(GiB)": 122.96, "step": 10460, "token_acc": 0.9432037981360999, "train_speed(iter/s)": 0.241879 }, { "epoch": 0.7976979952740301, "grad_norm": 0.9333913326263428, "learning_rate": 9.385012839658864e-05, "loss": 0.11425718069076538, "memory(GiB)": 122.96, "step": 10465, "token_acc": 0.9446195407474111, "train_speed(iter/s)": 0.241921 }, { "epoch": 0.79807912188429, "grad_norm": 1.1622281074523926, "learning_rate": 9.38443740701081e-05, "loss": 0.18089321851730347, "memory(GiB)": 122.96, "step": 10470, "token_acc": 0.9326700601573346, "train_speed(iter/s)": 0.241948 }, { "epoch": 0.7984602484945499, "grad_norm": 1.3617419004440308, "learning_rate": 9.383861722935616e-05, "loss": 0.17532868385314943, "memory(GiB)": 122.96, "step": 10475, "token_acc": 0.9421241050119332, "train_speed(iter/s)": 0.241974 }, { "epoch": 0.7988413751048098, "grad_norm": 0.8373824954032898, "learning_rate": 9.383285787466297e-05, "loss": 0.15992238521575927, "memory(GiB)": 122.96, "step": 10480, "token_acc": 0.9494184473897755, "train_speed(iter/s)": 0.242007 }, { "epoch": 0.7992225017150697, "grad_norm": 1.1844931840896606, "learning_rate": 9.382709600635878e-05, "loss": 0.22324609756469727, "memory(GiB)": 122.96, "step": 10485, "token_acc": 0.9242424242424242, "train_speed(iter/s)": 0.242041 }, { "epoch": 0.7996036283253297, "grad_norm": 0.7478426098823547, "learning_rate": 9.382133162477402e-05, "loss": 0.16789034605026246, "memory(GiB)": 122.96, "step": 10490, "token_acc": 0.9397575145238697, "train_speed(iter/s)": 0.242045 }, { "epoch": 0.7999847549355896, "grad_norm": 0.80352783203125, "learning_rate": 9.381556473023925e-05, "loss": 0.11218827962875366, "memory(GiB)": 122.96, "step": 10495, "token_acc": 0.9556271690629649, "train_speed(iter/s)": 0.242071 }, { "epoch": 0.8003658815458495, "grad_norm": 1.0331326723098755, "learning_rate": 9.380979532308518e-05, "loss": 0.16499152183532714, "memory(GiB)": 122.96, "step": 10500, "token_acc": 0.9276613234689683, "train_speed(iter/s)": 0.242113 }, { "epoch": 0.8007470081561094, "grad_norm": 0.7992278933525085, "learning_rate": 9.380402340364264e-05, "loss": 0.142061448097229, "memory(GiB)": 122.96, "step": 10505, "token_acc": 0.9434942991281019, "train_speed(iter/s)": 0.242132 }, { "epoch": 0.8011281347663693, "grad_norm": 0.6687667965888977, "learning_rate": 9.379824897224263e-05, "loss": 0.1863141655921936, "memory(GiB)": 122.96, "step": 10510, "token_acc": 0.9429121231558691, "train_speed(iter/s)": 0.242162 }, { "epoch": 0.8015092613766294, "grad_norm": 0.7543011903762817, "learning_rate": 9.37924720292163e-05, "loss": 0.09256382584571839, "memory(GiB)": 122.96, "step": 10515, "token_acc": 0.9460737937559129, "train_speed(iter/s)": 0.242191 }, { "epoch": 0.8018903879868893, "grad_norm": 1.3944259881973267, "learning_rate": 9.378669257489492e-05, "loss": 0.17069848775863647, "memory(GiB)": 122.96, "step": 10520, "token_acc": 0.932698693823334, "train_speed(iter/s)": 0.242215 }, { "epoch": 0.8022715145971492, "grad_norm": 1.0913804769515991, "learning_rate": 9.378091060960992e-05, "loss": 0.17842953205108641, "memory(GiB)": 122.96, "step": 10525, "token_acc": 0.9389687235841082, "train_speed(iter/s)": 0.242239 }, { "epoch": 0.8026526412074091, "grad_norm": 0.9133718013763428, "learning_rate": 9.377512613369285e-05, "loss": 0.21049418449401855, "memory(GiB)": 122.96, "step": 10530, "token_acc": 0.9280407556782, "train_speed(iter/s)": 0.242265 }, { "epoch": 0.803033767817669, "grad_norm": 0.5866994261741638, "learning_rate": 9.376933914747545e-05, "loss": 0.10623915195465088, "memory(GiB)": 122.96, "step": 10535, "token_acc": 0.9437446074201898, "train_speed(iter/s)": 0.24229 }, { "epoch": 0.8034148944279289, "grad_norm": 0.6079117059707642, "learning_rate": 9.376354965128955e-05, "loss": 0.1679425835609436, "memory(GiB)": 122.96, "step": 10540, "token_acc": 0.934043229497775, "train_speed(iter/s)": 0.242306 }, { "epoch": 0.8037960210381889, "grad_norm": 0.7343176603317261, "learning_rate": 9.375775764546717e-05, "loss": 0.18345099687576294, "memory(GiB)": 122.96, "step": 10545, "token_acc": 0.9359964687706908, "train_speed(iter/s)": 0.24233 }, { "epoch": 0.8041771476484488, "grad_norm": 1.0620198249816895, "learning_rate": 9.375196313034046e-05, "loss": 0.10703575611114502, "memory(GiB)": 122.96, "step": 10550, "token_acc": 0.9565476190476191, "train_speed(iter/s)": 0.242358 }, { "epoch": 0.8045582742587087, "grad_norm": 0.5823842883110046, "learning_rate": 9.37461661062417e-05, "loss": 0.223058819770813, "memory(GiB)": 122.96, "step": 10555, "token_acc": 0.9052220776807285, "train_speed(iter/s)": 0.242382 }, { "epoch": 0.8049394008689686, "grad_norm": 0.7113386988639832, "learning_rate": 9.37403665735033e-05, "loss": 0.1315876841545105, "memory(GiB)": 122.96, "step": 10560, "token_acc": 0.9481831864115244, "train_speed(iter/s)": 0.242409 }, { "epoch": 0.8053205274792286, "grad_norm": 0.7072805166244507, "learning_rate": 9.373456453245788e-05, "loss": 0.13366265296936036, "memory(GiB)": 122.96, "step": 10565, "token_acc": 0.9394628392781015, "train_speed(iter/s)": 0.242421 }, { "epoch": 0.8057016540894886, "grad_norm": 0.7184557914733887, "learning_rate": 9.372875998343813e-05, "loss": 0.14011650085449218, "memory(GiB)": 122.96, "step": 10570, "token_acc": 0.9456345998383185, "train_speed(iter/s)": 0.242444 }, { "epoch": 0.8060827806997485, "grad_norm": 0.625906229019165, "learning_rate": 9.372295292677691e-05, "loss": 0.15648391246795654, "memory(GiB)": 122.96, "step": 10575, "token_acc": 0.9432184798038457, "train_speed(iter/s)": 0.242455 }, { "epoch": 0.8064639073100084, "grad_norm": 0.9008362889289856, "learning_rate": 9.371714336280725e-05, "loss": 0.1931705355644226, "memory(GiB)": 122.96, "step": 10580, "token_acc": 0.9335837121960862, "train_speed(iter/s)": 0.242484 }, { "epoch": 0.8068450339202683, "grad_norm": 0.6111207604408264, "learning_rate": 9.37113312918623e-05, "loss": 0.12494570016860962, "memory(GiB)": 122.96, "step": 10585, "token_acc": 0.9561527581329562, "train_speed(iter/s)": 0.242513 }, { "epoch": 0.8072261605305282, "grad_norm": 1.5386730432510376, "learning_rate": 9.370551671427531e-05, "loss": 0.1675773024559021, "memory(GiB)": 122.96, "step": 10590, "token_acc": 0.9278099299421261, "train_speed(iter/s)": 0.242548 }, { "epoch": 0.8076072871407882, "grad_norm": 1.3422820568084717, "learning_rate": 9.369969963037979e-05, "loss": 0.19613367319107056, "memory(GiB)": 122.96, "step": 10595, "token_acc": 0.9243567753001716, "train_speed(iter/s)": 0.24257 }, { "epoch": 0.8079884137510481, "grad_norm": 1.5174806118011475, "learning_rate": 9.369388004050927e-05, "loss": 0.19347106218338012, "memory(GiB)": 122.96, "step": 10600, "token_acc": 0.9232650565576276, "train_speed(iter/s)": 0.242602 }, { "epoch": 0.8079884137510481, "eval_loss": 0.12085248529911041, "eval_runtime": 171.0115, "eval_samples_per_second": 3.099, "eval_steps_per_second": 3.099, "eval_token_acc": 0.9426615866514065, "step": 10600 }, { "epoch": 0.808369540361308, "grad_norm": 0.043467190116643906, "learning_rate": 9.368805794499752e-05, "loss": 0.14942909479141236, "memory(GiB)": 122.96, "step": 10605, "token_acc": 0.9424856378546682, "train_speed(iter/s)": 0.241687 }, { "epoch": 0.8087506669715679, "grad_norm": 1.0262973308563232, "learning_rate": 9.368223334417835e-05, "loss": 0.1801743984222412, "memory(GiB)": 122.96, "step": 10610, "token_acc": 0.9298065047344586, "train_speed(iter/s)": 0.241716 }, { "epoch": 0.8091317935818279, "grad_norm": 0.6798327565193176, "learning_rate": 9.367640623838583e-05, "loss": 0.16760859489440919, "memory(GiB)": 122.96, "step": 10615, "token_acc": 0.9317312943746586, "train_speed(iter/s)": 0.241739 }, { "epoch": 0.8095129201920878, "grad_norm": 0.9560171365737915, "learning_rate": 9.367057662795409e-05, "loss": 0.12464847564697265, "memory(GiB)": 122.96, "step": 10620, "token_acc": 0.9539902961351848, "train_speed(iter/s)": 0.241759 }, { "epoch": 0.8098940468023478, "grad_norm": 0.7493095397949219, "learning_rate": 9.366474451321742e-05, "loss": 0.1704413652420044, "memory(GiB)": 122.96, "step": 10625, "token_acc": 0.9299867899603699, "train_speed(iter/s)": 0.241792 }, { "epoch": 0.8102751734126077, "grad_norm": 0.2562902867794037, "learning_rate": 9.36589098945103e-05, "loss": 0.1004481315612793, "memory(GiB)": 122.96, "step": 10630, "token_acc": 0.9483333333333334, "train_speed(iter/s)": 0.241832 }, { "epoch": 0.8106563000228676, "grad_norm": 0.7692529559135437, "learning_rate": 9.36530727721673e-05, "loss": 0.1732357144355774, "memory(GiB)": 122.96, "step": 10635, "token_acc": 0.9440728618186993, "train_speed(iter/s)": 0.24185 }, { "epoch": 0.8110374266331275, "grad_norm": 0.5044536590576172, "learning_rate": 9.364723314652314e-05, "loss": 0.12741477489471437, "memory(GiB)": 122.96, "step": 10640, "token_acc": 0.956182852327786, "train_speed(iter/s)": 0.241878 }, { "epoch": 0.8114185532433874, "grad_norm": 0.5408362746238708, "learning_rate": 9.364139101791272e-05, "loss": 0.08211479187011719, "memory(GiB)": 122.96, "step": 10645, "token_acc": 0.9654657147600864, "train_speed(iter/s)": 0.241902 }, { "epoch": 0.8117996798536474, "grad_norm": 0.8718941807746887, "learning_rate": 9.363554638667105e-05, "loss": 0.15744494199752807, "memory(GiB)": 122.96, "step": 10650, "token_acc": 0.9397119843788138, "train_speed(iter/s)": 0.241928 }, { "epoch": 0.8121808064639073, "grad_norm": 1.4677070379257202, "learning_rate": 9.362969925313327e-05, "loss": 0.1561747670173645, "memory(GiB)": 122.96, "step": 10655, "token_acc": 0.94109396914446, "train_speed(iter/s)": 0.241951 }, { "epoch": 0.8125619330741672, "grad_norm": 0.6571265459060669, "learning_rate": 9.36238496176347e-05, "loss": 0.1979602098464966, "memory(GiB)": 122.96, "step": 10660, "token_acc": 0.9414160401002506, "train_speed(iter/s)": 0.241961 }, { "epoch": 0.8129430596844271, "grad_norm": 0.5700443983078003, "learning_rate": 9.361799748051081e-05, "loss": 0.20541536808013916, "memory(GiB)": 122.96, "step": 10665, "token_acc": 0.9379113646336112, "train_speed(iter/s)": 0.241991 }, { "epoch": 0.8133241862946871, "grad_norm": 1.1202160120010376, "learning_rate": 9.361214284209718e-05, "loss": 0.2366016149520874, "memory(GiB)": 122.96, "step": 10670, "token_acc": 0.9178272980501393, "train_speed(iter/s)": 0.242023 }, { "epoch": 0.8137053129049471, "grad_norm": 1.1226907968521118, "learning_rate": 9.360628570272954e-05, "loss": 0.1412135124206543, "memory(GiB)": 122.96, "step": 10675, "token_acc": 0.9446710861805201, "train_speed(iter/s)": 0.24203 }, { "epoch": 0.814086439515207, "grad_norm": 0.5565034747123718, "learning_rate": 9.360042606274377e-05, "loss": 0.1893878698348999, "memory(GiB)": 122.96, "step": 10680, "token_acc": 0.9265283230510376, "train_speed(iter/s)": 0.242049 }, { "epoch": 0.8144675661254669, "grad_norm": 0.659194827079773, "learning_rate": 9.35945639224759e-05, "loss": 0.15379323959350585, "memory(GiB)": 122.96, "step": 10685, "token_acc": 0.9375972831470214, "train_speed(iter/s)": 0.242075 }, { "epoch": 0.8148486927357268, "grad_norm": 0.844777524471283, "learning_rate": 9.358869928226209e-05, "loss": 0.17218339443206787, "memory(GiB)": 122.96, "step": 10690, "token_acc": 0.9250866435617169, "train_speed(iter/s)": 0.242104 }, { "epoch": 0.8152298193459867, "grad_norm": 1.6068127155303955, "learning_rate": 9.358283214243864e-05, "loss": 0.21194872856140137, "memory(GiB)": 122.96, "step": 10695, "token_acc": 0.9179634385858891, "train_speed(iter/s)": 0.242127 }, { "epoch": 0.8156109459562466, "grad_norm": 0.4072405993938446, "learning_rate": 9.357696250334203e-05, "loss": 0.12281076908111573, "memory(GiB)": 122.96, "step": 10700, "token_acc": 0.956857855361596, "train_speed(iter/s)": 0.242153 }, { "epoch": 0.8159920725665066, "grad_norm": 0.914492666721344, "learning_rate": 9.357109036530884e-05, "loss": 0.1384498119354248, "memory(GiB)": 122.96, "step": 10705, "token_acc": 0.941256459069894, "train_speed(iter/s)": 0.242185 }, { "epoch": 0.8163731991767665, "grad_norm": 1.2518452405929565, "learning_rate": 9.35652157286758e-05, "loss": 0.17320170402526855, "memory(GiB)": 122.96, "step": 10710, "token_acc": 0.940314822912112, "train_speed(iter/s)": 0.242211 }, { "epoch": 0.8167543257870264, "grad_norm": 1.0537068843841553, "learning_rate": 9.355933859377982e-05, "loss": 0.14413715600967408, "memory(GiB)": 122.96, "step": 10715, "token_acc": 0.9459947801646256, "train_speed(iter/s)": 0.242237 }, { "epoch": 0.8171354523972864, "grad_norm": 0.7839221954345703, "learning_rate": 9.35534589609579e-05, "loss": 0.21037988662719725, "memory(GiB)": 122.96, "step": 10720, "token_acc": 0.9163165266106442, "train_speed(iter/s)": 0.242273 }, { "epoch": 0.8175165790075463, "grad_norm": 0.854105532169342, "learning_rate": 9.354757683054724e-05, "loss": 0.1699491858482361, "memory(GiB)": 122.96, "step": 10725, "token_acc": 0.9361063464837049, "train_speed(iter/s)": 0.242311 }, { "epoch": 0.8178977056178063, "grad_norm": 2.2000865936279297, "learning_rate": 9.354169220288512e-05, "loss": 0.2090435266494751, "memory(GiB)": 122.96, "step": 10730, "token_acc": 0.9203793559770622, "train_speed(iter/s)": 0.242343 }, { "epoch": 0.8182788322280662, "grad_norm": 0.8004125952720642, "learning_rate": 9.353580507830901e-05, "loss": 0.12065560817718506, "memory(GiB)": 122.96, "step": 10735, "token_acc": 0.9558610709117221, "train_speed(iter/s)": 0.242376 }, { "epoch": 0.8186599588383261, "grad_norm": 0.5542896389961243, "learning_rate": 9.352991545715651e-05, "loss": 0.11057955026626587, "memory(GiB)": 122.96, "step": 10740, "token_acc": 0.9427128678391085, "train_speed(iter/s)": 0.2424 }, { "epoch": 0.819041085448586, "grad_norm": 1.5005683898925781, "learning_rate": 9.352402333976538e-05, "loss": 0.1631695508956909, "memory(GiB)": 122.96, "step": 10745, "token_acc": 0.928045789043336, "train_speed(iter/s)": 0.242431 }, { "epoch": 0.8194222120588459, "grad_norm": 0.5973544120788574, "learning_rate": 9.351812872647346e-05, "loss": 0.2064713716506958, "memory(GiB)": 122.96, "step": 10750, "token_acc": 0.9433690032463629, "train_speed(iter/s)": 0.242443 }, { "epoch": 0.8198033386691059, "grad_norm": 0.619206964969635, "learning_rate": 9.351223161761882e-05, "loss": 0.10870914459228516, "memory(GiB)": 122.96, "step": 10755, "token_acc": 0.961281239000352, "train_speed(iter/s)": 0.24247 }, { "epoch": 0.8201844652793658, "grad_norm": 0.8150497078895569, "learning_rate": 9.350633201353962e-05, "loss": 0.16726174354553222, "memory(GiB)": 122.96, "step": 10760, "token_acc": 0.94290089695909, "train_speed(iter/s)": 0.2425 }, { "epoch": 0.8205655918896257, "grad_norm": 0.8767359256744385, "learning_rate": 9.350042991457418e-05, "loss": 0.1738325834274292, "memory(GiB)": 122.96, "step": 10765, "token_acc": 0.941146965390403, "train_speed(iter/s)": 0.242518 }, { "epoch": 0.8209467184998857, "grad_norm": 1.003379464149475, "learning_rate": 9.349452532106094e-05, "loss": 0.14928394556045532, "memory(GiB)": 122.96, "step": 10770, "token_acc": 0.9452443723382681, "train_speed(iter/s)": 0.242539 }, { "epoch": 0.8213278451101456, "grad_norm": 1.2883267402648926, "learning_rate": 9.34886182333385e-05, "loss": 0.15184190273284912, "memory(GiB)": 122.96, "step": 10775, "token_acc": 0.9346062052505967, "train_speed(iter/s)": 0.242579 }, { "epoch": 0.8217089717204055, "grad_norm": 0.7896623015403748, "learning_rate": 9.348270865174563e-05, "loss": 0.1487217664718628, "memory(GiB)": 122.96, "step": 10780, "token_acc": 0.9212839910038847, "train_speed(iter/s)": 0.242608 }, { "epoch": 0.8220900983306655, "grad_norm": 0.9509725570678711, "learning_rate": 9.34767965766212e-05, "loss": 0.12904688119888305, "memory(GiB)": 122.96, "step": 10785, "token_acc": 0.9443590531816785, "train_speed(iter/s)": 0.24264 }, { "epoch": 0.8224712249409254, "grad_norm": 0.7725791335105896, "learning_rate": 9.347088200830425e-05, "loss": 0.1547078251838684, "memory(GiB)": 122.96, "step": 10790, "token_acc": 0.9389872717210846, "train_speed(iter/s)": 0.242651 }, { "epoch": 0.8228523515511853, "grad_norm": 1.2725965976715088, "learning_rate": 9.346496494713395e-05, "loss": 0.2095318078994751, "memory(GiB)": 122.96, "step": 10795, "token_acc": 0.9283374283374284, "train_speed(iter/s)": 0.24269 }, { "epoch": 0.8232334781614452, "grad_norm": 0.2965717911720276, "learning_rate": 9.345904539344959e-05, "loss": 0.2154148578643799, "memory(GiB)": 122.96, "step": 10800, "token_acc": 0.8855297157622739, "train_speed(iter/s)": 0.242729 }, { "epoch": 0.8232334781614452, "eval_loss": 0.1232714131474495, "eval_runtime": 175.9421, "eval_samples_per_second": 3.012, "eval_steps_per_second": 3.012, "eval_token_acc": 0.9415321366182761, "step": 10800 }, { "epoch": 0.8236146047717051, "grad_norm": 1.59852933883667, "learning_rate": 9.345312334759066e-05, "loss": 0.1863587021827698, "memory(GiB)": 122.96, "step": 10805, "token_acc": 0.9411258128316098, "train_speed(iter/s)": 0.2418 }, { "epoch": 0.8239957313819651, "grad_norm": 1.1068203449249268, "learning_rate": 9.344719880989677e-05, "loss": 0.18324304819107057, "memory(GiB)": 122.96, "step": 10810, "token_acc": 0.927762982689747, "train_speed(iter/s)": 0.241818 }, { "epoch": 0.824376857992225, "grad_norm": 1.3394430875778198, "learning_rate": 9.344127178070763e-05, "loss": 0.1280696988105774, "memory(GiB)": 122.96, "step": 10815, "token_acc": 0.953957267529341, "train_speed(iter/s)": 0.241851 }, { "epoch": 0.824757984602485, "grad_norm": 0.6359961628913879, "learning_rate": 9.343534226036316e-05, "loss": 0.15793397426605224, "memory(GiB)": 122.96, "step": 10820, "token_acc": 0.9437191760951552, "train_speed(iter/s)": 0.241889 }, { "epoch": 0.8251391112127449, "grad_norm": 0.7840981483459473, "learning_rate": 9.342941024920336e-05, "loss": 0.18080484867095947, "memory(GiB)": 122.96, "step": 10825, "token_acc": 0.9264344262295082, "train_speed(iter/s)": 0.241916 }, { "epoch": 0.8255202378230048, "grad_norm": 0.9391659498214722, "learning_rate": 9.342347574756843e-05, "loss": 0.18395013809204103, "memory(GiB)": 122.96, "step": 10830, "token_acc": 0.932446264073695, "train_speed(iter/s)": 0.241943 }, { "epoch": 0.8259013644332648, "grad_norm": 1.2795003652572632, "learning_rate": 9.341753875579868e-05, "loss": 0.140896737575531, "memory(GiB)": 122.96, "step": 10835, "token_acc": 0.9492269118261596, "train_speed(iter/s)": 0.241972 }, { "epoch": 0.8262824910435247, "grad_norm": 0.8375188708305359, "learning_rate": 9.341159927423456e-05, "loss": 0.1302001953125, "memory(GiB)": 122.96, "step": 10840, "token_acc": 0.9501428200467411, "train_speed(iter/s)": 0.242001 }, { "epoch": 0.8266636176537846, "grad_norm": 0.8336392045021057, "learning_rate": 9.340565730321666e-05, "loss": 0.10393034219741822, "memory(GiB)": 122.96, "step": 10845, "token_acc": 0.9438269750675878, "train_speed(iter/s)": 0.242032 }, { "epoch": 0.8270447442640445, "grad_norm": 0.4330224096775055, "learning_rate": 9.339971284308577e-05, "loss": 0.15688031911849976, "memory(GiB)": 122.96, "step": 10850, "token_acc": 0.9422382671480144, "train_speed(iter/s)": 0.24205 }, { "epoch": 0.8274258708743044, "grad_norm": 1.1242941617965698, "learning_rate": 9.339376589418272e-05, "loss": 0.13391684293746947, "memory(GiB)": 122.96, "step": 10855, "token_acc": 0.9441272430668842, "train_speed(iter/s)": 0.242073 }, { "epoch": 0.8278069974845643, "grad_norm": 1.247464895248413, "learning_rate": 9.338781645684857e-05, "loss": 0.1904120683670044, "memory(GiB)": 122.96, "step": 10860, "token_acc": 0.9265569917743831, "train_speed(iter/s)": 0.242092 }, { "epoch": 0.8281881240948243, "grad_norm": 0.7699156403541565, "learning_rate": 9.338186453142451e-05, "loss": 0.13738722801208497, "memory(GiB)": 122.96, "step": 10865, "token_acc": 0.9484736535096248, "train_speed(iter/s)": 0.242121 }, { "epoch": 0.8285692507050842, "grad_norm": 0.6154953837394714, "learning_rate": 9.337591011825182e-05, "loss": 0.15506752729415893, "memory(GiB)": 122.96, "step": 10870, "token_acc": 0.9436805922792173, "train_speed(iter/s)": 0.24215 }, { "epoch": 0.8289503773153442, "grad_norm": 1.6067928075790405, "learning_rate": 9.336995321767198e-05, "loss": 0.2513906002044678, "memory(GiB)": 122.96, "step": 10875, "token_acc": 0.9157112526539278, "train_speed(iter/s)": 0.242182 }, { "epoch": 0.8293315039256041, "grad_norm": 0.7470421195030212, "learning_rate": 9.336399383002658e-05, "loss": 0.17181236743927003, "memory(GiB)": 122.96, "step": 10880, "token_acc": 0.9340547162106982, "train_speed(iter/s)": 0.242206 }, { "epoch": 0.829712630535864, "grad_norm": 0.4758884310722351, "learning_rate": 9.335803195565738e-05, "loss": 0.12316359281539917, "memory(GiB)": 122.96, "step": 10885, "token_acc": 0.9395770392749244, "train_speed(iter/s)": 0.242235 }, { "epoch": 0.830093757146124, "grad_norm": 0.813215434551239, "learning_rate": 9.335206759490624e-05, "loss": 0.1133565902709961, "memory(GiB)": 122.96, "step": 10890, "token_acc": 0.9647279549718574, "train_speed(iter/s)": 0.242247 }, { "epoch": 0.8304748837563839, "grad_norm": 1.5027799606323242, "learning_rate": 9.334610074811521e-05, "loss": 0.18620024919509887, "memory(GiB)": 122.96, "step": 10895, "token_acc": 0.9290338645418327, "train_speed(iter/s)": 0.242274 }, { "epoch": 0.8308560103666438, "grad_norm": 0.6732718348503113, "learning_rate": 9.334013141562644e-05, "loss": 0.10942530632019043, "memory(GiB)": 122.96, "step": 10900, "token_acc": 0.9429797670141018, "train_speed(iter/s)": 0.242318 }, { "epoch": 0.8312371369769037, "grad_norm": 0.94988614320755, "learning_rate": 9.333415959778227e-05, "loss": 0.18650107383728026, "memory(GiB)": 122.96, "step": 10905, "token_acc": 0.9215631149845376, "train_speed(iter/s)": 0.242352 }, { "epoch": 0.8316182635871636, "grad_norm": 1.9878137111663818, "learning_rate": 9.332818529492513e-05, "loss": 0.1263636827468872, "memory(GiB)": 122.96, "step": 10910, "token_acc": 0.9475862068965517, "train_speed(iter/s)": 0.242391 }, { "epoch": 0.8319993901974236, "grad_norm": 0.6483054757118225, "learning_rate": 9.332220850739764e-05, "loss": 0.10581499338150024, "memory(GiB)": 122.96, "step": 10915, "token_acc": 0.9491422498861394, "train_speed(iter/s)": 0.242402 }, { "epoch": 0.8323805168076835, "grad_norm": 0.5818637609481812, "learning_rate": 9.331622923554253e-05, "loss": 0.16082143783569336, "memory(GiB)": 122.96, "step": 10920, "token_acc": 0.9372021306419961, "train_speed(iter/s)": 0.242437 }, { "epoch": 0.8327616434179435, "grad_norm": 1.0212641954421997, "learning_rate": 9.331024747970268e-05, "loss": 0.17940677404403688, "memory(GiB)": 122.96, "step": 10925, "token_acc": 0.9414634146341463, "train_speed(iter/s)": 0.24246 }, { "epoch": 0.8331427700282034, "grad_norm": 0.8136066198348999, "learning_rate": 9.330426324022112e-05, "loss": 0.1410413861274719, "memory(GiB)": 122.96, "step": 10930, "token_acc": 0.9478662560492741, "train_speed(iter/s)": 0.242464 }, { "epoch": 0.8335238966384633, "grad_norm": 0.8897073268890381, "learning_rate": 9.329827651744102e-05, "loss": 0.16079072952270507, "memory(GiB)": 122.96, "step": 10935, "token_acc": 0.931185944363104, "train_speed(iter/s)": 0.242494 }, { "epoch": 0.8339050232487232, "grad_norm": 0.7524811029434204, "learning_rate": 9.329228731170569e-05, "loss": 0.1738677978515625, "memory(GiB)": 122.96, "step": 10940, "token_acc": 0.9365698086463501, "train_speed(iter/s)": 0.24252 }, { "epoch": 0.8342861498589832, "grad_norm": 1.0789000988006592, "learning_rate": 9.32862956233586e-05, "loss": 0.1272106647491455, "memory(GiB)": 122.96, "step": 10945, "token_acc": 0.9457055932600047, "train_speed(iter/s)": 0.24255 }, { "epoch": 0.8346672764692431, "grad_norm": 0.8413866758346558, "learning_rate": 9.328030145274332e-05, "loss": 0.15169304609298706, "memory(GiB)": 122.96, "step": 10950, "token_acc": 0.9420600858369099, "train_speed(iter/s)": 0.24256 }, { "epoch": 0.835048403079503, "grad_norm": 0.6152269840240479, "learning_rate": 9.327430480020358e-05, "loss": 0.1927301526069641, "memory(GiB)": 122.96, "step": 10955, "token_acc": 0.9270280515542078, "train_speed(iter/s)": 0.24258 }, { "epoch": 0.8354295296897629, "grad_norm": 0.6123437881469727, "learning_rate": 9.32683056660833e-05, "loss": 0.11245507001876831, "memory(GiB)": 122.96, "step": 10960, "token_acc": 0.9571799796633149, "train_speed(iter/s)": 0.242581 }, { "epoch": 0.8358106563000228, "grad_norm": 0.6743918657302856, "learning_rate": 9.326230405072647e-05, "loss": 0.13175561428070068, "memory(GiB)": 122.96, "step": 10965, "token_acc": 0.947170823277018, "train_speed(iter/s)": 0.242597 }, { "epoch": 0.8361917829102828, "grad_norm": 1.2469751834869385, "learning_rate": 9.325629995447726e-05, "loss": 0.15788116455078124, "memory(GiB)": 122.96, "step": 10970, "token_acc": 0.9425328769591065, "train_speed(iter/s)": 0.242617 }, { "epoch": 0.8365729095205428, "grad_norm": 0.7009623646736145, "learning_rate": 9.325029337767998e-05, "loss": 0.1453192114830017, "memory(GiB)": 122.96, "step": 10975, "token_acc": 0.9438905930470347, "train_speed(iter/s)": 0.242628 }, { "epoch": 0.8369540361308027, "grad_norm": 0.679578423500061, "learning_rate": 9.324428432067909e-05, "loss": 0.14223899841308593, "memory(GiB)": 122.96, "step": 10980, "token_acc": 0.9413012729844413, "train_speed(iter/s)": 0.242648 }, { "epoch": 0.8373351627410626, "grad_norm": 0.9320142865180969, "learning_rate": 9.323827278381916e-05, "loss": 0.10234172344207763, "memory(GiB)": 122.96, "step": 10985, "token_acc": 0.9468223086900129, "train_speed(iter/s)": 0.242676 }, { "epoch": 0.8377162893513225, "grad_norm": 0.8450809121131897, "learning_rate": 9.323225876744495e-05, "loss": 0.1213609218597412, "memory(GiB)": 122.96, "step": 10990, "token_acc": 0.9358541525995948, "train_speed(iter/s)": 0.242701 }, { "epoch": 0.8380974159615825, "grad_norm": 0.285533607006073, "learning_rate": 9.322624227190132e-05, "loss": 0.1317624568939209, "memory(GiB)": 122.96, "step": 10995, "token_acc": 0.9393063583815029, "train_speed(iter/s)": 0.242736 }, { "epoch": 0.8384785425718424, "grad_norm": 1.151003360748291, "learning_rate": 9.322022329753329e-05, "loss": 0.11941642761230468, "memory(GiB)": 122.96, "step": 11000, "token_acc": 0.9437032693153496, "train_speed(iter/s)": 0.242762 }, { "epoch": 0.8384785425718424, "eval_loss": 0.1190381720662117, "eval_runtime": 177.629, "eval_samples_per_second": 2.984, "eval_steps_per_second": 2.984, "eval_token_acc": 0.9435877356785736, "step": 11000 }, { "epoch": 0.8388596691821023, "grad_norm": 0.6702337861061096, "learning_rate": 9.321420184468602e-05, "loss": 0.14373393058776857, "memory(GiB)": 122.96, "step": 11005, "token_acc": 0.9434516769527413, "train_speed(iter/s)": 0.241831 }, { "epoch": 0.8392407957923622, "grad_norm": 0.8021074533462524, "learning_rate": 9.32081779137048e-05, "loss": 0.14332401752471924, "memory(GiB)": 122.96, "step": 11010, "token_acc": 0.9203367875647669, "train_speed(iter/s)": 0.241866 }, { "epoch": 0.8396219224026221, "grad_norm": 0.7249941825866699, "learning_rate": 9.32021515049351e-05, "loss": 0.1467615008354187, "memory(GiB)": 122.96, "step": 11015, "token_acc": 0.944191814799504, "train_speed(iter/s)": 0.241877 }, { "epoch": 0.840003049012882, "grad_norm": 2.5019538402557373, "learning_rate": 9.319612261872249e-05, "loss": 0.18843120336532593, "memory(GiB)": 122.96, "step": 11020, "token_acc": 0.9276773296244785, "train_speed(iter/s)": 0.241908 }, { "epoch": 0.840384175623142, "grad_norm": 1.3736668825149536, "learning_rate": 9.31900912554127e-05, "loss": 0.14675939083099365, "memory(GiB)": 122.96, "step": 11025, "token_acc": 0.9391450831754053, "train_speed(iter/s)": 0.241931 }, { "epoch": 0.840765302233402, "grad_norm": 0.7341601252555847, "learning_rate": 9.318405741535161e-05, "loss": 0.19592134952545165, "memory(GiB)": 122.96, "step": 11030, "token_acc": 0.928843710292249, "train_speed(iter/s)": 0.241953 }, { "epoch": 0.8411464288436619, "grad_norm": 1.715691328048706, "learning_rate": 9.317802109888522e-05, "loss": 0.19210788011550903, "memory(GiB)": 122.96, "step": 11035, "token_acc": 0.940833915676683, "train_speed(iter/s)": 0.241978 }, { "epoch": 0.8415275554539218, "grad_norm": 0.8008217215538025, "learning_rate": 9.317198230635969e-05, "loss": 0.1412208318710327, "memory(GiB)": 122.96, "step": 11040, "token_acc": 0.9424826522744796, "train_speed(iter/s)": 0.241992 }, { "epoch": 0.8419086820641817, "grad_norm": 0.6606429815292358, "learning_rate": 9.316594103812131e-05, "loss": 0.16083933115005494, "memory(GiB)": 122.96, "step": 11045, "token_acc": 0.9404580152671755, "train_speed(iter/s)": 0.242015 }, { "epoch": 0.8422898086744417, "grad_norm": 1.5820889472961426, "learning_rate": 9.315989729451653e-05, "loss": 0.1473959803581238, "memory(GiB)": 122.96, "step": 11050, "token_acc": 0.9558823529411765, "train_speed(iter/s)": 0.242041 }, { "epoch": 0.8426709352847016, "grad_norm": 1.3957444429397583, "learning_rate": 9.315385107589194e-05, "loss": 0.1614848017692566, "memory(GiB)": 122.96, "step": 11055, "token_acc": 0.9441920830629461, "train_speed(iter/s)": 0.24206 }, { "epoch": 0.8430520618949615, "grad_norm": 1.3653455972671509, "learning_rate": 9.314780238259424e-05, "loss": 0.12625062465667725, "memory(GiB)": 122.96, "step": 11060, "token_acc": 0.9507148231753197, "train_speed(iter/s)": 0.242097 }, { "epoch": 0.8434331885052214, "grad_norm": 1.0424755811691284, "learning_rate": 9.314175121497031e-05, "loss": 0.12409739494323731, "memory(GiB)": 122.96, "step": 11065, "token_acc": 0.9437889216650278, "train_speed(iter/s)": 0.242118 }, { "epoch": 0.8438143151154813, "grad_norm": 1.4346057176589966, "learning_rate": 9.313569757336713e-05, "loss": 0.1016870379447937, "memory(GiB)": 122.96, "step": 11070, "token_acc": 0.9661753590325018, "train_speed(iter/s)": 0.242146 }, { "epoch": 0.8441954417257412, "grad_norm": 0.9965929985046387, "learning_rate": 9.312964145813189e-05, "loss": 0.11996313333511352, "memory(GiB)": 122.96, "step": 11075, "token_acc": 0.9489986027014439, "train_speed(iter/s)": 0.242174 }, { "epoch": 0.8445765683360013, "grad_norm": 1.5527921915054321, "learning_rate": 9.312358286961185e-05, "loss": 0.15396604537963868, "memory(GiB)": 122.96, "step": 11080, "token_acc": 0.9349593495934959, "train_speed(iter/s)": 0.242217 }, { "epoch": 0.8449576949462612, "grad_norm": 1.03357994556427, "learning_rate": 9.311752180815445e-05, "loss": 0.17043306827545165, "memory(GiB)": 122.96, "step": 11085, "token_acc": 0.9431724137931035, "train_speed(iter/s)": 0.242243 }, { "epoch": 0.8453388215565211, "grad_norm": 2.1101996898651123, "learning_rate": 9.311145827410727e-05, "loss": 0.16430689096450807, "memory(GiB)": 122.96, "step": 11090, "token_acc": 0.9359543436905516, "train_speed(iter/s)": 0.242273 }, { "epoch": 0.845719948166781, "grad_norm": 1.087923288345337, "learning_rate": 9.3105392267818e-05, "loss": 0.16110520362854003, "memory(GiB)": 122.96, "step": 11095, "token_acc": 0.9399122807017544, "train_speed(iter/s)": 0.242297 }, { "epoch": 0.8461010747770409, "grad_norm": 1.9518238306045532, "learning_rate": 9.309932378963455e-05, "loss": 0.1971418857574463, "memory(GiB)": 122.96, "step": 11100, "token_acc": 0.9263971654050571, "train_speed(iter/s)": 0.242323 }, { "epoch": 0.8464822013873009, "grad_norm": 1.4333595037460327, "learning_rate": 9.309325283990487e-05, "loss": 0.16797823905944825, "memory(GiB)": 122.96, "step": 11105, "token_acc": 0.9340553549939832, "train_speed(iter/s)": 0.242346 }, { "epoch": 0.8468633279975608, "grad_norm": 0.9693551063537598, "learning_rate": 9.30871794189771e-05, "loss": 0.20840809345245362, "memory(GiB)": 122.96, "step": 11110, "token_acc": 0.9083108605050257, "train_speed(iter/s)": 0.242374 }, { "epoch": 0.8472444546078207, "grad_norm": 0.3867141008377075, "learning_rate": 9.308110352719955e-05, "loss": 0.11118177175521851, "memory(GiB)": 122.96, "step": 11115, "token_acc": 0.9452237808951236, "train_speed(iter/s)": 0.2424 }, { "epoch": 0.8476255812180806, "grad_norm": 0.9719766974449158, "learning_rate": 9.307502516492064e-05, "loss": 0.12820684909820557, "memory(GiB)": 122.96, "step": 11120, "token_acc": 0.9504270330486446, "train_speed(iter/s)": 0.24242 }, { "epoch": 0.8480067078283405, "grad_norm": 0.8631400465965271, "learning_rate": 9.306894433248893e-05, "loss": 0.13506240844726564, "memory(GiB)": 122.96, "step": 11125, "token_acc": 0.9508806747705284, "train_speed(iter/s)": 0.242446 }, { "epoch": 0.8483878344386006, "grad_norm": 0.8466391563415527, "learning_rate": 9.306286103025313e-05, "loss": 0.1773344874382019, "memory(GiB)": 122.96, "step": 11130, "token_acc": 0.9230026550808593, "train_speed(iter/s)": 0.242477 }, { "epoch": 0.8487689610488605, "grad_norm": 0.7910301089286804, "learning_rate": 9.305677525856207e-05, "loss": 0.13892335891723634, "memory(GiB)": 122.96, "step": 11135, "token_acc": 0.9491755577109602, "train_speed(iter/s)": 0.242498 }, { "epoch": 0.8491500876591204, "grad_norm": 1.0928938388824463, "learning_rate": 9.305068701776477e-05, "loss": 0.18649892807006835, "memory(GiB)": 122.96, "step": 11140, "token_acc": 0.9230769230769231, "train_speed(iter/s)": 0.242525 }, { "epoch": 0.8495312142693803, "grad_norm": 1.3737156391143799, "learning_rate": 9.304459630821034e-05, "loss": 0.17914717197418212, "memory(GiB)": 122.96, "step": 11145, "token_acc": 0.9472784269022514, "train_speed(iter/s)": 0.242554 }, { "epoch": 0.8499123408796402, "grad_norm": 0.7425313591957092, "learning_rate": 9.303850313024806e-05, "loss": 0.13918216228485109, "memory(GiB)": 122.96, "step": 11150, "token_acc": 0.9482758620689655, "train_speed(iter/s)": 0.242587 }, { "epoch": 0.8502934674899001, "grad_norm": 1.1064648628234863, "learning_rate": 9.303240748422736e-05, "loss": 0.1892155408859253, "memory(GiB)": 122.96, "step": 11155, "token_acc": 0.9357945425361156, "train_speed(iter/s)": 0.242612 }, { "epoch": 0.8506745941001601, "grad_norm": 0.3445730209350586, "learning_rate": 9.302630937049778e-05, "loss": 0.11232266426086426, "memory(GiB)": 122.96, "step": 11160, "token_acc": 0.9399386234107847, "train_speed(iter/s)": 0.24264 }, { "epoch": 0.85105572071042, "grad_norm": 1.1421757936477661, "learning_rate": 9.302020878940902e-05, "loss": 0.1918261766433716, "memory(GiB)": 122.96, "step": 11165, "token_acc": 0.9457999302892994, "train_speed(iter/s)": 0.242663 }, { "epoch": 0.8514368473206799, "grad_norm": 1.0139286518096924, "learning_rate": 9.301410574131093e-05, "loss": 0.13044490814208984, "memory(GiB)": 122.96, "step": 11170, "token_acc": 0.9522141255605381, "train_speed(iter/s)": 0.242671 }, { "epoch": 0.8518179739309398, "grad_norm": 0.6924458146095276, "learning_rate": 9.300800022655349e-05, "loss": 0.16573522090911866, "memory(GiB)": 122.96, "step": 11175, "token_acc": 0.9249779346866726, "train_speed(iter/s)": 0.242706 }, { "epoch": 0.8521991005411997, "grad_norm": 0.5491956472396851, "learning_rate": 9.30018922454868e-05, "loss": 0.1435001850128174, "memory(GiB)": 122.96, "step": 11180, "token_acc": 0.9444871597724127, "train_speed(iter/s)": 0.242716 }, { "epoch": 0.8525802271514598, "grad_norm": 0.8935181498527527, "learning_rate": 9.299578179846117e-05, "loss": 0.15703366994857787, "memory(GiB)": 122.96, "step": 11185, "token_acc": 0.947107438016529, "train_speed(iter/s)": 0.242746 }, { "epoch": 0.8529613537617197, "grad_norm": 0.9166231155395508, "learning_rate": 9.298966888582696e-05, "loss": 0.158802592754364, "memory(GiB)": 122.96, "step": 11190, "token_acc": 0.932872655478776, "train_speed(iter/s)": 0.242777 }, { "epoch": 0.8533424803719796, "grad_norm": 1.9896697998046875, "learning_rate": 9.298355350793475e-05, "loss": 0.13850829601287842, "memory(GiB)": 122.96, "step": 11195, "token_acc": 0.9463955637707948, "train_speed(iter/s)": 0.242812 }, { "epoch": 0.8537236069822395, "grad_norm": 0.6727337837219238, "learning_rate": 9.297743566513521e-05, "loss": 0.18870625495910645, "memory(GiB)": 122.96, "step": 11200, "token_acc": 0.9302450127192395, "train_speed(iter/s)": 0.24282 }, { "epoch": 0.8537236069822395, "eval_loss": 0.11900965869426727, "eval_runtime": 184.8537, "eval_samples_per_second": 2.867, "eval_steps_per_second": 2.867, "eval_token_acc": 0.9437082103487742, "step": 11200 }, { "epoch": 0.8541047335924994, "grad_norm": 1.0817548036575317, "learning_rate": 9.297131535777917e-05, "loss": 0.1828467845916748, "memory(GiB)": 122.96, "step": 11205, "token_acc": 0.9436407943174159, "train_speed(iter/s)": 0.241887 }, { "epoch": 0.8544858602027594, "grad_norm": 1.351295828819275, "learning_rate": 9.296519258621759e-05, "loss": 0.16453466415405274, "memory(GiB)": 122.96, "step": 11210, "token_acc": 0.9491260923845194, "train_speed(iter/s)": 0.241926 }, { "epoch": 0.8548669868130193, "grad_norm": 0.6206011176109314, "learning_rate": 9.295906735080161e-05, "loss": 0.14493274688720703, "memory(GiB)": 122.96, "step": 11215, "token_acc": 0.9507299270072993, "train_speed(iter/s)": 0.241933 }, { "epoch": 0.8552481134232792, "grad_norm": 0.6712293028831482, "learning_rate": 9.295293965188248e-05, "loss": 0.1310397982597351, "memory(GiB)": 122.96, "step": 11220, "token_acc": 0.9457283822383796, "train_speed(iter/s)": 0.241956 }, { "epoch": 0.8556292400335391, "grad_norm": 1.9602712392807007, "learning_rate": 9.29468094898116e-05, "loss": 0.15444633960723878, "memory(GiB)": 122.96, "step": 11225, "token_acc": 0.9420856975151634, "train_speed(iter/s)": 0.241978 }, { "epoch": 0.856010366643799, "grad_norm": 0.801242470741272, "learning_rate": 9.294067686494047e-05, "loss": 0.15099868774414063, "memory(GiB)": 122.96, "step": 11230, "token_acc": 0.9428631667720851, "train_speed(iter/s)": 0.242002 }, { "epoch": 0.8563914932540589, "grad_norm": 1.2892011404037476, "learning_rate": 9.29345417776208e-05, "loss": 0.12697191238403321, "memory(GiB)": 122.96, "step": 11235, "token_acc": 0.9519450800915332, "train_speed(iter/s)": 0.242004 }, { "epoch": 0.856772619864319, "grad_norm": 0.4490237832069397, "learning_rate": 9.292840422820439e-05, "loss": 0.14466590881347657, "memory(GiB)": 122.96, "step": 11240, "token_acc": 0.9419919737322145, "train_speed(iter/s)": 0.242027 }, { "epoch": 0.8571537464745789, "grad_norm": 0.8701170086860657, "learning_rate": 9.292226421704323e-05, "loss": 0.140582275390625, "memory(GiB)": 122.96, "step": 11245, "token_acc": 0.9461852861035422, "train_speed(iter/s)": 0.242045 }, { "epoch": 0.8575348730848388, "grad_norm": 0.5863073468208313, "learning_rate": 9.291612174448937e-05, "loss": 0.2405301570892334, "memory(GiB)": 122.96, "step": 11250, "token_acc": 0.9181735159817351, "train_speed(iter/s)": 0.242061 }, { "epoch": 0.8579159996950987, "grad_norm": 0.5197257995605469, "learning_rate": 9.290997681089511e-05, "loss": 0.12870147228240966, "memory(GiB)": 122.96, "step": 11255, "token_acc": 0.9487666034155597, "train_speed(iter/s)": 0.242083 }, { "epoch": 0.8582971263053586, "grad_norm": 0.7126216292381287, "learning_rate": 9.290382941661278e-05, "loss": 0.12530384063720704, "memory(GiB)": 122.96, "step": 11260, "token_acc": 0.9542597765363129, "train_speed(iter/s)": 0.242082 }, { "epoch": 0.8586782529156186, "grad_norm": 1.0275896787643433, "learning_rate": 9.289767956199495e-05, "loss": 0.1642824411392212, "memory(GiB)": 122.96, "step": 11265, "token_acc": 0.9333768778576094, "train_speed(iter/s)": 0.242115 }, { "epoch": 0.8590593795258785, "grad_norm": 1.2328941822052002, "learning_rate": 9.289152724739428e-05, "loss": 0.14040806293487548, "memory(GiB)": 122.96, "step": 11270, "token_acc": 0.9352856109906001, "train_speed(iter/s)": 0.242129 }, { "epoch": 0.8594405061361384, "grad_norm": 0.732254683971405, "learning_rate": 9.288537247316353e-05, "loss": 0.14184410572052003, "memory(GiB)": 122.96, "step": 11275, "token_acc": 0.9451833213515457, "train_speed(iter/s)": 0.242141 }, { "epoch": 0.8598216327463983, "grad_norm": 1.1022695302963257, "learning_rate": 9.28792152396557e-05, "loss": 0.23989293575286866, "memory(GiB)": 122.96, "step": 11280, "token_acc": 0.9231980405878236, "train_speed(iter/s)": 0.242159 }, { "epoch": 0.8602027593566582, "grad_norm": 0.6675364375114441, "learning_rate": 9.287305554722385e-05, "loss": 0.15799055099487305, "memory(GiB)": 122.96, "step": 11285, "token_acc": 0.9252317198764161, "train_speed(iter/s)": 0.242185 }, { "epoch": 0.8605838859669183, "grad_norm": 1.327260136604309, "learning_rate": 9.286689339622123e-05, "loss": 0.13705127239227294, "memory(GiB)": 122.96, "step": 11290, "token_acc": 0.9482699554642001, "train_speed(iter/s)": 0.242206 }, { "epoch": 0.8609650125771782, "grad_norm": 1.1228729486465454, "learning_rate": 9.286072878700121e-05, "loss": 0.1902400493621826, "memory(GiB)": 122.96, "step": 11295, "token_acc": 0.9357437422077299, "train_speed(iter/s)": 0.242214 }, { "epoch": 0.8613461391874381, "grad_norm": 0.845726728439331, "learning_rate": 9.285456171991727e-05, "loss": 0.15102235078811646, "memory(GiB)": 122.96, "step": 11300, "token_acc": 0.9455394190871369, "train_speed(iter/s)": 0.242252 }, { "epoch": 0.861727265797698, "grad_norm": 0.9008126258850098, "learning_rate": 9.284839219532311e-05, "loss": 0.12301328182220458, "memory(GiB)": 122.96, "step": 11305, "token_acc": 0.9505386875612145, "train_speed(iter/s)": 0.242279 }, { "epoch": 0.8621083924079579, "grad_norm": 0.8441014885902405, "learning_rate": 9.284222021357248e-05, "loss": 0.1429712176322937, "memory(GiB)": 122.96, "step": 11310, "token_acc": 0.9409408115792194, "train_speed(iter/s)": 0.242287 }, { "epoch": 0.8624895190182178, "grad_norm": 0.8268718719482422, "learning_rate": 9.283604577501933e-05, "loss": 0.13432111740112304, "memory(GiB)": 122.96, "step": 11315, "token_acc": 0.9461340758192333, "train_speed(iter/s)": 0.242279 }, { "epoch": 0.8628706456284778, "grad_norm": 1.0975680351257324, "learning_rate": 9.282986888001774e-05, "loss": 0.1309664487838745, "memory(GiB)": 122.96, "step": 11320, "token_acc": 0.9443148688046648, "train_speed(iter/s)": 0.242308 }, { "epoch": 0.8632517722387377, "grad_norm": 1.169417142868042, "learning_rate": 9.282368952892193e-05, "loss": 0.20588057041168212, "memory(GiB)": 122.96, "step": 11325, "token_acc": 0.9230307876849261, "train_speed(iter/s)": 0.242331 }, { "epoch": 0.8636328988489976, "grad_norm": 0.6297056674957275, "learning_rate": 9.281750772208626e-05, "loss": 0.15845749378204346, "memory(GiB)": 122.96, "step": 11330, "token_acc": 0.9447421918314545, "train_speed(iter/s)": 0.242349 }, { "epoch": 0.8640140254592575, "grad_norm": 1.0419367551803589, "learning_rate": 9.28113234598652e-05, "loss": 0.20952963829040527, "memory(GiB)": 122.96, "step": 11335, "token_acc": 0.908467603702434, "train_speed(iter/s)": 0.242366 }, { "epoch": 0.8643951520695174, "grad_norm": 0.6862680315971375, "learning_rate": 9.280513674261342e-05, "loss": 0.12869423627853394, "memory(GiB)": 122.96, "step": 11340, "token_acc": 0.9522039979497694, "train_speed(iter/s)": 0.242369 }, { "epoch": 0.8647762786797775, "grad_norm": 0.12061133980751038, "learning_rate": 9.27989475706857e-05, "loss": 0.12949740886688232, "memory(GiB)": 122.96, "step": 11345, "token_acc": 0.9511424856028237, "train_speed(iter/s)": 0.242393 }, { "epoch": 0.8651574052900374, "grad_norm": 1.2768725156784058, "learning_rate": 9.279275594443693e-05, "loss": 0.15031658411026, "memory(GiB)": 122.96, "step": 11350, "token_acc": 0.946282527881041, "train_speed(iter/s)": 0.242412 }, { "epoch": 0.8655385319002973, "grad_norm": 1.0683151483535767, "learning_rate": 9.278656186422218e-05, "loss": 0.1757448434829712, "memory(GiB)": 122.96, "step": 11355, "token_acc": 0.9324688663499807, "train_speed(iter/s)": 0.24243 }, { "epoch": 0.8659196585105572, "grad_norm": 0.5343469381332397, "learning_rate": 9.278036533039669e-05, "loss": 0.09582918882369995, "memory(GiB)": 122.96, "step": 11360, "token_acc": 0.943115234375, "train_speed(iter/s)": 0.242456 }, { "epoch": 0.8663007851208171, "grad_norm": 0.5921213030815125, "learning_rate": 9.277416634331577e-05, "loss": 0.12343356609344483, "memory(GiB)": 122.96, "step": 11365, "token_acc": 0.9520399305555556, "train_speed(iter/s)": 0.242483 }, { "epoch": 0.8666819117310771, "grad_norm": 1.1680989265441895, "learning_rate": 9.276796490333489e-05, "loss": 0.1946183919906616, "memory(GiB)": 122.96, "step": 11370, "token_acc": 0.9113680154142582, "train_speed(iter/s)": 0.242512 }, { "epoch": 0.867063038341337, "grad_norm": 1.3305295705795288, "learning_rate": 9.27617610108097e-05, "loss": 0.19429900646209716, "memory(GiB)": 122.96, "step": 11375, "token_acc": 0.9037304452466908, "train_speed(iter/s)": 0.242551 }, { "epoch": 0.8674441649515969, "grad_norm": 1.0473742485046387, "learning_rate": 9.275555466609593e-05, "loss": 0.20306687355041503, "memory(GiB)": 122.96, "step": 11380, "token_acc": 0.9079531691562374, "train_speed(iter/s)": 0.242587 }, { "epoch": 0.8678252915618568, "grad_norm": 1.1064177751541138, "learning_rate": 9.274934586954953e-05, "loss": 0.1528411865234375, "memory(GiB)": 122.96, "step": 11385, "token_acc": 0.9403420942845223, "train_speed(iter/s)": 0.242623 }, { "epoch": 0.8682064181721167, "grad_norm": 0.717984139919281, "learning_rate": 9.274313462152654e-05, "loss": 0.10176662206649781, "memory(GiB)": 122.96, "step": 11390, "token_acc": 0.9483264826776278, "train_speed(iter/s)": 0.242638 }, { "epoch": 0.8685875447823767, "grad_norm": 0.8964281678199768, "learning_rate": 9.273692092238311e-05, "loss": 0.11504189968109131, "memory(GiB)": 122.96, "step": 11395, "token_acc": 0.9562218094985407, "train_speed(iter/s)": 0.242665 }, { "epoch": 0.8689686713926367, "grad_norm": 1.0431890487670898, "learning_rate": 9.273070477247561e-05, "loss": 0.11975104808807373, "memory(GiB)": 122.96, "step": 11400, "token_acc": 0.952100960848989, "train_speed(iter/s)": 0.242678 }, { "epoch": 0.8689686713926367, "eval_loss": 0.12047483772039413, "eval_runtime": 172.761, "eval_samples_per_second": 3.068, "eval_steps_per_second": 3.068, "eval_token_acc": 0.9433919643394976, "step": 11400 }, { "epoch": 0.8693497980028966, "grad_norm": 1.627865195274353, "learning_rate": 9.272448617216046e-05, "loss": 0.29540698528289794, "memory(GiB)": 122.96, "step": 11405, "token_acc": 0.9424642134999599, "train_speed(iter/s)": 0.24182 }, { "epoch": 0.8697309246131565, "grad_norm": 0.05468102917075157, "learning_rate": 9.271826512179431e-05, "loss": 0.13939646482467652, "memory(GiB)": 122.96, "step": 11410, "token_acc": 0.9306818181818182, "train_speed(iter/s)": 0.241849 }, { "epoch": 0.8701120512234164, "grad_norm": 2.2837295532226562, "learning_rate": 9.27120416217339e-05, "loss": 0.14522193670272826, "memory(GiB)": 122.96, "step": 11415, "token_acc": 0.9382584628486268, "train_speed(iter/s)": 0.241875 }, { "epoch": 0.8704931778336763, "grad_norm": 0.6580895781517029, "learning_rate": 9.27058156723361e-05, "loss": 0.14301533699035646, "memory(GiB)": 122.96, "step": 11420, "token_acc": 0.950741673663588, "train_speed(iter/s)": 0.241903 }, { "epoch": 0.8708743044439363, "grad_norm": 1.1832242012023926, "learning_rate": 9.269958727395793e-05, "loss": 0.14156968593597413, "memory(GiB)": 122.96, "step": 11425, "token_acc": 0.9326463010673537, "train_speed(iter/s)": 0.241936 }, { "epoch": 0.8712554310541962, "grad_norm": 0.5878319144248962, "learning_rate": 9.269335642695661e-05, "loss": 0.13483787775039674, "memory(GiB)": 122.96, "step": 11430, "token_acc": 0.9416964133945266, "train_speed(iter/s)": 0.241957 }, { "epoch": 0.8716365576644561, "grad_norm": 0.5540229678153992, "learning_rate": 9.268712313168942e-05, "loss": 0.14003828763961793, "memory(GiB)": 122.96, "step": 11435, "token_acc": 0.9422492401215805, "train_speed(iter/s)": 0.241976 }, { "epoch": 0.872017684274716, "grad_norm": 0.4587554633617401, "learning_rate": 9.26808873885138e-05, "loss": 0.1087761402130127, "memory(GiB)": 122.96, "step": 11440, "token_acc": 0.9511930585683297, "train_speed(iter/s)": 0.24199 }, { "epoch": 0.872398810884976, "grad_norm": 2.100513458251953, "learning_rate": 9.267464919778734e-05, "loss": 0.15411088466644288, "memory(GiB)": 122.96, "step": 11445, "token_acc": 0.9380774032459426, "train_speed(iter/s)": 0.242018 }, { "epoch": 0.872779937495236, "grad_norm": 0.518129289150238, "learning_rate": 9.266840855986781e-05, "loss": 0.1339137077331543, "memory(GiB)": 122.96, "step": 11450, "token_acc": 0.9405251951738822, "train_speed(iter/s)": 0.242027 }, { "epoch": 0.8731610641054959, "grad_norm": 1.5014537572860718, "learning_rate": 9.266216547511304e-05, "loss": 0.10791488885879516, "memory(GiB)": 122.96, "step": 11455, "token_acc": 0.9474346868114574, "train_speed(iter/s)": 0.242058 }, { "epoch": 0.8735421907157558, "grad_norm": 0.5119888186454773, "learning_rate": 9.265591994388105e-05, "loss": 0.13979020118713378, "memory(GiB)": 122.96, "step": 11460, "token_acc": 0.9464082098061574, "train_speed(iter/s)": 0.242088 }, { "epoch": 0.8739233173260157, "grad_norm": 0.31261828541755676, "learning_rate": 9.264967196653e-05, "loss": 0.154273521900177, "memory(GiB)": 122.96, "step": 11465, "token_acc": 0.9279484262419416, "train_speed(iter/s)": 0.242109 }, { "epoch": 0.8743044439362756, "grad_norm": 0.9616721868515015, "learning_rate": 9.26434215434182e-05, "loss": 0.2128819465637207, "memory(GiB)": 122.96, "step": 11470, "token_acc": 0.9272114240380801, "train_speed(iter/s)": 0.242135 }, { "epoch": 0.8746855705465355, "grad_norm": 1.499384880065918, "learning_rate": 9.263716867490404e-05, "loss": 0.1638340950012207, "memory(GiB)": 122.96, "step": 11475, "token_acc": 0.9379679144385027, "train_speed(iter/s)": 0.242148 }, { "epoch": 0.8750666971567955, "grad_norm": 0.6719818115234375, "learning_rate": 9.263091336134612e-05, "loss": 0.10661208629608154, "memory(GiB)": 122.96, "step": 11480, "token_acc": 0.9521540599563232, "train_speed(iter/s)": 0.242168 }, { "epoch": 0.8754478237670554, "grad_norm": 0.7274910807609558, "learning_rate": 9.262465560310318e-05, "loss": 0.1181525468826294, "memory(GiB)": 122.96, "step": 11485, "token_acc": 0.9514869888475836, "train_speed(iter/s)": 0.242191 }, { "epoch": 0.8758289503773153, "grad_norm": 0.5980207920074463, "learning_rate": 9.261839540053402e-05, "loss": 0.19358291625976562, "memory(GiB)": 122.96, "step": 11490, "token_acc": 0.9183716515503059, "train_speed(iter/s)": 0.242213 }, { "epoch": 0.8762100769875752, "grad_norm": 0.7800714373588562, "learning_rate": 9.261213275399766e-05, "loss": 0.16730958223342896, "memory(GiB)": 122.96, "step": 11495, "token_acc": 0.9281686687913704, "train_speed(iter/s)": 0.242244 }, { "epoch": 0.8765912035978352, "grad_norm": 0.8332794308662415, "learning_rate": 9.260586766385323e-05, "loss": 0.10308022499084472, "memory(GiB)": 122.96, "step": 11500, "token_acc": 0.9508144362823379, "train_speed(iter/s)": 0.242279 }, { "epoch": 0.8769723302080952, "grad_norm": 0.7437598705291748, "learning_rate": 9.259960013046e-05, "loss": 0.12815870046615602, "memory(GiB)": 122.96, "step": 11505, "token_acc": 0.9502756215547306, "train_speed(iter/s)": 0.242281 }, { "epoch": 0.8773534568183551, "grad_norm": 0.8914576768875122, "learning_rate": 9.259333015417739e-05, "loss": 0.11342629194259643, "memory(GiB)": 122.96, "step": 11510, "token_acc": 0.9587782468338714, "train_speed(iter/s)": 0.24231 }, { "epoch": 0.877734583428615, "grad_norm": 1.3738980293273926, "learning_rate": 9.258705773536496e-05, "loss": 0.18521571159362793, "memory(GiB)": 122.96, "step": 11515, "token_acc": 0.9212152420185376, "train_speed(iter/s)": 0.242338 }, { "epoch": 0.8781157100388749, "grad_norm": 1.36734139919281, "learning_rate": 9.258078287438241e-05, "loss": 0.16222747564315795, "memory(GiB)": 122.96, "step": 11520, "token_acc": 0.9304314030314808, "train_speed(iter/s)": 0.242369 }, { "epoch": 0.8784968366491348, "grad_norm": 1.6461329460144043, "learning_rate": 9.257450557158954e-05, "loss": 0.1493427038192749, "memory(GiB)": 122.96, "step": 11525, "token_acc": 0.9365537130497477, "train_speed(iter/s)": 0.242404 }, { "epoch": 0.8788779632593948, "grad_norm": 0.5578780770301819, "learning_rate": 9.256822582734635e-05, "loss": 0.11434768438339234, "memory(GiB)": 122.96, "step": 11530, "token_acc": 0.9500564334085779, "train_speed(iter/s)": 0.242434 }, { "epoch": 0.8792590898696547, "grad_norm": 0.6220436692237854, "learning_rate": 9.256194364201296e-05, "loss": 0.12437794208526612, "memory(GiB)": 122.96, "step": 11535, "token_acc": 0.9473365617433414, "train_speed(iter/s)": 0.242464 }, { "epoch": 0.8796402164799146, "grad_norm": 0.9617224931716919, "learning_rate": 9.25556590159496e-05, "loss": 0.17805936336517333, "memory(GiB)": 122.96, "step": 11540, "token_acc": 0.9376747245518829, "train_speed(iter/s)": 0.242486 }, { "epoch": 0.8800213430901745, "grad_norm": 0.7395830154418945, "learning_rate": 9.254937194951669e-05, "loss": 0.12242883443832397, "memory(GiB)": 122.96, "step": 11545, "token_acc": 0.9524969549330086, "train_speed(iter/s)": 0.242497 }, { "epoch": 0.8804024697004345, "grad_norm": 0.8783916234970093, "learning_rate": 9.254308244307473e-05, "loss": 0.13247370719909668, "memory(GiB)": 122.96, "step": 11550, "token_acc": 0.959231217239371, "train_speed(iter/s)": 0.242517 }, { "epoch": 0.8807835963106944, "grad_norm": 0.6463296413421631, "learning_rate": 9.253679049698444e-05, "loss": 0.14341195821762084, "memory(GiB)": 122.96, "step": 11555, "token_acc": 0.945671223273747, "train_speed(iter/s)": 0.242536 }, { "epoch": 0.8811647229209544, "grad_norm": 0.7633922696113586, "learning_rate": 9.25304961116066e-05, "loss": 0.12439312934875488, "memory(GiB)": 122.96, "step": 11560, "token_acc": 0.9443637118538509, "train_speed(iter/s)": 0.242562 }, { "epoch": 0.8815458495312143, "grad_norm": 1.352342128753662, "learning_rate": 9.252419928730217e-05, "loss": 0.18157944679260254, "memory(GiB)": 122.96, "step": 11565, "token_acc": 0.9330065359477124, "train_speed(iter/s)": 0.242598 }, { "epoch": 0.8819269761414742, "grad_norm": 0.8785626888275146, "learning_rate": 9.251790002443226e-05, "loss": 0.10114047527313233, "memory(GiB)": 122.96, "step": 11570, "token_acc": 0.9517374517374517, "train_speed(iter/s)": 0.242627 }, { "epoch": 0.8823081027517341, "grad_norm": 1.6296240091323853, "learning_rate": 9.251159832335807e-05, "loss": 0.21737995147705078, "memory(GiB)": 122.96, "step": 11575, "token_acc": 0.9309249702262803, "train_speed(iter/s)": 0.242661 }, { "epoch": 0.882689229361994, "grad_norm": 1.0468300580978394, "learning_rate": 9.2505294184441e-05, "loss": 0.17732958793640136, "memory(GiB)": 122.96, "step": 11580, "token_acc": 0.9247654452115419, "train_speed(iter/s)": 0.242683 }, { "epoch": 0.883070355972254, "grad_norm": 0.7905191779136658, "learning_rate": 9.249898760804257e-05, "loss": 0.09798108339309693, "memory(GiB)": 122.96, "step": 11585, "token_acc": 0.9511450381679389, "train_speed(iter/s)": 0.24272 }, { "epoch": 0.8834514825825139, "grad_norm": 0.8342950940132141, "learning_rate": 9.249267859452441e-05, "loss": 0.19752393960952758, "memory(GiB)": 122.96, "step": 11590, "token_acc": 0.9200764209520776, "train_speed(iter/s)": 0.242737 }, { "epoch": 0.8838326091927738, "grad_norm": 1.3242619037628174, "learning_rate": 9.248636714424833e-05, "loss": 0.1638488531112671, "memory(GiB)": 122.96, "step": 11595, "token_acc": 0.9442482341069627, "train_speed(iter/s)": 0.242765 }, { "epoch": 0.8842137358030338, "grad_norm": 0.9804904460906982, "learning_rate": 9.248005325757626e-05, "loss": 0.12706501483917237, "memory(GiB)": 122.96, "step": 11600, "token_acc": 0.9495395241749808, "train_speed(iter/s)": 0.242785 }, { "epoch": 0.8842137358030338, "eval_loss": 0.11803941428661346, "eval_runtime": 178.586, "eval_samples_per_second": 2.968, "eval_steps_per_second": 2.968, "eval_token_acc": 0.9434145533401602, "step": 11600 }, { "epoch": 0.8845948624132937, "grad_norm": 0.7456228137016296, "learning_rate": 9.247373693487024e-05, "loss": 0.17875895500183106, "memory(GiB)": 122.96, "step": 11605, "token_acc": 0.9427135309656589, "train_speed(iter/s)": 0.24191 }, { "epoch": 0.8849759890235537, "grad_norm": 1.19225013256073, "learning_rate": 9.246741817649253e-05, "loss": 0.135236394405365, "memory(GiB)": 122.96, "step": 11610, "token_acc": 0.9504378283712784, "train_speed(iter/s)": 0.241929 }, { "epoch": 0.8853571156338136, "grad_norm": 0.9328842163085938, "learning_rate": 9.246109698280546e-05, "loss": 0.16903975009918212, "memory(GiB)": 122.96, "step": 11615, "token_acc": 0.9280866192630899, "train_speed(iter/s)": 0.241952 }, { "epoch": 0.8857382422440735, "grad_norm": 1.2180582284927368, "learning_rate": 9.245477335417152e-05, "loss": 0.15201431512832642, "memory(GiB)": 122.96, "step": 11620, "token_acc": 0.9369127516778524, "train_speed(iter/s)": 0.24198 }, { "epoch": 0.8861193688543334, "grad_norm": 0.6162218451499939, "learning_rate": 9.244844729095335e-05, "loss": 0.14544885158538817, "memory(GiB)": 122.96, "step": 11625, "token_acc": 0.9476904252064852, "train_speed(iter/s)": 0.24199 }, { "epoch": 0.8865004954645933, "grad_norm": 0.7765874266624451, "learning_rate": 9.24421187935137e-05, "loss": 0.13743438720703124, "memory(GiB)": 122.96, "step": 11630, "token_acc": 0.9500828010409273, "train_speed(iter/s)": 0.242012 }, { "epoch": 0.8868816220748532, "grad_norm": 0.828283429145813, "learning_rate": 9.24357878622155e-05, "loss": 0.150018048286438, "memory(GiB)": 122.96, "step": 11635, "token_acc": 0.920039980009995, "train_speed(iter/s)": 0.242045 }, { "epoch": 0.8872627486851132, "grad_norm": 1.2223256826400757, "learning_rate": 9.242945449742177e-05, "loss": 0.20054380893707274, "memory(GiB)": 122.96, "step": 11640, "token_acc": 0.9174092934436665, "train_speed(iter/s)": 0.242061 }, { "epoch": 0.8876438752953731, "grad_norm": 0.6012343764305115, "learning_rate": 9.242311869949575e-05, "loss": 0.1429295301437378, "memory(GiB)": 122.96, "step": 11645, "token_acc": 0.9400479616306955, "train_speed(iter/s)": 0.242084 }, { "epoch": 0.888025001905633, "grad_norm": 0.967304527759552, "learning_rate": 9.241678046880073e-05, "loss": 0.15435534715652466, "memory(GiB)": 122.96, "step": 11650, "token_acc": 0.9411280362289007, "train_speed(iter/s)": 0.242096 }, { "epoch": 0.888406128515893, "grad_norm": 0.3374031186103821, "learning_rate": 9.241043980570019e-05, "loss": 0.11826033592224121, "memory(GiB)": 122.96, "step": 11655, "token_acc": 0.9591666666666666, "train_speed(iter/s)": 0.242128 }, { "epoch": 0.8887872551261529, "grad_norm": 1.6713097095489502, "learning_rate": 9.240409671055774e-05, "loss": 0.16891655921936036, "memory(GiB)": 122.96, "step": 11660, "token_acc": 0.9296121097445601, "train_speed(iter/s)": 0.242154 }, { "epoch": 0.8891683817364129, "grad_norm": 0.9264869689941406, "learning_rate": 9.239775118373711e-05, "loss": 0.14773941040039062, "memory(GiB)": 122.96, "step": 11665, "token_acc": 0.9318497913769124, "train_speed(iter/s)": 0.242175 }, { "epoch": 0.8895495083466728, "grad_norm": 0.7910012602806091, "learning_rate": 9.23914032256022e-05, "loss": 0.15114855766296387, "memory(GiB)": 122.96, "step": 11670, "token_acc": 0.9447102013572144, "train_speed(iter/s)": 0.24218 }, { "epoch": 0.8899306349569327, "grad_norm": 0.6481488943099976, "learning_rate": 9.238505283651705e-05, "loss": 0.15131815671920776, "memory(GiB)": 122.96, "step": 11675, "token_acc": 0.946266829865361, "train_speed(iter/s)": 0.242182 }, { "epoch": 0.8903117615671926, "grad_norm": 0.5785846710205078, "learning_rate": 9.23787000168458e-05, "loss": 0.18101712465286254, "memory(GiB)": 122.96, "step": 11680, "token_acc": 0.938740713676775, "train_speed(iter/s)": 0.242192 }, { "epoch": 0.8906928881774525, "grad_norm": 1.0119166374206543, "learning_rate": 9.237234476695277e-05, "loss": 0.15444326400756836, "memory(GiB)": 122.96, "step": 11685, "token_acc": 0.928311057108141, "train_speed(iter/s)": 0.242225 }, { "epoch": 0.8910740147877125, "grad_norm": 0.7281562685966492, "learning_rate": 9.236598708720238e-05, "loss": 0.11219713687896729, "memory(GiB)": 122.96, "step": 11690, "token_acc": 0.9575577066269546, "train_speed(iter/s)": 0.242256 }, { "epoch": 0.8914551413979724, "grad_norm": 1.1952383518218994, "learning_rate": 9.235962697795926e-05, "loss": 0.16654957532882692, "memory(GiB)": 122.96, "step": 11695, "token_acc": 0.9535723497549652, "train_speed(iter/s)": 0.242282 }, { "epoch": 0.8918362680082323, "grad_norm": 0.6787243485450745, "learning_rate": 9.23532644395881e-05, "loss": 0.15942226648330687, "memory(GiB)": 122.96, "step": 11700, "token_acc": 0.9299330242143226, "train_speed(iter/s)": 0.242306 }, { "epoch": 0.8922173946184923, "grad_norm": 0.7359351515769958, "learning_rate": 9.234689947245377e-05, "loss": 0.1333617687225342, "memory(GiB)": 122.96, "step": 11705, "token_acc": 0.9478201634877385, "train_speed(iter/s)": 0.24231 }, { "epoch": 0.8925985212287522, "grad_norm": 0.865392804145813, "learning_rate": 9.234053207692125e-05, "loss": 0.2199159622192383, "memory(GiB)": 122.96, "step": 11710, "token_acc": 0.9093511450381679, "train_speed(iter/s)": 0.242344 }, { "epoch": 0.8929796478390121, "grad_norm": 0.7505701780319214, "learning_rate": 9.233416225335572e-05, "loss": 0.11937528848648071, "memory(GiB)": 122.96, "step": 11715, "token_acc": 0.9435110786462138, "train_speed(iter/s)": 0.242366 }, { "epoch": 0.8933607744492721, "grad_norm": 0.49155566096305847, "learning_rate": 9.232779000212242e-05, "loss": 0.12721607685089112, "memory(GiB)": 122.96, "step": 11720, "token_acc": 0.9677700348432056, "train_speed(iter/s)": 0.242391 }, { "epoch": 0.893741901059532, "grad_norm": 0.19501294195652008, "learning_rate": 9.23214153235868e-05, "loss": 0.10543086528778076, "memory(GiB)": 122.96, "step": 11725, "token_acc": 0.961091841351229, "train_speed(iter/s)": 0.242404 }, { "epoch": 0.8941230276697919, "grad_norm": 0.839466392993927, "learning_rate": 9.23150382181144e-05, "loss": 0.11526789665222167, "memory(GiB)": 122.96, "step": 11730, "token_acc": 0.9522190103723857, "train_speed(iter/s)": 0.242418 }, { "epoch": 0.8945041542800518, "grad_norm": 0.5504568219184875, "learning_rate": 9.230865868607092e-05, "loss": 0.14696993827819824, "memory(GiB)": 122.96, "step": 11735, "token_acc": 0.9401141946620635, "train_speed(iter/s)": 0.242424 }, { "epoch": 0.8948852808903117, "grad_norm": 1.1373040676116943, "learning_rate": 9.230227672782221e-05, "loss": 0.14533056020736695, "memory(GiB)": 122.96, "step": 11740, "token_acc": 0.9474373363262252, "train_speed(iter/s)": 0.242439 }, { "epoch": 0.8952664075005717, "grad_norm": 1.3203582763671875, "learning_rate": 9.229589234373423e-05, "loss": 0.15756341218948364, "memory(GiB)": 122.96, "step": 11745, "token_acc": 0.9163332120771189, "train_speed(iter/s)": 0.242475 }, { "epoch": 0.8956475341108316, "grad_norm": 1.3900340795516968, "learning_rate": 9.228950553417311e-05, "loss": 0.17532361745834352, "memory(GiB)": 122.96, "step": 11750, "token_acc": 0.9339250493096647, "train_speed(iter/s)": 0.242503 }, { "epoch": 0.8960286607210916, "grad_norm": 0.9940233826637268, "learning_rate": 9.22831162995051e-05, "loss": 0.1425628423690796, "memory(GiB)": 122.96, "step": 11755, "token_acc": 0.9324255319148936, "train_speed(iter/s)": 0.242524 }, { "epoch": 0.8964097873313515, "grad_norm": 0.858811616897583, "learning_rate": 9.227672464009658e-05, "loss": 0.14655332565307616, "memory(GiB)": 122.96, "step": 11760, "token_acc": 0.9474337748344371, "train_speed(iter/s)": 0.242545 }, { "epoch": 0.8967909139416114, "grad_norm": 0.8200286030769348, "learning_rate": 9.227033055631409e-05, "loss": 0.1415562868118286, "memory(GiB)": 122.96, "step": 11765, "token_acc": 0.9446992176962503, "train_speed(iter/s)": 0.242558 }, { "epoch": 0.8971720405518714, "grad_norm": 1.2545667886734009, "learning_rate": 9.22639340485243e-05, "loss": 0.14444782733917236, "memory(GiB)": 122.96, "step": 11770, "token_acc": 0.9365798414496036, "train_speed(iter/s)": 0.242586 }, { "epoch": 0.8975531671621313, "grad_norm": 1.0853227376937866, "learning_rate": 9.225753511709401e-05, "loss": 0.14715808629989624, "memory(GiB)": 122.96, "step": 11775, "token_acc": 0.9423682140047207, "train_speed(iter/s)": 0.242611 }, { "epoch": 0.8979342937723912, "grad_norm": 0.9125546216964722, "learning_rate": 9.22511337623902e-05, "loss": 0.17790710926055908, "memory(GiB)": 122.96, "step": 11780, "token_acc": 0.9352867657269823, "train_speed(iter/s)": 0.242618 }, { "epoch": 0.8983154203826511, "grad_norm": 0.8053238391876221, "learning_rate": 9.224472998477993e-05, "loss": 0.14402978420257567, "memory(GiB)": 122.96, "step": 11785, "token_acc": 0.953808572063069, "train_speed(iter/s)": 0.242636 }, { "epoch": 0.898696546992911, "grad_norm": 1.1952511072158813, "learning_rate": 9.223832378463042e-05, "loss": 0.14119811058044435, "memory(GiB)": 122.96, "step": 11790, "token_acc": 0.9331240946402704, "train_speed(iter/s)": 0.242664 }, { "epoch": 0.8990776736031709, "grad_norm": 1.422877311706543, "learning_rate": 9.223191516230907e-05, "loss": 0.15266720056533814, "memory(GiB)": 122.96, "step": 11795, "token_acc": 0.9465305626443067, "train_speed(iter/s)": 0.242674 }, { "epoch": 0.8994588002134309, "grad_norm": 1.2361427545547485, "learning_rate": 9.222550411818336e-05, "loss": 0.1222700834274292, "memory(GiB)": 122.96, "step": 11800, "token_acc": 0.9519366197183099, "train_speed(iter/s)": 0.242695 }, { "epoch": 0.8994588002134309, "eval_loss": 0.11847015470266342, "eval_runtime": 178.1484, "eval_samples_per_second": 2.975, "eval_steps_per_second": 2.975, "eval_token_acc": 0.9444611770375279, "step": 11800 }, { "epoch": 0.8998399268236908, "grad_norm": 1.0086513757705688, "learning_rate": 9.221909065262093e-05, "loss": 0.18087037801742553, "memory(GiB)": 122.96, "step": 11805, "token_acc": 0.9434511706268518, "train_speed(iter/s)": 0.241829 }, { "epoch": 0.9002210534339508, "grad_norm": 0.6679299473762512, "learning_rate": 9.221267476598959e-05, "loss": 0.14509079456329346, "memory(GiB)": 122.96, "step": 11810, "token_acc": 0.9386200716845878, "train_speed(iter/s)": 0.241864 }, { "epoch": 0.9006021800442107, "grad_norm": 0.5772103667259216, "learning_rate": 9.220625645865724e-05, "loss": 0.18981798887252807, "memory(GiB)": 122.96, "step": 11815, "token_acc": 0.9249644381223329, "train_speed(iter/s)": 0.241897 }, { "epoch": 0.9009833066544706, "grad_norm": 0.9887216687202454, "learning_rate": 9.219983573099194e-05, "loss": 0.10760596990585328, "memory(GiB)": 122.96, "step": 11820, "token_acc": 0.9529329118370214, "train_speed(iter/s)": 0.241931 }, { "epoch": 0.9013644332647306, "grad_norm": 1.367976427078247, "learning_rate": 9.219341258336187e-05, "loss": 0.16150444746017456, "memory(GiB)": 122.96, "step": 11825, "token_acc": 0.9263207770670457, "train_speed(iter/s)": 0.241956 }, { "epoch": 0.9017455598749905, "grad_norm": 1.2807461023330688, "learning_rate": 9.218698701613542e-05, "loss": 0.17334593534469606, "memory(GiB)": 122.96, "step": 11830, "token_acc": 0.9500411184210527, "train_speed(iter/s)": 0.241976 }, { "epoch": 0.9021266864852504, "grad_norm": 1.0547317266464233, "learning_rate": 9.218055902968101e-05, "loss": 0.16150217056274413, "memory(GiB)": 122.96, "step": 11835, "token_acc": 0.9371771027071709, "train_speed(iter/s)": 0.242 }, { "epoch": 0.9025078130955103, "grad_norm": 1.1199029684066772, "learning_rate": 9.217412862436729e-05, "loss": 0.1572587490081787, "memory(GiB)": 122.96, "step": 11840, "token_acc": 0.9306865540688748, "train_speed(iter/s)": 0.242022 }, { "epoch": 0.9028889397057702, "grad_norm": 0.5875632762908936, "learning_rate": 9.216769580056301e-05, "loss": 0.15320951938629152, "memory(GiB)": 122.96, "step": 11845, "token_acc": 0.9401131719585036, "train_speed(iter/s)": 0.242045 }, { "epoch": 0.9032700663160302, "grad_norm": 0.4891464412212372, "learning_rate": 9.216126055863707e-05, "loss": 0.1438019871711731, "memory(GiB)": 122.96, "step": 11850, "token_acc": 0.9581428915082992, "train_speed(iter/s)": 0.242072 }, { "epoch": 0.9036511929262901, "grad_norm": 0.7824639678001404, "learning_rate": 9.215482289895847e-05, "loss": 0.17016725540161132, "memory(GiB)": 122.96, "step": 11855, "token_acc": 0.9211845102505695, "train_speed(iter/s)": 0.242111 }, { "epoch": 0.90403231953655, "grad_norm": 0.7167559862136841, "learning_rate": 9.214838282189642e-05, "loss": 0.15830509662628173, "memory(GiB)": 122.96, "step": 11860, "token_acc": 0.9486000270526174, "train_speed(iter/s)": 0.242111 }, { "epoch": 0.90441344614681, "grad_norm": 0.43831273913383484, "learning_rate": 9.214194032782019e-05, "loss": 0.19446541070938111, "memory(GiB)": 122.96, "step": 11865, "token_acc": 0.933820968303727, "train_speed(iter/s)": 0.242143 }, { "epoch": 0.9047945727570699, "grad_norm": 0.7044629454612732, "learning_rate": 9.213549541709924e-05, "loss": 0.14148727655410767, "memory(GiB)": 122.96, "step": 11870, "token_acc": 0.9492692916317602, "train_speed(iter/s)": 0.242131 }, { "epoch": 0.9051756993673298, "grad_norm": 0.7193948030471802, "learning_rate": 9.212904809010317e-05, "loss": 0.1030422568321228, "memory(GiB)": 122.96, "step": 11875, "token_acc": 0.9562241551404534, "train_speed(iter/s)": 0.242138 }, { "epoch": 0.9055568259775898, "grad_norm": 1.2893426418304443, "learning_rate": 9.21225983472017e-05, "loss": 0.12540862560272217, "memory(GiB)": 122.96, "step": 11880, "token_acc": 0.9421841541755889, "train_speed(iter/s)": 0.242166 }, { "epoch": 0.9059379525878497, "grad_norm": 1.9233492612838745, "learning_rate": 9.211614618876468e-05, "loss": 0.15760860443115235, "memory(GiB)": 122.96, "step": 11885, "token_acc": 0.9387096774193548, "train_speed(iter/s)": 0.242196 }, { "epoch": 0.9063190791981096, "grad_norm": 0.6106871962547302, "learning_rate": 9.210969161516212e-05, "loss": 0.13520212173461915, "memory(GiB)": 122.96, "step": 11890, "token_acc": 0.949454200284765, "train_speed(iter/s)": 0.242222 }, { "epoch": 0.9067002058083695, "grad_norm": 1.1322214603424072, "learning_rate": 9.210323462676415e-05, "loss": 0.13717477321624755, "memory(GiB)": 122.96, "step": 11895, "token_acc": 0.935416040853109, "train_speed(iter/s)": 0.242251 }, { "epoch": 0.9070813324186294, "grad_norm": 3.6114487648010254, "learning_rate": 9.209677522394106e-05, "loss": 0.15325380563735963, "memory(GiB)": 122.96, "step": 11900, "token_acc": 0.9436504695794202, "train_speed(iter/s)": 0.242285 }, { "epoch": 0.9074624590288894, "grad_norm": 0.8233906030654907, "learning_rate": 9.209031340706329e-05, "loss": 0.1379368543624878, "memory(GiB)": 122.96, "step": 11905, "token_acc": 0.9398841139546112, "train_speed(iter/s)": 0.242312 }, { "epoch": 0.9078435856391494, "grad_norm": 0.9061114192008972, "learning_rate": 9.208384917650135e-05, "loss": 0.1842959403991699, "memory(GiB)": 122.96, "step": 11910, "token_acc": 0.9273004575495679, "train_speed(iter/s)": 0.242342 }, { "epoch": 0.9082247122494093, "grad_norm": 1.4810551404953003, "learning_rate": 9.207738253262594e-05, "loss": 0.13917871713638305, "memory(GiB)": 122.96, "step": 11915, "token_acc": 0.9427231960864247, "train_speed(iter/s)": 0.242368 }, { "epoch": 0.9086058388596692, "grad_norm": 0.5086352229118347, "learning_rate": 9.207091347580791e-05, "loss": 0.12181053161621094, "memory(GiB)": 122.96, "step": 11920, "token_acc": 0.9508238276299112, "train_speed(iter/s)": 0.242392 }, { "epoch": 0.9089869654699291, "grad_norm": 0.9137089848518372, "learning_rate": 9.206444200641823e-05, "loss": 0.16192402839660644, "memory(GiB)": 122.96, "step": 11925, "token_acc": 0.9349429912810194, "train_speed(iter/s)": 0.24242 }, { "epoch": 0.909368092080189, "grad_norm": 0.5920147895812988, "learning_rate": 9.205796812482802e-05, "loss": 0.18976017236709594, "memory(GiB)": 122.96, "step": 11930, "token_acc": 0.937398900032352, "train_speed(iter/s)": 0.242439 }, { "epoch": 0.909749218690449, "grad_norm": 1.4929907321929932, "learning_rate": 9.205149183140849e-05, "loss": 0.19894405603408813, "memory(GiB)": 122.96, "step": 11935, "token_acc": 0.9274106175514626, "train_speed(iter/s)": 0.242467 }, { "epoch": 0.9101303453007089, "grad_norm": 0.6337088346481323, "learning_rate": 9.204501312653105e-05, "loss": 0.17703309059143066, "memory(GiB)": 122.96, "step": 11940, "token_acc": 0.9204326923076923, "train_speed(iter/s)": 0.242495 }, { "epoch": 0.9105114719109688, "grad_norm": 0.8873234987258911, "learning_rate": 9.203853201056722e-05, "loss": 0.10702053308486939, "memory(GiB)": 122.96, "step": 11945, "token_acc": 0.9467270194986073, "train_speed(iter/s)": 0.242528 }, { "epoch": 0.9108925985212287, "grad_norm": 1.1136358976364136, "learning_rate": 9.203204848388867e-05, "loss": 0.16947510242462158, "memory(GiB)": 122.96, "step": 11950, "token_acc": 0.9409078328259064, "train_speed(iter/s)": 0.242537 }, { "epoch": 0.9112737251314886, "grad_norm": 0.2798648476600647, "learning_rate": 9.202556254686719e-05, "loss": 0.13364312648773194, "memory(GiB)": 122.96, "step": 11955, "token_acc": 0.946081319976429, "train_speed(iter/s)": 0.242567 }, { "epoch": 0.9116548517417487, "grad_norm": 1.156320571899414, "learning_rate": 9.201907419987471e-05, "loss": 0.20252470970153807, "memory(GiB)": 122.96, "step": 11960, "token_acc": 0.9322588180331698, "train_speed(iter/s)": 0.242572 }, { "epoch": 0.9120359783520086, "grad_norm": 1.3011653423309326, "learning_rate": 9.201258344328332e-05, "loss": 0.1324612021446228, "memory(GiB)": 122.96, "step": 11965, "token_acc": 0.938949938949939, "train_speed(iter/s)": 0.242595 }, { "epoch": 0.9124171049622685, "grad_norm": 0.8753924369812012, "learning_rate": 9.200609027746524e-05, "loss": 0.16521816253662108, "memory(GiB)": 122.96, "step": 11970, "token_acc": 0.9456469456469456, "train_speed(iter/s)": 0.242619 }, { "epoch": 0.9127982315725284, "grad_norm": 1.6770938634872437, "learning_rate": 9.19995947027928e-05, "loss": 0.14216171503067015, "memory(GiB)": 122.96, "step": 11975, "token_acc": 0.9473977177845812, "train_speed(iter/s)": 0.242649 }, { "epoch": 0.9131793581827883, "grad_norm": 0.8921284079551697, "learning_rate": 9.199309671963852e-05, "loss": 0.11614675521850586, "memory(GiB)": 122.96, "step": 11980, "token_acc": 0.9495674581262654, "train_speed(iter/s)": 0.242668 }, { "epoch": 0.9135604847930483, "grad_norm": 0.9243173003196716, "learning_rate": 9.198659632837501e-05, "loss": 0.075668466091156, "memory(GiB)": 122.96, "step": 11985, "token_acc": 0.959643605870021, "train_speed(iter/s)": 0.242695 }, { "epoch": 0.9139416114033082, "grad_norm": 1.1301816701889038, "learning_rate": 9.198009352937504e-05, "loss": 0.19226405620574952, "memory(GiB)": 122.96, "step": 11990, "token_acc": 0.9277880884089759, "train_speed(iter/s)": 0.242717 }, { "epoch": 0.9143227380135681, "grad_norm": 0.7278009057044983, "learning_rate": 9.197358832301153e-05, "loss": 0.12943798303604126, "memory(GiB)": 122.96, "step": 11995, "token_acc": 0.9490366687383468, "train_speed(iter/s)": 0.242755 }, { "epoch": 0.914703864623828, "grad_norm": 0.3542937636375427, "learning_rate": 9.19670807096575e-05, "loss": 0.12744930982589722, "memory(GiB)": 122.96, "step": 12000, "token_acc": 0.943364457370274, "train_speed(iter/s)": 0.24277 }, { "epoch": 0.914703864623828, "eval_loss": 0.11662054061889648, "eval_runtime": 175.8834, "eval_samples_per_second": 3.013, "eval_steps_per_second": 3.013, "eval_token_acc": 0.9444009397024276, "step": 12000 }, { "epoch": 0.9150849912340879, "grad_norm": 1.172133207321167, "learning_rate": 9.196057068968613e-05, "loss": 0.16453678607940675, "memory(GiB)": 122.96, "step": 12005, "token_acc": 0.9444388799061342, "train_speed(iter/s)": 0.241925 }, { "epoch": 0.9154661178443478, "grad_norm": 0.588714063167572, "learning_rate": 9.195405826347077e-05, "loss": 0.13364335298538207, "memory(GiB)": 122.96, "step": 12010, "token_acc": 0.9535227459352932, "train_speed(iter/s)": 0.241944 }, { "epoch": 0.9158472444546079, "grad_norm": 1.0444954633712769, "learning_rate": 9.194754343138486e-05, "loss": 0.1653128981590271, "memory(GiB)": 122.96, "step": 12015, "token_acc": 0.9463912133891214, "train_speed(iter/s)": 0.24197 }, { "epoch": 0.9162283710648678, "grad_norm": 0.8094541430473328, "learning_rate": 9.194102619380198e-05, "loss": 0.1403631091117859, "memory(GiB)": 122.96, "step": 12020, "token_acc": 0.9442064463484292, "train_speed(iter/s)": 0.241981 }, { "epoch": 0.9166094976751277, "grad_norm": 1.0604108572006226, "learning_rate": 9.193450655109589e-05, "loss": 0.14506103992462158, "memory(GiB)": 122.96, "step": 12025, "token_acc": 0.9461538461538461, "train_speed(iter/s)": 0.241983 }, { "epoch": 0.9169906242853876, "grad_norm": 1.016266942024231, "learning_rate": 9.192798450364044e-05, "loss": 0.17972655296325685, "memory(GiB)": 122.96, "step": 12030, "token_acc": 0.9269921695071396, "train_speed(iter/s)": 0.24201 }, { "epoch": 0.9173717508956475, "grad_norm": 1.5686322450637817, "learning_rate": 9.192146005180967e-05, "loss": 0.16812283992767335, "memory(GiB)": 122.96, "step": 12035, "token_acc": 0.9339080459770115, "train_speed(iter/s)": 0.242035 }, { "epoch": 0.9177528775059075, "grad_norm": 0.5908207297325134, "learning_rate": 9.191493319597769e-05, "loss": 0.13720533847808838, "memory(GiB)": 122.96, "step": 12040, "token_acc": 0.9281705948372615, "train_speed(iter/s)": 0.242074 }, { "epoch": 0.9181340041161674, "grad_norm": 0.6789950728416443, "learning_rate": 9.190840393651878e-05, "loss": 0.18171563148498535, "memory(GiB)": 122.96, "step": 12045, "token_acc": 0.9337491337491337, "train_speed(iter/s)": 0.24209 }, { "epoch": 0.9185151307264273, "grad_norm": 0.45283934473991394, "learning_rate": 9.190187227380741e-05, "loss": 0.10858960151672363, "memory(GiB)": 122.96, "step": 12050, "token_acc": 0.9612636329447161, "train_speed(iter/s)": 0.242121 }, { "epoch": 0.9188962573366872, "grad_norm": 1.2656975984573364, "learning_rate": 9.18953382082181e-05, "loss": 0.14404940605163574, "memory(GiB)": 122.96, "step": 12055, "token_acc": 0.9592130518234165, "train_speed(iter/s)": 0.242146 }, { "epoch": 0.9192773839469471, "grad_norm": 1.6916420459747314, "learning_rate": 9.188880174012557e-05, "loss": 0.19252575635910035, "memory(GiB)": 122.96, "step": 12060, "token_acc": 0.9238694905552376, "train_speed(iter/s)": 0.242173 }, { "epoch": 0.9196585105572072, "grad_norm": 0.5286811590194702, "learning_rate": 9.188226286990465e-05, "loss": 0.20890052318573, "memory(GiB)": 122.96, "step": 12065, "token_acc": 0.9302971396834213, "train_speed(iter/s)": 0.242193 }, { "epoch": 0.9200396371674671, "grad_norm": 0.5750043988227844, "learning_rate": 9.18757215979303e-05, "loss": 0.18122923374176025, "memory(GiB)": 122.96, "step": 12070, "token_acc": 0.9187755102040817, "train_speed(iter/s)": 0.24222 }, { "epoch": 0.920420763777727, "grad_norm": 0.4938739538192749, "learning_rate": 9.186917792457766e-05, "loss": 0.10699106454849243, "memory(GiB)": 122.96, "step": 12075, "token_acc": 0.9535802469135802, "train_speed(iter/s)": 0.242249 }, { "epoch": 0.9208018903879869, "grad_norm": 0.4918135702610016, "learning_rate": 9.186263185022195e-05, "loss": 0.1499289631843567, "memory(GiB)": 122.96, "step": 12080, "token_acc": 0.9258737316798196, "train_speed(iter/s)": 0.24228 }, { "epoch": 0.9211830169982468, "grad_norm": 0.8623130917549133, "learning_rate": 9.185608337523858e-05, "loss": 0.10095444917678834, "memory(GiB)": 122.96, "step": 12085, "token_acc": 0.9533930469105815, "train_speed(iter/s)": 0.242293 }, { "epoch": 0.9215641436085067, "grad_norm": 0.4240442216396332, "learning_rate": 9.184953250000306e-05, "loss": 0.10906807184219361, "memory(GiB)": 122.96, "step": 12090, "token_acc": 0.9564817652467055, "train_speed(iter/s)": 0.242309 }, { "epoch": 0.9219452702187667, "grad_norm": 0.7125605344772339, "learning_rate": 9.184297922489104e-05, "loss": 0.1341407537460327, "memory(GiB)": 122.96, "step": 12095, "token_acc": 0.9400577756379394, "train_speed(iter/s)": 0.242338 }, { "epoch": 0.9223263968290266, "grad_norm": 0.5970594882965088, "learning_rate": 9.183642355027837e-05, "loss": 0.1077890157699585, "memory(GiB)": 122.96, "step": 12100, "token_acc": 0.9543482751497624, "train_speed(iter/s)": 0.24236 }, { "epoch": 0.9227075234392865, "grad_norm": 0.7044920921325684, "learning_rate": 9.182986547654093e-05, "loss": 0.11380608081817627, "memory(GiB)": 122.96, "step": 12105, "token_acc": 0.9502672554032071, "train_speed(iter/s)": 0.242391 }, { "epoch": 0.9230886500495464, "grad_norm": 1.2874997854232788, "learning_rate": 9.182330500405483e-05, "loss": 0.15849599838256836, "memory(GiB)": 122.96, "step": 12110, "token_acc": 0.942582378730024, "train_speed(iter/s)": 0.242408 }, { "epoch": 0.9234697766598063, "grad_norm": 0.9350869059562683, "learning_rate": 9.181674213319625e-05, "loss": 0.1166802167892456, "memory(GiB)": 122.96, "step": 12115, "token_acc": 0.9410282258064516, "train_speed(iter/s)": 0.242441 }, { "epoch": 0.9238509032700664, "grad_norm": 0.5365926623344421, "learning_rate": 9.181017686434159e-05, "loss": 0.14451483488082886, "memory(GiB)": 122.96, "step": 12120, "token_acc": 0.9372252902062437, "train_speed(iter/s)": 0.242452 }, { "epoch": 0.9242320298803263, "grad_norm": 1.0447312593460083, "learning_rate": 9.18036091978673e-05, "loss": 0.1282801151275635, "memory(GiB)": 122.96, "step": 12125, "token_acc": 0.9342583415923357, "train_speed(iter/s)": 0.242485 }, { "epoch": 0.9246131564905862, "grad_norm": 0.7450137734413147, "learning_rate": 9.179703913415001e-05, "loss": 0.10404415130615234, "memory(GiB)": 122.96, "step": 12130, "token_acc": 0.9616138516824567, "train_speed(iter/s)": 0.242501 }, { "epoch": 0.9249942831008461, "grad_norm": 1.4281253814697266, "learning_rate": 9.179046667356649e-05, "loss": 0.1529651403427124, "memory(GiB)": 122.96, "step": 12135, "token_acc": 0.9367204137511409, "train_speed(iter/s)": 0.242529 }, { "epoch": 0.925375409711106, "grad_norm": 0.7040519118309021, "learning_rate": 9.178389181649364e-05, "loss": 0.13391484022140504, "memory(GiB)": 122.96, "step": 12140, "token_acc": 0.942960615663196, "train_speed(iter/s)": 0.242528 }, { "epoch": 0.925756536321366, "grad_norm": 0.8738211989402771, "learning_rate": 9.177731456330849e-05, "loss": 0.14983537197113037, "memory(GiB)": 122.96, "step": 12145, "token_acc": 0.9513040607461208, "train_speed(iter/s)": 0.242541 }, { "epoch": 0.9261376629316259, "grad_norm": 0.19297455251216888, "learning_rate": 9.177073491438823e-05, "loss": 0.12810500860214233, "memory(GiB)": 122.96, "step": 12150, "token_acc": 0.9293655984303466, "train_speed(iter/s)": 0.24258 }, { "epoch": 0.9265187895418858, "grad_norm": 0.18724516034126282, "learning_rate": 9.176415287011015e-05, "loss": 0.12594324350357056, "memory(GiB)": 122.96, "step": 12155, "token_acc": 0.9547442799461642, "train_speed(iter/s)": 0.242602 }, { "epoch": 0.9268999161521457, "grad_norm": 0.749176025390625, "learning_rate": 9.175756843085173e-05, "loss": 0.17756813764572144, "memory(GiB)": 122.96, "step": 12160, "token_acc": 0.9319654427645788, "train_speed(iter/s)": 0.242622 }, { "epoch": 0.9272810427624056, "grad_norm": 1.18508780002594, "learning_rate": 9.175098159699052e-05, "loss": 0.1512345552444458, "memory(GiB)": 122.96, "step": 12165, "token_acc": 0.9407054750054101, "train_speed(iter/s)": 0.242647 }, { "epoch": 0.9276621693726655, "grad_norm": 1.6245020627975464, "learning_rate": 9.174439236890426e-05, "loss": 0.177998948097229, "memory(GiB)": 122.96, "step": 12170, "token_acc": 0.930623867460523, "train_speed(iter/s)": 0.242671 }, { "epoch": 0.9280432959829256, "grad_norm": 0.6511953473091125, "learning_rate": 9.173780074697084e-05, "loss": 0.16304129362106323, "memory(GiB)": 122.96, "step": 12175, "token_acc": 0.928555678059537, "train_speed(iter/s)": 0.242696 }, { "epoch": 0.9284244225931855, "grad_norm": 0.6800899505615234, "learning_rate": 9.173120673156822e-05, "loss": 0.10091533660888671, "memory(GiB)": 122.96, "step": 12180, "token_acc": 0.9547600913937547, "train_speed(iter/s)": 0.242709 }, { "epoch": 0.9288055492034454, "grad_norm": 0.5256918668746948, "learning_rate": 9.172461032307455e-05, "loss": 0.11389726400375366, "memory(GiB)": 122.96, "step": 12185, "token_acc": 0.9554455445544554, "train_speed(iter/s)": 0.242732 }, { "epoch": 0.9291866758137053, "grad_norm": 0.5612916946411133, "learning_rate": 9.171801152186811e-05, "loss": 0.1606438159942627, "memory(GiB)": 122.96, "step": 12190, "token_acc": 0.9496970809619975, "train_speed(iter/s)": 0.242756 }, { "epoch": 0.9295678024239652, "grad_norm": 0.5194783806800842, "learning_rate": 9.17114103283273e-05, "loss": 0.13194029331207274, "memory(GiB)": 122.96, "step": 12195, "token_acc": 0.9495949594959496, "train_speed(iter/s)": 0.242778 }, { "epoch": 0.9299489290342252, "grad_norm": 0.8489723205566406, "learning_rate": 9.170480674283066e-05, "loss": 0.127402400970459, "memory(GiB)": 122.96, "step": 12200, "token_acc": 0.9557522123893806, "train_speed(iter/s)": 0.242799 }, { "epoch": 0.9299489290342252, "eval_loss": 0.11756382882595062, "eval_runtime": 174.1205, "eval_samples_per_second": 3.044, "eval_steps_per_second": 3.044, "eval_token_acc": 0.9446644780434914, "step": 12200 }, { "epoch": 0.9303300556444851, "grad_norm": 0.6613253951072693, "learning_rate": 9.16982007657569e-05, "loss": 0.1644328236579895, "memory(GiB)": 122.96, "step": 12205, "token_acc": 0.9443151427538851, "train_speed(iter/s)": 0.241981 }, { "epoch": 0.930711182254745, "grad_norm": 0.9121660590171814, "learning_rate": 9.169159239748484e-05, "loss": 0.20351755619049072, "memory(GiB)": 122.96, "step": 12210, "token_acc": 0.9399624765478424, "train_speed(iter/s)": 0.241994 }, { "epoch": 0.9310923088650049, "grad_norm": 0.623970627784729, "learning_rate": 9.168498163839341e-05, "loss": 0.15243160724639893, "memory(GiB)": 122.96, "step": 12215, "token_acc": 0.936177533115431, "train_speed(iter/s)": 0.242016 }, { "epoch": 0.9314734354752648, "grad_norm": 1.2924596071243286, "learning_rate": 9.167836848886174e-05, "loss": 0.14186103343963624, "memory(GiB)": 122.96, "step": 12220, "token_acc": 0.9375605033881897, "train_speed(iter/s)": 0.242051 }, { "epoch": 0.9318545620855249, "grad_norm": 0.9273470044136047, "learning_rate": 9.167175294926904e-05, "loss": 0.21459746360778809, "memory(GiB)": 122.96, "step": 12225, "token_acc": 0.9415136120880553, "train_speed(iter/s)": 0.242062 }, { "epoch": 0.9322356886957848, "grad_norm": 0.9148798584938049, "learning_rate": 9.166513501999468e-05, "loss": 0.21885323524475098, "memory(GiB)": 122.96, "step": 12230, "token_acc": 0.9166825185466997, "train_speed(iter/s)": 0.242087 }, { "epoch": 0.9326168153060447, "grad_norm": 0.7379205822944641, "learning_rate": 9.16585147014182e-05, "loss": 0.13861508369445802, "memory(GiB)": 122.96, "step": 12235, "token_acc": 0.9403431993550616, "train_speed(iter/s)": 0.242091 }, { "epoch": 0.9329979419163046, "grad_norm": 2.0146560668945312, "learning_rate": 9.16518919939192e-05, "loss": 0.1408682346343994, "memory(GiB)": 122.96, "step": 12240, "token_acc": 0.9510413849066811, "train_speed(iter/s)": 0.242116 }, { "epoch": 0.9333790685265645, "grad_norm": 1.3773797750473022, "learning_rate": 9.164526689787749e-05, "loss": 0.13957394361495973, "memory(GiB)": 122.96, "step": 12245, "token_acc": 0.9570980615735462, "train_speed(iter/s)": 0.242116 }, { "epoch": 0.9337601951368244, "grad_norm": 0.7364859580993652, "learning_rate": 9.163863941367298e-05, "loss": 0.16790409088134767, "memory(GiB)": 122.96, "step": 12250, "token_acc": 0.9417133706965573, "train_speed(iter/s)": 0.242136 }, { "epoch": 0.9341413217470844, "grad_norm": 0.8190340399742126, "learning_rate": 9.163200954168573e-05, "loss": 0.1943003296852112, "memory(GiB)": 122.96, "step": 12255, "token_acc": 0.9168556311413454, "train_speed(iter/s)": 0.242167 }, { "epoch": 0.9345224483573443, "grad_norm": 0.8544818758964539, "learning_rate": 9.162537728229592e-05, "loss": 0.15196917057037354, "memory(GiB)": 122.96, "step": 12260, "token_acc": 0.9480213567839196, "train_speed(iter/s)": 0.242189 }, { "epoch": 0.9349035749676042, "grad_norm": 0.9462592005729675, "learning_rate": 9.16187426358839e-05, "loss": 0.19429467916488646, "memory(GiB)": 122.96, "step": 12265, "token_acc": 0.9252637423653526, "train_speed(iter/s)": 0.242194 }, { "epoch": 0.9352847015778641, "grad_norm": 0.6628739237785339, "learning_rate": 9.161210560283011e-05, "loss": 0.16618529558181763, "memory(GiB)": 122.96, "step": 12270, "token_acc": 0.9366056873754754, "train_speed(iter/s)": 0.242211 }, { "epoch": 0.935665828188124, "grad_norm": 1.2484318017959595, "learning_rate": 9.160546618351517e-05, "loss": 0.16949933767318726, "memory(GiB)": 122.96, "step": 12275, "token_acc": 0.9192233009708738, "train_speed(iter/s)": 0.242241 }, { "epoch": 0.9360469547983841, "grad_norm": 0.7353682518005371, "learning_rate": 9.159882437831984e-05, "loss": 0.0866227388381958, "memory(GiB)": 122.96, "step": 12280, "token_acc": 0.9572776949826131, "train_speed(iter/s)": 0.24227 }, { "epoch": 0.936428081408644, "grad_norm": 1.710636854171753, "learning_rate": 9.159218018762495e-05, "loss": 0.12940901517868042, "memory(GiB)": 122.96, "step": 12285, "token_acc": 0.9474446513552659, "train_speed(iter/s)": 0.242291 }, { "epoch": 0.9368092080189039, "grad_norm": 0.6022464036941528, "learning_rate": 9.158553361181154e-05, "loss": 0.10844582319259644, "memory(GiB)": 122.96, "step": 12290, "token_acc": 0.9587426326129665, "train_speed(iter/s)": 0.242305 }, { "epoch": 0.9371903346291638, "grad_norm": 0.8708229064941406, "learning_rate": 9.157888465126077e-05, "loss": 0.12513418197631837, "memory(GiB)": 122.96, "step": 12295, "token_acc": 0.9524375743162902, "train_speed(iter/s)": 0.242315 }, { "epoch": 0.9375714612394237, "grad_norm": 0.6610813140869141, "learning_rate": 9.157223330635391e-05, "loss": 0.09572759866714478, "memory(GiB)": 122.96, "step": 12300, "token_acc": 0.9571687110396139, "train_speed(iter/s)": 0.242335 }, { "epoch": 0.9379525878496837, "grad_norm": 1.1380579471588135, "learning_rate": 9.156557957747238e-05, "loss": 0.1792851209640503, "memory(GiB)": 122.96, "step": 12305, "token_acc": 0.9365604329524955, "train_speed(iter/s)": 0.242363 }, { "epoch": 0.9383337144599436, "grad_norm": 0.7449556589126587, "learning_rate": 9.155892346499776e-05, "loss": 0.157357656955719, "memory(GiB)": 122.96, "step": 12310, "token_acc": 0.9449644327967054, "train_speed(iter/s)": 0.242382 }, { "epoch": 0.9387148410702035, "grad_norm": 0.5486127138137817, "learning_rate": 9.155226496931173e-05, "loss": 0.11645959615707398, "memory(GiB)": 122.96, "step": 12315, "token_acc": 0.9476993865030675, "train_speed(iter/s)": 0.242397 }, { "epoch": 0.9390959676804634, "grad_norm": 0.41119739413261414, "learning_rate": 9.154560409079614e-05, "loss": 0.1562546968460083, "memory(GiB)": 122.96, "step": 12320, "token_acc": 0.9329189632930691, "train_speed(iter/s)": 0.242404 }, { "epoch": 0.9394770942907233, "grad_norm": 0.8167228102684021, "learning_rate": 9.153894082983295e-05, "loss": 0.17147165536880493, "memory(GiB)": 122.96, "step": 12325, "token_acc": 0.9292970337261276, "train_speed(iter/s)": 0.242422 }, { "epoch": 0.9398582209009833, "grad_norm": 1.3649063110351562, "learning_rate": 9.153227518680426e-05, "loss": 0.20161380767822265, "memory(GiB)": 122.96, "step": 12330, "token_acc": 0.9389014606048138, "train_speed(iter/s)": 0.242445 }, { "epoch": 0.9402393475112433, "grad_norm": 0.6043210625648499, "learning_rate": 9.15256071620923e-05, "loss": 0.1568316102027893, "memory(GiB)": 122.96, "step": 12335, "token_acc": 0.944613227389577, "train_speed(iter/s)": 0.242466 }, { "epoch": 0.9406204741215032, "grad_norm": 0.9746297001838684, "learning_rate": 9.15189367560795e-05, "loss": 0.10245785713195801, "memory(GiB)": 122.96, "step": 12340, "token_acc": 0.9493368123505109, "train_speed(iter/s)": 0.242492 }, { "epoch": 0.9410016007317631, "grad_norm": 0.9292162656784058, "learning_rate": 9.151226396914834e-05, "loss": 0.13858909606933595, "memory(GiB)": 122.96, "step": 12345, "token_acc": 0.931513297246411, "train_speed(iter/s)": 0.242518 }, { "epoch": 0.941382727342023, "grad_norm": 0.5452598929405212, "learning_rate": 9.150558880168148e-05, "loss": 0.1288095474243164, "memory(GiB)": 122.96, "step": 12350, "token_acc": 0.9481005885500268, "train_speed(iter/s)": 0.242529 }, { "epoch": 0.9417638539522829, "grad_norm": 1.2067897319793701, "learning_rate": 9.149891125406172e-05, "loss": 0.148617160320282, "memory(GiB)": 122.96, "step": 12355, "token_acc": 0.9491894507621582, "train_speed(iter/s)": 0.242555 }, { "epoch": 0.9421449805625429, "grad_norm": 1.083177089691162, "learning_rate": 9.149223132667197e-05, "loss": 0.1334935188293457, "memory(GiB)": 122.96, "step": 12360, "token_acc": 0.955533790401567, "train_speed(iter/s)": 0.242578 }, { "epoch": 0.9425261071728028, "grad_norm": 0.6250692009925842, "learning_rate": 9.14855490198953e-05, "loss": 0.10021820068359374, "memory(GiB)": 122.96, "step": 12365, "token_acc": 0.9611163374098464, "train_speed(iter/s)": 0.24261 }, { "epoch": 0.9429072337830627, "grad_norm": 0.5646321177482605, "learning_rate": 9.147886433411492e-05, "loss": 0.12314031124114991, "memory(GiB)": 122.96, "step": 12370, "token_acc": 0.9439769707705934, "train_speed(iter/s)": 0.242634 }, { "epoch": 0.9432883603933226, "grad_norm": 0.7545549273490906, "learning_rate": 9.147217726971416e-05, "loss": 0.11838376522064209, "memory(GiB)": 122.96, "step": 12375, "token_acc": 0.9513023782559457, "train_speed(iter/s)": 0.242664 }, { "epoch": 0.9436694870035826, "grad_norm": 0.8307051062583923, "learning_rate": 9.146548782707647e-05, "loss": 0.17386358976364136, "memory(GiB)": 122.96, "step": 12380, "token_acc": 0.9273821183713447, "train_speed(iter/s)": 0.242678 }, { "epoch": 0.9440506136138426, "grad_norm": 0.8261755704879761, "learning_rate": 9.145879600658548e-05, "loss": 0.1268669605255127, "memory(GiB)": 122.96, "step": 12385, "token_acc": 0.9478021978021978, "train_speed(iter/s)": 0.24269 }, { "epoch": 0.9444317402241025, "grad_norm": 2.044796943664551, "learning_rate": 9.145210180862493e-05, "loss": 0.16295242309570312, "memory(GiB)": 122.96, "step": 12390, "token_acc": 0.9307644110275689, "train_speed(iter/s)": 0.242711 }, { "epoch": 0.9448128668343624, "grad_norm": 0.8609843254089355, "learning_rate": 9.144540523357872e-05, "loss": 0.11911549568176269, "memory(GiB)": 122.96, "step": 12395, "token_acc": 0.9535874439461883, "train_speed(iter/s)": 0.242737 }, { "epoch": 0.9451939934446223, "grad_norm": 2.5846669673919678, "learning_rate": 9.143870628183083e-05, "loss": 0.10483273267745971, "memory(GiB)": 122.96, "step": 12400, "token_acc": 0.9568657874321179, "train_speed(iter/s)": 0.242751 }, { "epoch": 0.9451939934446223, "eval_loss": 0.1134624108672142, "eval_runtime": 161.8205, "eval_samples_per_second": 3.275, "eval_steps_per_second": 3.275, "eval_token_acc": 0.9463059454249744, "step": 12400 }, { "epoch": 0.9455751200548822, "grad_norm": 2.0358991622924805, "learning_rate": 9.143200495376545e-05, "loss": 0.19119592905044555, "memory(GiB)": 122.96, "step": 12405, "token_acc": 0.9461599075049156, "train_speed(iter/s)": 0.242021 }, { "epoch": 0.9459562466651421, "grad_norm": 0.33811354637145996, "learning_rate": 9.142530124976683e-05, "loss": 0.1471969962120056, "memory(GiB)": 122.96, "step": 12410, "token_acc": 0.9510655090765588, "train_speed(iter/s)": 0.242061 }, { "epoch": 0.9463373732754021, "grad_norm": 0.9500938653945923, "learning_rate": 9.141859517021945e-05, "loss": 0.14927375316619873, "memory(GiB)": 122.96, "step": 12415, "token_acc": 0.9455623147714866, "train_speed(iter/s)": 0.242078 }, { "epoch": 0.946718499885662, "grad_norm": 0.8465932607650757, "learning_rate": 9.141188671550782e-05, "loss": 0.19367703199386596, "memory(GiB)": 122.96, "step": 12420, "token_acc": 0.915057915057915, "train_speed(iter/s)": 0.242099 }, { "epoch": 0.9470996264959219, "grad_norm": 1.0269887447357178, "learning_rate": 9.140517588601667e-05, "loss": 0.12113461494445801, "memory(GiB)": 122.96, "step": 12425, "token_acc": 0.9513055751587862, "train_speed(iter/s)": 0.242139 }, { "epoch": 0.9474807531061818, "grad_norm": 1.2539961338043213, "learning_rate": 9.139846268213083e-05, "loss": 0.1467184066772461, "memory(GiB)": 122.96, "step": 12430, "token_acc": 0.9373636646930508, "train_speed(iter/s)": 0.24217 }, { "epoch": 0.9478618797164418, "grad_norm": 1.065714716911316, "learning_rate": 9.139174710423525e-05, "loss": 0.17355780601501464, "memory(GiB)": 122.96, "step": 12435, "token_acc": 0.9239543726235742, "train_speed(iter/s)": 0.242198 }, { "epoch": 0.9482430063267018, "grad_norm": 1.2264153957366943, "learning_rate": 9.138502915271508e-05, "loss": 0.13309202194213868, "memory(GiB)": 122.96, "step": 12440, "token_acc": 0.9573585531773685, "train_speed(iter/s)": 0.242221 }, { "epoch": 0.9486241329369617, "grad_norm": 0.8538663387298584, "learning_rate": 9.137830882795552e-05, "loss": 0.19082493782043458, "memory(GiB)": 122.96, "step": 12445, "token_acc": 0.9269570011025359, "train_speed(iter/s)": 0.242236 }, { "epoch": 0.9490052595472216, "grad_norm": 1.2173811197280884, "learning_rate": 9.137158613034198e-05, "loss": 0.17732337713241578, "memory(GiB)": 122.96, "step": 12450, "token_acc": 0.9360730593607306, "train_speed(iter/s)": 0.242254 }, { "epoch": 0.9493863861574815, "grad_norm": 0.605402946472168, "learning_rate": 9.136486106025996e-05, "loss": 0.13959741592407227, "memory(GiB)": 122.96, "step": 12455, "token_acc": 0.9447998537744471, "train_speed(iter/s)": 0.242282 }, { "epoch": 0.9497675127677414, "grad_norm": 0.7813966274261475, "learning_rate": 9.13581336180951e-05, "loss": 0.13818759918212892, "memory(GiB)": 122.96, "step": 12460, "token_acc": 0.9445552453131233, "train_speed(iter/s)": 0.242307 }, { "epoch": 0.9501486393780014, "grad_norm": 0.8371070623397827, "learning_rate": 9.135140380423324e-05, "loss": 0.1973399519920349, "memory(GiB)": 122.96, "step": 12465, "token_acc": 0.9247193484481621, "train_speed(iter/s)": 0.242332 }, { "epoch": 0.9505297659882613, "grad_norm": 0.9141432046890259, "learning_rate": 9.134467161906024e-05, "loss": 0.1688783884048462, "memory(GiB)": 122.96, "step": 12470, "token_acc": 0.9245591710598073, "train_speed(iter/s)": 0.242358 }, { "epoch": 0.9509108925985212, "grad_norm": 1.034576654434204, "learning_rate": 9.133793706296217e-05, "loss": 0.14325401782989503, "memory(GiB)": 122.96, "step": 12475, "token_acc": 0.9196601941747573, "train_speed(iter/s)": 0.242391 }, { "epoch": 0.9512920192087811, "grad_norm": 0.9074895977973938, "learning_rate": 9.133120013632526e-05, "loss": 0.14475760459899903, "memory(GiB)": 122.96, "step": 12480, "token_acc": 0.9478218465539662, "train_speed(iter/s)": 0.242411 }, { "epoch": 0.951673145819041, "grad_norm": 1.2916032075881958, "learning_rate": 9.132446083953582e-05, "loss": 0.17250173091888427, "memory(GiB)": 122.96, "step": 12485, "token_acc": 0.9295634920634921, "train_speed(iter/s)": 0.242442 }, { "epoch": 0.952054272429301, "grad_norm": 0.4414958357810974, "learning_rate": 9.131771917298032e-05, "loss": 0.1321173667907715, "memory(GiB)": 122.96, "step": 12490, "token_acc": 0.9514209375759047, "train_speed(iter/s)": 0.242468 }, { "epoch": 0.952435399039561, "grad_norm": 0.7453198432922363, "learning_rate": 9.131097513704536e-05, "loss": 0.15776137113571168, "memory(GiB)": 122.96, "step": 12495, "token_acc": 0.936750651607298, "train_speed(iter/s)": 0.24249 }, { "epoch": 0.9528165256498209, "grad_norm": 0.9814322590827942, "learning_rate": 9.130422873211768e-05, "loss": 0.17042393684387208, "memory(GiB)": 122.96, "step": 12500, "token_acc": 0.9350923482849605, "train_speed(iter/s)": 0.242521 }, { "epoch": 0.9531976522600808, "grad_norm": 0.9558305144309998, "learning_rate": 9.129747995858418e-05, "loss": 0.12569780349731446, "memory(GiB)": 122.96, "step": 12505, "token_acc": 0.9492543957266859, "train_speed(iter/s)": 0.242543 }, { "epoch": 0.9535787788703407, "grad_norm": 0.7548251748085022, "learning_rate": 9.129072881683181e-05, "loss": 0.16030017137527466, "memory(GiB)": 122.96, "step": 12510, "token_acc": 0.9379007144165598, "train_speed(iter/s)": 0.242564 }, { "epoch": 0.9539599054806006, "grad_norm": 1.0573387145996094, "learning_rate": 9.128397530724778e-05, "loss": 0.11236531734466552, "memory(GiB)": 122.96, "step": 12515, "token_acc": 0.9453681710213777, "train_speed(iter/s)": 0.242601 }, { "epoch": 0.9543410320908606, "grad_norm": 0.846492350101471, "learning_rate": 9.127721943021934e-05, "loss": 0.11807132959365844, "memory(GiB)": 122.96, "step": 12520, "token_acc": 0.9395418700713482, "train_speed(iter/s)": 0.242633 }, { "epoch": 0.9547221587011205, "grad_norm": 1.4017573595046997, "learning_rate": 9.127046118613392e-05, "loss": 0.12895534038543702, "memory(GiB)": 122.96, "step": 12525, "token_acc": 0.9441970911249629, "train_speed(iter/s)": 0.242666 }, { "epoch": 0.9551032853113804, "grad_norm": 0.6587196588516235, "learning_rate": 9.126370057537906e-05, "loss": 0.12625349760055543, "memory(GiB)": 122.96, "step": 12530, "token_acc": 0.945712523133868, "train_speed(iter/s)": 0.242691 }, { "epoch": 0.9554844119216404, "grad_norm": 1.005194902420044, "learning_rate": 9.125693759834247e-05, "loss": 0.18841396570205687, "memory(GiB)": 122.96, "step": 12535, "token_acc": 0.9406636670416197, "train_speed(iter/s)": 0.242697 }, { "epoch": 0.9558655385319003, "grad_norm": 0.6691261529922485, "learning_rate": 9.125017225541196e-05, "loss": 0.1545950651168823, "memory(GiB)": 122.96, "step": 12540, "token_acc": 0.9371418338108882, "train_speed(iter/s)": 0.242721 }, { "epoch": 0.9562466651421603, "grad_norm": 1.5457665920257568, "learning_rate": 9.124340454697549e-05, "loss": 0.15577960014343262, "memory(GiB)": 122.96, "step": 12545, "token_acc": 0.9561944904410657, "train_speed(iter/s)": 0.242736 }, { "epoch": 0.9566277917524202, "grad_norm": 4.163644313812256, "learning_rate": 9.123663447342117e-05, "loss": 0.05284888744354248, "memory(GiB)": 122.96, "step": 12550, "token_acc": 0.96533203125, "train_speed(iter/s)": 0.242772 }, { "epoch": 0.9570089183626801, "grad_norm": 0.9373558163642883, "learning_rate": 9.122986203513722e-05, "loss": 0.11360034942626954, "memory(GiB)": 122.96, "step": 12555, "token_acc": 0.9516150650046911, "train_speed(iter/s)": 0.242776 }, { "epoch": 0.95739004497294, "grad_norm": 1.166609525680542, "learning_rate": 9.1223087232512e-05, "loss": 0.12866644859313964, "memory(GiB)": 122.96, "step": 12560, "token_acc": 0.9610027855153204, "train_speed(iter/s)": 0.242796 }, { "epoch": 0.9577711715831999, "grad_norm": 1.1007999181747437, "learning_rate": 9.121631006593406e-05, "loss": 0.14236094951629638, "memory(GiB)": 122.96, "step": 12565, "token_acc": 0.9532984293193717, "train_speed(iter/s)": 0.242815 }, { "epoch": 0.9581522981934598, "grad_norm": 1.0412156581878662, "learning_rate": 9.120953053579198e-05, "loss": 0.1653854489326477, "memory(GiB)": 122.96, "step": 12570, "token_acc": 0.9299290780141845, "train_speed(iter/s)": 0.242845 }, { "epoch": 0.9585334248037198, "grad_norm": 1.0808460712432861, "learning_rate": 9.120274864247455e-05, "loss": 0.16031842231750487, "memory(GiB)": 122.96, "step": 12575, "token_acc": 0.9420631182289213, "train_speed(iter/s)": 0.242869 }, { "epoch": 0.9589145514139797, "grad_norm": 2.9490091800689697, "learning_rate": 9.11959643863707e-05, "loss": 0.17578827142715453, "memory(GiB)": 122.96, "step": 12580, "token_acc": 0.9136325148179509, "train_speed(iter/s)": 0.242904 }, { "epoch": 0.9592956780242397, "grad_norm": 0.4628011882305145, "learning_rate": 9.118917776786949e-05, "loss": 0.12529417276382446, "memory(GiB)": 122.96, "step": 12585, "token_acc": 0.949454200284765, "train_speed(iter/s)": 0.24292 }, { "epoch": 0.9596768046344996, "grad_norm": 0.6801323294639587, "learning_rate": 9.118238878736004e-05, "loss": 0.19715020656585694, "memory(GiB)": 122.96, "step": 12590, "token_acc": 0.9165322580645161, "train_speed(iter/s)": 0.242939 }, { "epoch": 0.9600579312447595, "grad_norm": 0.5149279236793518, "learning_rate": 9.117559744523172e-05, "loss": 0.16272096633911132, "memory(GiB)": 122.96, "step": 12595, "token_acc": 0.935361216730038, "train_speed(iter/s)": 0.242968 }, { "epoch": 0.9604390578550195, "grad_norm": 0.7931864857673645, "learning_rate": 9.116880374187395e-05, "loss": 0.09934694766998291, "memory(GiB)": 122.96, "step": 12600, "token_acc": 0.9559386973180076, "train_speed(iter/s)": 0.242998 }, { "epoch": 0.9604390578550195, "eval_loss": 0.11736884713172913, "eval_runtime": 160.8089, "eval_samples_per_second": 3.296, "eval_steps_per_second": 3.296, "eval_token_acc": 0.945108728389856, "step": 12600 }, { "epoch": 0.9608201844652794, "grad_norm": 0.4606764018535614, "learning_rate": 9.116200767767636e-05, "loss": 0.19204888343811036, "memory(GiB)": 122.96, "step": 12605, "token_acc": 0.9445288359651138, "train_speed(iter/s)": 0.242283 }, { "epoch": 0.9612013110755393, "grad_norm": 1.0171664953231812, "learning_rate": 9.115520925302862e-05, "loss": 0.15200179815292358, "memory(GiB)": 122.96, "step": 12610, "token_acc": 0.9323843416370107, "train_speed(iter/s)": 0.242317 }, { "epoch": 0.9615824376857992, "grad_norm": 0.8562371134757996, "learning_rate": 9.114840846832063e-05, "loss": 0.1556476831436157, "memory(GiB)": 122.96, "step": 12615, "token_acc": 0.9403681290973273, "train_speed(iter/s)": 0.242327 }, { "epoch": 0.9619635642960591, "grad_norm": 0.7140572667121887, "learning_rate": 9.114160532394235e-05, "loss": 0.1564452528953552, "memory(GiB)": 122.96, "step": 12620, "token_acc": 0.9382213170400543, "train_speed(iter/s)": 0.242344 }, { "epoch": 0.9623446909063191, "grad_norm": 0.536054253578186, "learning_rate": 9.113479982028392e-05, "loss": 0.10607349872589111, "memory(GiB)": 122.96, "step": 12625, "token_acc": 0.9530367717945186, "train_speed(iter/s)": 0.242356 }, { "epoch": 0.962725817516579, "grad_norm": 1.4696619510650635, "learning_rate": 9.112799195773562e-05, "loss": 0.12701845169067383, "memory(GiB)": 122.96, "step": 12630, "token_acc": 0.9432058584214809, "train_speed(iter/s)": 0.242377 }, { "epoch": 0.963106944126839, "grad_norm": 1.049810767173767, "learning_rate": 9.112118173668784e-05, "loss": 0.09789316654205323, "memory(GiB)": 122.96, "step": 12635, "token_acc": 0.9566787003610109, "train_speed(iter/s)": 0.242411 }, { "epoch": 0.9634880707370989, "grad_norm": 2.4460391998291016, "learning_rate": 9.111436915753112e-05, "loss": 0.24358758926391602, "memory(GiB)": 122.96, "step": 12640, "token_acc": 0.9136690647482014, "train_speed(iter/s)": 0.242438 }, { "epoch": 0.9638691973473588, "grad_norm": 1.1424875259399414, "learning_rate": 9.110755422065611e-05, "loss": 0.1291264057159424, "memory(GiB)": 122.96, "step": 12645, "token_acc": 0.9537767756482526, "train_speed(iter/s)": 0.242458 }, { "epoch": 0.9642503239576187, "grad_norm": 0.5858159065246582, "learning_rate": 9.110073692645363e-05, "loss": 0.22132444381713867, "memory(GiB)": 122.96, "step": 12650, "token_acc": 0.9265114662960389, "train_speed(iter/s)": 0.24248 }, { "epoch": 0.9646314505678787, "grad_norm": 0.9348229169845581, "learning_rate": 9.109391727531463e-05, "loss": 0.1623238205909729, "memory(GiB)": 122.96, "step": 12655, "token_acc": 0.9408872041798996, "train_speed(iter/s)": 0.242476 }, { "epoch": 0.9650125771781386, "grad_norm": 1.4766432046890259, "learning_rate": 9.108709526763016e-05, "loss": 0.21986565589904786, "memory(GiB)": 122.96, "step": 12660, "token_acc": 0.9324947589098532, "train_speed(iter/s)": 0.2425 }, { "epoch": 0.9653937037883985, "grad_norm": 1.383824348449707, "learning_rate": 9.108027090379145e-05, "loss": 0.18708276748657227, "memory(GiB)": 122.96, "step": 12665, "token_acc": 0.9366827253957329, "train_speed(iter/s)": 0.242518 }, { "epoch": 0.9657748303986584, "grad_norm": 1.3704112768173218, "learning_rate": 9.107344418418984e-05, "loss": 0.09474117755889892, "memory(GiB)": 122.96, "step": 12670, "token_acc": 0.9607694637988489, "train_speed(iter/s)": 0.242525 }, { "epoch": 0.9661559570089183, "grad_norm": 1.1993939876556396, "learning_rate": 9.10666151092168e-05, "loss": 0.14071383476257324, "memory(GiB)": 122.96, "step": 12675, "token_acc": 0.9421878358048571, "train_speed(iter/s)": 0.242548 }, { "epoch": 0.9665370836191783, "grad_norm": 0.9820361137390137, "learning_rate": 9.105978367926396e-05, "loss": 0.1502423882484436, "memory(GiB)": 122.96, "step": 12680, "token_acc": 0.9358552631578947, "train_speed(iter/s)": 0.242567 }, { "epoch": 0.9669182102294382, "grad_norm": 0.6660642027854919, "learning_rate": 9.105294989472308e-05, "loss": 0.09964171051979065, "memory(GiB)": 122.96, "step": 12685, "token_acc": 0.9600261054005548, "train_speed(iter/s)": 0.242586 }, { "epoch": 0.9672993368396982, "grad_norm": 1.3371837139129639, "learning_rate": 9.104611375598602e-05, "loss": 0.1763326644897461, "memory(GiB)": 122.96, "step": 12690, "token_acc": 0.9477501480165779, "train_speed(iter/s)": 0.242596 }, { "epoch": 0.9676804634499581, "grad_norm": 0.8384644389152527, "learning_rate": 9.103927526344482e-05, "loss": 0.14857335090637208, "memory(GiB)": 122.96, "step": 12695, "token_acc": 0.9402325581395349, "train_speed(iter/s)": 0.242619 }, { "epoch": 0.968061590060218, "grad_norm": 0.9216745495796204, "learning_rate": 9.103243441749162e-05, "loss": 0.21381356716156005, "memory(GiB)": 122.96, "step": 12700, "token_acc": 0.9041146216017634, "train_speed(iter/s)": 0.242653 }, { "epoch": 0.968442716670478, "grad_norm": 1.0271165370941162, "learning_rate": 9.102559121851873e-05, "loss": 0.10878422260284423, "memory(GiB)": 122.96, "step": 12705, "token_acc": 0.9571082670356278, "train_speed(iter/s)": 0.242664 }, { "epoch": 0.9688238432807379, "grad_norm": 0.9268117547035217, "learning_rate": 9.101874566691855e-05, "loss": 0.15361542701721193, "memory(GiB)": 122.96, "step": 12710, "token_acc": 0.9438943894389439, "train_speed(iter/s)": 0.24269 }, { "epoch": 0.9692049698909978, "grad_norm": 0.15602844953536987, "learning_rate": 9.101189776308368e-05, "loss": 0.1676180362701416, "memory(GiB)": 122.96, "step": 12715, "token_acc": 0.9165558510638298, "train_speed(iter/s)": 0.242723 }, { "epoch": 0.9695860965012577, "grad_norm": 0.8222043514251709, "learning_rate": 9.100504750740677e-05, "loss": 0.137641441822052, "memory(GiB)": 122.96, "step": 12720, "token_acc": 0.9568965517241379, "train_speed(iter/s)": 0.24274 }, { "epoch": 0.9699672231115176, "grad_norm": 0.40221932530403137, "learning_rate": 9.099819490028067e-05, "loss": 0.12060710191726684, "memory(GiB)": 122.96, "step": 12725, "token_acc": 0.9425182481751825, "train_speed(iter/s)": 0.242776 }, { "epoch": 0.9703483497217775, "grad_norm": 1.5900017023086548, "learning_rate": 9.099133994209837e-05, "loss": 0.1688591480255127, "memory(GiB)": 122.96, "step": 12730, "token_acc": 0.9474062107341591, "train_speed(iter/s)": 0.242795 }, { "epoch": 0.9707294763320375, "grad_norm": 0.674226701259613, "learning_rate": 9.098448263325294e-05, "loss": 0.12622933387756347, "memory(GiB)": 122.96, "step": 12735, "token_acc": 0.9477977161500816, "train_speed(iter/s)": 0.242802 }, { "epoch": 0.9711106029422975, "grad_norm": 0.5303128957748413, "learning_rate": 9.097762297413761e-05, "loss": 0.12346469163894654, "memory(GiB)": 122.96, "step": 12740, "token_acc": 0.9631317315658657, "train_speed(iter/s)": 0.242821 }, { "epoch": 0.9714917295525574, "grad_norm": 0.5571014285087585, "learning_rate": 9.097076096514576e-05, "loss": 0.11265636682510376, "memory(GiB)": 122.96, "step": 12745, "token_acc": 0.9571862540345548, "train_speed(iter/s)": 0.242822 }, { "epoch": 0.9718728561628173, "grad_norm": 1.0209358930587769, "learning_rate": 9.096389660667091e-05, "loss": 0.14302513599395753, "memory(GiB)": 122.96, "step": 12750, "token_acc": 0.941367022999676, "train_speed(iter/s)": 0.242852 }, { "epoch": 0.9722539827730772, "grad_norm": 0.6059298515319824, "learning_rate": 9.095702989910669e-05, "loss": 0.1385814905166626, "memory(GiB)": 122.96, "step": 12755, "token_acc": 0.9496085409252669, "train_speed(iter/s)": 0.242867 }, { "epoch": 0.9726351093833372, "grad_norm": 0.9591848850250244, "learning_rate": 9.095016084284686e-05, "loss": 0.16359013319015503, "memory(GiB)": 122.96, "step": 12760, "token_acc": 0.9422512234910277, "train_speed(iter/s)": 0.242882 }, { "epoch": 0.9730162359935971, "grad_norm": 0.9698325395584106, "learning_rate": 9.094328943828534e-05, "loss": 0.12806707620620728, "memory(GiB)": 122.96, "step": 12765, "token_acc": 0.9471705992900397, "train_speed(iter/s)": 0.242905 }, { "epoch": 0.973397362603857, "grad_norm": 1.2850149869918823, "learning_rate": 9.093641568581617e-05, "loss": 0.13033506870269776, "memory(GiB)": 122.96, "step": 12770, "token_acc": 0.9460882192775458, "train_speed(iter/s)": 0.242931 }, { "epoch": 0.9737784892141169, "grad_norm": 1.2702388763427734, "learning_rate": 9.092953958583352e-05, "loss": 0.14732524156570434, "memory(GiB)": 122.96, "step": 12775, "token_acc": 0.9439182915506036, "train_speed(iter/s)": 0.242954 }, { "epoch": 0.9741596158243768, "grad_norm": 1.1373991966247559, "learning_rate": 9.092266113873171e-05, "loss": 0.09388877749443054, "memory(GiB)": 122.96, "step": 12780, "token_acc": 0.9460161662817552, "train_speed(iter/s)": 0.242985 }, { "epoch": 0.9745407424346367, "grad_norm": 0.868877112865448, "learning_rate": 9.09157803449052e-05, "loss": 0.13880642652511596, "memory(GiB)": 122.96, "step": 12785, "token_acc": 0.9407204742362061, "train_speed(iter/s)": 0.243012 }, { "epoch": 0.9749218690448967, "grad_norm": 1.4632521867752075, "learning_rate": 9.090889720474856e-05, "loss": 0.1460828423500061, "memory(GiB)": 122.96, "step": 12790, "token_acc": 0.9382174911292006, "train_speed(iter/s)": 0.243039 }, { "epoch": 0.9753029956551567, "grad_norm": 1.0088013410568237, "learning_rate": 9.09020117186565e-05, "loss": 0.14504342079162597, "memory(GiB)": 122.96, "step": 12795, "token_acc": 0.9451922041563476, "train_speed(iter/s)": 0.243046 }, { "epoch": 0.9756841222654166, "grad_norm": 1.6049000024795532, "learning_rate": 9.089512388702388e-05, "loss": 0.13781250715255738, "memory(GiB)": 122.96, "step": 12800, "token_acc": 0.9433322022395657, "train_speed(iter/s)": 0.243067 }, { "epoch": 0.9756841222654166, "eval_loss": 0.11753977090120316, "eval_runtime": 157.412, "eval_samples_per_second": 3.367, "eval_steps_per_second": 3.367, "eval_token_acc": 0.9456358050719836, "step": 12800 }, { "epoch": 0.9760652488756765, "grad_norm": 1.3490016460418701, "learning_rate": 9.088823371024568e-05, "loss": 0.20663738250732422, "memory(GiB)": 122.96, "step": 12805, "token_acc": 0.9450270873520912, "train_speed(iter/s)": 0.242376 }, { "epoch": 0.9764463754859364, "grad_norm": 0.8341593146324158, "learning_rate": 9.088134118871702e-05, "loss": 0.13109879493713378, "memory(GiB)": 122.96, "step": 12810, "token_acc": 0.9411356138218441, "train_speed(iter/s)": 0.242396 }, { "epoch": 0.9768275020961964, "grad_norm": 0.6089208722114563, "learning_rate": 9.087444632283315e-05, "loss": 0.13605031967163086, "memory(GiB)": 122.96, "step": 12815, "token_acc": 0.9483695652173914, "train_speed(iter/s)": 0.24242 }, { "epoch": 0.9772086287064563, "grad_norm": 1.1056259870529175, "learning_rate": 9.086754911298946e-05, "loss": 0.18811968564987183, "memory(GiB)": 122.96, "step": 12820, "token_acc": 0.9230628988149499, "train_speed(iter/s)": 0.242446 }, { "epoch": 0.9775897553167162, "grad_norm": 1.0476961135864258, "learning_rate": 9.08606495595815e-05, "loss": 0.15532393455505372, "memory(GiB)": 122.96, "step": 12825, "token_acc": 0.9351137487636004, "train_speed(iter/s)": 0.242469 }, { "epoch": 0.9779708819269761, "grad_norm": 1.5626649856567383, "learning_rate": 9.085374766300489e-05, "loss": 0.15314501523971558, "memory(GiB)": 122.96, "step": 12830, "token_acc": 0.9534286762009534, "train_speed(iter/s)": 0.242488 }, { "epoch": 0.978352008537236, "grad_norm": 0.9460096955299377, "learning_rate": 9.084684342365544e-05, "loss": 0.1483892560005188, "memory(GiB)": 122.96, "step": 12835, "token_acc": 0.9460101867572156, "train_speed(iter/s)": 0.242509 }, { "epoch": 0.978733135147496, "grad_norm": 0.9304004907608032, "learning_rate": 9.083993684192907e-05, "loss": 0.15346908569335938, "memory(GiB)": 122.96, "step": 12840, "token_acc": 0.9366768897708821, "train_speed(iter/s)": 0.242529 }, { "epoch": 0.979114261757756, "grad_norm": 0.9307335615158081, "learning_rate": 9.083302791822184e-05, "loss": 0.1419435977935791, "memory(GiB)": 122.96, "step": 12845, "token_acc": 0.9553813687066627, "train_speed(iter/s)": 0.242558 }, { "epoch": 0.9794953883680159, "grad_norm": 1.736047387123108, "learning_rate": 9.082611665292995e-05, "loss": 0.23424105644226073, "memory(GiB)": 122.96, "step": 12850, "token_acc": 0.9054652880354506, "train_speed(iter/s)": 0.24259 }, { "epoch": 0.9798765149782758, "grad_norm": 0.5830547213554382, "learning_rate": 9.081920304644973e-05, "loss": 0.14928051233291625, "memory(GiB)": 122.96, "step": 12855, "token_acc": 0.9533396494552345, "train_speed(iter/s)": 0.242616 }, { "epoch": 0.9802576415885357, "grad_norm": 2.2217087745666504, "learning_rate": 9.081228709917764e-05, "loss": 0.12877252101898193, "memory(GiB)": 122.96, "step": 12860, "token_acc": 0.9378228049264998, "train_speed(iter/s)": 0.242641 }, { "epoch": 0.9806387681987956, "grad_norm": 0.4636951684951782, "learning_rate": 9.080536881151027e-05, "loss": 0.17830157279968262, "memory(GiB)": 122.96, "step": 12865, "token_acc": 0.9321659299557974, "train_speed(iter/s)": 0.24266 }, { "epoch": 0.9810198948090556, "grad_norm": 1.1657698154449463, "learning_rate": 9.079844818384436e-05, "loss": 0.138388991355896, "memory(GiB)": 122.96, "step": 12870, "token_acc": 0.9357820481634639, "train_speed(iter/s)": 0.242689 }, { "epoch": 0.9814010214193155, "grad_norm": 0.8685896992683411, "learning_rate": 9.079152521657676e-05, "loss": 0.15551928281784058, "memory(GiB)": 122.96, "step": 12875, "token_acc": 0.9359165424739195, "train_speed(iter/s)": 0.242717 }, { "epoch": 0.9817821480295754, "grad_norm": 0.7978358268737793, "learning_rate": 9.07845999101045e-05, "loss": 0.12114787101745605, "memory(GiB)": 122.96, "step": 12880, "token_acc": 0.9606299212598425, "train_speed(iter/s)": 0.242748 }, { "epoch": 0.9821632746398353, "grad_norm": 0.8418326377868652, "learning_rate": 9.077767226482472e-05, "loss": 0.14338310956954955, "memory(GiB)": 122.96, "step": 12885, "token_acc": 0.9326814591951862, "train_speed(iter/s)": 0.24278 }, { "epoch": 0.9825444012500952, "grad_norm": 0.8159209489822388, "learning_rate": 9.077074228113463e-05, "loss": 0.17863924503326417, "memory(GiB)": 122.96, "step": 12890, "token_acc": 0.929047131147541, "train_speed(iter/s)": 0.242805 }, { "epoch": 0.9829255278603553, "grad_norm": 0.5453921556472778, "learning_rate": 9.076380995943169e-05, "loss": 0.17117899656295776, "memory(GiB)": 122.96, "step": 12895, "token_acc": 0.9308768154922001, "train_speed(iter/s)": 0.242829 }, { "epoch": 0.9833066544706152, "grad_norm": 1.0200836658477783, "learning_rate": 9.07568753001134e-05, "loss": 0.14029661417007447, "memory(GiB)": 122.96, "step": 12900, "token_acc": 0.9508733624454149, "train_speed(iter/s)": 0.242857 }, { "epoch": 0.9836877810808751, "grad_norm": 0.9157674908638, "learning_rate": 9.074993830357748e-05, "loss": 0.11890754699707032, "memory(GiB)": 122.96, "step": 12905, "token_acc": 0.951346893897746, "train_speed(iter/s)": 0.242862 }, { "epoch": 0.984068907691135, "grad_norm": 0.8744694590568542, "learning_rate": 9.074299897022167e-05, "loss": 0.12556229829788207, "memory(GiB)": 122.96, "step": 12910, "token_acc": 0.939961759082218, "train_speed(iter/s)": 0.242896 }, { "epoch": 0.9844500343013949, "grad_norm": 0.8613981008529663, "learning_rate": 9.073605730044394e-05, "loss": 0.14889154434204102, "memory(GiB)": 122.96, "step": 12915, "token_acc": 0.9409794225554314, "train_speed(iter/s)": 0.242915 }, { "epoch": 0.9848311609116549, "grad_norm": 1.2100050449371338, "learning_rate": 9.072911329464238e-05, "loss": 0.11992695331573486, "memory(GiB)": 122.96, "step": 12920, "token_acc": 0.9549038935436176, "train_speed(iter/s)": 0.242941 }, { "epoch": 0.9852122875219148, "grad_norm": 0.44918274879455566, "learning_rate": 9.072216695321517e-05, "loss": 0.1055149793624878, "memory(GiB)": 122.96, "step": 12925, "token_acc": 0.9600169240533107, "train_speed(iter/s)": 0.242962 }, { "epoch": 0.9855934141321747, "grad_norm": 1.1258149147033691, "learning_rate": 9.071521827656066e-05, "loss": 0.12806191444396972, "memory(GiB)": 122.96, "step": 12930, "token_acc": 0.9515228831127411, "train_speed(iter/s)": 0.242973 }, { "epoch": 0.9859745407424346, "grad_norm": 0.8813290596008301, "learning_rate": 9.070826726507732e-05, "loss": 0.1328412413597107, "memory(GiB)": 122.96, "step": 12935, "token_acc": 0.9544626593806922, "train_speed(iter/s)": 0.243007 }, { "epoch": 0.9863556673526945, "grad_norm": 0.7307548522949219, "learning_rate": 9.070131391916376e-05, "loss": 0.17557835578918457, "memory(GiB)": 122.96, "step": 12940, "token_acc": 0.930705079605762, "train_speed(iter/s)": 0.243025 }, { "epoch": 0.9867367939629544, "grad_norm": 1.7199678421020508, "learning_rate": 9.069435823921874e-05, "loss": 0.13195364475250243, "memory(GiB)": 122.96, "step": 12945, "token_acc": 0.954388389771942, "train_speed(iter/s)": 0.243055 }, { "epoch": 0.9871179205732145, "grad_norm": 1.5133916139602661, "learning_rate": 9.06874002256411e-05, "loss": 0.16090396642684937, "memory(GiB)": 122.96, "step": 12950, "token_acc": 0.9381992541289291, "train_speed(iter/s)": 0.243081 }, { "epoch": 0.9874990471834744, "grad_norm": 1.088442087173462, "learning_rate": 9.068043987882989e-05, "loss": 0.19202930927276612, "memory(GiB)": 122.96, "step": 12955, "token_acc": 0.9103042479908151, "train_speed(iter/s)": 0.243098 }, { "epoch": 0.9878801737937343, "grad_norm": 1.1103661060333252, "learning_rate": 9.067347719918422e-05, "loss": 0.1846510052680969, "memory(GiB)": 122.96, "step": 12960, "token_acc": 0.9356280733124721, "train_speed(iter/s)": 0.243112 }, { "epoch": 0.9882613004039942, "grad_norm": 0.6558491587638855, "learning_rate": 9.066651218710337e-05, "loss": 0.14351265430450438, "memory(GiB)": 122.96, "step": 12965, "token_acc": 0.9527410207939508, "train_speed(iter/s)": 0.243139 }, { "epoch": 0.9886424270142541, "grad_norm": 1.186589241027832, "learning_rate": 9.065954484298678e-05, "loss": 0.1753328800201416, "memory(GiB)": 122.96, "step": 12970, "token_acc": 0.9383424862705941, "train_speed(iter/s)": 0.243163 }, { "epoch": 0.9890235536245141, "grad_norm": 0.7713607549667358, "learning_rate": 9.065257516723398e-05, "loss": 0.15498749017715455, "memory(GiB)": 122.96, "step": 12975, "token_acc": 0.9519569268999792, "train_speed(iter/s)": 0.243183 }, { "epoch": 0.989404680234774, "grad_norm": 0.5007254481315613, "learning_rate": 9.064560316024462e-05, "loss": 0.13800753355026246, "memory(GiB)": 122.96, "step": 12980, "token_acc": 0.952803294266709, "train_speed(iter/s)": 0.243193 }, { "epoch": 0.9897858068450339, "grad_norm": 1.283984899520874, "learning_rate": 9.063862882241856e-05, "loss": 0.14775224924087524, "memory(GiB)": 122.96, "step": 12985, "token_acc": 0.946290395994538, "train_speed(iter/s)": 0.243216 }, { "epoch": 0.9901669334552938, "grad_norm": 0.7066991925239563, "learning_rate": 9.06316521541557e-05, "loss": 0.11097780466079712, "memory(GiB)": 122.96, "step": 12990, "token_acc": 0.9526174218911649, "train_speed(iter/s)": 0.243235 }, { "epoch": 0.9905480600655537, "grad_norm": 0.9233891367912292, "learning_rate": 9.062467315585616e-05, "loss": 0.20944910049438475, "memory(GiB)": 122.96, "step": 12995, "token_acc": 0.9191530317613089, "train_speed(iter/s)": 0.24327 }, { "epoch": 0.9909291866758138, "grad_norm": 0.7900567054748535, "learning_rate": 9.061769182792015e-05, "loss": 0.15398424863815308, "memory(GiB)": 122.96, "step": 13000, "token_acc": 0.940550510783201, "train_speed(iter/s)": 0.243281 }, { "epoch": 0.9909291866758138, "eval_loss": 0.11379073560237885, "eval_runtime": 159.7392, "eval_samples_per_second": 3.318, "eval_steps_per_second": 3.318, "eval_token_acc": 0.9462758267574243, "step": 13000 }, { "epoch": 0.9913103132860737, "grad_norm": 1.4866052865982056, "learning_rate": 9.0610708170748e-05, "loss": 0.149062442779541, "memory(GiB)": 122.96, "step": 13005, "token_acc": 0.9460534493474207, "train_speed(iter/s)": 0.24258 }, { "epoch": 0.9916914398963336, "grad_norm": 0.9612509608268738, "learning_rate": 9.060372218474016e-05, "loss": 0.1529853105545044, "memory(GiB)": 122.96, "step": 13010, "token_acc": 0.9418627644848725, "train_speed(iter/s)": 0.242603 }, { "epoch": 0.9920725665065935, "grad_norm": 0.7497698664665222, "learning_rate": 9.05967338702973e-05, "loss": 0.14684865474700928, "memory(GiB)": 122.96, "step": 13015, "token_acc": 0.9452863295581433, "train_speed(iter/s)": 0.242616 }, { "epoch": 0.9924536931168534, "grad_norm": 1.25754976272583, "learning_rate": 9.058974322782015e-05, "loss": 0.14794299602508545, "memory(GiB)": 122.96, "step": 13020, "token_acc": 0.9421052631578948, "train_speed(iter/s)": 0.242644 }, { "epoch": 0.9928348197271133, "grad_norm": 0.6167917847633362, "learning_rate": 9.058275025770956e-05, "loss": 0.2108780860900879, "memory(GiB)": 122.96, "step": 13025, "token_acc": 0.9378743557598551, "train_speed(iter/s)": 0.242658 }, { "epoch": 0.9932159463373733, "grad_norm": 0.8814374208450317, "learning_rate": 9.057575496036661e-05, "loss": 0.1810707449913025, "memory(GiB)": 122.96, "step": 13030, "token_acc": 0.9348845285639868, "train_speed(iter/s)": 0.242675 }, { "epoch": 0.9935970729476332, "grad_norm": 0.5326366424560547, "learning_rate": 9.056875733619238e-05, "loss": 0.14560816287994385, "memory(GiB)": 122.96, "step": 13035, "token_acc": 0.9625057155921354, "train_speed(iter/s)": 0.242696 }, { "epoch": 0.9939781995578931, "grad_norm": 0.6444352269172668, "learning_rate": 9.056175738558818e-05, "loss": 0.1773803114891052, "memory(GiB)": 122.96, "step": 13040, "token_acc": 0.9274994227661049, "train_speed(iter/s)": 0.242722 }, { "epoch": 0.994359326168153, "grad_norm": 0.7549293637275696, "learning_rate": 9.055475510895543e-05, "loss": 0.1321724534034729, "memory(GiB)": 122.96, "step": 13045, "token_acc": 0.942737896494157, "train_speed(iter/s)": 0.242737 }, { "epoch": 0.9947404527784129, "grad_norm": 1.5126726627349854, "learning_rate": 9.054775050669566e-05, "loss": 0.14355943202972413, "memory(GiB)": 122.96, "step": 13050, "token_acc": 0.9419802867383512, "train_speed(iter/s)": 0.242761 }, { "epoch": 0.995121579388673, "grad_norm": 1.109665036201477, "learning_rate": 9.054074357921057e-05, "loss": 0.18673934936523437, "memory(GiB)": 122.96, "step": 13055, "token_acc": 0.9169354838709678, "train_speed(iter/s)": 0.242793 }, { "epoch": 0.9955027059989329, "grad_norm": 0.3576335906982422, "learning_rate": 9.053373432690197e-05, "loss": 0.10564805269241333, "memory(GiB)": 122.96, "step": 13060, "token_acc": 0.9574468085106383, "train_speed(iter/s)": 0.242818 }, { "epoch": 0.9958838326091928, "grad_norm": 0.9800400733947754, "learning_rate": 9.052672275017181e-05, "loss": 0.1497722625732422, "memory(GiB)": 122.96, "step": 13065, "token_acc": 0.9382538770821367, "train_speed(iter/s)": 0.242849 }, { "epoch": 0.9962649592194527, "grad_norm": 0.7127009034156799, "learning_rate": 9.051970884942216e-05, "loss": 0.12221091985702515, "memory(GiB)": 122.96, "step": 13070, "token_acc": 0.9496090356211989, "train_speed(iter/s)": 0.24287 }, { "epoch": 0.9966460858297126, "grad_norm": 0.7085050344467163, "learning_rate": 9.051269262505524e-05, "loss": 0.16754279136657715, "memory(GiB)": 122.96, "step": 13075, "token_acc": 0.9357743983463753, "train_speed(iter/s)": 0.242886 }, { "epoch": 0.9970272124399726, "grad_norm": 0.4136047065258026, "learning_rate": 9.05056740774734e-05, "loss": 0.10579663515090942, "memory(GiB)": 122.96, "step": 13080, "token_acc": 0.9545211342964152, "train_speed(iter/s)": 0.242922 }, { "epoch": 0.9974083390502325, "grad_norm": 0.8508806228637695, "learning_rate": 9.049865320707914e-05, "loss": 0.14258871078491211, "memory(GiB)": 122.96, "step": 13085, "token_acc": 0.9512245745122457, "train_speed(iter/s)": 0.242944 }, { "epoch": 0.9977894656604924, "grad_norm": 0.6361384391784668, "learning_rate": 9.049163001427503e-05, "loss": 0.10106058120727539, "memory(GiB)": 122.96, "step": 13090, "token_acc": 0.9586235720256339, "train_speed(iter/s)": 0.242961 }, { "epoch": 0.9981705922707523, "grad_norm": 0.6288278102874756, "learning_rate": 9.048460449946386e-05, "loss": 0.14365031719207763, "memory(GiB)": 122.96, "step": 13095, "token_acc": 0.9433814916048419, "train_speed(iter/s)": 0.24298 }, { "epoch": 0.9985517188810122, "grad_norm": 0.7413514852523804, "learning_rate": 9.047757666304848e-05, "loss": 0.08960820436477661, "memory(GiB)": 122.96, "step": 13100, "token_acc": 0.9526737967914438, "train_speed(iter/s)": 0.242991 }, { "epoch": 0.9989328454912721, "grad_norm": 0.9728160500526428, "learning_rate": 9.047054650543193e-05, "loss": 0.15240256786346434, "memory(GiB)": 122.96, "step": 13105, "token_acc": 0.9412811387900356, "train_speed(iter/s)": 0.243008 }, { "epoch": 0.9993139721015322, "grad_norm": 0.6683516502380371, "learning_rate": 9.046351402701734e-05, "loss": 0.16470314264297486, "memory(GiB)": 122.96, "step": 13110, "token_acc": 0.9415539766215907, "train_speed(iter/s)": 0.243033 }, { "epoch": 0.9996950987117921, "grad_norm": 0.5164461731910706, "learning_rate": 9.0456479228208e-05, "loss": 0.14240710735321044, "memory(GiB)": 122.96, "step": 13115, "token_acc": 0.9549052869663557, "train_speed(iter/s)": 0.243039 }, { "epoch": 1.000076225322052, "grad_norm": 0.6638708114624023, "learning_rate": 9.044944210940729e-05, "loss": 0.18563666343688964, "memory(GiB)": 122.96, "step": 13120, "token_acc": 0.9323958896700919, "train_speed(iter/s)": 0.24307 }, { "epoch": 1.000457351932312, "grad_norm": 0.8887478113174438, "learning_rate": 9.044240267101882e-05, "loss": 0.10783932209014893, "memory(GiB)": 122.96, "step": 13125, "token_acc": 0.9501797637390857, "train_speed(iter/s)": 0.243096 }, { "epoch": 1.000838478542572, "grad_norm": 0.8701349496841431, "learning_rate": 9.043536091344621e-05, "loss": 0.15447317361831664, "memory(GiB)": 122.96, "step": 13130, "token_acc": 0.9359906213364596, "train_speed(iter/s)": 0.243124 }, { "epoch": 1.0012196051528317, "grad_norm": 0.7305511832237244, "learning_rate": 9.04283168370933e-05, "loss": 0.15913857221603395, "memory(GiB)": 122.96, "step": 13135, "token_acc": 0.9464746019711903, "train_speed(iter/s)": 0.24314 }, { "epoch": 1.0016007317630917, "grad_norm": 0.8776249885559082, "learning_rate": 9.042127044236403e-05, "loss": 0.13561135530471802, "memory(GiB)": 122.96, "step": 13140, "token_acc": 0.9492924528301887, "train_speed(iter/s)": 0.243158 }, { "epoch": 1.0019818583733515, "grad_norm": 1.4002827405929565, "learning_rate": 9.041422172966247e-05, "loss": 0.1536510705947876, "memory(GiB)": 122.96, "step": 13145, "token_acc": 0.9460507757404796, "train_speed(iter/s)": 0.243191 }, { "epoch": 1.0023629849836115, "grad_norm": 1.39417564868927, "learning_rate": 9.040717069939286e-05, "loss": 0.11698276996612549, "memory(GiB)": 122.96, "step": 13150, "token_acc": 0.9457159069415547, "train_speed(iter/s)": 0.24321 }, { "epoch": 1.0027441115938716, "grad_norm": 0.3120267391204834, "learning_rate": 9.04001173519595e-05, "loss": 0.16960211992263793, "memory(GiB)": 122.96, "step": 13155, "token_acc": 0.9252684637300022, "train_speed(iter/s)": 0.243237 }, { "epoch": 1.0031252382041314, "grad_norm": 0.5013481378555298, "learning_rate": 9.03930616877669e-05, "loss": 0.12895805835723878, "memory(GiB)": 122.96, "step": 13160, "token_acc": 0.9584788513775708, "train_speed(iter/s)": 0.243259 }, { "epoch": 1.0035063648143914, "grad_norm": 1.1895321607589722, "learning_rate": 9.038600370721966e-05, "loss": 0.17277244329452515, "memory(GiB)": 122.96, "step": 13165, "token_acc": 0.9431674665319525, "train_speed(iter/s)": 0.243284 }, { "epoch": 1.0038874914246512, "grad_norm": 0.8038957118988037, "learning_rate": 9.03789434107225e-05, "loss": 0.12657909393310546, "memory(GiB)": 122.96, "step": 13170, "token_acc": 0.949971081550029, "train_speed(iter/s)": 0.243311 }, { "epoch": 1.0042686180349112, "grad_norm": 0.27494892477989197, "learning_rate": 9.037188079868035e-05, "loss": 0.10776898860931397, "memory(GiB)": 122.96, "step": 13175, "token_acc": 0.9546599496221663, "train_speed(iter/s)": 0.243339 }, { "epoch": 1.0046497446451712, "grad_norm": 1.2263998985290527, "learning_rate": 9.036481587149816e-05, "loss": 0.14088907241821289, "memory(GiB)": 122.96, "step": 13180, "token_acc": 0.9186949766960124, "train_speed(iter/s)": 0.243374 }, { "epoch": 1.005030871255431, "grad_norm": 59.28468322753906, "learning_rate": 9.035774862958111e-05, "loss": 0.11438615322113037, "memory(GiB)": 122.96, "step": 13185, "token_acc": 0.9485744737543299, "train_speed(iter/s)": 0.2434 }, { "epoch": 1.005411997865691, "grad_norm": 0.6580718159675598, "learning_rate": 9.035067907333446e-05, "loss": 0.17062005996704102, "memory(GiB)": 122.96, "step": 13190, "token_acc": 0.9296903910201061, "train_speed(iter/s)": 0.243402 }, { "epoch": 1.0057931244759508, "grad_norm": 1.9844533205032349, "learning_rate": 9.03436072031636e-05, "loss": 0.16813061237335206, "memory(GiB)": 122.96, "step": 13195, "token_acc": 0.9432911392405063, "train_speed(iter/s)": 0.243429 }, { "epoch": 1.0061742510862108, "grad_norm": 0.4521082639694214, "learning_rate": 9.03365330194741e-05, "loss": 0.08492686748504638, "memory(GiB)": 122.96, "step": 13200, "token_acc": 0.9628831814415907, "train_speed(iter/s)": 0.243442 }, { "epoch": 1.0061742510862108, "eval_loss": 0.1140500158071518, "eval_runtime": 161.2966, "eval_samples_per_second": 3.286, "eval_steps_per_second": 3.286, "eval_token_acc": 0.9459595807481477, "step": 13200 }, { "epoch": 1.0065553776964709, "grad_norm": 0.5249303579330444, "learning_rate": 9.03294565226716e-05, "loss": 0.12711071968078613, "memory(GiB)": 122.96, "step": 13205, "token_acc": 0.9461722317859125, "train_speed(iter/s)": 0.242723 }, { "epoch": 1.0069365043067307, "grad_norm": 0.7792975902557373, "learning_rate": 9.032237771316193e-05, "loss": 0.1343394160270691, "memory(GiB)": 122.96, "step": 13210, "token_acc": 0.9535687824154112, "train_speed(iter/s)": 0.242752 }, { "epoch": 1.0073176309169907, "grad_norm": 0.5957159399986267, "learning_rate": 9.031529659135101e-05, "loss": 0.1373907208442688, "memory(GiB)": 122.96, "step": 13215, "token_acc": 0.9434825870646766, "train_speed(iter/s)": 0.242772 }, { "epoch": 1.0076987575272505, "grad_norm": 1.5517845153808594, "learning_rate": 9.030821315764493e-05, "loss": 0.13840944766998292, "memory(GiB)": 122.96, "step": 13220, "token_acc": 0.9344709897610921, "train_speed(iter/s)": 0.242804 }, { "epoch": 1.0080798841375105, "grad_norm": 0.5660856366157532, "learning_rate": 9.030112741244987e-05, "loss": 0.1246480941772461, "memory(GiB)": 122.96, "step": 13225, "token_acc": 0.9604140439222268, "train_speed(iter/s)": 0.242822 }, { "epoch": 1.0084610107477705, "grad_norm": 1.0082930326461792, "learning_rate": 9.029403935617218e-05, "loss": 0.10962262153625488, "memory(GiB)": 122.96, "step": 13230, "token_acc": 0.9449580735325737, "train_speed(iter/s)": 0.242846 }, { "epoch": 1.0088421373580303, "grad_norm": 1.0502551794052124, "learning_rate": 9.02869489892183e-05, "loss": 0.17590911388397218, "memory(GiB)": 122.96, "step": 13235, "token_acc": 0.9146230699364214, "train_speed(iter/s)": 0.242881 }, { "epoch": 1.0092232639682903, "grad_norm": 0.9352285861968994, "learning_rate": 9.027985631199487e-05, "loss": 0.1284429430961609, "memory(GiB)": 122.96, "step": 13240, "token_acc": 0.9492310933220982, "train_speed(iter/s)": 0.2429 }, { "epoch": 1.0096043905785501, "grad_norm": 0.5141494870185852, "learning_rate": 9.02727613249086e-05, "loss": 0.1310176134109497, "memory(GiB)": 122.96, "step": 13245, "token_acc": 0.9491150442477876, "train_speed(iter/s)": 0.242918 }, { "epoch": 1.0099855171888101, "grad_norm": 1.0539319515228271, "learning_rate": 9.026566402836634e-05, "loss": 0.11778910160064697, "memory(GiB)": 122.96, "step": 13250, "token_acc": 0.9528529332976159, "train_speed(iter/s)": 0.242942 }, { "epoch": 1.0103666437990702, "grad_norm": 1.0462214946746826, "learning_rate": 9.025856442277512e-05, "loss": 0.19528688192367555, "memory(GiB)": 122.96, "step": 13255, "token_acc": 0.9256484149855908, "train_speed(iter/s)": 0.242964 }, { "epoch": 1.01074777040933, "grad_norm": 1.0990525484085083, "learning_rate": 9.025146250854204e-05, "loss": 0.13569742441177368, "memory(GiB)": 122.96, "step": 13260, "token_acc": 0.9524520786656709, "train_speed(iter/s)": 0.242974 }, { "epoch": 1.01112889701959, "grad_norm": 0.7722269296646118, "learning_rate": 9.024435828607439e-05, "loss": 0.1402994155883789, "memory(GiB)": 122.96, "step": 13265, "token_acc": 0.9566722618548049, "train_speed(iter/s)": 0.242973 }, { "epoch": 1.0115100236298498, "grad_norm": 1.4359697103500366, "learning_rate": 9.023725175577955e-05, "loss": 0.11814293861389161, "memory(GiB)": 122.96, "step": 13270, "token_acc": 0.946689232071081, "train_speed(iter/s)": 0.242992 }, { "epoch": 1.0118911502401098, "grad_norm": 0.5311808586120605, "learning_rate": 9.023014291806503e-05, "loss": 0.12027857303619385, "memory(GiB)": 122.96, "step": 13275, "token_acc": 0.9551998389207692, "train_speed(iter/s)": 0.242988 }, { "epoch": 1.0122722768503698, "grad_norm": 0.6212030649185181, "learning_rate": 9.022303177333851e-05, "loss": 0.1370632290840149, "memory(GiB)": 122.96, "step": 13280, "token_acc": 0.9468209443665264, "train_speed(iter/s)": 0.242999 }, { "epoch": 1.0126534034606296, "grad_norm": 0.5942872762680054, "learning_rate": 9.021591832200778e-05, "loss": 0.09242379665374756, "memory(GiB)": 122.96, "step": 13285, "token_acc": 0.9550517104216388, "train_speed(iter/s)": 0.243014 }, { "epoch": 1.0130345300708896, "grad_norm": 0.5524414777755737, "learning_rate": 9.020880256448075e-05, "loss": 0.12993178367614747, "memory(GiB)": 122.96, "step": 13290, "token_acc": 0.956109496864964, "train_speed(iter/s)": 0.243027 }, { "epoch": 1.0134156566811494, "grad_norm": 0.5235651731491089, "learning_rate": 9.020168450116549e-05, "loss": 0.11526881456375122, "memory(GiB)": 122.96, "step": 13295, "token_acc": 0.9517369727047147, "train_speed(iter/s)": 0.243039 }, { "epoch": 1.0137967832914094, "grad_norm": 0.8513157963752747, "learning_rate": 9.019456413247019e-05, "loss": 0.12882033586502076, "memory(GiB)": 122.96, "step": 13300, "token_acc": 0.9535859269282815, "train_speed(iter/s)": 0.243052 }, { "epoch": 1.0141779099016692, "grad_norm": 1.1090227365493774, "learning_rate": 9.018744145880316e-05, "loss": 0.15540637969970703, "memory(GiB)": 122.96, "step": 13305, "token_acc": 0.9312883435582822, "train_speed(iter/s)": 0.243078 }, { "epoch": 1.0145590365119292, "grad_norm": 0.7994403839111328, "learning_rate": 9.018031648057283e-05, "loss": 0.11464146375656128, "memory(GiB)": 122.96, "step": 13310, "token_acc": 0.9619089316987741, "train_speed(iter/s)": 0.243105 }, { "epoch": 1.0149401631221893, "grad_norm": 1.1008329391479492, "learning_rate": 9.017318919818784e-05, "loss": 0.1328161120414734, "memory(GiB)": 122.96, "step": 13315, "token_acc": 0.9442259563818377, "train_speed(iter/s)": 0.243136 }, { "epoch": 1.015321289732449, "grad_norm": 1.126471996307373, "learning_rate": 9.016605961205686e-05, "loss": 0.13819352388381959, "memory(GiB)": 122.96, "step": 13320, "token_acc": 0.9490605427974947, "train_speed(iter/s)": 0.243167 }, { "epoch": 1.015702416342709, "grad_norm": 0.3152206242084503, "learning_rate": 9.015892772258876e-05, "loss": 0.14500752687454224, "memory(GiB)": 122.96, "step": 13325, "token_acc": 0.9495431068732618, "train_speed(iter/s)": 0.243197 }, { "epoch": 1.0160835429529689, "grad_norm": 1.2332167625427246, "learning_rate": 9.015179353019252e-05, "loss": 0.18771822452545167, "memory(GiB)": 122.96, "step": 13330, "token_acc": 0.932094943240454, "train_speed(iter/s)": 0.243216 }, { "epoch": 1.016464669563229, "grad_norm": 1.695331335067749, "learning_rate": 9.014465703527724e-05, "loss": 0.18234039545059205, "memory(GiB)": 122.96, "step": 13335, "token_acc": 0.9225554106910039, "train_speed(iter/s)": 0.243242 }, { "epoch": 1.016845796173489, "grad_norm": 0.54533451795578, "learning_rate": 9.013751823825218e-05, "loss": 0.09690825939178467, "memory(GiB)": 122.96, "step": 13340, "token_acc": 0.9511137688742712, "train_speed(iter/s)": 0.243261 }, { "epoch": 1.0172269227837487, "grad_norm": 0.8418424129486084, "learning_rate": 9.01303771395267e-05, "loss": 0.09796565771102905, "memory(GiB)": 122.96, "step": 13345, "token_acc": 0.9516716319099636, "train_speed(iter/s)": 0.24329 }, { "epoch": 1.0176080493940087, "grad_norm": 1.126841425895691, "learning_rate": 9.012323373951032e-05, "loss": 0.1392220973968506, "memory(GiB)": 122.96, "step": 13350, "token_acc": 0.9399915182357931, "train_speed(iter/s)": 0.243313 }, { "epoch": 1.0179891760042685, "grad_norm": 0.894351601600647, "learning_rate": 9.011608803861268e-05, "loss": 0.12688040733337402, "memory(GiB)": 122.96, "step": 13355, "token_acc": 0.9531675440188782, "train_speed(iter/s)": 0.243332 }, { "epoch": 1.0183703026145285, "grad_norm": 1.2176145315170288, "learning_rate": 9.010894003724357e-05, "loss": 0.11399437189102173, "memory(GiB)": 122.96, "step": 13360, "token_acc": 0.9510022271714922, "train_speed(iter/s)": 0.243359 }, { "epoch": 1.0187514292247886, "grad_norm": 0.8465023636817932, "learning_rate": 9.010178973581287e-05, "loss": 0.15052452087402343, "memory(GiB)": 122.96, "step": 13365, "token_acc": 0.9456114028507127, "train_speed(iter/s)": 0.243394 }, { "epoch": 1.0191325558350484, "grad_norm": 0.8254810571670532, "learning_rate": 9.00946371347306e-05, "loss": 0.129919695854187, "memory(GiB)": 122.96, "step": 13370, "token_acc": 0.940236275191105, "train_speed(iter/s)": 0.243425 }, { "epoch": 1.0195136824453084, "grad_norm": 0.6567272543907166, "learning_rate": 9.008748223440697e-05, "loss": 0.15339882373809816, "memory(GiB)": 122.96, "step": 13375, "token_acc": 0.9417758369723436, "train_speed(iter/s)": 0.243439 }, { "epoch": 1.0198948090555682, "grad_norm": 0.8173088431358337, "learning_rate": 9.008032503525227e-05, "loss": 0.1255262851715088, "memory(GiB)": 122.96, "step": 13380, "token_acc": 0.9579896907216495, "train_speed(iter/s)": 0.243466 }, { "epoch": 1.0202759356658282, "grad_norm": 1.875491976737976, "learning_rate": 9.007316553767691e-05, "loss": 0.17698462009429933, "memory(GiB)": 122.96, "step": 13385, "token_acc": 0.9510851602952517, "train_speed(iter/s)": 0.243471 }, { "epoch": 1.0206570622760882, "grad_norm": 0.7572685480117798, "learning_rate": 9.006600374209149e-05, "loss": 0.14281053543090821, "memory(GiB)": 122.96, "step": 13390, "token_acc": 0.9430399025281755, "train_speed(iter/s)": 0.243499 }, { "epoch": 1.021038188886348, "grad_norm": 1.0179612636566162, "learning_rate": 9.005883964890666e-05, "loss": 0.15240601301193238, "memory(GiB)": 122.96, "step": 13395, "token_acc": 0.9403085177733065, "train_speed(iter/s)": 0.243517 }, { "epoch": 1.021419315496608, "grad_norm": 1.1154353618621826, "learning_rate": 9.005167325853328e-05, "loss": 0.15707879066467284, "memory(GiB)": 122.96, "step": 13400, "token_acc": 0.9370748299319728, "train_speed(iter/s)": 0.243541 }, { "epoch": 1.021419315496608, "eval_loss": 0.11693435907363892, "eval_runtime": 160.762, "eval_samples_per_second": 3.297, "eval_steps_per_second": 3.297, "eval_token_acc": 0.9461252334196735, "step": 13400 }, { "epoch": 1.0218004421068678, "grad_norm": 0.7211269736289978, "learning_rate": 9.004450457138231e-05, "loss": 0.12977230548858643, "memory(GiB)": 122.96, "step": 13405, "token_acc": 0.9463530195896603, "train_speed(iter/s)": 0.242851 }, { "epoch": 1.0221815687171278, "grad_norm": 1.0241775512695312, "learning_rate": 9.003733358786483e-05, "loss": 0.11355311870574951, "memory(GiB)": 122.96, "step": 13410, "token_acc": 0.9634649381261049, "train_speed(iter/s)": 0.24288 }, { "epoch": 1.0225626953273879, "grad_norm": 0.6859497427940369, "learning_rate": 9.003016030839205e-05, "loss": 0.14380919933319092, "memory(GiB)": 122.96, "step": 13415, "token_acc": 0.9430281534027932, "train_speed(iter/s)": 0.2429 }, { "epoch": 1.0229438219376477, "grad_norm": 1.180594801902771, "learning_rate": 9.002298473337535e-05, "loss": 0.14033219814300538, "memory(GiB)": 122.96, "step": 13420, "token_acc": 0.9255184088023699, "train_speed(iter/s)": 0.242933 }, { "epoch": 1.0233249485479077, "grad_norm": 1.17043137550354, "learning_rate": 9.00158068632262e-05, "loss": 0.1468792200088501, "memory(GiB)": 122.96, "step": 13425, "token_acc": 0.9432612029306526, "train_speed(iter/s)": 0.242947 }, { "epoch": 1.0237060751581675, "grad_norm": 0.9427582025527954, "learning_rate": 9.000862669835624e-05, "loss": 0.15820050239562988, "memory(GiB)": 122.96, "step": 13430, "token_acc": 0.9271895152902018, "train_speed(iter/s)": 0.242968 }, { "epoch": 1.0240872017684275, "grad_norm": 1.3545323610305786, "learning_rate": 9.000144423917718e-05, "loss": 0.135398006439209, "memory(GiB)": 122.96, "step": 13435, "token_acc": 0.9559393156875403, "train_speed(iter/s)": 0.242987 }, { "epoch": 1.0244683283786875, "grad_norm": 0.6405087113380432, "learning_rate": 8.999425948610093e-05, "loss": 0.18731287717819214, "memory(GiB)": 122.96, "step": 13440, "token_acc": 0.9419390428906358, "train_speed(iter/s)": 0.242996 }, { "epoch": 1.0248494549889473, "grad_norm": 0.754671573638916, "learning_rate": 8.998707243953949e-05, "loss": 0.13950384855270387, "memory(GiB)": 122.96, "step": 13445, "token_acc": 0.9544454225352113, "train_speed(iter/s)": 0.243003 }, { "epoch": 1.0252305815992073, "grad_norm": 0.8181285262107849, "learning_rate": 8.997988309990501e-05, "loss": 0.15849127769470214, "memory(GiB)": 122.96, "step": 13450, "token_acc": 0.9339890296954795, "train_speed(iter/s)": 0.243023 }, { "epoch": 1.0256117082094671, "grad_norm": 0.8961131572723389, "learning_rate": 8.997269146760976e-05, "loss": 0.13048157691955567, "memory(GiB)": 122.96, "step": 13455, "token_acc": 0.9372661870503597, "train_speed(iter/s)": 0.243048 }, { "epoch": 1.0259928348197271, "grad_norm": 1.0455275774002075, "learning_rate": 8.996549754306615e-05, "loss": 0.133545184135437, "memory(GiB)": 122.96, "step": 13460, "token_acc": 0.9448340345956054, "train_speed(iter/s)": 0.243085 }, { "epoch": 1.026373961429987, "grad_norm": 0.566551923751831, "learning_rate": 8.995830132668674e-05, "loss": 0.12568373680114747, "memory(GiB)": 122.96, "step": 13465, "token_acc": 0.948565482384421, "train_speed(iter/s)": 0.243104 }, { "epoch": 1.026755088040247, "grad_norm": 1.247444987297058, "learning_rate": 8.995110281888416e-05, "loss": 0.1739397406578064, "memory(GiB)": 122.96, "step": 13470, "token_acc": 0.9298029556650246, "train_speed(iter/s)": 0.243127 }, { "epoch": 1.027136214650507, "grad_norm": 0.8384561538696289, "learning_rate": 8.994390202007124e-05, "loss": 0.10171937942504883, "memory(GiB)": 122.96, "step": 13475, "token_acc": 0.95, "train_speed(iter/s)": 0.243157 }, { "epoch": 1.0275173412607668, "grad_norm": 0.4667125344276428, "learning_rate": 8.993669893066088e-05, "loss": 0.09824130535125733, "memory(GiB)": 122.96, "step": 13480, "token_acc": 0.9593291958272812, "train_speed(iter/s)": 0.243173 }, { "epoch": 1.0278984678710268, "grad_norm": 0.8166374564170837, "learning_rate": 8.992949355106619e-05, "loss": 0.14350948333740235, "memory(GiB)": 122.96, "step": 13485, "token_acc": 0.9425333068204415, "train_speed(iter/s)": 0.243197 }, { "epoch": 1.0282795944812866, "grad_norm": 1.6082546710968018, "learning_rate": 8.992228588170032e-05, "loss": 0.1368964672088623, "memory(GiB)": 122.96, "step": 13490, "token_acc": 0.9545454545454546, "train_speed(iter/s)": 0.243222 }, { "epoch": 1.0286607210915466, "grad_norm": 0.9413194060325623, "learning_rate": 8.991507592297663e-05, "loss": 0.14596915245056152, "memory(GiB)": 122.96, "step": 13495, "token_acc": 0.9421871841520113, "train_speed(iter/s)": 0.243247 }, { "epoch": 1.0290418477018066, "grad_norm": 0.6232536435127258, "learning_rate": 8.990786367530856e-05, "loss": 0.109859037399292, "memory(GiB)": 122.96, "step": 13500, "token_acc": 0.9625079974408189, "train_speed(iter/s)": 0.243249 }, { "epoch": 1.0294229743120664, "grad_norm": 0.7154970169067383, "learning_rate": 8.99006491391097e-05, "loss": 0.14309734106063843, "memory(GiB)": 122.96, "step": 13505, "token_acc": 0.9467871485943775, "train_speed(iter/s)": 0.243268 }, { "epoch": 1.0298041009223264, "grad_norm": 1.215544581413269, "learning_rate": 8.989343231479377e-05, "loss": 0.1301755428314209, "memory(GiB)": 122.96, "step": 13510, "token_acc": 0.9507859281437125, "train_speed(iter/s)": 0.243283 }, { "epoch": 1.0301852275325862, "grad_norm": 1.1776058673858643, "learning_rate": 8.988621320277463e-05, "loss": 0.12737114429473878, "memory(GiB)": 122.96, "step": 13515, "token_acc": 0.947353027200936, "train_speed(iter/s)": 0.243315 }, { "epoch": 1.0305663541428463, "grad_norm": 3.1133933067321777, "learning_rate": 8.987899180346625e-05, "loss": 0.16982563734054565, "memory(GiB)": 122.96, "step": 13520, "token_acc": 0.93134684147795, "train_speed(iter/s)": 0.243345 }, { "epoch": 1.0309474807531063, "grad_norm": 1.10660982131958, "learning_rate": 8.987176811728277e-05, "loss": 0.11491932868957519, "memory(GiB)": 122.96, "step": 13525, "token_acc": 0.9483721826237719, "train_speed(iter/s)": 0.243367 }, { "epoch": 1.031328607363366, "grad_norm": 0.7260339260101318, "learning_rate": 8.986454214463842e-05, "loss": 0.14321138858795165, "memory(GiB)": 122.96, "step": 13530, "token_acc": 0.9523390526470157, "train_speed(iter/s)": 0.243385 }, { "epoch": 1.031709733973626, "grad_norm": 0.5288922786712646, "learning_rate": 8.985731388594755e-05, "loss": 0.09017077684402466, "memory(GiB)": 122.96, "step": 13535, "token_acc": 0.9629013079667063, "train_speed(iter/s)": 0.243413 }, { "epoch": 1.0320908605838859, "grad_norm": 2.512619972229004, "learning_rate": 8.98500833416247e-05, "loss": 0.15156651735305787, "memory(GiB)": 122.96, "step": 13540, "token_acc": 0.9346608587429994, "train_speed(iter/s)": 0.243439 }, { "epoch": 1.032471987194146, "grad_norm": 1.079639196395874, "learning_rate": 8.984285051208449e-05, "loss": 0.14731051921844482, "memory(GiB)": 122.96, "step": 13545, "token_acc": 0.9429579535430755, "train_speed(iter/s)": 0.24347 }, { "epoch": 1.032853113804406, "grad_norm": 0.7776508331298828, "learning_rate": 8.98356153977417e-05, "loss": 0.14867217540740968, "memory(GiB)": 122.96, "step": 13550, "token_acc": 0.945613338626439, "train_speed(iter/s)": 0.243481 }, { "epoch": 1.0332342404146657, "grad_norm": 0.8485033512115479, "learning_rate": 8.982837799901124e-05, "loss": 0.12658956050872802, "memory(GiB)": 122.96, "step": 13555, "token_acc": 0.9404239068168799, "train_speed(iter/s)": 0.243503 }, { "epoch": 1.0336153670249257, "grad_norm": 2.018871545791626, "learning_rate": 8.982113831630812e-05, "loss": 0.20261495113372802, "memory(GiB)": 122.96, "step": 13560, "token_acc": 0.9464012251148545, "train_speed(iter/s)": 0.243528 }, { "epoch": 1.0339964936351855, "grad_norm": 1.4046076536178589, "learning_rate": 8.981389635004749e-05, "loss": 0.1104088306427002, "memory(GiB)": 122.96, "step": 13565, "token_acc": 0.9462447579107892, "train_speed(iter/s)": 0.243559 }, { "epoch": 1.0343776202454456, "grad_norm": 0.9755273461341858, "learning_rate": 8.98066521006447e-05, "loss": 0.07407600283622742, "memory(GiB)": 122.96, "step": 13570, "token_acc": 0.9617224880382775, "train_speed(iter/s)": 0.243582 }, { "epoch": 1.0347587468557056, "grad_norm": 0.8309898376464844, "learning_rate": 8.97994055685151e-05, "loss": 0.11751409769058227, "memory(GiB)": 122.96, "step": 13575, "token_acc": 0.9573117338003503, "train_speed(iter/s)": 0.243602 }, { "epoch": 1.0351398734659654, "grad_norm": 1.7204824686050415, "learning_rate": 8.97921567540743e-05, "loss": 0.14962058067321776, "memory(GiB)": 122.96, "step": 13580, "token_acc": 0.939297124600639, "train_speed(iter/s)": 0.243624 }, { "epoch": 1.0355210000762254, "grad_norm": 1.1543312072753906, "learning_rate": 8.978490565773798e-05, "loss": 0.14077852964401244, "memory(GiB)": 122.96, "step": 13585, "token_acc": 0.9554995801847187, "train_speed(iter/s)": 0.243646 }, { "epoch": 1.0359021266864852, "grad_norm": 0.7658385038375854, "learning_rate": 8.977765227992192e-05, "loss": 0.12712208032608033, "memory(GiB)": 122.96, "step": 13590, "token_acc": 0.9386213408876298, "train_speed(iter/s)": 0.243676 }, { "epoch": 1.0362832532967452, "grad_norm": 0.8528062701225281, "learning_rate": 8.977039662104211e-05, "loss": 0.2031987190246582, "memory(GiB)": 122.96, "step": 13595, "token_acc": 0.9162072767364939, "train_speed(iter/s)": 0.243712 }, { "epoch": 1.036664379907005, "grad_norm": 1.5451500415802002, "learning_rate": 8.97631386815146e-05, "loss": 0.16884269714355468, "memory(GiB)": 122.96, "step": 13600, "token_acc": 0.9259622456484432, "train_speed(iter/s)": 0.243736 }, { "epoch": 1.036664379907005, "eval_loss": 0.1134137436747551, "eval_runtime": 158.0854, "eval_samples_per_second": 3.353, "eval_steps_per_second": 3.353, "eval_token_acc": 0.9462833564243118, "step": 13600 }, { "epoch": 1.037045506517265, "grad_norm": 0.8775653839111328, "learning_rate": 8.975587846175563e-05, "loss": 0.11757107973098754, "memory(GiB)": 122.96, "step": 13605, "token_acc": 0.9465571295866233, "train_speed(iter/s)": 0.243069 }, { "epoch": 1.037426633127525, "grad_norm": 0.8536204695701599, "learning_rate": 8.97486159621815e-05, "loss": 0.12389830350875855, "memory(GiB)": 122.96, "step": 13610, "token_acc": 0.9472337703869523, "train_speed(iter/s)": 0.243099 }, { "epoch": 1.0378077597377848, "grad_norm": 1.0681958198547363, "learning_rate": 8.97413511832087e-05, "loss": 0.12852122783660888, "memory(GiB)": 122.96, "step": 13615, "token_acc": 0.9492719586660404, "train_speed(iter/s)": 0.243111 }, { "epoch": 1.0381888863480448, "grad_norm": 0.701020359992981, "learning_rate": 8.973408412525385e-05, "loss": 0.13967133760452272, "memory(GiB)": 122.96, "step": 13620, "token_acc": 0.9421415153412649, "train_speed(iter/s)": 0.243125 }, { "epoch": 1.0385700129583046, "grad_norm": 0.5792496204376221, "learning_rate": 8.972681478873365e-05, "loss": 0.13783787488937377, "memory(GiB)": 122.96, "step": 13625, "token_acc": 0.9483410331793364, "train_speed(iter/s)": 0.243159 }, { "epoch": 1.0389511395685647, "grad_norm": 1.1490252017974854, "learning_rate": 8.971954317406498e-05, "loss": 0.1697959303855896, "memory(GiB)": 122.96, "step": 13630, "token_acc": 0.9397717666948436, "train_speed(iter/s)": 0.243184 }, { "epoch": 1.0393322661788247, "grad_norm": 0.9637802839279175, "learning_rate": 8.971226928166484e-05, "loss": 0.13800439834594727, "memory(GiB)": 122.96, "step": 13635, "token_acc": 0.9437381660806059, "train_speed(iter/s)": 0.243213 }, { "epoch": 1.0397133927890845, "grad_norm": 1.467260718345642, "learning_rate": 8.970499311195034e-05, "loss": 0.15870745182037355, "memory(GiB)": 122.96, "step": 13640, "token_acc": 0.9407481797639969, "train_speed(iter/s)": 0.243242 }, { "epoch": 1.0400945193993445, "grad_norm": 0.8862055540084839, "learning_rate": 8.969771466533871e-05, "loss": 0.13408082723617554, "memory(GiB)": 122.96, "step": 13645, "token_acc": 0.9406494960806271, "train_speed(iter/s)": 0.243258 }, { "epoch": 1.0404756460096043, "grad_norm": 0.9647778272628784, "learning_rate": 8.96904339422474e-05, "loss": 0.10010493993759155, "memory(GiB)": 122.96, "step": 13650, "token_acc": 0.9520036223681232, "train_speed(iter/s)": 0.243281 }, { "epoch": 1.0408567726198643, "grad_norm": 1.5681296586990356, "learning_rate": 8.968315094309387e-05, "loss": 0.12826627492904663, "memory(GiB)": 122.96, "step": 13655, "token_acc": 0.937682312383983, "train_speed(iter/s)": 0.243306 }, { "epoch": 1.0412378992301243, "grad_norm": 0.8008802533149719, "learning_rate": 8.96758656682958e-05, "loss": 0.12088322639465332, "memory(GiB)": 122.96, "step": 13660, "token_acc": 0.9491942092324501, "train_speed(iter/s)": 0.24333 }, { "epoch": 1.0416190258403841, "grad_norm": 0.9233777523040771, "learning_rate": 8.966857811827094e-05, "loss": 0.12159066200256348, "memory(GiB)": 122.96, "step": 13665, "token_acc": 0.95357095883204, "train_speed(iter/s)": 0.24334 }, { "epoch": 1.0420001524506441, "grad_norm": 1.7468844652175903, "learning_rate": 8.966128829343721e-05, "loss": 0.17538166046142578, "memory(GiB)": 122.96, "step": 13670, "token_acc": 0.9310431293881645, "train_speed(iter/s)": 0.243365 }, { "epoch": 1.042381279060904, "grad_norm": 0.9375073909759521, "learning_rate": 8.965399619421267e-05, "loss": 0.07961732149124146, "memory(GiB)": 122.96, "step": 13675, "token_acc": 0.9574803149606299, "train_speed(iter/s)": 0.24339 }, { "epoch": 1.042762405671164, "grad_norm": 1.428511619567871, "learning_rate": 8.964670182101544e-05, "loss": 0.11720694303512573, "memory(GiB)": 122.96, "step": 13680, "token_acc": 0.9463044851547694, "train_speed(iter/s)": 0.243418 }, { "epoch": 1.043143532281424, "grad_norm": 0.8822304606437683, "learning_rate": 8.963940517426385e-05, "loss": 0.1352558970451355, "memory(GiB)": 122.96, "step": 13685, "token_acc": 0.9495377074746575, "train_speed(iter/s)": 0.243417 }, { "epoch": 1.0435246588916838, "grad_norm": 1.4004019498825073, "learning_rate": 8.963210625437632e-05, "loss": 0.15498522520065308, "memory(GiB)": 122.96, "step": 13690, "token_acc": 0.9524733268671193, "train_speed(iter/s)": 0.243441 }, { "epoch": 1.0439057855019438, "grad_norm": 0.7472198605537415, "learning_rate": 8.96248050617714e-05, "loss": 0.14953227043151857, "memory(GiB)": 122.96, "step": 13695, "token_acc": 0.9458939264328486, "train_speed(iter/s)": 0.24346 }, { "epoch": 1.0442869121122036, "grad_norm": 1.3499114513397217, "learning_rate": 8.961750159686782e-05, "loss": 0.12039797306060791, "memory(GiB)": 122.96, "step": 13700, "token_acc": 0.9536068651112899, "train_speed(iter/s)": 0.243485 }, { "epoch": 1.0446680387224636, "grad_norm": 0.9894657135009766, "learning_rate": 8.961019586008435e-05, "loss": 0.14435898065567015, "memory(GiB)": 122.96, "step": 13705, "token_acc": 0.9399815327793167, "train_speed(iter/s)": 0.243504 }, { "epoch": 1.0450491653327236, "grad_norm": 0.6942718625068665, "learning_rate": 8.960288785183997e-05, "loss": 0.11741418838500976, "memory(GiB)": 122.96, "step": 13710, "token_acc": 0.9460138104205901, "train_speed(iter/s)": 0.24353 }, { "epoch": 1.0454302919429834, "grad_norm": 0.8321689367294312, "learning_rate": 8.959557757255375e-05, "loss": 0.21252524852752686, "memory(GiB)": 122.96, "step": 13715, "token_acc": 0.9420043127908296, "train_speed(iter/s)": 0.243549 }, { "epoch": 1.0458114185532434, "grad_norm": 1.1780673265457153, "learning_rate": 8.95882650226449e-05, "loss": 0.1004792332649231, "memory(GiB)": 122.96, "step": 13720, "token_acc": 0.9627896613190731, "train_speed(iter/s)": 0.243571 }, { "epoch": 1.0461925451635032, "grad_norm": 1.1518492698669434, "learning_rate": 8.958095020253277e-05, "loss": 0.092930269241333, "memory(GiB)": 122.96, "step": 13725, "token_acc": 0.9592657782247925, "train_speed(iter/s)": 0.243595 }, { "epoch": 1.0465736717737633, "grad_norm": 0.7837052345275879, "learning_rate": 8.957363311263682e-05, "loss": 0.13908870220184327, "memory(GiB)": 122.96, "step": 13730, "token_acc": 0.9449851042701093, "train_speed(iter/s)": 0.24362 }, { "epoch": 1.0469547983840233, "grad_norm": 1.083979606628418, "learning_rate": 8.956631375337665e-05, "loss": 0.1276944398880005, "memory(GiB)": 122.96, "step": 13735, "token_acc": 0.9389770723104056, "train_speed(iter/s)": 0.24365 }, { "epoch": 1.047335924994283, "grad_norm": 0.7969292998313904, "learning_rate": 8.9558992125172e-05, "loss": 0.09670544862747192, "memory(GiB)": 122.96, "step": 13740, "token_acc": 0.9632058287795993, "train_speed(iter/s)": 0.243671 }, { "epoch": 1.047717051604543, "grad_norm": 0.7863472700119019, "learning_rate": 8.955166822844274e-05, "loss": 0.13300952911376954, "memory(GiB)": 122.96, "step": 13745, "token_acc": 0.944616853664359, "train_speed(iter/s)": 0.243696 }, { "epoch": 1.048098178214803, "grad_norm": 1.2049895524978638, "learning_rate": 8.954434206360884e-05, "loss": 0.10193939208984375, "memory(GiB)": 122.96, "step": 13750, "token_acc": 0.9538761368557818, "train_speed(iter/s)": 0.243723 }, { "epoch": 1.048479304825063, "grad_norm": 1.3849700689315796, "learning_rate": 8.953701363109042e-05, "loss": 0.17364826202392578, "memory(GiB)": 122.96, "step": 13755, "token_acc": 0.9324018902814876, "train_speed(iter/s)": 0.243749 }, { "epoch": 1.048860431435323, "grad_norm": 0.8267444968223572, "learning_rate": 8.952968293130774e-05, "loss": 0.11265556812286377, "memory(GiB)": 122.96, "step": 13760, "token_acc": 0.9510081358330386, "train_speed(iter/s)": 0.243765 }, { "epoch": 1.0492415580455827, "grad_norm": 0.925733745098114, "learning_rate": 8.95223499646812e-05, "loss": 0.16342424154281615, "memory(GiB)": 122.96, "step": 13765, "token_acc": 0.9317387798978769, "train_speed(iter/s)": 0.243793 }, { "epoch": 1.0496226846558427, "grad_norm": 0.797872006893158, "learning_rate": 8.951501473163129e-05, "loss": 0.1494942545890808, "memory(GiB)": 122.96, "step": 13770, "token_acc": 0.9489919844547, "train_speed(iter/s)": 0.243822 }, { "epoch": 1.0500038112661025, "grad_norm": 1.821158528327942, "learning_rate": 8.950767723257867e-05, "loss": 0.16833198070526123, "memory(GiB)": 122.96, "step": 13775, "token_acc": 0.9272308198880475, "train_speed(iter/s)": 0.243851 }, { "epoch": 1.0503849378763626, "grad_norm": 0.7974928021430969, "learning_rate": 8.950033746794409e-05, "loss": 0.13624104261398315, "memory(GiB)": 122.96, "step": 13780, "token_acc": 0.9484106305367379, "train_speed(iter/s)": 0.243858 }, { "epoch": 1.0507660644866224, "grad_norm": 0.9187918901443481, "learning_rate": 8.949299543814844e-05, "loss": 0.1357928156852722, "memory(GiB)": 122.96, "step": 13785, "token_acc": 0.9439864722046079, "train_speed(iter/s)": 0.243882 }, { "epoch": 1.0511471910968824, "grad_norm": 0.7949797511100769, "learning_rate": 8.94856511436128e-05, "loss": 0.1301784634590149, "memory(GiB)": 122.96, "step": 13790, "token_acc": 0.9540005575689992, "train_speed(iter/s)": 0.243908 }, { "epoch": 1.0515283177071424, "grad_norm": 0.05753763020038605, "learning_rate": 8.94783045847583e-05, "loss": 0.10907024145126343, "memory(GiB)": 122.96, "step": 13795, "token_acc": 0.9495798319327731, "train_speed(iter/s)": 0.24393 }, { "epoch": 1.0519094443174022, "grad_norm": 0.9887524843215942, "learning_rate": 8.947095576200621e-05, "loss": 0.12767888307571412, "memory(GiB)": 122.96, "step": 13800, "token_acc": 0.9526031434184676, "train_speed(iter/s)": 0.243952 }, { "epoch": 1.0519094443174022, "eval_loss": 0.11490591615438461, "eval_runtime": 160.0928, "eval_samples_per_second": 3.311, "eval_steps_per_second": 3.311, "eval_token_acc": 0.9462381784229865, "step": 13800 }, { "epoch": 1.0522905709276622, "grad_norm": 1.1243432760238647, "learning_rate": 8.946360467577799e-05, "loss": 0.14066786766052247, "memory(GiB)": 122.96, "step": 13805, "token_acc": 0.9462167150013182, "train_speed(iter/s)": 0.24329 }, { "epoch": 1.052671697537922, "grad_norm": 1.9596192836761475, "learning_rate": 8.945625132649518e-05, "loss": 0.12184176445007325, "memory(GiB)": 122.96, "step": 13810, "token_acc": 0.9519650655021834, "train_speed(iter/s)": 0.243325 }, { "epoch": 1.053052824148182, "grad_norm": 0.831046998500824, "learning_rate": 8.944889571457944e-05, "loss": 0.1344318151473999, "memory(GiB)": 122.96, "step": 13815, "token_acc": 0.9370733155238943, "train_speed(iter/s)": 0.243346 }, { "epoch": 1.053433950758442, "grad_norm": 0.572076678276062, "learning_rate": 8.944153784045262e-05, "loss": 0.13695544004440308, "memory(GiB)": 122.96, "step": 13820, "token_acc": 0.9485861182519281, "train_speed(iter/s)": 0.243376 }, { "epoch": 1.0538150773687018, "grad_norm": 0.5379874110221863, "learning_rate": 8.943417770453662e-05, "loss": 0.15187511444091797, "memory(GiB)": 122.96, "step": 13825, "token_acc": 0.9423661800486618, "train_speed(iter/s)": 0.243388 }, { "epoch": 1.0541962039789619, "grad_norm": 1.014495611190796, "learning_rate": 8.942681530725352e-05, "loss": 0.19357091188430786, "memory(GiB)": 122.96, "step": 13830, "token_acc": 0.932656023222061, "train_speed(iter/s)": 0.243416 }, { "epoch": 1.0545773305892217, "grad_norm": 0.47639355063438416, "learning_rate": 8.941945064902553e-05, "loss": 0.15310170650482177, "memory(GiB)": 122.96, "step": 13835, "token_acc": 0.9466460799451021, "train_speed(iter/s)": 0.243432 }, { "epoch": 1.0549584571994817, "grad_norm": 0.8637543320655823, "learning_rate": 8.941208373027498e-05, "loss": 0.13954622745513917, "memory(GiB)": 122.96, "step": 13840, "token_acc": 0.9377431906614786, "train_speed(iter/s)": 0.243458 }, { "epoch": 1.0553395838097417, "grad_norm": 0.7267767190933228, "learning_rate": 8.940471455142432e-05, "loss": 0.14207327365875244, "memory(GiB)": 122.96, "step": 13845, "token_acc": 0.9513824168996583, "train_speed(iter/s)": 0.243474 }, { "epoch": 1.0557207104200015, "grad_norm": 0.7471698522567749, "learning_rate": 8.939734311289614e-05, "loss": 0.15841224193572997, "memory(GiB)": 122.96, "step": 13850, "token_acc": 0.9506980802792321, "train_speed(iter/s)": 0.243486 }, { "epoch": 1.0561018370302615, "grad_norm": 1.390010952949524, "learning_rate": 8.938996941511316e-05, "loss": 0.08919198513031006, "memory(GiB)": 122.96, "step": 13855, "token_acc": 0.9586956521739131, "train_speed(iter/s)": 0.243514 }, { "epoch": 1.0564829636405213, "grad_norm": 1.0175960063934326, "learning_rate": 8.938259345849822e-05, "loss": 0.13083903789520263, "memory(GiB)": 122.96, "step": 13860, "token_acc": 0.9539399412960036, "train_speed(iter/s)": 0.243533 }, { "epoch": 1.0568640902507813, "grad_norm": 0.5142428278923035, "learning_rate": 8.937521524347432e-05, "loss": 0.15616945028305054, "memory(GiB)": 122.96, "step": 13865, "token_acc": 0.941733160141028, "train_speed(iter/s)": 0.243546 }, { "epoch": 1.0572452168610413, "grad_norm": 1.099473237991333, "learning_rate": 8.936783477046453e-05, "loss": 0.11118948459625244, "memory(GiB)": 122.96, "step": 13870, "token_acc": 0.9517506404782238, "train_speed(iter/s)": 0.243567 }, { "epoch": 1.0576263434713011, "grad_norm": 0.9400036334991455, "learning_rate": 8.93604520398921e-05, "loss": 0.1440316319465637, "memory(GiB)": 122.96, "step": 13875, "token_acc": 0.9414990859232175, "train_speed(iter/s)": 0.243591 }, { "epoch": 1.0580074700815612, "grad_norm": 0.4430847465991974, "learning_rate": 8.935306705218041e-05, "loss": 0.11485897302627564, "memory(GiB)": 122.96, "step": 13880, "token_acc": 0.9565094871313169, "train_speed(iter/s)": 0.243604 }, { "epoch": 1.058388596691821, "grad_norm": 0.7600257396697998, "learning_rate": 8.934567980775294e-05, "loss": 0.15391680002212524, "memory(GiB)": 122.96, "step": 13885, "token_acc": 0.9308956725930896, "train_speed(iter/s)": 0.243622 }, { "epoch": 1.058769723302081, "grad_norm": 1.4620006084442139, "learning_rate": 8.933829030703334e-05, "loss": 0.11927628517150879, "memory(GiB)": 122.96, "step": 13890, "token_acc": 0.9592225609756098, "train_speed(iter/s)": 0.243652 }, { "epoch": 1.059150849912341, "grad_norm": 0.852737307548523, "learning_rate": 8.933089855044533e-05, "loss": 0.15493178367614746, "memory(GiB)": 122.96, "step": 13895, "token_acc": 0.9485917882592467, "train_speed(iter/s)": 0.243662 }, { "epoch": 1.0595319765226008, "grad_norm": 0.7927077412605286, "learning_rate": 8.932350453841281e-05, "loss": 0.1438792824745178, "memory(GiB)": 122.96, "step": 13900, "token_acc": 0.9544666088464874, "train_speed(iter/s)": 0.243684 }, { "epoch": 1.0599131031328608, "grad_norm": 1.4425526857376099, "learning_rate": 8.931610827135978e-05, "loss": 0.13518056869506836, "memory(GiB)": 122.96, "step": 13905, "token_acc": 0.9502220144893667, "train_speed(iter/s)": 0.24371 }, { "epoch": 1.0602942297431206, "grad_norm": 0.9809650182723999, "learning_rate": 8.93087097497104e-05, "loss": 0.1633934736251831, "memory(GiB)": 122.96, "step": 13910, "token_acc": 0.9278074866310161, "train_speed(iter/s)": 0.243727 }, { "epoch": 1.0606753563533806, "grad_norm": 0.5744962692260742, "learning_rate": 8.93013089738889e-05, "loss": 0.12379004955291747, "memory(GiB)": 122.96, "step": 13915, "token_acc": 0.9547186729432863, "train_speed(iter/s)": 0.24375 }, { "epoch": 1.0610564829636404, "grad_norm": 0.33915218710899353, "learning_rate": 8.929390594431974e-05, "loss": 0.13455127477645873, "memory(GiB)": 122.96, "step": 13920, "token_acc": 0.9553587565515995, "train_speed(iter/s)": 0.243768 }, { "epoch": 1.0614376095739004, "grad_norm": 0.7223771810531616, "learning_rate": 8.928650066142742e-05, "loss": 0.09828578233718872, "memory(GiB)": 122.96, "step": 13925, "token_acc": 0.9656078860898138, "train_speed(iter/s)": 0.243787 }, { "epoch": 1.0618187361841604, "grad_norm": 1.0227817296981812, "learning_rate": 8.92790931256366e-05, "loss": 0.14081578254699706, "memory(GiB)": 122.96, "step": 13930, "token_acc": 0.9445037353255069, "train_speed(iter/s)": 0.243811 }, { "epoch": 1.0621998627944202, "grad_norm": 1.1046584844589233, "learning_rate": 8.927168333737205e-05, "loss": 0.11488572359085084, "memory(GiB)": 122.96, "step": 13935, "token_acc": 0.9477968004490598, "train_speed(iter/s)": 0.243836 }, { "epoch": 1.0625809894046803, "grad_norm": 0.7464026212692261, "learning_rate": 8.926427129705872e-05, "loss": 0.13014965057373046, "memory(GiB)": 122.96, "step": 13940, "token_acc": 0.9396619807297425, "train_speed(iter/s)": 0.243852 }, { "epoch": 1.06296211601494, "grad_norm": 1.3379030227661133, "learning_rate": 8.925685700512161e-05, "loss": 0.11515038013458252, "memory(GiB)": 122.96, "step": 13945, "token_acc": 0.9568113450795015, "train_speed(iter/s)": 0.243877 }, { "epoch": 1.0633432426252, "grad_norm": 1.869718313217163, "learning_rate": 8.924944046198596e-05, "loss": 0.2573657512664795, "memory(GiB)": 122.96, "step": 13950, "token_acc": 0.8844953173777316, "train_speed(iter/s)": 0.243908 }, { "epoch": 1.06372436923546, "grad_norm": 0.8353025317192078, "learning_rate": 8.924202166807702e-05, "loss": 0.14185887575149536, "memory(GiB)": 122.96, "step": 13955, "token_acc": 0.9480432972522898, "train_speed(iter/s)": 0.243926 }, { "epoch": 1.06410549584572, "grad_norm": 1.9787334203720093, "learning_rate": 8.923460062382026e-05, "loss": 0.18336135149002075, "memory(GiB)": 122.96, "step": 13960, "token_acc": 0.9223648850403106, "train_speed(iter/s)": 0.243954 }, { "epoch": 1.06448662245598, "grad_norm": 0.9908289909362793, "learning_rate": 8.922717732964121e-05, "loss": 0.20720796585083007, "memory(GiB)": 122.96, "step": 13965, "token_acc": 0.9230355220667384, "train_speed(iter/s)": 0.243977 }, { "epoch": 1.0648677490662397, "grad_norm": 1.0205495357513428, "learning_rate": 8.921975178596558e-05, "loss": 0.20223214626312255, "memory(GiB)": 122.96, "step": 13970, "token_acc": 0.9097416744475568, "train_speed(iter/s)": 0.243994 }, { "epoch": 1.0652488756764997, "grad_norm": 0.74333256483078, "learning_rate": 8.921232399321919e-05, "loss": 0.14606393575668336, "memory(GiB)": 122.96, "step": 13975, "token_acc": 0.9411417322834645, "train_speed(iter/s)": 0.244013 }, { "epoch": 1.0656300022867597, "grad_norm": 1.5916481018066406, "learning_rate": 8.920489395182798e-05, "loss": 0.1726184606552124, "memory(GiB)": 122.96, "step": 13980, "token_acc": 0.9316843345111896, "train_speed(iter/s)": 0.244044 }, { "epoch": 1.0660111288970195, "grad_norm": 0.7216887474060059, "learning_rate": 8.919746166221802e-05, "loss": 0.1451572895050049, "memory(GiB)": 122.96, "step": 13985, "token_acc": 0.945054945054945, "train_speed(iter/s)": 0.244066 }, { "epoch": 1.0663922555072796, "grad_norm": 0.9098125100135803, "learning_rate": 8.919002712481557e-05, "loss": 0.1438741445541382, "memory(GiB)": 122.96, "step": 13990, "token_acc": 0.9410542716177617, "train_speed(iter/s)": 0.244093 }, { "epoch": 1.0667733821175394, "grad_norm": 1.905616283416748, "learning_rate": 8.918259034004691e-05, "loss": 0.12207678556442261, "memory(GiB)": 122.96, "step": 13995, "token_acc": 0.9657553551296505, "train_speed(iter/s)": 0.244103 }, { "epoch": 1.0671545087277994, "grad_norm": 0.7123406529426575, "learning_rate": 8.917515130833851e-05, "loss": 0.08823396563529969, "memory(GiB)": 122.96, "step": 14000, "token_acc": 0.9643886372993001, "train_speed(iter/s)": 0.244126 }, { "epoch": 1.0671545087277994, "eval_loss": 0.11337019503116608, "eval_runtime": 161.096, "eval_samples_per_second": 3.29, "eval_steps_per_second": 3.29, "eval_token_acc": 0.9467652551051141, "step": 14000 }, { "epoch": 1.0675356353380594, "grad_norm": 0.6830775737762451, "learning_rate": 8.916771003011699e-05, "loss": 0.17265145778656005, "memory(GiB)": 122.96, "step": 14005, "token_acc": 0.9463833503129093, "train_speed(iter/s)": 0.243459 }, { "epoch": 1.0679167619483192, "grad_norm": 0.462451696395874, "learning_rate": 8.916026650580906e-05, "loss": 0.12832536697387695, "memory(GiB)": 122.96, "step": 14010, "token_acc": 0.9569700827498409, "train_speed(iter/s)": 0.243468 }, { "epoch": 1.0682978885585792, "grad_norm": 1.1834667921066284, "learning_rate": 8.915282073584157e-05, "loss": 0.12904551029205322, "memory(GiB)": 122.96, "step": 14015, "token_acc": 0.9385325558794947, "train_speed(iter/s)": 0.243495 }, { "epoch": 1.068679015168839, "grad_norm": 1.362908959388733, "learning_rate": 8.91453727206415e-05, "loss": 0.11929768323898315, "memory(GiB)": 122.96, "step": 14020, "token_acc": 0.9573584905660377, "train_speed(iter/s)": 0.243525 }, { "epoch": 1.069060141779099, "grad_norm": 0.9988250136375427, "learning_rate": 8.913792246063596e-05, "loss": 0.156122624874115, "memory(GiB)": 122.96, "step": 14025, "token_acc": 0.9432242022378782, "train_speed(iter/s)": 0.243551 }, { "epoch": 1.069441268389359, "grad_norm": 1.387105107307434, "learning_rate": 8.91304699562522e-05, "loss": 0.17082748413085938, "memory(GiB)": 122.96, "step": 14030, "token_acc": 0.9485873089393237, "train_speed(iter/s)": 0.243561 }, { "epoch": 1.0698223949996188, "grad_norm": 1.707722783088684, "learning_rate": 8.912301520791757e-05, "loss": 0.1387547731399536, "memory(GiB)": 122.96, "step": 14035, "token_acc": 0.9413847364280095, "train_speed(iter/s)": 0.243593 }, { "epoch": 1.0702035216098789, "grad_norm": 0.7509340047836304, "learning_rate": 8.911555821605957e-05, "loss": 0.12211596965789795, "memory(GiB)": 122.96, "step": 14040, "token_acc": 0.952988792029888, "train_speed(iter/s)": 0.243616 }, { "epoch": 1.0705846482201387, "grad_norm": 0.7022151947021484, "learning_rate": 8.910809898110582e-05, "loss": 0.08998556733131409, "memory(GiB)": 122.96, "step": 14045, "token_acc": 0.964412306819793, "train_speed(iter/s)": 0.243626 }, { "epoch": 1.0709657748303987, "grad_norm": 0.962763249874115, "learning_rate": 8.910063750348408e-05, "loss": 0.19588856697082518, "memory(GiB)": 122.96, "step": 14050, "token_acc": 0.9306442251886244, "train_speed(iter/s)": 0.24365 }, { "epoch": 1.0713469014406587, "grad_norm": 0.9701728224754333, "learning_rate": 8.909317378362223e-05, "loss": 0.1199905276298523, "memory(GiB)": 122.96, "step": 14055, "token_acc": 0.9503513942416686, "train_speed(iter/s)": 0.243674 }, { "epoch": 1.0717280280509185, "grad_norm": 0.4670375883579254, "learning_rate": 8.908570782194829e-05, "loss": 0.1108126163482666, "memory(GiB)": 122.96, "step": 14060, "token_acc": 0.9580820265379976, "train_speed(iter/s)": 0.243685 }, { "epoch": 1.0721091546611785, "grad_norm": 0.8073314428329468, "learning_rate": 8.907823961889037e-05, "loss": 0.11033324003219605, "memory(GiB)": 122.96, "step": 14065, "token_acc": 0.9592572062084257, "train_speed(iter/s)": 0.243709 }, { "epoch": 1.0724902812714383, "grad_norm": 0.7255701422691345, "learning_rate": 8.907076917487676e-05, "loss": 0.13623740673065185, "memory(GiB)": 122.96, "step": 14070, "token_acc": 0.9529576746557878, "train_speed(iter/s)": 0.243716 }, { "epoch": 1.0728714078816983, "grad_norm": 1.1095976829528809, "learning_rate": 8.906329649033585e-05, "loss": 0.134912109375, "memory(GiB)": 122.96, "step": 14075, "token_acc": 0.9330181245074862, "train_speed(iter/s)": 0.243744 }, { "epoch": 1.0732525344919583, "grad_norm": 1.7091161012649536, "learning_rate": 8.905582156569615e-05, "loss": 0.145004940032959, "memory(GiB)": 122.96, "step": 14080, "token_acc": 0.9442388354305453, "train_speed(iter/s)": 0.24377 }, { "epoch": 1.0736336611022181, "grad_norm": 1.3228789567947388, "learning_rate": 8.904834440138633e-05, "loss": 0.12184611558914185, "memory(GiB)": 122.96, "step": 14085, "token_acc": 0.9559861799609434, "train_speed(iter/s)": 0.243784 }, { "epoch": 1.0740147877124782, "grad_norm": 0.9856445789337158, "learning_rate": 8.904086499783517e-05, "loss": 0.14568943977355958, "memory(GiB)": 122.96, "step": 14090, "token_acc": 0.9472259810554804, "train_speed(iter/s)": 0.243802 }, { "epoch": 1.074395914322738, "grad_norm": 0.36612966656684875, "learning_rate": 8.903338335547157e-05, "loss": 0.089752459526062, "memory(GiB)": 122.96, "step": 14095, "token_acc": 0.9629165173772841, "train_speed(iter/s)": 0.243828 }, { "epoch": 1.074777040932998, "grad_norm": 0.8229888677597046, "learning_rate": 8.902589947472457e-05, "loss": 0.13899683952331543, "memory(GiB)": 122.96, "step": 14100, "token_acc": 0.9489256780556534, "train_speed(iter/s)": 0.243858 }, { "epoch": 1.0751581675432578, "grad_norm": 0.5527825951576233, "learning_rate": 8.901841335602334e-05, "loss": 0.08099815249443054, "memory(GiB)": 122.96, "step": 14105, "token_acc": 0.9730568256041803, "train_speed(iter/s)": 0.243871 }, { "epoch": 1.0755392941535178, "grad_norm": 1.2650723457336426, "learning_rate": 8.901092499979718e-05, "loss": 0.14886187314987182, "memory(GiB)": 122.96, "step": 14110, "token_acc": 0.9340937896070975, "train_speed(iter/s)": 0.243881 }, { "epoch": 1.0759204207637778, "grad_norm": 1.1389076709747314, "learning_rate": 8.90034344064755e-05, "loss": 0.17556400299072267, "memory(GiB)": 122.96, "step": 14115, "token_acc": 0.9383792909397861, "train_speed(iter/s)": 0.243906 }, { "epoch": 1.0763015473740376, "grad_norm": 1.5875388383865356, "learning_rate": 8.899594157648784e-05, "loss": 0.16824105978012086, "memory(GiB)": 122.96, "step": 14120, "token_acc": 0.9340369393139841, "train_speed(iter/s)": 0.243917 }, { "epoch": 1.0766826739842976, "grad_norm": 1.0428729057312012, "learning_rate": 8.89884465102639e-05, "loss": 0.12246975898742676, "memory(GiB)": 122.96, "step": 14125, "token_acc": 0.9405383688228204, "train_speed(iter/s)": 0.243947 }, { "epoch": 1.0770638005945574, "grad_norm": 1.0009219646453857, "learning_rate": 8.898094920823349e-05, "loss": 0.1099663257598877, "memory(GiB)": 122.96, "step": 14130, "token_acc": 0.9516752577319587, "train_speed(iter/s)": 0.243972 }, { "epoch": 1.0774449272048174, "grad_norm": 1.3694859743118286, "learning_rate": 8.897344967082652e-05, "loss": 0.12390153408050537, "memory(GiB)": 122.96, "step": 14135, "token_acc": 0.9482558139534883, "train_speed(iter/s)": 0.243989 }, { "epoch": 1.0778260538150775, "grad_norm": 1.509225606918335, "learning_rate": 8.89659478984731e-05, "loss": 0.13058322668075562, "memory(GiB)": 122.96, "step": 14140, "token_acc": 0.9525389497980381, "train_speed(iter/s)": 0.244004 }, { "epoch": 1.0782071804253373, "grad_norm": 0.6984720826148987, "learning_rate": 8.895844389160338e-05, "loss": 0.10018734931945801, "memory(GiB)": 122.96, "step": 14145, "token_acc": 0.9524241051200725, "train_speed(iter/s)": 0.244016 }, { "epoch": 1.0785883070355973, "grad_norm": 0.47876617312431335, "learning_rate": 8.895093765064767e-05, "loss": 0.10984331369400024, "memory(GiB)": 122.96, "step": 14150, "token_acc": 0.956457345971564, "train_speed(iter/s)": 0.244041 }, { "epoch": 1.078969433645857, "grad_norm": 1.0510331392288208, "learning_rate": 8.894342917603646e-05, "loss": 0.17121741771697999, "memory(GiB)": 122.96, "step": 14155, "token_acc": 0.9340433482810164, "train_speed(iter/s)": 0.244065 }, { "epoch": 1.079350560256117, "grad_norm": 0.6981080174446106, "learning_rate": 8.893591846820031e-05, "loss": 0.11933771371841431, "memory(GiB)": 122.96, "step": 14160, "token_acc": 0.9552810161823561, "train_speed(iter/s)": 0.244081 }, { "epoch": 1.079731686866377, "grad_norm": 1.4020987749099731, "learning_rate": 8.892840552756991e-05, "loss": 0.0819251537322998, "memory(GiB)": 122.96, "step": 14165, "token_acc": 0.9652358992550549, "train_speed(iter/s)": 0.244098 }, { "epoch": 1.080112813476637, "grad_norm": 0.5612790584564209, "learning_rate": 8.89208903545761e-05, "loss": 0.10373153686523437, "memory(GiB)": 122.96, "step": 14170, "token_acc": 0.9556891249650545, "train_speed(iter/s)": 0.244103 }, { "epoch": 1.080493940086897, "grad_norm": 1.0501195192337036, "learning_rate": 8.891337294964985e-05, "loss": 0.14267386198043824, "memory(GiB)": 122.96, "step": 14175, "token_acc": 0.9438502673796791, "train_speed(iter/s)": 0.244127 }, { "epoch": 1.0808750666971567, "grad_norm": 0.9344440698623657, "learning_rate": 8.890585331322224e-05, "loss": 0.14275239706039428, "memory(GiB)": 122.96, "step": 14180, "token_acc": 0.9444655281467426, "train_speed(iter/s)": 0.24413 }, { "epoch": 1.0812561933074167, "grad_norm": 1.2886723279953003, "learning_rate": 8.889833144572449e-05, "loss": 0.12619506120681762, "memory(GiB)": 122.96, "step": 14185, "token_acc": 0.9417791104447776, "train_speed(iter/s)": 0.244155 }, { "epoch": 1.0816373199176768, "grad_norm": 0.8645256161689758, "learning_rate": 8.889080734758795e-05, "loss": 0.1693003296852112, "memory(GiB)": 122.96, "step": 14190, "token_acc": 0.9369737177851493, "train_speed(iter/s)": 0.244162 }, { "epoch": 1.0820184465279366, "grad_norm": 0.654602587223053, "learning_rate": 8.888328101924407e-05, "loss": 0.1439509153366089, "memory(GiB)": 122.96, "step": 14195, "token_acc": 0.9446881091617934, "train_speed(iter/s)": 0.244181 }, { "epoch": 1.0823995731381966, "grad_norm": 0.6642621755599976, "learning_rate": 8.887575246112447e-05, "loss": 0.20056264400482177, "memory(GiB)": 122.96, "step": 14200, "token_acc": 0.9385435168738899, "train_speed(iter/s)": 0.244183 }, { "epoch": 1.0823995731381966, "eval_loss": 0.11249065399169922, "eval_runtime": 161.3497, "eval_samples_per_second": 3.285, "eval_steps_per_second": 3.285, "eval_token_acc": 0.9476386964640684, "step": 14200 }, { "epoch": 1.0827806997484564, "grad_norm": 0.8324598670005798, "learning_rate": 8.886822167366086e-05, "loss": 0.14480102062225342, "memory(GiB)": 122.96, "step": 14205, "token_acc": 0.9473828142322872, "train_speed(iter/s)": 0.243537 }, { "epoch": 1.0831618263587164, "grad_norm": 0.8573809266090393, "learning_rate": 8.886068865728513e-05, "loss": 0.1487090229988098, "memory(GiB)": 122.96, "step": 14210, "token_acc": 0.9468797236018138, "train_speed(iter/s)": 0.243561 }, { "epoch": 1.0835429529689762, "grad_norm": 0.9249890446662903, "learning_rate": 8.885315341242923e-05, "loss": 0.1498643636703491, "memory(GiB)": 122.96, "step": 14215, "token_acc": 0.9458754483208347, "train_speed(iter/s)": 0.243586 }, { "epoch": 1.0839240795792362, "grad_norm": 1.3766473531723022, "learning_rate": 8.884561593952528e-05, "loss": 0.12732200622558593, "memory(GiB)": 122.96, "step": 14220, "token_acc": 0.9499175597691674, "train_speed(iter/s)": 0.243608 }, { "epoch": 1.0843052061894962, "grad_norm": 1.398710012435913, "learning_rate": 8.883807623900552e-05, "loss": 0.1911759376525879, "memory(GiB)": 122.96, "step": 14225, "token_acc": 0.931395105626438, "train_speed(iter/s)": 0.243631 }, { "epoch": 1.084686332799756, "grad_norm": 1.1975528001785278, "learning_rate": 8.883053431130233e-05, "loss": 0.16268826723098756, "memory(GiB)": 122.96, "step": 14230, "token_acc": 0.9468698517298187, "train_speed(iter/s)": 0.24365 }, { "epoch": 1.085067459410016, "grad_norm": 0.9337657690048218, "learning_rate": 8.882299015684818e-05, "loss": 0.06861156821250916, "memory(GiB)": 122.96, "step": 14235, "token_acc": 0.9583945178658835, "train_speed(iter/s)": 0.243682 }, { "epoch": 1.0854485860202758, "grad_norm": 0.857903778553009, "learning_rate": 8.881544377607571e-05, "loss": 0.12826788425445557, "memory(GiB)": 122.96, "step": 14240, "token_acc": 0.9514312529328953, "train_speed(iter/s)": 0.243704 }, { "epoch": 1.0858297126305358, "grad_norm": 0.7988215088844299, "learning_rate": 8.880789516941766e-05, "loss": 0.14227854013442992, "memory(GiB)": 122.96, "step": 14245, "token_acc": 0.9469325153374233, "train_speed(iter/s)": 0.243729 }, { "epoch": 1.0862108392407959, "grad_norm": 0.8608855605125427, "learning_rate": 8.880034433730694e-05, "loss": 0.15314873456954955, "memory(GiB)": 122.96, "step": 14250, "token_acc": 0.9539914163090129, "train_speed(iter/s)": 0.243748 }, { "epoch": 1.0865919658510557, "grad_norm": 1.296857476234436, "learning_rate": 8.879279128017647e-05, "loss": 0.154669451713562, "memory(GiB)": 122.96, "step": 14255, "token_acc": 0.9459850494333253, "train_speed(iter/s)": 0.243771 }, { "epoch": 1.0869730924613157, "grad_norm": 1.24712073802948, "learning_rate": 8.878523599845949e-05, "loss": 0.12981812953948973, "memory(GiB)": 122.96, "step": 14260, "token_acc": 0.9557449789817842, "train_speed(iter/s)": 0.243779 }, { "epoch": 1.0873542190715755, "grad_norm": 0.8143262267112732, "learning_rate": 8.87776784925892e-05, "loss": 0.11611336469650269, "memory(GiB)": 122.96, "step": 14265, "token_acc": 0.957934131736527, "train_speed(iter/s)": 0.243791 }, { "epoch": 1.0877353456818355, "grad_norm": 0.7102507948875427, "learning_rate": 8.877011876299899e-05, "loss": 0.126455557346344, "memory(GiB)": 122.96, "step": 14270, "token_acc": 0.9401843939734652, "train_speed(iter/s)": 0.243817 }, { "epoch": 1.0881164722920955, "grad_norm": 0.7546711564064026, "learning_rate": 8.87625568101224e-05, "loss": 0.11931931972503662, "memory(GiB)": 122.96, "step": 14275, "token_acc": 0.9574883948204251, "train_speed(iter/s)": 0.243845 }, { "epoch": 1.0884975989023553, "grad_norm": 0.9797837734222412, "learning_rate": 8.875499263439304e-05, "loss": 0.10101989507675171, "memory(GiB)": 122.96, "step": 14280, "token_acc": 0.9583673469387755, "train_speed(iter/s)": 0.243873 }, { "epoch": 1.0888787255126153, "grad_norm": 1.045759916305542, "learning_rate": 8.87474262362447e-05, "loss": 0.11701341867446899, "memory(GiB)": 122.96, "step": 14285, "token_acc": 0.9603614705278292, "train_speed(iter/s)": 0.243896 }, { "epoch": 1.0892598521228751, "grad_norm": 0.932837963104248, "learning_rate": 8.873985761611128e-05, "loss": 0.11453988552093505, "memory(GiB)": 122.96, "step": 14290, "token_acc": 0.9497159090909091, "train_speed(iter/s)": 0.243904 }, { "epoch": 1.0896409787331351, "grad_norm": 1.546210527420044, "learning_rate": 8.873228677442681e-05, "loss": 0.1746220827102661, "memory(GiB)": 122.96, "step": 14295, "token_acc": 0.9386949924127466, "train_speed(iter/s)": 0.243932 }, { "epoch": 1.0900221053433952, "grad_norm": 0.7339175939559937, "learning_rate": 8.872471371162543e-05, "loss": 0.15134212970733643, "memory(GiB)": 122.96, "step": 14300, "token_acc": 0.9432355723746452, "train_speed(iter/s)": 0.243952 }, { "epoch": 1.090403231953655, "grad_norm": 1.3828098773956299, "learning_rate": 8.871713842814141e-05, "loss": 0.17620112895965576, "memory(GiB)": 122.96, "step": 14305, "token_acc": 0.9315367807720321, "train_speed(iter/s)": 0.243974 }, { "epoch": 1.090784358563915, "grad_norm": 0.9878448247909546, "learning_rate": 8.870956092440918e-05, "loss": 0.12643036842346192, "memory(GiB)": 122.96, "step": 14310, "token_acc": 0.9470553242117787, "train_speed(iter/s)": 0.243985 }, { "epoch": 1.0911654851741748, "grad_norm": 1.2611618041992188, "learning_rate": 8.870198120086327e-05, "loss": 0.13811444044113158, "memory(GiB)": 122.96, "step": 14315, "token_acc": 0.9505213657738704, "train_speed(iter/s)": 0.24401 }, { "epoch": 1.0915466117844348, "grad_norm": 0.4351796805858612, "learning_rate": 8.869439925793832e-05, "loss": 0.1470876455307007, "memory(GiB)": 122.96, "step": 14320, "token_acc": 0.9518434913468774, "train_speed(iter/s)": 0.244032 }, { "epoch": 1.0919277383946948, "grad_norm": 1.020097017288208, "learning_rate": 8.868681509606916e-05, "loss": 0.14730892181396485, "memory(GiB)": 122.96, "step": 14325, "token_acc": 0.9492015418502202, "train_speed(iter/s)": 0.244038 }, { "epoch": 1.0923088650049546, "grad_norm": 0.9121341705322266, "learning_rate": 8.867922871569066e-05, "loss": 0.13442734479904175, "memory(GiB)": 122.96, "step": 14330, "token_acc": 0.9462404000714413, "train_speed(iter/s)": 0.244054 }, { "epoch": 1.0926899916152146, "grad_norm": 0.8346598744392395, "learning_rate": 8.86716401172379e-05, "loss": 0.12837202548980714, "memory(GiB)": 122.96, "step": 14335, "token_acc": 0.9576423936553713, "train_speed(iter/s)": 0.244071 }, { "epoch": 1.0930711182254744, "grad_norm": 0.650312602519989, "learning_rate": 8.866404930114603e-05, "loss": 0.11385934352874756, "memory(GiB)": 122.96, "step": 14340, "token_acc": 0.9402852049910874, "train_speed(iter/s)": 0.2441 }, { "epoch": 1.0934522448357344, "grad_norm": 0.1723540723323822, "learning_rate": 8.865645626785036e-05, "loss": 0.13434340953826904, "memory(GiB)": 122.96, "step": 14345, "token_acc": 0.9593002499107461, "train_speed(iter/s)": 0.244129 }, { "epoch": 1.0938333714459945, "grad_norm": 0.35588544607162476, "learning_rate": 8.864886101778631e-05, "loss": 0.15625797510147094, "memory(GiB)": 122.96, "step": 14350, "token_acc": 0.9267592149353758, "train_speed(iter/s)": 0.244163 }, { "epoch": 1.0942144980562543, "grad_norm": 0.8230992555618286, "learning_rate": 8.864126355138945e-05, "loss": 0.15003788471221924, "memory(GiB)": 122.96, "step": 14355, "token_acc": 0.929266368656324, "train_speed(iter/s)": 0.244192 }, { "epoch": 1.0945956246665143, "grad_norm": 0.288656622171402, "learning_rate": 8.863366386909541e-05, "loss": 0.1376652479171753, "memory(GiB)": 122.96, "step": 14360, "token_acc": 0.9524089306698003, "train_speed(iter/s)": 0.244222 }, { "epoch": 1.094976751276774, "grad_norm": 1.3268550634384155, "learning_rate": 8.862606197134005e-05, "loss": 0.1131251573562622, "memory(GiB)": 122.96, "step": 14365, "token_acc": 0.9500157183275699, "train_speed(iter/s)": 0.244251 }, { "epoch": 1.095357877887034, "grad_norm": 1.29462730884552, "learning_rate": 8.861845785855928e-05, "loss": 0.12927931547164917, "memory(GiB)": 122.96, "step": 14370, "token_acc": 0.9487179487179487, "train_speed(iter/s)": 0.244269 }, { "epoch": 1.0957390044972941, "grad_norm": 1.998456597328186, "learning_rate": 8.861085153118916e-05, "loss": 0.1220848798751831, "memory(GiB)": 122.96, "step": 14375, "token_acc": 0.9476529160739687, "train_speed(iter/s)": 0.244291 }, { "epoch": 1.096120131107554, "grad_norm": 1.5576528310775757, "learning_rate": 8.86032429896659e-05, "loss": 0.1355413317680359, "memory(GiB)": 122.96, "step": 14380, "token_acc": 0.9463929284288566, "train_speed(iter/s)": 0.24432 }, { "epoch": 1.096501257717814, "grad_norm": 0.8468164801597595, "learning_rate": 8.859563223442576e-05, "loss": 0.11906529664993286, "memory(GiB)": 122.96, "step": 14385, "token_acc": 0.9495086411385971, "train_speed(iter/s)": 0.244339 }, { "epoch": 1.0968823843280737, "grad_norm": 1.1479028463363647, "learning_rate": 8.858801926590524e-05, "loss": 0.15391225814819337, "memory(GiB)": 122.96, "step": 14390, "token_acc": 0.9381824290273811, "train_speed(iter/s)": 0.244358 }, { "epoch": 1.0972635109383337, "grad_norm": 0.31084245443344116, "learning_rate": 8.858040408454088e-05, "loss": 0.09148204326629639, "memory(GiB)": 122.96, "step": 14395, "token_acc": 0.9595903165735568, "train_speed(iter/s)": 0.244374 }, { "epoch": 1.0976446375485938, "grad_norm": 1.3860517740249634, "learning_rate": 8.857278669076938e-05, "loss": 0.1814399003982544, "memory(GiB)": 122.96, "step": 14400, "token_acc": 0.9332304980167475, "train_speed(iter/s)": 0.244395 }, { "epoch": 1.0976446375485938, "eval_loss": 0.11170077323913574, "eval_runtime": 159.4039, "eval_samples_per_second": 3.325, "eval_steps_per_second": 3.325, "eval_token_acc": 0.9485874344918981, "step": 14400 }, { "epoch": 1.0980257641588536, "grad_norm": 1.3578943014144897, "learning_rate": 8.856516708502757e-05, "loss": 0.11858257055282592, "memory(GiB)": 122.96, "step": 14405, "token_acc": 0.9485876539567815, "train_speed(iter/s)": 0.243764 }, { "epoch": 1.0984068907691136, "grad_norm": 1.1676934957504272, "learning_rate": 8.855754526775239e-05, "loss": 0.14838911294937135, "memory(GiB)": 122.96, "step": 14410, "token_acc": 0.9460500963391136, "train_speed(iter/s)": 0.243778 }, { "epoch": 1.0987880173793734, "grad_norm": 1.561812162399292, "learning_rate": 8.85499212393809e-05, "loss": 0.15635422468185425, "memory(GiB)": 122.96, "step": 14415, "token_acc": 0.9448609431680773, "train_speed(iter/s)": 0.243802 }, { "epoch": 1.0991691439896334, "grad_norm": 0.9555893540382385, "learning_rate": 8.854229500035034e-05, "loss": 0.19608778953552247, "memory(GiB)": 122.96, "step": 14420, "token_acc": 0.9379789272030651, "train_speed(iter/s)": 0.243828 }, { "epoch": 1.0995502705998932, "grad_norm": 0.6626488566398621, "learning_rate": 8.853466655109801e-05, "loss": 0.1451410174369812, "memory(GiB)": 122.96, "step": 14425, "token_acc": 0.9410217881292261, "train_speed(iter/s)": 0.243851 }, { "epoch": 1.0999313972101532, "grad_norm": 1.8461352586746216, "learning_rate": 8.852703589206139e-05, "loss": 0.09144092202186585, "memory(GiB)": 122.96, "step": 14430, "token_acc": 0.9621478873239436, "train_speed(iter/s)": 0.243883 }, { "epoch": 1.1003125238204132, "grad_norm": 1.0859041213989258, "learning_rate": 8.851940302367804e-05, "loss": 0.16422631740570068, "memory(GiB)": 122.96, "step": 14435, "token_acc": 0.9375793866811831, "train_speed(iter/s)": 0.243903 }, { "epoch": 1.100693650430673, "grad_norm": 0.4465910494327545, "learning_rate": 8.851176794638567e-05, "loss": 0.1650601863861084, "memory(GiB)": 122.96, "step": 14440, "token_acc": 0.9379354021532615, "train_speed(iter/s)": 0.243921 }, { "epoch": 1.101074777040933, "grad_norm": 0.8427920937538147, "learning_rate": 8.850413066062212e-05, "loss": 0.16799304485321045, "memory(GiB)": 122.96, "step": 14445, "token_acc": 0.9427995971802619, "train_speed(iter/s)": 0.24394 }, { "epoch": 1.1014559036511928, "grad_norm": 0.7271862626075745, "learning_rate": 8.849649116682539e-05, "loss": 0.11650708913803101, "memory(GiB)": 122.96, "step": 14450, "token_acc": 0.945708801754867, "train_speed(iter/s)": 0.24397 }, { "epoch": 1.1018370302614529, "grad_norm": 1.1876842975616455, "learning_rate": 8.848884946543352e-05, "loss": 0.15535416603088378, "memory(GiB)": 122.96, "step": 14455, "token_acc": 0.9346011131725418, "train_speed(iter/s)": 0.243986 }, { "epoch": 1.1022181568717129, "grad_norm": 0.8193598985671997, "learning_rate": 8.848120555688473e-05, "loss": 0.15821361541748047, "memory(GiB)": 122.96, "step": 14460, "token_acc": 0.9363226319978775, "train_speed(iter/s)": 0.244012 }, { "epoch": 1.1025992834819727, "grad_norm": 1.228663682937622, "learning_rate": 8.84735594416174e-05, "loss": 0.14682830572128297, "memory(GiB)": 122.96, "step": 14465, "token_acc": 0.942713567839196, "train_speed(iter/s)": 0.244022 }, { "epoch": 1.1029804100922327, "grad_norm": 0.7804664373397827, "learning_rate": 8.846591112006995e-05, "loss": 0.1353921413421631, "memory(GiB)": 122.96, "step": 14470, "token_acc": 0.9526889447838064, "train_speed(iter/s)": 0.244028 }, { "epoch": 1.1033615367024925, "grad_norm": 0.843709409236908, "learning_rate": 8.8458260592681e-05, "loss": 0.11843812465667725, "memory(GiB)": 122.96, "step": 14475, "token_acc": 0.9421278254091972, "train_speed(iter/s)": 0.244045 }, { "epoch": 1.1037426633127525, "grad_norm": 0.9855577349662781, "learning_rate": 8.84506078598893e-05, "loss": 0.16073298454284668, "memory(GiB)": 122.96, "step": 14480, "token_acc": 0.9398885350318471, "train_speed(iter/s)": 0.244073 }, { "epoch": 1.1041237899230125, "grad_norm": 1.7700817584991455, "learning_rate": 8.844295292213365e-05, "loss": 0.17864352464675903, "memory(GiB)": 122.96, "step": 14485, "token_acc": 0.9403085177733065, "train_speed(iter/s)": 0.244087 }, { "epoch": 1.1045049165332723, "grad_norm": 1.1716434955596924, "learning_rate": 8.843529577985306e-05, "loss": 0.1596297264099121, "memory(GiB)": 122.96, "step": 14490, "token_acc": 0.9412376495059802, "train_speed(iter/s)": 0.244112 }, { "epoch": 1.1048860431435323, "grad_norm": 1.1746630668640137, "learning_rate": 8.842763643348661e-05, "loss": 0.11899219751358033, "memory(GiB)": 122.96, "step": 14495, "token_acc": 0.9475170628383149, "train_speed(iter/s)": 0.244134 }, { "epoch": 1.1052671697537921, "grad_norm": 0.4147237241268158, "learning_rate": 8.841997488347354e-05, "loss": 0.10175942182540894, "memory(GiB)": 122.96, "step": 14500, "token_acc": 0.9626955475330926, "train_speed(iter/s)": 0.244153 }, { "epoch": 1.1056482963640522, "grad_norm": 1.2778043746948242, "learning_rate": 8.841231113025321e-05, "loss": 0.16685280799865723, "memory(GiB)": 122.96, "step": 14505, "token_acc": 0.9174904942965779, "train_speed(iter/s)": 0.244179 }, { "epoch": 1.106029422974312, "grad_norm": 2.0955970287323, "learning_rate": 8.840464517426508e-05, "loss": 0.11966350078582763, "memory(GiB)": 122.96, "step": 14510, "token_acc": 0.9421281891723708, "train_speed(iter/s)": 0.244201 }, { "epoch": 1.106410549584572, "grad_norm": 0.8662545680999756, "learning_rate": 8.839697701594876e-05, "loss": 0.1479501485824585, "memory(GiB)": 122.96, "step": 14515, "token_acc": 0.9454851104707013, "train_speed(iter/s)": 0.24423 }, { "epoch": 1.106791676194832, "grad_norm": 1.3616070747375488, "learning_rate": 8.838930665574401e-05, "loss": 0.10350685119628907, "memory(GiB)": 122.96, "step": 14520, "token_acc": 0.9562851442046073, "train_speed(iter/s)": 0.244248 }, { "epoch": 1.1071728028050918, "grad_norm": 1.029227375984192, "learning_rate": 8.838163409409066e-05, "loss": 0.16952912807464598, "memory(GiB)": 122.96, "step": 14525, "token_acc": 0.9378808395396073, "train_speed(iter/s)": 0.244262 }, { "epoch": 1.1075539294153518, "grad_norm": 1.2725201845169067, "learning_rate": 8.837395933142871e-05, "loss": 0.1009417176246643, "memory(GiB)": 122.96, "step": 14530, "token_acc": 0.9537203047960556, "train_speed(iter/s)": 0.244271 }, { "epoch": 1.1079350560256116, "grad_norm": 0.46861401200294495, "learning_rate": 8.836628236819827e-05, "loss": 0.154563307762146, "memory(GiB)": 122.96, "step": 14535, "token_acc": 0.9261033185083818, "train_speed(iter/s)": 0.244298 }, { "epoch": 1.1083161826358716, "grad_norm": 0.7519727945327759, "learning_rate": 8.835860320483959e-05, "loss": 0.12474924325942993, "memory(GiB)": 122.96, "step": 14540, "token_acc": 0.9482368165642187, "train_speed(iter/s)": 0.244325 }, { "epoch": 1.1086973092461316, "grad_norm": 0.7186149954795837, "learning_rate": 8.835092184179301e-05, "loss": 0.1126839280128479, "memory(GiB)": 122.96, "step": 14545, "token_acc": 0.9551388559221458, "train_speed(iter/s)": 0.244345 }, { "epoch": 1.1090784358563914, "grad_norm": 0.8930983543395996, "learning_rate": 8.834323827949901e-05, "loss": 0.09922429919242859, "memory(GiB)": 122.96, "step": 14550, "token_acc": 0.9562792772696854, "train_speed(iter/s)": 0.244369 }, { "epoch": 1.1094595624666514, "grad_norm": 1.49032461643219, "learning_rate": 8.833555251839826e-05, "loss": 0.13515920639038087, "memory(GiB)": 122.96, "step": 14555, "token_acc": 0.9598923283983849, "train_speed(iter/s)": 0.244393 }, { "epoch": 1.1098406890769112, "grad_norm": 1.0114067792892456, "learning_rate": 8.832786455893147e-05, "loss": 0.11715936660766602, "memory(GiB)": 122.96, "step": 14560, "token_acc": 0.9502708025603152, "train_speed(iter/s)": 0.244416 }, { "epoch": 1.1102218156871713, "grad_norm": 0.7081486582756042, "learning_rate": 8.83201744015395e-05, "loss": 0.15945316553115846, "memory(GiB)": 122.96, "step": 14565, "token_acc": 0.9519461797212878, "train_speed(iter/s)": 0.244424 }, { "epoch": 1.1106029422974313, "grad_norm": 0.766555905342102, "learning_rate": 8.831248204666336e-05, "loss": 0.11560359001159667, "memory(GiB)": 122.96, "step": 14570, "token_acc": 0.9564336372847011, "train_speed(iter/s)": 0.244446 }, { "epoch": 1.110984068907691, "grad_norm": 0.784356415271759, "learning_rate": 8.830478749474417e-05, "loss": 0.12432987689971924, "memory(GiB)": 122.96, "step": 14575, "token_acc": 0.9549266247379455, "train_speed(iter/s)": 0.244469 }, { "epoch": 1.111365195517951, "grad_norm": 0.9182413816452026, "learning_rate": 8.829709074622317e-05, "loss": 0.13628878593444824, "memory(GiB)": 122.96, "step": 14580, "token_acc": 0.9379025598678777, "train_speed(iter/s)": 0.244489 }, { "epoch": 1.111746322128211, "grad_norm": 1.1239757537841797, "learning_rate": 8.828939180154173e-05, "loss": 0.16028662919998168, "memory(GiB)": 122.96, "step": 14585, "token_acc": 0.9489946051986268, "train_speed(iter/s)": 0.24451 }, { "epoch": 1.112127448738471, "grad_norm": 1.4984540939331055, "learning_rate": 8.828169066114136e-05, "loss": 0.14382741451263428, "memory(GiB)": 122.96, "step": 14590, "token_acc": 0.9437080079898311, "train_speed(iter/s)": 0.244531 }, { "epoch": 1.112508575348731, "grad_norm": 0.5357720255851746, "learning_rate": 8.827398732546368e-05, "loss": 0.10743606090545654, "memory(GiB)": 122.96, "step": 14595, "token_acc": 0.950766221062928, "train_speed(iter/s)": 0.244554 }, { "epoch": 1.1128897019589907, "grad_norm": 1.0382788181304932, "learning_rate": 8.826628179495044e-05, "loss": 0.12864969968795775, "memory(GiB)": 122.96, "step": 14600, "token_acc": 0.956228046473926, "train_speed(iter/s)": 0.244561 }, { "epoch": 1.1128897019589907, "eval_loss": 0.11288680881261826, "eval_runtime": 160.3547, "eval_samples_per_second": 3.305, "eval_steps_per_second": 3.305, "eval_token_acc": 0.9474805734594302, "step": 14600 }, { "epoch": 1.1132708285692507, "grad_norm": 1.510717511177063, "learning_rate": 8.82585740700435e-05, "loss": 0.1493336796760559, "memory(GiB)": 122.96, "step": 14605, "token_acc": 0.9473271616815969, "train_speed(iter/s)": 0.24391 }, { "epoch": 1.1136519551795105, "grad_norm": 1.6160142421722412, "learning_rate": 8.825086415118491e-05, "loss": 0.17118076086044312, "memory(GiB)": 122.96, "step": 14610, "token_acc": 0.9390541199414919, "train_speed(iter/s)": 0.243933 }, { "epoch": 1.1140330817897706, "grad_norm": 0.5297818779945374, "learning_rate": 8.824315203881675e-05, "loss": 0.10271905660629273, "memory(GiB)": 122.96, "step": 14615, "token_acc": 0.9553903345724907, "train_speed(iter/s)": 0.243954 }, { "epoch": 1.1144142084000306, "grad_norm": 1.4766556024551392, "learning_rate": 8.823543773338128e-05, "loss": 0.13967974185943605, "memory(GiB)": 122.96, "step": 14620, "token_acc": 0.9389948531637905, "train_speed(iter/s)": 0.243975 }, { "epoch": 1.1147953350102904, "grad_norm": 0.777630627155304, "learning_rate": 8.82277212353209e-05, "loss": 0.1507936716079712, "memory(GiB)": 122.96, "step": 14625, "token_acc": 0.9484667802385008, "train_speed(iter/s)": 0.243993 }, { "epoch": 1.1151764616205504, "grad_norm": 0.004666919820010662, "learning_rate": 8.82200025450781e-05, "loss": 0.11755968332290649, "memory(GiB)": 122.96, "step": 14630, "token_acc": 0.9454516024603431, "train_speed(iter/s)": 0.24401 }, { "epoch": 1.1155575882308102, "grad_norm": 0.9860005378723145, "learning_rate": 8.821228166309553e-05, "loss": 0.12462644577026367, "memory(GiB)": 122.96, "step": 14635, "token_acc": 0.9519892884468248, "train_speed(iter/s)": 0.244031 }, { "epoch": 1.1159387148410702, "grad_norm": 0.4192967712879181, "learning_rate": 8.820455858981593e-05, "loss": 0.09451914429664612, "memory(GiB)": 122.96, "step": 14640, "token_acc": 0.9662309368191722, "train_speed(iter/s)": 0.244035 }, { "epoch": 1.1163198414513302, "grad_norm": 1.1363403797149658, "learning_rate": 8.819683332568219e-05, "loss": 0.15708003044128419, "memory(GiB)": 122.96, "step": 14645, "token_acc": 0.9207547169811321, "train_speed(iter/s)": 0.244067 }, { "epoch": 1.11670096806159, "grad_norm": 2.042970895767212, "learning_rate": 8.818910587113729e-05, "loss": 0.16134674549102784, "memory(GiB)": 122.96, "step": 14650, "token_acc": 0.9447743467933492, "train_speed(iter/s)": 0.244085 }, { "epoch": 1.11708209467185, "grad_norm": 0.9141850471496582, "learning_rate": 8.81813762266244e-05, "loss": 0.19086229801177979, "memory(GiB)": 122.96, "step": 14655, "token_acc": 0.9212491513917176, "train_speed(iter/s)": 0.24411 }, { "epoch": 1.1174632212821098, "grad_norm": 0.6576411128044128, "learning_rate": 8.817364439258677e-05, "loss": 0.13876715898513795, "memory(GiB)": 122.96, "step": 14660, "token_acc": 0.9421093148575828, "train_speed(iter/s)": 0.244127 }, { "epoch": 1.1178443478923699, "grad_norm": 1.1344268321990967, "learning_rate": 8.816591036946778e-05, "loss": 0.1559647798538208, "memory(GiB)": 122.96, "step": 14665, "token_acc": 0.9381529460927706, "train_speed(iter/s)": 0.244149 }, { "epoch": 1.1182254745026299, "grad_norm": 0.9037122130393982, "learning_rate": 8.815817415771095e-05, "loss": 0.1437970995903015, "memory(GiB)": 122.96, "step": 14670, "token_acc": 0.9390669628689305, "train_speed(iter/s)": 0.244175 }, { "epoch": 1.1186066011128897, "grad_norm": 0.7030108571052551, "learning_rate": 8.81504357577599e-05, "loss": 0.1492432951927185, "memory(GiB)": 122.96, "step": 14675, "token_acc": 0.9349730458221024, "train_speed(iter/s)": 0.244201 }, { "epoch": 1.1189877277231497, "grad_norm": 0.698906660079956, "learning_rate": 8.81426951700584e-05, "loss": 0.10723373889923096, "memory(GiB)": 122.96, "step": 14680, "token_acc": 0.9609246009906439, "train_speed(iter/s)": 0.244235 }, { "epoch": 1.1193688543334095, "grad_norm": 0.6497915387153625, "learning_rate": 8.813495239505032e-05, "loss": 0.14099397659301757, "memory(GiB)": 122.96, "step": 14685, "token_acc": 0.946651369568335, "train_speed(iter/s)": 0.244247 }, { "epoch": 1.1197499809436695, "grad_norm": 0.1224902793765068, "learning_rate": 8.81272074331797e-05, "loss": 0.1007909893989563, "memory(GiB)": 122.96, "step": 14690, "token_acc": 0.9580510682647212, "train_speed(iter/s)": 0.244268 }, { "epoch": 1.1201311075539295, "grad_norm": 1.0913958549499512, "learning_rate": 8.811946028489067e-05, "loss": 0.1308163046836853, "memory(GiB)": 122.96, "step": 14695, "token_acc": 0.9434656849855377, "train_speed(iter/s)": 0.244295 }, { "epoch": 1.1205122341641893, "grad_norm": 0.18087823688983917, "learning_rate": 8.81117109506275e-05, "loss": 0.11786898374557495, "memory(GiB)": 122.96, "step": 14700, "token_acc": 0.9368826788725608, "train_speed(iter/s)": 0.244316 }, { "epoch": 1.1208933607744493, "grad_norm": 0.7965733408927917, "learning_rate": 8.810395943083455e-05, "loss": 0.10318766832351685, "memory(GiB)": 122.96, "step": 14705, "token_acc": 0.9507096774193549, "train_speed(iter/s)": 0.24434 }, { "epoch": 1.1212744873847091, "grad_norm": 3.6499099731445312, "learning_rate": 8.809620572595635e-05, "loss": 0.15136798620223998, "memory(GiB)": 122.96, "step": 14710, "token_acc": 0.9311154598825832, "train_speed(iter/s)": 0.244369 }, { "epoch": 1.1216556139949692, "grad_norm": 1.2160944938659668, "learning_rate": 8.808844983643754e-05, "loss": 0.130560040473938, "memory(GiB)": 122.96, "step": 14715, "token_acc": 0.947881413293624, "train_speed(iter/s)": 0.244376 }, { "epoch": 1.122036740605229, "grad_norm": 0.8200365304946899, "learning_rate": 8.808069176272289e-05, "loss": 0.1337457060813904, "memory(GiB)": 122.96, "step": 14720, "token_acc": 0.9474412171507607, "train_speed(iter/s)": 0.244388 }, { "epoch": 1.122417867215489, "grad_norm": 0.7821569442749023, "learning_rate": 8.807293150525728e-05, "loss": 0.11512157917022706, "memory(GiB)": 122.96, "step": 14725, "token_acc": 0.9521149241819633, "train_speed(iter/s)": 0.244405 }, { "epoch": 1.122798993825749, "grad_norm": 1.949230670928955, "learning_rate": 8.806516906448574e-05, "loss": 0.15147820711135865, "memory(GiB)": 122.96, "step": 14730, "token_acc": 0.9409395973154362, "train_speed(iter/s)": 0.244424 }, { "epoch": 1.1231801204360088, "grad_norm": 1.1075332164764404, "learning_rate": 8.805740444085338e-05, "loss": 0.12038544416427613, "memory(GiB)": 122.96, "step": 14735, "token_acc": 0.9583847102342786, "train_speed(iter/s)": 0.244435 }, { "epoch": 1.1235612470462688, "grad_norm": 1.2451515197753906, "learning_rate": 8.80496376348055e-05, "loss": 0.21417253017425536, "memory(GiB)": 122.96, "step": 14740, "token_acc": 0.9138143314454568, "train_speed(iter/s)": 0.244458 }, { "epoch": 1.1239423736565286, "grad_norm": 0.6266849637031555, "learning_rate": 8.804186864678745e-05, "loss": 0.08756917715072632, "memory(GiB)": 122.96, "step": 14745, "token_acc": 0.9668599834299917, "train_speed(iter/s)": 0.244488 }, { "epoch": 1.1243235002667886, "grad_norm": 0.596644401550293, "learning_rate": 8.803409747724479e-05, "loss": 0.17536920309066772, "memory(GiB)": 122.96, "step": 14750, "token_acc": 0.9457782573509577, "train_speed(iter/s)": 0.244513 }, { "epoch": 1.1247046268770486, "grad_norm": 0.5178549289703369, "learning_rate": 8.802632412662312e-05, "loss": 0.08749451637268066, "memory(GiB)": 122.96, "step": 14755, "token_acc": 0.9687286007760785, "train_speed(iter/s)": 0.244535 }, { "epoch": 1.1250857534873084, "grad_norm": 1.3540711402893066, "learning_rate": 8.801854859536824e-05, "loss": 0.17378873825073243, "memory(GiB)": 122.96, "step": 14760, "token_acc": 0.9253525971792226, "train_speed(iter/s)": 0.244562 }, { "epoch": 1.1254668800975685, "grad_norm": 0.8283711671829224, "learning_rate": 8.801077088392604e-05, "loss": 0.20466501712799073, "memory(GiB)": 122.96, "step": 14765, "token_acc": 0.931148262208535, "train_speed(iter/s)": 0.244586 }, { "epoch": 1.1258480067078283, "grad_norm": 0.44806790351867676, "learning_rate": 8.80029909927425e-05, "loss": 0.12286052703857422, "memory(GiB)": 122.96, "step": 14770, "token_acc": 0.9592271629492826, "train_speed(iter/s)": 0.244598 }, { "epoch": 1.1262291333180883, "grad_norm": 1.2301825284957886, "learning_rate": 8.799520892226378e-05, "loss": 0.13751307725906373, "memory(GiB)": 122.96, "step": 14775, "token_acc": 0.9510426110607434, "train_speed(iter/s)": 0.244609 }, { "epoch": 1.1266102599283483, "grad_norm": 0.7519361972808838, "learning_rate": 8.798742467293615e-05, "loss": 0.12390105724334717, "memory(GiB)": 122.96, "step": 14780, "token_acc": 0.9479095270733379, "train_speed(iter/s)": 0.244643 }, { "epoch": 1.126991386538608, "grad_norm": 0.9533618092536926, "learning_rate": 8.797963824520601e-05, "loss": 0.15460102558135985, "memory(GiB)": 122.96, "step": 14785, "token_acc": 0.9421759386133187, "train_speed(iter/s)": 0.244669 }, { "epoch": 1.127372513148868, "grad_norm": 0.4909072518348694, "learning_rate": 8.797184963951986e-05, "loss": 0.15204998254776, "memory(GiB)": 122.96, "step": 14790, "token_acc": 0.9477621244233886, "train_speed(iter/s)": 0.244686 }, { "epoch": 1.127753639759128, "grad_norm": 0.38940319418907166, "learning_rate": 8.796405885632433e-05, "loss": 0.10481984615325927, "memory(GiB)": 122.96, "step": 14795, "token_acc": 0.9579094354677837, "train_speed(iter/s)": 0.244707 }, { "epoch": 1.128134766369388, "grad_norm": 0.9325776696205139, "learning_rate": 8.79562658960662e-05, "loss": 0.11775988340377808, "memory(GiB)": 122.96, "step": 14800, "token_acc": 0.9579124579124579, "train_speed(iter/s)": 0.244716 }, { "epoch": 1.128134766369388, "eval_loss": 0.11246661841869354, "eval_runtime": 156.9299, "eval_samples_per_second": 3.377, "eval_steps_per_second": 3.377, "eval_token_acc": 0.9481507138124209, "step": 14800 }, { "epoch": 1.1285158929796477, "grad_norm": 0.5858659148216248, "learning_rate": 8.794847075919238e-05, "loss": 0.1429687261581421, "memory(GiB)": 122.96, "step": 14805, "token_acc": 0.9480018019177553, "train_speed(iter/s)": 0.244098 }, { "epoch": 1.1288970195899077, "grad_norm": 0.7057172060012817, "learning_rate": 8.794067344614984e-05, "loss": 0.10204832553863526, "memory(GiB)": 122.96, "step": 14810, "token_acc": 0.9516283524904214, "train_speed(iter/s)": 0.244118 }, { "epoch": 1.1292781462001678, "grad_norm": 0.8164618015289307, "learning_rate": 8.793287395738576e-05, "loss": 0.12823518514633178, "memory(GiB)": 122.96, "step": 14815, "token_acc": 0.9502997485979501, "train_speed(iter/s)": 0.24413 }, { "epoch": 1.1296592728104276, "grad_norm": 0.7177162766456604, "learning_rate": 8.792507229334738e-05, "loss": 0.1512230634689331, "memory(GiB)": 122.96, "step": 14820, "token_acc": 0.947203196347032, "train_speed(iter/s)": 0.244161 }, { "epoch": 1.1300403994206876, "grad_norm": 0.7411829233169556, "learning_rate": 8.791726845448212e-05, "loss": 0.1054340124130249, "memory(GiB)": 122.96, "step": 14825, "token_acc": 0.9445681211041852, "train_speed(iter/s)": 0.244182 }, { "epoch": 1.1304215260309474, "grad_norm": 2.1831984519958496, "learning_rate": 8.790946244123746e-05, "loss": 0.14504587650299072, "memory(GiB)": 122.96, "step": 14830, "token_acc": 0.9354909249945331, "train_speed(iter/s)": 0.24421 }, { "epoch": 1.1308026526412074, "grad_norm": 0.8571719527244568, "learning_rate": 8.790165425406104e-05, "loss": 0.12789541482925415, "memory(GiB)": 122.96, "step": 14835, "token_acc": 0.94520890668001, "train_speed(iter/s)": 0.244234 }, { "epoch": 1.1311837792514674, "grad_norm": 1.4965510368347168, "learning_rate": 8.789384389340063e-05, "loss": 0.10111509561538697, "memory(GiB)": 122.96, "step": 14840, "token_acc": 0.9646038172353962, "train_speed(iter/s)": 0.244245 }, { "epoch": 1.1315649058617272, "grad_norm": 1.0690295696258545, "learning_rate": 8.788603135970413e-05, "loss": 0.12717626094818116, "memory(GiB)": 122.96, "step": 14845, "token_acc": 0.9432637571157495, "train_speed(iter/s)": 0.244267 }, { "epoch": 1.1319460324719872, "grad_norm": 1.2258834838867188, "learning_rate": 8.787821665341956e-05, "loss": 0.13925180435180665, "memory(GiB)": 122.96, "step": 14850, "token_acc": 0.9537860707311782, "train_speed(iter/s)": 0.244287 }, { "epoch": 1.132327159082247, "grad_norm": 0.9318856000900269, "learning_rate": 8.787039977499502e-05, "loss": 0.13183165788650514, "memory(GiB)": 122.96, "step": 14855, "token_acc": 0.9423138897840799, "train_speed(iter/s)": 0.244314 }, { "epoch": 1.132708285692507, "grad_norm": 2.22027850151062, "learning_rate": 8.786258072487881e-05, "loss": 0.20236563682556152, "memory(GiB)": 122.96, "step": 14860, "token_acc": 0.9181002243829469, "train_speed(iter/s)": 0.24434 }, { "epoch": 1.133089412302767, "grad_norm": 1.4645618200302124, "learning_rate": 8.785475950351927e-05, "loss": 0.1679174780845642, "memory(GiB)": 122.96, "step": 14865, "token_acc": 0.9429183018194791, "train_speed(iter/s)": 0.244354 }, { "epoch": 1.1334705389130268, "grad_norm": 0.7779101729393005, "learning_rate": 8.784693611136496e-05, "loss": 0.11979987621307372, "memory(GiB)": 122.96, "step": 14870, "token_acc": 0.9418321142459074, "train_speed(iter/s)": 0.244376 }, { "epoch": 1.1338516655232869, "grad_norm": 0.8253455758094788, "learning_rate": 8.78391105488645e-05, "loss": 0.16935352087020875, "memory(GiB)": 122.96, "step": 14875, "token_acc": 0.938872864433104, "train_speed(iter/s)": 0.244375 }, { "epoch": 1.1342327921335467, "grad_norm": 1.2045767307281494, "learning_rate": 8.783128281646664e-05, "loss": 0.10881816148757935, "memory(GiB)": 122.96, "step": 14880, "token_acc": 0.962984552608569, "train_speed(iter/s)": 0.244402 }, { "epoch": 1.1346139187438067, "grad_norm": 0.5174950361251831, "learning_rate": 8.782345291462027e-05, "loss": 0.07671124935150146, "memory(GiB)": 122.96, "step": 14885, "token_acc": 0.9571341090018372, "train_speed(iter/s)": 0.244419 }, { "epoch": 1.1349950453540667, "grad_norm": 0.9042118191719055, "learning_rate": 8.781562084377439e-05, "loss": 0.15886917114257812, "memory(GiB)": 122.96, "step": 14890, "token_acc": 0.9354309165526675, "train_speed(iter/s)": 0.244445 }, { "epoch": 1.1353761719643265, "grad_norm": 0.768493115901947, "learning_rate": 8.780778660437815e-05, "loss": 0.15011091232299806, "memory(GiB)": 122.96, "step": 14895, "token_acc": 0.9439552760307477, "train_speed(iter/s)": 0.244461 }, { "epoch": 1.1357572985745865, "grad_norm": 1.073806881904602, "learning_rate": 8.779995019688079e-05, "loss": 0.11822941303253173, "memory(GiB)": 122.96, "step": 14900, "token_acc": 0.9606418407508326, "train_speed(iter/s)": 0.244487 }, { "epoch": 1.1361384251848463, "grad_norm": 0.48068055510520935, "learning_rate": 8.77921116217317e-05, "loss": 0.1279631733894348, "memory(GiB)": 122.96, "step": 14905, "token_acc": 0.9527766518038103, "train_speed(iter/s)": 0.244504 }, { "epoch": 1.1365195517951063, "grad_norm": 1.0315064191818237, "learning_rate": 8.77842708793804e-05, "loss": 0.16808602809906006, "memory(GiB)": 122.96, "step": 14910, "token_acc": 0.9380841121495327, "train_speed(iter/s)": 0.244524 }, { "epoch": 1.1369006784053663, "grad_norm": 0.5866919755935669, "learning_rate": 8.777642797027648e-05, "loss": 0.14136891365051268, "memory(GiB)": 122.96, "step": 14915, "token_acc": 0.947906976744186, "train_speed(iter/s)": 0.244544 }, { "epoch": 1.1372818050156261, "grad_norm": 1.1678366661071777, "learning_rate": 8.776858289486975e-05, "loss": 0.09477418065071105, "memory(GiB)": 122.96, "step": 14920, "token_acc": 0.9581163685759565, "train_speed(iter/s)": 0.24457 }, { "epoch": 1.1376629316258862, "grad_norm": 1.369999885559082, "learning_rate": 8.776073565361002e-05, "loss": 0.18082426786422728, "memory(GiB)": 122.96, "step": 14925, "token_acc": 0.944112295295035, "train_speed(iter/s)": 0.244587 }, { "epoch": 1.138044058236146, "grad_norm": 0.7855144143104553, "learning_rate": 8.775288624694738e-05, "loss": 0.15698699951171874, "memory(GiB)": 122.96, "step": 14930, "token_acc": 0.9462188337111065, "train_speed(iter/s)": 0.244609 }, { "epoch": 1.138425184846406, "grad_norm": 0.9337587356567383, "learning_rate": 8.774503467533187e-05, "loss": 0.15934780836105347, "memory(GiB)": 122.96, "step": 14935, "token_acc": 0.9347480106100796, "train_speed(iter/s)": 0.244631 }, { "epoch": 1.138806311456666, "grad_norm": 0.44149667024612427, "learning_rate": 8.77371809392138e-05, "loss": 0.14023571014404296, "memory(GiB)": 122.96, "step": 14940, "token_acc": 0.9555067837190743, "train_speed(iter/s)": 0.244654 }, { "epoch": 1.1391874380669258, "grad_norm": 0.9286788105964661, "learning_rate": 8.772932503904352e-05, "loss": 0.1243199348449707, "memory(GiB)": 122.96, "step": 14945, "token_acc": 0.9517913325890591, "train_speed(iter/s)": 0.244647 }, { "epoch": 1.1395685646771858, "grad_norm": 0.8473654985427856, "learning_rate": 8.772146697527153e-05, "loss": 0.128795325756073, "memory(GiB)": 122.96, "step": 14950, "token_acc": 0.9396535313154388, "train_speed(iter/s)": 0.244666 }, { "epoch": 1.1399496912874456, "grad_norm": 0.8285189867019653, "learning_rate": 8.771360674834846e-05, "loss": 0.13803331851959227, "memory(GiB)": 122.96, "step": 14955, "token_acc": 0.947929479294793, "train_speed(iter/s)": 0.244687 }, { "epoch": 1.1403308178977056, "grad_norm": 0.8456258177757263, "learning_rate": 8.770574435872505e-05, "loss": 0.13609519004821777, "memory(GiB)": 122.96, "step": 14960, "token_acc": 0.9460149553164326, "train_speed(iter/s)": 0.244707 }, { "epoch": 1.1407119445079656, "grad_norm": 1.3935569524765015, "learning_rate": 8.769787980685218e-05, "loss": 0.15916311740875244, "memory(GiB)": 122.96, "step": 14965, "token_acc": 0.9456541628545289, "train_speed(iter/s)": 0.244725 }, { "epoch": 1.1410930711182254, "grad_norm": 0.732819139957428, "learning_rate": 8.769001309318085e-05, "loss": 0.11329989433288574, "memory(GiB)": 122.96, "step": 14970, "token_acc": 0.9592274678111588, "train_speed(iter/s)": 0.244748 }, { "epoch": 1.1414741977284855, "grad_norm": 0.5422601699829102, "learning_rate": 8.768214421816217e-05, "loss": 0.10292205810546876, "memory(GiB)": 122.96, "step": 14975, "token_acc": 0.9599937733499377, "train_speed(iter/s)": 0.244756 }, { "epoch": 1.1418553243387453, "grad_norm": 0.6721330881118774, "learning_rate": 8.767427318224737e-05, "loss": 0.08533362746238708, "memory(GiB)": 122.96, "step": 14980, "token_acc": 0.9642041127189642, "train_speed(iter/s)": 0.244783 }, { "epoch": 1.1422364509490053, "grad_norm": 0.6754940748214722, "learning_rate": 8.766639998588785e-05, "loss": 0.1099397897720337, "memory(GiB)": 122.96, "step": 14985, "token_acc": 0.944663167104112, "train_speed(iter/s)": 0.244803 }, { "epoch": 1.1426175775592653, "grad_norm": 0.7263115644454956, "learning_rate": 8.765852462953508e-05, "loss": 0.12096083164215088, "memory(GiB)": 122.96, "step": 14990, "token_acc": 0.9602803738317757, "train_speed(iter/s)": 0.244827 }, { "epoch": 1.142998704169525, "grad_norm": 0.7146492600440979, "learning_rate": 8.765064711364068e-05, "loss": 0.1614994525909424, "memory(GiB)": 122.96, "step": 14995, "token_acc": 0.936611601997695, "train_speed(iter/s)": 0.244854 }, { "epoch": 1.1433798307797851, "grad_norm": 1.0148382186889648, "learning_rate": 8.764276743865638e-05, "loss": 0.13319382667541504, "memory(GiB)": 122.96, "step": 15000, "token_acc": 0.9509239624356256, "train_speed(iter/s)": 0.244879 }, { "epoch": 1.1433798307797851, "eval_loss": 0.11213986575603485, "eval_runtime": 158.4148, "eval_samples_per_second": 3.346, "eval_steps_per_second": 3.346, "eval_token_acc": 0.9479925908077826, "step": 15000 }, { "epoch": 1.143760957390045, "grad_norm": 0.8521045446395874, "learning_rate": 8.763488560503408e-05, "loss": 0.09414032697677613, "memory(GiB)": 122.96, "step": 15005, "token_acc": 0.9486597194218647, "train_speed(iter/s)": 0.24424 }, { "epoch": 1.144142084000305, "grad_norm": 1.1516413688659668, "learning_rate": 8.76270016132257e-05, "loss": 0.16544830799102783, "memory(GiB)": 122.96, "step": 15010, "token_acc": 0.9204401948403391, "train_speed(iter/s)": 0.244258 }, { "epoch": 1.144523210610565, "grad_norm": 1.593096375465393, "learning_rate": 8.76191154636834e-05, "loss": 0.14657498598098756, "memory(GiB)": 122.96, "step": 15015, "token_acc": 0.9407274360125729, "train_speed(iter/s)": 0.244269 }, { "epoch": 1.1449043372208247, "grad_norm": 0.6142476797103882, "learning_rate": 8.761122715685942e-05, "loss": 0.1265580654144287, "memory(GiB)": 122.96, "step": 15020, "token_acc": 0.9478336789261159, "train_speed(iter/s)": 0.24428 }, { "epoch": 1.1452854638310848, "grad_norm": 0.8141835331916809, "learning_rate": 8.760333669320608e-05, "loss": 0.19002195596694946, "memory(GiB)": 122.96, "step": 15025, "token_acc": 0.9304451510333863, "train_speed(iter/s)": 0.244296 }, { "epoch": 1.1456665904413446, "grad_norm": 0.08428604900836945, "learning_rate": 8.759544407317589e-05, "loss": 0.11336742639541626, "memory(GiB)": 122.96, "step": 15030, "token_acc": 0.9532277710109622, "train_speed(iter/s)": 0.24432 }, { "epoch": 1.1460477170516046, "grad_norm": 0.7287114858627319, "learning_rate": 8.758754929722145e-05, "loss": 0.14064549207687377, "memory(GiB)": 122.96, "step": 15035, "token_acc": 0.9470761961015948, "train_speed(iter/s)": 0.244324 }, { "epoch": 1.1464288436618646, "grad_norm": 0.986757218837738, "learning_rate": 8.757965236579548e-05, "loss": 0.14259891510009765, "memory(GiB)": 122.96, "step": 15040, "token_acc": 0.9471435353695205, "train_speed(iter/s)": 0.24434 }, { "epoch": 1.1468099702721244, "grad_norm": 0.5509956479072571, "learning_rate": 8.757175327935083e-05, "loss": 0.14767595529556274, "memory(GiB)": 122.96, "step": 15045, "token_acc": 0.9495128432240921, "train_speed(iter/s)": 0.244364 }, { "epoch": 1.1471910968823844, "grad_norm": 0.46354901790618896, "learning_rate": 8.75638520383405e-05, "loss": 0.09168213605880737, "memory(GiB)": 122.96, "step": 15050, "token_acc": 0.9661255272913205, "train_speed(iter/s)": 0.244369 }, { "epoch": 1.1475722234926442, "grad_norm": 0.7158856391906738, "learning_rate": 8.755594864321757e-05, "loss": 0.11930320262908936, "memory(GiB)": 122.96, "step": 15055, "token_acc": 0.9463586530931871, "train_speed(iter/s)": 0.244387 }, { "epoch": 1.1479533501029042, "grad_norm": 0.8513442873954773, "learning_rate": 8.754804309443526e-05, "loss": 0.14333151578903197, "memory(GiB)": 122.96, "step": 15060, "token_acc": 0.9496208713532965, "train_speed(iter/s)": 0.244397 }, { "epoch": 1.148334476713164, "grad_norm": 0.854579508304596, "learning_rate": 8.754013539244692e-05, "loss": 0.09215972423553467, "memory(GiB)": 122.96, "step": 15065, "token_acc": 0.9577002053388091, "train_speed(iter/s)": 0.244426 }, { "epoch": 1.148715603323424, "grad_norm": 0.7479031682014465, "learning_rate": 8.753222553770603e-05, "loss": 0.17512450218200684, "memory(GiB)": 122.96, "step": 15070, "token_acc": 0.9483050847457627, "train_speed(iter/s)": 0.244449 }, { "epoch": 1.149096729933684, "grad_norm": 0.7710641026496887, "learning_rate": 8.752431353066616e-05, "loss": 0.16083064079284667, "memory(GiB)": 122.96, "step": 15075, "token_acc": 0.9357914812460267, "train_speed(iter/s)": 0.244471 }, { "epoch": 1.1494778565439439, "grad_norm": 0.8333407640457153, "learning_rate": 8.751639937178107e-05, "loss": 0.11266645193099975, "memory(GiB)": 122.96, "step": 15080, "token_acc": 0.950712297363681, "train_speed(iter/s)": 0.244493 }, { "epoch": 1.1498589831542039, "grad_norm": 0.7921438813209534, "learning_rate": 8.750848306150455e-05, "loss": 0.11827307939529419, "memory(GiB)": 122.96, "step": 15085, "token_acc": 0.9544825677919203, "train_speed(iter/s)": 0.24451 }, { "epoch": 1.1502401097644637, "grad_norm": 0.6485952734947205, "learning_rate": 8.750056460029059e-05, "loss": 0.13288892507553102, "memory(GiB)": 122.96, "step": 15090, "token_acc": 0.9458377239199157, "train_speed(iter/s)": 0.244528 }, { "epoch": 1.1506212363747237, "grad_norm": 0.05959833040833473, "learning_rate": 8.74926439885933e-05, "loss": 0.14683165550231933, "memory(GiB)": 122.96, "step": 15095, "token_acc": 0.9210357372137813, "train_speed(iter/s)": 0.244554 }, { "epoch": 1.1510023629849837, "grad_norm": 0.9276690483093262, "learning_rate": 8.748472122686683e-05, "loss": 0.11480458974838256, "memory(GiB)": 122.96, "step": 15100, "token_acc": 0.9485192837465565, "train_speed(iter/s)": 0.244574 }, { "epoch": 1.1513834895952435, "grad_norm": 0.9079185128211975, "learning_rate": 8.747679631556557e-05, "loss": 0.11390597820281982, "memory(GiB)": 122.96, "step": 15105, "token_acc": 0.9557759867741269, "train_speed(iter/s)": 0.244594 }, { "epoch": 1.1517646162055035, "grad_norm": 0.943495512008667, "learning_rate": 8.746886925514395e-05, "loss": 0.10988938808441162, "memory(GiB)": 122.96, "step": 15110, "token_acc": 0.9588994565217391, "train_speed(iter/s)": 0.244618 }, { "epoch": 1.1521457428157633, "grad_norm": 0.3423910140991211, "learning_rate": 8.746094004605653e-05, "loss": 0.1330575466156006, "memory(GiB)": 122.96, "step": 15115, "token_acc": 0.9495996231747527, "train_speed(iter/s)": 0.244636 }, { "epoch": 1.1525268694260233, "grad_norm": 0.5606217384338379, "learning_rate": 8.745300868875808e-05, "loss": 0.13739627599716187, "memory(GiB)": 122.96, "step": 15120, "token_acc": 0.9459366391184573, "train_speed(iter/s)": 0.244661 }, { "epoch": 1.1529079960362831, "grad_norm": 1.1202778816223145, "learning_rate": 8.744507518370337e-05, "loss": 0.14448046684265137, "memory(GiB)": 122.96, "step": 15125, "token_acc": 0.9424819678194932, "train_speed(iter/s)": 0.24468 }, { "epoch": 1.1532891226465432, "grad_norm": 0.3090142607688904, "learning_rate": 8.743713953134736e-05, "loss": 0.12370638847351074, "memory(GiB)": 122.96, "step": 15130, "token_acc": 0.947866563227308, "train_speed(iter/s)": 0.244692 }, { "epoch": 1.1536702492568032, "grad_norm": 1.1007739305496216, "learning_rate": 8.742920173214513e-05, "loss": 0.14784564971923828, "memory(GiB)": 122.96, "step": 15135, "token_acc": 0.9356408869659275, "train_speed(iter/s)": 0.244707 }, { "epoch": 1.154051375867063, "grad_norm": 0.931355893611908, "learning_rate": 8.742126178655186e-05, "loss": 0.13279253244400024, "memory(GiB)": 122.96, "step": 15140, "token_acc": 0.9485232067510548, "train_speed(iter/s)": 0.244731 }, { "epoch": 1.154432502477323, "grad_norm": 0.638060450553894, "learning_rate": 8.74133196950229e-05, "loss": 0.0913454532623291, "memory(GiB)": 122.96, "step": 15145, "token_acc": 0.9690667598741699, "train_speed(iter/s)": 0.244744 }, { "epoch": 1.1548136290875828, "grad_norm": 1.0747016668319702, "learning_rate": 8.740537545801366e-05, "loss": 0.1265228271484375, "memory(GiB)": 122.96, "step": 15150, "token_acc": 0.9510316768381285, "train_speed(iter/s)": 0.244753 }, { "epoch": 1.1551947556978428, "grad_norm": 0.7324453592300415, "learning_rate": 8.739742907597973e-05, "loss": 0.14105077981948852, "memory(GiB)": 122.96, "step": 15155, "token_acc": 0.9411920529801324, "train_speed(iter/s)": 0.24478 }, { "epoch": 1.1555758823081028, "grad_norm": 0.951191246509552, "learning_rate": 8.73894805493768e-05, "loss": 0.16334066390991211, "memory(GiB)": 122.96, "step": 15160, "token_acc": 0.9300542957521558, "train_speed(iter/s)": 0.244797 }, { "epoch": 1.1559570089183626, "grad_norm": 0.9150108098983765, "learning_rate": 8.738152987866064e-05, "loss": 0.19450894594192505, "memory(GiB)": 122.96, "step": 15165, "token_acc": 0.9512043704991309, "train_speed(iter/s)": 0.244802 }, { "epoch": 1.1563381355286226, "grad_norm": 0.9924925565719604, "learning_rate": 8.737357706428724e-05, "loss": 0.13020154237747192, "memory(GiB)": 122.96, "step": 15170, "token_acc": 0.9495562130177515, "train_speed(iter/s)": 0.244815 }, { "epoch": 1.1567192621388824, "grad_norm": 0.32507431507110596, "learning_rate": 8.736562210671261e-05, "loss": 0.09521171450614929, "memory(GiB)": 122.96, "step": 15175, "token_acc": 0.9650856389986825, "train_speed(iter/s)": 0.24483 }, { "epoch": 1.1571003887491424, "grad_norm": 0.7084077000617981, "learning_rate": 8.735766500639295e-05, "loss": 0.09836294651031494, "memory(GiB)": 122.96, "step": 15180, "token_acc": 0.9591194968553459, "train_speed(iter/s)": 0.244851 }, { "epoch": 1.1574815153594025, "grad_norm": 2.0652875900268555, "learning_rate": 8.734970576378458e-05, "loss": 0.13604525327682496, "memory(GiB)": 122.96, "step": 15185, "token_acc": 0.9397697077059345, "train_speed(iter/s)": 0.244875 }, { "epoch": 1.1578626419696623, "grad_norm": 0.7420374751091003, "learning_rate": 8.734174437934389e-05, "loss": 0.15464385747909545, "memory(GiB)": 122.96, "step": 15190, "token_acc": 0.9380637402285027, "train_speed(iter/s)": 0.244888 }, { "epoch": 1.1582437685799223, "grad_norm": 0.6496666669845581, "learning_rate": 8.733378085352745e-05, "loss": 0.1267142653465271, "memory(GiB)": 122.96, "step": 15195, "token_acc": 0.9540229885057471, "train_speed(iter/s)": 0.244905 }, { "epoch": 1.158624895190182, "grad_norm": 1.033959150314331, "learning_rate": 8.732581518679194e-05, "loss": 0.1230698823928833, "memory(GiB)": 122.96, "step": 15200, "token_acc": 0.9495798319327731, "train_speed(iter/s)": 0.244913 }, { "epoch": 1.158624895190182, "eval_loss": 0.11309267580509186, "eval_runtime": 160.6576, "eval_samples_per_second": 3.299, "eval_steps_per_second": 3.299, "eval_token_acc": 0.9473375097885669, "step": 15200 }, { "epoch": 1.159006021800442, "grad_norm": 0.8508667349815369, "learning_rate": 8.731784737959414e-05, "loss": 0.13024368286132812, "memory(GiB)": 122.96, "step": 15205, "token_acc": 0.9475545463092199, "train_speed(iter/s)": 0.244304 }, { "epoch": 1.1593871484107021, "grad_norm": 1.598263144493103, "learning_rate": 8.730987743239096e-05, "loss": 0.15528591871261596, "memory(GiB)": 122.96, "step": 15210, "token_acc": 0.9366013071895425, "train_speed(iter/s)": 0.244332 }, { "epoch": 1.159768275020962, "grad_norm": 0.9266265630722046, "learning_rate": 8.730190534563945e-05, "loss": 0.20747644901275636, "memory(GiB)": 122.96, "step": 15215, "token_acc": 0.9297395517867959, "train_speed(iter/s)": 0.244357 }, { "epoch": 1.160149401631222, "grad_norm": 1.4843882322311401, "learning_rate": 8.729393111979678e-05, "loss": 0.14198944568634034, "memory(GiB)": 122.96, "step": 15220, "token_acc": 0.9463578780680918, "train_speed(iter/s)": 0.244376 }, { "epoch": 1.1605305282414817, "grad_norm": 0.7738968133926392, "learning_rate": 8.728595475532022e-05, "loss": 0.1432621717453003, "memory(GiB)": 122.96, "step": 15225, "token_acc": 0.9423778264040846, "train_speed(iter/s)": 0.244403 }, { "epoch": 1.1609116548517417, "grad_norm": 1.2536051273345947, "learning_rate": 8.727797625266716e-05, "loss": 0.17608290910720825, "memory(GiB)": 122.96, "step": 15230, "token_acc": 0.9275902211874273, "train_speed(iter/s)": 0.244423 }, { "epoch": 1.1612927814620018, "grad_norm": 0.40909719467163086, "learning_rate": 8.726999561229518e-05, "loss": 0.10799274444580079, "memory(GiB)": 122.96, "step": 15235, "token_acc": 0.945540484997945, "train_speed(iter/s)": 0.244442 }, { "epoch": 1.1616739080722616, "grad_norm": 1.249367117881775, "learning_rate": 8.72620128346619e-05, "loss": 0.09496196508407592, "memory(GiB)": 122.96, "step": 15240, "token_acc": 0.9598517603458925, "train_speed(iter/s)": 0.244465 }, { "epoch": 1.1620550346825216, "grad_norm": 1.3569941520690918, "learning_rate": 8.725402792022511e-05, "loss": 0.11940066814422608, "memory(GiB)": 122.96, "step": 15245, "token_acc": 0.9581162742237966, "train_speed(iter/s)": 0.244485 }, { "epoch": 1.1624361612927814, "grad_norm": 0.5919360518455505, "learning_rate": 8.724604086944267e-05, "loss": 0.12390017509460449, "memory(GiB)": 122.96, "step": 15250, "token_acc": 0.9535608308605341, "train_speed(iter/s)": 0.244499 }, { "epoch": 1.1628172879030414, "grad_norm": 1.2694308757781982, "learning_rate": 8.723805168277267e-05, "loss": 0.18417187929153442, "memory(GiB)": 122.96, "step": 15255, "token_acc": 0.9081885856079405, "train_speed(iter/s)": 0.244529 }, { "epoch": 1.1631984145133014, "grad_norm": 0.7929002046585083, "learning_rate": 8.723006036067317e-05, "loss": 0.11598962545394897, "memory(GiB)": 122.96, "step": 15260, "token_acc": 0.9540425531914893, "train_speed(iter/s)": 0.244526 }, { "epoch": 1.1635795411235612, "grad_norm": 0.5900220274925232, "learning_rate": 8.72220669036025e-05, "loss": 0.10466889142990113, "memory(GiB)": 122.96, "step": 15265, "token_acc": 0.9516775691074066, "train_speed(iter/s)": 0.244554 }, { "epoch": 1.1639606677338212, "grad_norm": 0.9482578039169312, "learning_rate": 8.721407131201902e-05, "loss": 0.14594424962997438, "memory(GiB)": 122.96, "step": 15270, "token_acc": 0.9458087753765554, "train_speed(iter/s)": 0.244565 }, { "epoch": 1.164341794344081, "grad_norm": 0.6127783060073853, "learning_rate": 8.720607358638124e-05, "loss": 0.11488378047943115, "memory(GiB)": 122.96, "step": 15275, "token_acc": 0.9558648786284162, "train_speed(iter/s)": 0.244579 }, { "epoch": 1.164722920954341, "grad_norm": 1.4611761569976807, "learning_rate": 8.719807372714781e-05, "loss": 0.11078819036483764, "memory(GiB)": 122.96, "step": 15280, "token_acc": 0.9608974358974359, "train_speed(iter/s)": 0.244596 }, { "epoch": 1.165104047564601, "grad_norm": 1.1277974843978882, "learning_rate": 8.719007173477746e-05, "loss": 0.11859327554702759, "memory(GiB)": 122.96, "step": 15285, "token_acc": 0.9539664804469273, "train_speed(iter/s)": 0.244613 }, { "epoch": 1.1654851741748609, "grad_norm": 1.4730879068374634, "learning_rate": 8.718206760972907e-05, "loss": 0.11753822565078735, "memory(GiB)": 122.96, "step": 15290, "token_acc": 0.9544832975190467, "train_speed(iter/s)": 0.24463 }, { "epoch": 1.1658663007851209, "grad_norm": 0.702673614025116, "learning_rate": 8.717406135246166e-05, "loss": 0.1130339503288269, "memory(GiB)": 122.96, "step": 15295, "token_acc": 0.9598645215918713, "train_speed(iter/s)": 0.244649 }, { "epoch": 1.1662474273953807, "grad_norm": 1.0372339487075806, "learning_rate": 8.716605296343435e-05, "loss": 0.126303768157959, "memory(GiB)": 122.96, "step": 15300, "token_acc": 0.9548931680295436, "train_speed(iter/s)": 0.244655 }, { "epoch": 1.1666285540056407, "grad_norm": 0.928277850151062, "learning_rate": 8.715804244310637e-05, "loss": 0.11490498781204224, "memory(GiB)": 122.96, "step": 15305, "token_acc": 0.952856159669649, "train_speed(iter/s)": 0.244683 }, { "epoch": 1.1670096806159007, "grad_norm": 0.9137842059135437, "learning_rate": 8.715002979193708e-05, "loss": 0.16258968114852906, "memory(GiB)": 122.96, "step": 15310, "token_acc": 0.9337374293979886, "train_speed(iter/s)": 0.244692 }, { "epoch": 1.1673908072261605, "grad_norm": 0.8712362051010132, "learning_rate": 8.714201501038599e-05, "loss": 0.11977760791778565, "memory(GiB)": 122.96, "step": 15315, "token_acc": 0.959858762311838, "train_speed(iter/s)": 0.244714 }, { "epoch": 1.1677719338364205, "grad_norm": 1.237574815750122, "learning_rate": 8.713399809891269e-05, "loss": 0.16022827625274658, "memory(GiB)": 122.96, "step": 15320, "token_acc": 0.929299572509043, "train_speed(iter/s)": 0.244743 }, { "epoch": 1.1681530604466803, "grad_norm": 1.0529792308807373, "learning_rate": 8.712597905797692e-05, "loss": 0.08282302618026734, "memory(GiB)": 122.96, "step": 15325, "token_acc": 0.958389781304432, "train_speed(iter/s)": 0.244757 }, { "epoch": 1.1685341870569403, "grad_norm": 0.7802228927612305, "learning_rate": 8.711795788803856e-05, "loss": 0.15539878606796265, "memory(GiB)": 122.96, "step": 15330, "token_acc": 0.9464586201621539, "train_speed(iter/s)": 0.24477 }, { "epoch": 1.1689153136672004, "grad_norm": 1.3973380327224731, "learning_rate": 8.710993458955753e-05, "loss": 0.174909508228302, "memory(GiB)": 122.96, "step": 15335, "token_acc": 0.9337270341207349, "train_speed(iter/s)": 0.244796 }, { "epoch": 1.1692964402774602, "grad_norm": 1.040671467781067, "learning_rate": 8.710190916299399e-05, "loss": 0.18078778982162474, "memory(GiB)": 122.96, "step": 15340, "token_acc": 0.9438982070561018, "train_speed(iter/s)": 0.244818 }, { "epoch": 1.1696775668877202, "grad_norm": 0.6526258587837219, "learning_rate": 8.709388160880812e-05, "loss": 0.1686447024345398, "memory(GiB)": 122.96, "step": 15345, "token_acc": 0.9266383359427421, "train_speed(iter/s)": 0.244841 }, { "epoch": 1.17005869349798, "grad_norm": 0.37981587648391724, "learning_rate": 8.708585192746026e-05, "loss": 0.13500214815139772, "memory(GiB)": 122.96, "step": 15350, "token_acc": 0.9651893360502632, "train_speed(iter/s)": 0.244856 }, { "epoch": 1.17043982010824, "grad_norm": 0.9267290234565735, "learning_rate": 8.707782011941092e-05, "loss": 0.14339447021484375, "memory(GiB)": 122.96, "step": 15355, "token_acc": 0.9490328523180841, "train_speed(iter/s)": 0.244873 }, { "epoch": 1.1708209467184998, "grad_norm": 0.11955124139785767, "learning_rate": 8.706978618512066e-05, "loss": 0.1625092625617981, "memory(GiB)": 122.96, "step": 15360, "token_acc": 0.9221967963386728, "train_speed(iter/s)": 0.2449 }, { "epoch": 1.1712020733287598, "grad_norm": 0.7671645283699036, "learning_rate": 8.706175012505015e-05, "loss": 0.119148850440979, "memory(GiB)": 122.96, "step": 15365, "token_acc": 0.9392541330257593, "train_speed(iter/s)": 0.244921 }, { "epoch": 1.1715831999390198, "grad_norm": 1.2860488891601562, "learning_rate": 8.705371193966028e-05, "loss": 0.18182966709136963, "memory(GiB)": 122.96, "step": 15370, "token_acc": 0.9235611510791367, "train_speed(iter/s)": 0.244949 }, { "epoch": 1.1719643265492796, "grad_norm": 0.7328260540962219, "learning_rate": 8.7045671629412e-05, "loss": 0.09445739984512329, "memory(GiB)": 122.96, "step": 15375, "token_acc": 0.9545975693648245, "train_speed(iter/s)": 0.244969 }, { "epoch": 1.1723454531595396, "grad_norm": 0.7912675142288208, "learning_rate": 8.703762919476634e-05, "loss": 0.14686689376831055, "memory(GiB)": 122.96, "step": 15380, "token_acc": 0.9421179973943793, "train_speed(iter/s)": 0.244985 }, { "epoch": 1.1727265797697994, "grad_norm": 0.6493186354637146, "learning_rate": 8.702958463618454e-05, "loss": 0.1516942024230957, "memory(GiB)": 122.96, "step": 15385, "token_acc": 0.947676740780189, "train_speed(iter/s)": 0.244992 }, { "epoch": 1.1731077063800595, "grad_norm": 0.8062649965286255, "learning_rate": 8.702153795412788e-05, "loss": 0.11498950719833374, "memory(GiB)": 122.96, "step": 15390, "token_acc": 0.9540777146367685, "train_speed(iter/s)": 0.245002 }, { "epoch": 1.1734888329903195, "grad_norm": 1.2523550987243652, "learning_rate": 8.701348914905782e-05, "loss": 0.17094634771347045, "memory(GiB)": 122.96, "step": 15395, "token_acc": 0.9330016583747927, "train_speed(iter/s)": 0.24503 }, { "epoch": 1.1738699596005793, "grad_norm": 0.8629603385925293, "learning_rate": 8.700543822143593e-05, "loss": 0.15060425996780397, "memory(GiB)": 122.96, "step": 15400, "token_acc": 0.9394589244473771, "train_speed(iter/s)": 0.245048 }, { "epoch": 1.1738699596005793, "eval_loss": 0.10974685102701187, "eval_runtime": 158.7485, "eval_samples_per_second": 3.339, "eval_steps_per_second": 3.339, "eval_token_acc": 0.9486928498283236, "step": 15400 }, { "epoch": 1.1742510862108393, "grad_norm": 0.7294900417327881, "learning_rate": 8.699738517172388e-05, "loss": 0.10440998077392578, "memory(GiB)": 122.96, "step": 15405, "token_acc": 0.9491162602331569, "train_speed(iter/s)": 0.244452 }, { "epoch": 1.174632212821099, "grad_norm": 1.291364073753357, "learning_rate": 8.69893300003835e-05, "loss": 0.17267327308654784, "memory(GiB)": 122.96, "step": 15410, "token_acc": 0.9383508362504862, "train_speed(iter/s)": 0.244469 }, { "epoch": 1.175013339431359, "grad_norm": 1.5825797319412231, "learning_rate": 8.698127270787667e-05, "loss": 0.14803647994995117, "memory(GiB)": 122.96, "step": 15415, "token_acc": 0.9389959754289345, "train_speed(iter/s)": 0.244488 }, { "epoch": 1.175394466041619, "grad_norm": 0.6356317400932312, "learning_rate": 8.697321329466547e-05, "loss": 0.10160845518112183, "memory(GiB)": 122.96, "step": 15420, "token_acc": 0.9662921348314607, "train_speed(iter/s)": 0.244514 }, { "epoch": 1.175775592651879, "grad_norm": 0.9681187272071838, "learning_rate": 8.696515176121205e-05, "loss": 0.11745184659957886, "memory(GiB)": 122.96, "step": 15425, "token_acc": 0.9491120218579235, "train_speed(iter/s)": 0.244542 }, { "epoch": 1.176156719262139, "grad_norm": 0.2702987790107727, "learning_rate": 8.695708810797873e-05, "loss": 0.11490235328674317, "memory(GiB)": 122.96, "step": 15430, "token_acc": 0.9540871723282147, "train_speed(iter/s)": 0.244562 }, { "epoch": 1.1765378458723987, "grad_norm": 0.7735621929168701, "learning_rate": 8.694902233542792e-05, "loss": 0.15282104015350342, "memory(GiB)": 122.96, "step": 15435, "token_acc": 0.9415437003405221, "train_speed(iter/s)": 0.244589 }, { "epoch": 1.1769189724826588, "grad_norm": 1.4513568878173828, "learning_rate": 8.694095444402214e-05, "loss": 0.16595627069473268, "memory(GiB)": 122.96, "step": 15440, "token_acc": 0.9413680781758957, "train_speed(iter/s)": 0.244617 }, { "epoch": 1.1773000990929186, "grad_norm": 0.8187140226364136, "learning_rate": 8.693288443422405e-05, "loss": 0.1316475510597229, "memory(GiB)": 122.96, "step": 15445, "token_acc": 0.94806654676259, "train_speed(iter/s)": 0.244639 }, { "epoch": 1.1776812257031786, "grad_norm": 1.0944315195083618, "learning_rate": 8.692481230649641e-05, "loss": 0.10914645195007325, "memory(GiB)": 122.96, "step": 15450, "token_acc": 0.9579524680073126, "train_speed(iter/s)": 0.244643 }, { "epoch": 1.1780623523134386, "grad_norm": 0.9753233194351196, "learning_rate": 8.691673806130214e-05, "loss": 0.1696930170059204, "memory(GiB)": 122.96, "step": 15455, "token_acc": 0.9374301675977653, "train_speed(iter/s)": 0.244664 }, { "epoch": 1.1784434789236984, "grad_norm": 0.18618640303611755, "learning_rate": 8.690866169910427e-05, "loss": 0.10209591388702392, "memory(GiB)": 122.96, "step": 15460, "token_acc": 0.9627343392775491, "train_speed(iter/s)": 0.244685 }, { "epoch": 1.1788246055339584, "grad_norm": 0.9901086091995239, "learning_rate": 8.690058322036593e-05, "loss": 0.14055452346801758, "memory(GiB)": 122.96, "step": 15465, "token_acc": 0.9433962264150944, "train_speed(iter/s)": 0.244713 }, { "epoch": 1.1792057321442182, "grad_norm": 0.8790974617004395, "learning_rate": 8.689250262555037e-05, "loss": 0.08217062950134277, "memory(GiB)": 122.96, "step": 15470, "token_acc": 0.9719222462203023, "train_speed(iter/s)": 0.244723 }, { "epoch": 1.1795868587544782, "grad_norm": 1.0042043924331665, "learning_rate": 8.688441991512099e-05, "loss": 0.15218801498413087, "memory(GiB)": 122.96, "step": 15475, "token_acc": 0.9405996873667756, "train_speed(iter/s)": 0.244734 }, { "epoch": 1.1799679853647382, "grad_norm": 0.760420560836792, "learning_rate": 8.687633508954129e-05, "loss": 0.16309088468551636, "memory(GiB)": 122.96, "step": 15480, "token_acc": 0.9416348357524829, "train_speed(iter/s)": 0.244744 }, { "epoch": 1.180349111974998, "grad_norm": 1.5859681367874146, "learning_rate": 8.686824814927491e-05, "loss": 0.09084450006484986, "memory(GiB)": 122.96, "step": 15485, "token_acc": 0.9633348547086567, "train_speed(iter/s)": 0.244757 }, { "epoch": 1.180730238585258, "grad_norm": 1.3473327159881592, "learning_rate": 8.686015909478558e-05, "loss": 0.1329216480255127, "memory(GiB)": 122.96, "step": 15490, "token_acc": 0.910874897792314, "train_speed(iter/s)": 0.244788 }, { "epoch": 1.1811113651955178, "grad_norm": 0.6216985583305359, "learning_rate": 8.685206792653719e-05, "loss": 0.12665894031524658, "memory(GiB)": 122.96, "step": 15495, "token_acc": 0.9519827998088868, "train_speed(iter/s)": 0.244806 }, { "epoch": 1.1814924918057779, "grad_norm": 0.4470501244068146, "learning_rate": 8.68439746449937e-05, "loss": 0.13310775756835938, "memory(GiB)": 122.96, "step": 15500, "token_acc": 0.9427670668121741, "train_speed(iter/s)": 0.244816 }, { "epoch": 1.1818736184160379, "grad_norm": 0.9363273978233337, "learning_rate": 8.683587925061924e-05, "loss": 0.1751970410346985, "memory(GiB)": 122.96, "step": 15505, "token_acc": 0.9482320765774566, "train_speed(iter/s)": 0.244834 }, { "epoch": 1.1822547450262977, "grad_norm": 1.8551082611083984, "learning_rate": 8.682778174387807e-05, "loss": 0.16682401895523072, "memory(GiB)": 122.96, "step": 15510, "token_acc": 0.935353922885095, "train_speed(iter/s)": 0.244851 }, { "epoch": 1.1826358716365577, "grad_norm": 0.3091897666454315, "learning_rate": 8.681968212523451e-05, "loss": 0.10499422550201416, "memory(GiB)": 122.96, "step": 15515, "token_acc": 0.944206008583691, "train_speed(iter/s)": 0.244879 }, { "epoch": 1.1830169982468175, "grad_norm": 0.461551696062088, "learning_rate": 8.681158039515302e-05, "loss": 0.12954739332199097, "memory(GiB)": 122.96, "step": 15520, "token_acc": 0.9450662739322533, "train_speed(iter/s)": 0.244899 }, { "epoch": 1.1833981248570775, "grad_norm": 1.3881033658981323, "learning_rate": 8.680347655409824e-05, "loss": 0.12848298549652098, "memory(GiB)": 122.96, "step": 15525, "token_acc": 0.9444444444444444, "train_speed(iter/s)": 0.244923 }, { "epoch": 1.1837792514673375, "grad_norm": 0.9536482691764832, "learning_rate": 8.679537060253486e-05, "loss": 0.11738846302032471, "memory(GiB)": 122.96, "step": 15530, "token_acc": 0.9513602638087386, "train_speed(iter/s)": 0.244948 }, { "epoch": 1.1841603780775973, "grad_norm": 0.8938249349594116, "learning_rate": 8.678726254092774e-05, "loss": 0.1280520439147949, "memory(GiB)": 122.96, "step": 15535, "token_acc": 0.9523954372623574, "train_speed(iter/s)": 0.244964 }, { "epoch": 1.1845415046878573, "grad_norm": 0.6342135071754456, "learning_rate": 8.677915236974181e-05, "loss": 0.14952415227890015, "memory(GiB)": 122.96, "step": 15540, "token_acc": 0.94377990430622, "train_speed(iter/s)": 0.244978 }, { "epoch": 1.1849226312981171, "grad_norm": 0.5193489789962769, "learning_rate": 8.67710400894422e-05, "loss": 0.1523041844367981, "memory(GiB)": 122.96, "step": 15545, "token_acc": 0.9512006196746708, "train_speed(iter/s)": 0.244988 }, { "epoch": 1.1853037579083772, "grad_norm": 2.058776378631592, "learning_rate": 8.676292570049405e-05, "loss": 0.12025766372680664, "memory(GiB)": 122.96, "step": 15550, "token_acc": 0.9612437048390629, "train_speed(iter/s)": 0.245004 }, { "epoch": 1.1856848845186372, "grad_norm": 0.9953367710113525, "learning_rate": 8.675480920336271e-05, "loss": 0.1556059718132019, "memory(GiB)": 122.96, "step": 15555, "token_acc": 0.9553868937430581, "train_speed(iter/s)": 0.245013 }, { "epoch": 1.186066011128897, "grad_norm": 0.5508955717086792, "learning_rate": 8.674669059851364e-05, "loss": 0.1146467924118042, "memory(GiB)": 122.96, "step": 15560, "token_acc": 0.9623529411764706, "train_speed(iter/s)": 0.245021 }, { "epoch": 1.186447137739157, "grad_norm": 1.2285789251327515, "learning_rate": 8.673856988641237e-05, "loss": 0.13746432065963746, "memory(GiB)": 122.96, "step": 15565, "token_acc": 0.9497860199714693, "train_speed(iter/s)": 0.245046 }, { "epoch": 1.1868282643494168, "grad_norm": 1.784615159034729, "learning_rate": 8.673044706752463e-05, "loss": 0.144450044631958, "memory(GiB)": 122.96, "step": 15570, "token_acc": 0.934375801076647, "train_speed(iter/s)": 0.24507 }, { "epoch": 1.1872093909596768, "grad_norm": 0.773529052734375, "learning_rate": 8.672232214231619e-05, "loss": 0.1562058448791504, "memory(GiB)": 122.96, "step": 15575, "token_acc": 0.9469313348187158, "train_speed(iter/s)": 0.245088 }, { "epoch": 1.1875905175699368, "grad_norm": 1.6124267578125, "learning_rate": 8.6714195111253e-05, "loss": 0.17003366947174073, "memory(GiB)": 122.96, "step": 15580, "token_acc": 0.9444320712694878, "train_speed(iter/s)": 0.245097 }, { "epoch": 1.1879716441801966, "grad_norm": 1.343233346939087, "learning_rate": 8.670606597480108e-05, "loss": 0.15675766468048097, "memory(GiB)": 122.96, "step": 15585, "token_acc": 0.9383770591824283, "train_speed(iter/s)": 0.245112 }, { "epoch": 1.1883527707904566, "grad_norm": 0.5944880247116089, "learning_rate": 8.66979347334266e-05, "loss": 0.11131103038787842, "memory(GiB)": 122.96, "step": 15590, "token_acc": 0.9613807982740021, "train_speed(iter/s)": 0.245132 }, { "epoch": 1.1887338974007164, "grad_norm": 0.7507705092430115, "learning_rate": 8.668980138759589e-05, "loss": 0.11792598962783814, "memory(GiB)": 122.96, "step": 15595, "token_acc": 0.9487422876127195, "train_speed(iter/s)": 0.245155 }, { "epoch": 1.1891150240109765, "grad_norm": 0.496099591255188, "learning_rate": 8.668166593777531e-05, "loss": 0.10193474292755127, "memory(GiB)": 122.96, "step": 15600, "token_acc": 0.9587339743589743, "train_speed(iter/s)": 0.245171 }, { "epoch": 1.1891150240109765, "eval_loss": 0.10949927568435669, "eval_runtime": 157.95, "eval_samples_per_second": 3.355, "eval_steps_per_second": 3.355, "eval_token_acc": 0.9488283838322993, "step": 15600 }, { "epoch": 1.1894961506212365, "grad_norm": 0.6254851222038269, "learning_rate": 8.667352838443144e-05, "loss": 0.09314851760864258, "memory(GiB)": 122.96, "step": 15605, "token_acc": 0.9500335486035394, "train_speed(iter/s)": 0.24456 }, { "epoch": 1.1898772772314963, "grad_norm": 0.8940487504005432, "learning_rate": 8.666538872803087e-05, "loss": 0.15930067300796508, "memory(GiB)": 122.96, "step": 15610, "token_acc": 0.9387334820662433, "train_speed(iter/s)": 0.244576 }, { "epoch": 1.1902584038417563, "grad_norm": 0.5735794305801392, "learning_rate": 8.665724696904043e-05, "loss": 0.15432335138320924, "memory(GiB)": 122.96, "step": 15615, "token_acc": 0.9517241379310345, "train_speed(iter/s)": 0.24458 }, { "epoch": 1.190639530452016, "grad_norm": 1.8929848670959473, "learning_rate": 8.664910310792697e-05, "loss": 0.20740699768066406, "memory(GiB)": 122.96, "step": 15620, "token_acc": 0.9142491467576792, "train_speed(iter/s)": 0.244607 }, { "epoch": 1.1910206570622761, "grad_norm": 1.5022095441818237, "learning_rate": 8.664095714515754e-05, "loss": 0.15379525423049928, "memory(GiB)": 122.96, "step": 15625, "token_acc": 0.9409730228268388, "train_speed(iter/s)": 0.244624 }, { "epoch": 1.1914017836725361, "grad_norm": 0.5857786536216736, "learning_rate": 8.663280908119923e-05, "loss": 0.060064852237701416, "memory(GiB)": 122.96, "step": 15630, "token_acc": 0.9789750328515112, "train_speed(iter/s)": 0.244656 }, { "epoch": 1.191782910282796, "grad_norm": 0.08278290927410126, "learning_rate": 8.662465891651932e-05, "loss": 0.07783631086349488, "memory(GiB)": 122.96, "step": 15635, "token_acc": 0.9616971125515615, "train_speed(iter/s)": 0.24468 }, { "epoch": 1.192164036893056, "grad_norm": 1.2153772115707397, "learning_rate": 8.66165066515852e-05, "loss": 0.10311132669448853, "memory(GiB)": 122.96, "step": 15640, "token_acc": 0.9491159982751186, "train_speed(iter/s)": 0.24471 }, { "epoch": 1.1925451635033157, "grad_norm": 0.2157495617866516, "learning_rate": 8.660835228686432e-05, "loss": 0.12466132640838623, "memory(GiB)": 122.96, "step": 15645, "token_acc": 0.9400272603362109, "train_speed(iter/s)": 0.244729 }, { "epoch": 1.1929262901135758, "grad_norm": 1.6295340061187744, "learning_rate": 8.660019582282432e-05, "loss": 0.11552400588989258, "memory(GiB)": 122.96, "step": 15650, "token_acc": 0.951071761416589, "train_speed(iter/s)": 0.244759 }, { "epoch": 1.1933074167238358, "grad_norm": 0.034936126321554184, "learning_rate": 8.659203725993296e-05, "loss": 0.11986469030380249, "memory(GiB)": 122.96, "step": 15655, "token_acc": 0.9437346437346438, "train_speed(iter/s)": 0.244785 }, { "epoch": 1.1936885433340956, "grad_norm": 0.8410263061523438, "learning_rate": 8.658387659865805e-05, "loss": 0.16565303802490233, "memory(GiB)": 122.96, "step": 15660, "token_acc": 0.9371381306865177, "train_speed(iter/s)": 0.244806 }, { "epoch": 1.1940696699443556, "grad_norm": 0.6600218415260315, "learning_rate": 8.65757138394676e-05, "loss": 0.102394700050354, "memory(GiB)": 122.96, "step": 15665, "token_acc": 0.955153791969946, "train_speed(iter/s)": 0.244832 }, { "epoch": 1.1944507965546154, "grad_norm": 1.1341533660888672, "learning_rate": 8.656754898282968e-05, "loss": 0.17225127220153807, "memory(GiB)": 122.96, "step": 15670, "token_acc": 0.9307740717432348, "train_speed(iter/s)": 0.244857 }, { "epoch": 1.1948319231648754, "grad_norm": 0.8788964748382568, "learning_rate": 8.655938202921253e-05, "loss": 0.1484083652496338, "memory(GiB)": 122.96, "step": 15675, "token_acc": 0.9420289855072463, "train_speed(iter/s)": 0.244884 }, { "epoch": 1.1952130497751352, "grad_norm": 1.010901689529419, "learning_rate": 8.655121297908447e-05, "loss": 0.136228609085083, "memory(GiB)": 122.96, "step": 15680, "token_acc": 0.9515794924909373, "train_speed(iter/s)": 0.244904 }, { "epoch": 1.1955941763853952, "grad_norm": 1.6960610151290894, "learning_rate": 8.654304183291398e-05, "loss": 0.15194953680038453, "memory(GiB)": 122.96, "step": 15685, "token_acc": 0.9472767968167123, "train_speed(iter/s)": 0.244929 }, { "epoch": 1.1959753029956552, "grad_norm": 0.7076501250267029, "learning_rate": 8.653486859116959e-05, "loss": 0.15752742290496827, "memory(GiB)": 122.96, "step": 15690, "token_acc": 0.9389458621512043, "train_speed(iter/s)": 0.24495 }, { "epoch": 1.196356429605915, "grad_norm": 0.7482985258102417, "learning_rate": 8.652669325432006e-05, "loss": 0.1388368248939514, "memory(GiB)": 122.96, "step": 15695, "token_acc": 0.9372411596049698, "train_speed(iter/s)": 0.244978 }, { "epoch": 1.196737556216175, "grad_norm": 0.6396218538284302, "learning_rate": 8.651851582283417e-05, "loss": 0.09900745153427123, "memory(GiB)": 122.96, "step": 15700, "token_acc": 0.963320058687906, "train_speed(iter/s)": 0.244993 }, { "epoch": 1.1971186828264349, "grad_norm": 0.9170348644256592, "learning_rate": 8.651033629718085e-05, "loss": 0.13875975608825683, "memory(GiB)": 122.96, "step": 15705, "token_acc": 0.9521077024939307, "train_speed(iter/s)": 0.245013 }, { "epoch": 1.1974998094366949, "grad_norm": 0.7421585321426392, "learning_rate": 8.650215467782919e-05, "loss": 0.17269353866577147, "memory(GiB)": 122.96, "step": 15710, "token_acc": 0.9453781512605042, "train_speed(iter/s)": 0.245029 }, { "epoch": 1.197880936046955, "grad_norm": 1.0272371768951416, "learning_rate": 8.649397096524832e-05, "loss": 0.17152791023254393, "memory(GiB)": 122.96, "step": 15715, "token_acc": 0.941958041958042, "train_speed(iter/s)": 0.245044 }, { "epoch": 1.1982620626572147, "grad_norm": 0.9068253636360168, "learning_rate": 8.64857851599076e-05, "loss": 0.1595659613609314, "memory(GiB)": 122.96, "step": 15720, "token_acc": 0.9454849498327759, "train_speed(iter/s)": 0.245068 }, { "epoch": 1.1986431892674747, "grad_norm": 0.7580740451812744, "learning_rate": 8.64775972622764e-05, "loss": 0.12419443130493164, "memory(GiB)": 122.96, "step": 15725, "token_acc": 0.9462897526501767, "train_speed(iter/s)": 0.245094 }, { "epoch": 1.1990243158777345, "grad_norm": 0.9802871346473694, "learning_rate": 8.646940727282427e-05, "loss": 0.16741663217544556, "memory(GiB)": 122.96, "step": 15730, "token_acc": 0.9327195467422096, "train_speed(iter/s)": 0.245121 }, { "epoch": 1.1994054424879945, "grad_norm": 0.9984568357467651, "learning_rate": 8.646121519202088e-05, "loss": 0.18072489500045777, "memory(GiB)": 122.96, "step": 15735, "token_acc": 0.9344744584113399, "train_speed(iter/s)": 0.245135 }, { "epoch": 1.1997865690982543, "grad_norm": 1.0426232814788818, "learning_rate": 8.6453021020336e-05, "loss": 0.11003752946853637, "memory(GiB)": 122.96, "step": 15740, "token_acc": 0.955937794533459, "train_speed(iter/s)": 0.245155 }, { "epoch": 1.2001676957085143, "grad_norm": 1.4015369415283203, "learning_rate": 8.644482475823954e-05, "loss": 0.1158212423324585, "memory(GiB)": 122.96, "step": 15745, "token_acc": 0.9469244288224956, "train_speed(iter/s)": 0.245183 }, { "epoch": 1.2005488223187744, "grad_norm": 0.26828399300575256, "learning_rate": 8.643662640620148e-05, "loss": 0.15168803930282593, "memory(GiB)": 122.96, "step": 15750, "token_acc": 0.9420875420875421, "train_speed(iter/s)": 0.245209 }, { "epoch": 1.2009299489290342, "grad_norm": 0.9683685302734375, "learning_rate": 8.642842596469199e-05, "loss": 0.1827712893486023, "memory(GiB)": 122.96, "step": 15755, "token_acc": 0.9277647560196036, "train_speed(iter/s)": 0.245229 }, { "epoch": 1.2013110755392942, "grad_norm": 1.439871072769165, "learning_rate": 8.642022343418133e-05, "loss": 0.16164228916168213, "memory(GiB)": 122.96, "step": 15760, "token_acc": 0.9485131690739167, "train_speed(iter/s)": 0.245242 }, { "epoch": 1.201692202149554, "grad_norm": 0.9930106401443481, "learning_rate": 8.641201881513985e-05, "loss": 0.14125605821609497, "memory(GiB)": 122.96, "step": 15765, "token_acc": 0.9570159123785906, "train_speed(iter/s)": 0.24526 }, { "epoch": 1.202073328759814, "grad_norm": 0.9577664136886597, "learning_rate": 8.640381210803808e-05, "loss": 0.17500603199005127, "memory(GiB)": 122.96, "step": 15770, "token_acc": 0.9268236908300937, "train_speed(iter/s)": 0.245279 }, { "epoch": 1.202454455370074, "grad_norm": 0.6015380024909973, "learning_rate": 8.639560331334662e-05, "loss": 0.10504404306411744, "memory(GiB)": 122.96, "step": 15775, "token_acc": 0.9545849002024877, "train_speed(iter/s)": 0.245302 }, { "epoch": 1.2028355819803338, "grad_norm": 0.6197881102561951, "learning_rate": 8.63873924315362e-05, "loss": 0.14085274934768677, "memory(GiB)": 122.96, "step": 15780, "token_acc": 0.949814126394052, "train_speed(iter/s)": 0.245306 }, { "epoch": 1.2032167085905938, "grad_norm": 0.49084654450416565, "learning_rate": 8.637917946307768e-05, "loss": 0.12354809045791626, "memory(GiB)": 122.96, "step": 15785, "token_acc": 0.9501192004401247, "train_speed(iter/s)": 0.245318 }, { "epoch": 1.2035978352008536, "grad_norm": 0.7021044492721558, "learning_rate": 8.637096440844202e-05, "loss": 0.08869106769561767, "memory(GiB)": 122.96, "step": 15790, "token_acc": 0.9635974304068522, "train_speed(iter/s)": 0.245336 }, { "epoch": 1.2039789618111136, "grad_norm": 0.8917171359062195, "learning_rate": 8.636274726810037e-05, "loss": 0.10839877128601075, "memory(GiB)": 122.96, "step": 15795, "token_acc": 0.955249569707401, "train_speed(iter/s)": 0.245359 }, { "epoch": 1.2043600884213737, "grad_norm": 0.47438985109329224, "learning_rate": 8.635452804252388e-05, "loss": 0.19416491985321044, "memory(GiB)": 122.96, "step": 15800, "token_acc": 0.9450234103609727, "train_speed(iter/s)": 0.245368 }, { "epoch": 1.2043600884213737, "eval_loss": 0.10888304561376572, "eval_runtime": 160.7788, "eval_samples_per_second": 3.296, "eval_steps_per_second": 3.296, "eval_token_acc": 0.9494608758508524, "step": 15800 }, { "epoch": 1.2047412150316335, "grad_norm": 0.7827815413475037, "learning_rate": 8.634630673218393e-05, "loss": 0.13567349910736085, "memory(GiB)": 122.96, "step": 15805, "token_acc": 0.949237296696636, "train_speed(iter/s)": 0.244767 }, { "epoch": 1.2051223416418935, "grad_norm": 0.4082328677177429, "learning_rate": 8.633808333755193e-05, "loss": 0.12117927074432373, "memory(GiB)": 122.96, "step": 15810, "token_acc": 0.9606437454279444, "train_speed(iter/s)": 0.244782 }, { "epoch": 1.2055034682521533, "grad_norm": 0.6525883674621582, "learning_rate": 8.63298578590995e-05, "loss": 0.1369357943534851, "memory(GiB)": 122.96, "step": 15815, "token_acc": 0.9481884057971014, "train_speed(iter/s)": 0.244797 }, { "epoch": 1.2058845948624133, "grad_norm": 1.0739715099334717, "learning_rate": 8.632163029729831e-05, "loss": 0.12826080322265626, "memory(GiB)": 122.96, "step": 15820, "token_acc": 0.9504482477587612, "train_speed(iter/s)": 0.244808 }, { "epoch": 1.2062657214726733, "grad_norm": 1.1617728471755981, "learning_rate": 8.631340065262018e-05, "loss": 0.13622546195983887, "memory(GiB)": 122.96, "step": 15825, "token_acc": 0.9423772609819121, "train_speed(iter/s)": 0.244831 }, { "epoch": 1.206646848082933, "grad_norm": 1.6007927656173706, "learning_rate": 8.630516892553703e-05, "loss": 0.13062527179718017, "memory(GiB)": 122.96, "step": 15830, "token_acc": 0.9404553415061296, "train_speed(iter/s)": 0.244858 }, { "epoch": 1.2070279746931931, "grad_norm": 0.9140941500663757, "learning_rate": 8.629693511652092e-05, "loss": 0.13574250936508178, "memory(GiB)": 122.96, "step": 15835, "token_acc": 0.9477265697494587, "train_speed(iter/s)": 0.244873 }, { "epoch": 1.207409101303453, "grad_norm": 0.3883593678474426, "learning_rate": 8.628869922604403e-05, "loss": 0.10227218866348267, "memory(GiB)": 122.96, "step": 15840, "token_acc": 0.9476772616136919, "train_speed(iter/s)": 0.244896 }, { "epoch": 1.207790227913713, "grad_norm": 0.05199310556054115, "learning_rate": 8.628046125457862e-05, "loss": 0.10925573110580444, "memory(GiB)": 122.96, "step": 15845, "token_acc": 0.9474954737477369, "train_speed(iter/s)": 0.244923 }, { "epoch": 1.208171354523973, "grad_norm": 1.6014018058776855, "learning_rate": 8.627222120259714e-05, "loss": 0.2327803373336792, "memory(GiB)": 122.96, "step": 15850, "token_acc": 0.9088380716934487, "train_speed(iter/s)": 0.244946 }, { "epoch": 1.2085524811342327, "grad_norm": 0.7493191957473755, "learning_rate": 8.626397907057209e-05, "loss": 0.09664825797080993, "memory(GiB)": 122.96, "step": 15855, "token_acc": 0.9623081406644647, "train_speed(iter/s)": 0.244966 }, { "epoch": 1.2089336077444928, "grad_norm": 1.074459195137024, "learning_rate": 8.625573485897613e-05, "loss": 0.18731815814971925, "memory(GiB)": 122.96, "step": 15860, "token_acc": 0.9222021910969352, "train_speed(iter/s)": 0.244981 }, { "epoch": 1.2093147343547526, "grad_norm": 0.8230160474777222, "learning_rate": 8.624748856828201e-05, "loss": 0.14536923170089722, "memory(GiB)": 122.96, "step": 15865, "token_acc": 0.9411764705882353, "train_speed(iter/s)": 0.245002 }, { "epoch": 1.2096958609650126, "grad_norm": 1.1192909479141235, "learning_rate": 8.623924019896263e-05, "loss": 0.17896480560302735, "memory(GiB)": 122.96, "step": 15870, "token_acc": 0.9228855721393034, "train_speed(iter/s)": 0.245026 }, { "epoch": 1.2100769875752726, "grad_norm": 0.6741317510604858, "learning_rate": 8.6230989751491e-05, "loss": 0.19663692712783815, "memory(GiB)": 122.96, "step": 15875, "token_acc": 0.9253466050479915, "train_speed(iter/s)": 0.245035 }, { "epoch": 1.2104581141855324, "grad_norm": 0.47300955653190613, "learning_rate": 8.622273722634024e-05, "loss": 0.10869425535202026, "memory(GiB)": 122.96, "step": 15880, "token_acc": 0.9517799352750809, "train_speed(iter/s)": 0.245062 }, { "epoch": 1.2108392407957924, "grad_norm": 0.706558108329773, "learning_rate": 8.62144826239836e-05, "loss": 0.12261606454849243, "memory(GiB)": 122.96, "step": 15885, "token_acc": 0.9543232915666934, "train_speed(iter/s)": 0.245055 }, { "epoch": 1.2112203674060522, "grad_norm": 0.9239067435264587, "learning_rate": 8.620622594489443e-05, "loss": 0.14339258670806884, "memory(GiB)": 122.96, "step": 15890, "token_acc": 0.9480640357408786, "train_speed(iter/s)": 0.245075 }, { "epoch": 1.2116014940163122, "grad_norm": 0.8928197622299194, "learning_rate": 8.619796718954623e-05, "loss": 0.12770410776138305, "memory(GiB)": 122.96, "step": 15895, "token_acc": 0.9358757062146893, "train_speed(iter/s)": 0.245101 }, { "epoch": 1.2119826206265722, "grad_norm": 0.9507559537887573, "learning_rate": 8.618970635841258e-05, "loss": 0.1283632755279541, "memory(GiB)": 122.96, "step": 15900, "token_acc": 0.9529355835512241, "train_speed(iter/s)": 0.245122 }, { "epoch": 1.212363747236832, "grad_norm": 1.3958762884140015, "learning_rate": 8.618144345196721e-05, "loss": 0.15555522441864014, "memory(GiB)": 122.96, "step": 15905, "token_acc": 0.9422502489213409, "train_speed(iter/s)": 0.24514 }, { "epoch": 1.212744873847092, "grad_norm": 0.10395362973213196, "learning_rate": 8.617317847068397e-05, "loss": 0.10153658390045166, "memory(GiB)": 122.96, "step": 15910, "token_acc": 0.9442115446630119, "train_speed(iter/s)": 0.245166 }, { "epoch": 1.2131260004573519, "grad_norm": 0.49249958992004395, "learning_rate": 8.61649114150368e-05, "loss": 0.1260540723800659, "memory(GiB)": 122.96, "step": 15915, "token_acc": 0.9536866744691523, "train_speed(iter/s)": 0.245184 }, { "epoch": 1.2135071270676119, "grad_norm": 0.14692093431949615, "learning_rate": 8.615664228549979e-05, "loss": 0.1368972420692444, "memory(GiB)": 122.96, "step": 15920, "token_acc": 0.9256489221293445, "train_speed(iter/s)": 0.245213 }, { "epoch": 1.213888253677872, "grad_norm": 0.6109350919723511, "learning_rate": 8.614837108254713e-05, "loss": 0.10539435148239136, "memory(GiB)": 122.96, "step": 15925, "token_acc": 0.9557762960947067, "train_speed(iter/s)": 0.245225 }, { "epoch": 1.2142693802881317, "grad_norm": 0.719002902507782, "learning_rate": 8.614009780665314e-05, "loss": 0.1267549991607666, "memory(GiB)": 122.96, "step": 15930, "token_acc": 0.9577239030254459, "train_speed(iter/s)": 0.245228 }, { "epoch": 1.2146505068983917, "grad_norm": 0.9963420033454895, "learning_rate": 8.613182245829226e-05, "loss": 0.1612454891204834, "memory(GiB)": 122.96, "step": 15935, "token_acc": 0.9325890940317733, "train_speed(iter/s)": 0.245256 }, { "epoch": 1.2150316335086515, "grad_norm": 1.1787956953048706, "learning_rate": 8.612354503793902e-05, "loss": 0.1581188678741455, "memory(GiB)": 122.96, "step": 15940, "token_acc": 0.932569558101473, "train_speed(iter/s)": 0.245283 }, { "epoch": 1.2154127601189115, "grad_norm": 0.7018942832946777, "learning_rate": 8.61152655460681e-05, "loss": 0.15772944688796997, "memory(GiB)": 122.96, "step": 15945, "token_acc": 0.9376448771441818, "train_speed(iter/s)": 0.245306 }, { "epoch": 1.2157938867291715, "grad_norm": 0.4029046297073364, "learning_rate": 8.610698398315431e-05, "loss": 0.1264857292175293, "memory(GiB)": 122.96, "step": 15950, "token_acc": 0.945141065830721, "train_speed(iter/s)": 0.245328 }, { "epoch": 1.2161750133394313, "grad_norm": 1.4024724960327148, "learning_rate": 8.609870034967253e-05, "loss": 0.1959424376487732, "memory(GiB)": 122.96, "step": 15955, "token_acc": 0.9392953929539295, "train_speed(iter/s)": 0.245344 }, { "epoch": 1.2165561399496914, "grad_norm": 0.48563823103904724, "learning_rate": 8.609041464609782e-05, "loss": 0.14989123344421387, "memory(GiB)": 122.96, "step": 15960, "token_acc": 0.9505383580080754, "train_speed(iter/s)": 0.245368 }, { "epoch": 1.2169372665599512, "grad_norm": 1.5315569639205933, "learning_rate": 8.608212687290531e-05, "loss": 0.19434041976928712, "memory(GiB)": 122.96, "step": 15965, "token_acc": 0.935859230878388, "train_speed(iter/s)": 0.245382 }, { "epoch": 1.2173183931702112, "grad_norm": 2.983673572540283, "learning_rate": 8.607383703057026e-05, "loss": 0.13692327737808227, "memory(GiB)": 122.96, "step": 15970, "token_acc": 0.9494543365881677, "train_speed(iter/s)": 0.245404 }, { "epoch": 1.2176995197804712, "grad_norm": 0.6809452176094055, "learning_rate": 8.606554511956805e-05, "loss": 0.1288262963294983, "memory(GiB)": 122.96, "step": 15975, "token_acc": 0.9515175404020496, "train_speed(iter/s)": 0.245424 }, { "epoch": 1.218080646390731, "grad_norm": 1.2504374980926514, "learning_rate": 8.605725114037422e-05, "loss": 0.14783220291137694, "memory(GiB)": 122.96, "step": 15980, "token_acc": 0.937702922077922, "train_speed(iter/s)": 0.245442 }, { "epoch": 1.218461773000991, "grad_norm": 0.8203782439231873, "learning_rate": 8.604895509346433e-05, "loss": 0.102177894115448, "memory(GiB)": 122.96, "step": 15985, "token_acc": 0.9572903629536921, "train_speed(iter/s)": 0.245454 }, { "epoch": 1.2188428996112508, "grad_norm": 0.7793991565704346, "learning_rate": 8.604065697931418e-05, "loss": 0.1759890079498291, "memory(GiB)": 122.96, "step": 15990, "token_acc": 0.9277742682670036, "train_speed(iter/s)": 0.245477 }, { "epoch": 1.2192240262215108, "grad_norm": 0.5459764003753662, "learning_rate": 8.603235679839959e-05, "loss": 0.18720703125, "memory(GiB)": 122.96, "step": 15995, "token_acc": 0.933494431603278, "train_speed(iter/s)": 0.245485 }, { "epoch": 1.2196051528317706, "grad_norm": 1.2532260417938232, "learning_rate": 8.602405455119656e-05, "loss": 0.13662257194519042, "memory(GiB)": 122.96, "step": 16000, "token_acc": 0.954750346740638, "train_speed(iter/s)": 0.2455 }, { "epoch": 1.2196051528317706, "eval_loss": 0.10765693336725235, "eval_runtime": 161.6495, "eval_samples_per_second": 3.279, "eval_steps_per_second": 3.279, "eval_token_acc": 0.9499804228660924, "step": 16000 }, { "epoch": 1.2199862794420306, "grad_norm": 0.6384474039077759, "learning_rate": 8.601575023818115e-05, "loss": 0.1702946901321411, "memory(GiB)": 122.96, "step": 16005, "token_acc": 0.9496507164137791, "train_speed(iter/s)": 0.244911 }, { "epoch": 1.2203674060522907, "grad_norm": 0.9726187586784363, "learning_rate": 8.60074438598296e-05, "loss": 0.1425405979156494, "memory(GiB)": 122.96, "step": 16010, "token_acc": 0.9548547717842324, "train_speed(iter/s)": 0.244926 }, { "epoch": 1.2207485326625505, "grad_norm": 0.7182465195655823, "learning_rate": 8.599913541661825e-05, "loss": 0.12409054040908814, "memory(GiB)": 122.96, "step": 16015, "token_acc": 0.934047619047619, "train_speed(iter/s)": 0.244952 }, { "epoch": 1.2211296592728105, "grad_norm": 0.9416847229003906, "learning_rate": 8.599082490902354e-05, "loss": 0.13901689052581787, "memory(GiB)": 122.96, "step": 16020, "token_acc": 0.9472693032015066, "train_speed(iter/s)": 0.244971 }, { "epoch": 1.2215107858830703, "grad_norm": 0.16284970939159393, "learning_rate": 8.598251233752203e-05, "loss": 0.10334191322326661, "memory(GiB)": 122.96, "step": 16025, "token_acc": 0.9538539553752535, "train_speed(iter/s)": 0.245 }, { "epoch": 1.2218919124933303, "grad_norm": 0.8743412494659424, "learning_rate": 8.597419770259044e-05, "loss": 0.13739452362060547, "memory(GiB)": 122.96, "step": 16030, "token_acc": 0.9483311660164716, "train_speed(iter/s)": 0.24499 }, { "epoch": 1.2222730391035903, "grad_norm": 0.47932201623916626, "learning_rate": 8.596588100470553e-05, "loss": 0.11962087154388427, "memory(GiB)": 122.96, "step": 16035, "token_acc": 0.9538190512142947, "train_speed(iter/s)": 0.244997 }, { "epoch": 1.22265416571385, "grad_norm": 0.6411938071250916, "learning_rate": 8.595756224434425e-05, "loss": 0.11554520130157471, "memory(GiB)": 122.96, "step": 16040, "token_acc": 0.9540955631399317, "train_speed(iter/s)": 0.245012 }, { "epoch": 1.2230352923241101, "grad_norm": 0.708158552646637, "learning_rate": 8.594924142198364e-05, "loss": 0.2054734468460083, "memory(GiB)": 122.96, "step": 16045, "token_acc": 0.9259470236399886, "train_speed(iter/s)": 0.245023 }, { "epoch": 1.22341641893437, "grad_norm": 0.6682152152061462, "learning_rate": 8.594091853810087e-05, "loss": 0.12679922580718994, "memory(GiB)": 122.96, "step": 16050, "token_acc": 0.9533145275035261, "train_speed(iter/s)": 0.245031 }, { "epoch": 1.22379754554463, "grad_norm": 0.6523780822753906, "learning_rate": 8.59325935931732e-05, "loss": 0.1569245457649231, "memory(GiB)": 122.96, "step": 16055, "token_acc": 0.9481351981351981, "train_speed(iter/s)": 0.245051 }, { "epoch": 1.2241786721548897, "grad_norm": 0.8597134947776794, "learning_rate": 8.592426658767803e-05, "loss": 0.10363597869873047, "memory(GiB)": 122.96, "step": 16060, "token_acc": 0.9558843308675185, "train_speed(iter/s)": 0.245061 }, { "epoch": 1.2245597987651498, "grad_norm": 0.5574375987052917, "learning_rate": 8.591593752209288e-05, "loss": 0.12690383195877075, "memory(GiB)": 122.96, "step": 16065, "token_acc": 0.9537845057880677, "train_speed(iter/s)": 0.245054 }, { "epoch": 1.2249409253754098, "grad_norm": 0.6397616267204285, "learning_rate": 8.590760639689539e-05, "loss": 0.18539944887161255, "memory(GiB)": 122.96, "step": 16070, "token_acc": 0.9324473975636767, "train_speed(iter/s)": 0.245071 }, { "epoch": 1.2253220519856696, "grad_norm": 0.5023388266563416, "learning_rate": 8.58992732125633e-05, "loss": 0.15264731645584106, "memory(GiB)": 122.96, "step": 16075, "token_acc": 0.9356622998544396, "train_speed(iter/s)": 0.245094 }, { "epoch": 1.2257031785959296, "grad_norm": 0.7483315467834473, "learning_rate": 8.58909379695745e-05, "loss": 0.14215537309646606, "memory(GiB)": 122.96, "step": 16080, "token_acc": 0.9486088379705401, "train_speed(iter/s)": 0.245104 }, { "epoch": 1.2260843052061894, "grad_norm": 0.9191861152648926, "learning_rate": 8.588260066840694e-05, "loss": 0.13580310344696045, "memory(GiB)": 122.96, "step": 16085, "token_acc": 0.9340746624305004, "train_speed(iter/s)": 0.245125 }, { "epoch": 1.2264654318164494, "grad_norm": 1.4546343088150024, "learning_rate": 8.587426130953876e-05, "loss": 0.13323121070861815, "memory(GiB)": 122.96, "step": 16090, "token_acc": 0.947333480858597, "train_speed(iter/s)": 0.245144 }, { "epoch": 1.2268465584267094, "grad_norm": 1.099069356918335, "learning_rate": 8.586591989344816e-05, "loss": 0.1484699845314026, "memory(GiB)": 122.96, "step": 16095, "token_acc": 0.9420977169956987, "train_speed(iter/s)": 0.245154 }, { "epoch": 1.2272276850369692, "grad_norm": 0.8957284092903137, "learning_rate": 8.58575764206135e-05, "loss": 0.15352662801742553, "memory(GiB)": 122.96, "step": 16100, "token_acc": 0.9432672590567327, "train_speed(iter/s)": 0.245176 }, { "epoch": 1.2276088116472292, "grad_norm": 1.0219942331314087, "learning_rate": 8.584923089151324e-05, "loss": 0.11999895572662353, "memory(GiB)": 122.96, "step": 16105, "token_acc": 0.9593679458239278, "train_speed(iter/s)": 0.245202 }, { "epoch": 1.227989938257489, "grad_norm": 1.288794755935669, "learning_rate": 8.584088330662593e-05, "loss": 0.13033831119537354, "memory(GiB)": 122.96, "step": 16110, "token_acc": 0.9480986639260021, "train_speed(iter/s)": 0.245222 }, { "epoch": 1.228371064867749, "grad_norm": 1.104875922203064, "learning_rate": 8.583253366643029e-05, "loss": 0.1975583553314209, "memory(GiB)": 122.96, "step": 16115, "token_acc": 0.9361491175887369, "train_speed(iter/s)": 0.245238 }, { "epoch": 1.228752191478009, "grad_norm": 0.8471193313598633, "learning_rate": 8.582418197140513e-05, "loss": 0.14043872356414794, "memory(GiB)": 122.96, "step": 16120, "token_acc": 0.9478307637145931, "train_speed(iter/s)": 0.245255 }, { "epoch": 1.2291333180882689, "grad_norm": 0.23152922093868256, "learning_rate": 8.581582822202939e-05, "loss": 0.09041902422904968, "memory(GiB)": 122.96, "step": 16125, "token_acc": 0.9600461006530926, "train_speed(iter/s)": 0.245272 }, { "epoch": 1.2295144446985289, "grad_norm": 0.9142738580703735, "learning_rate": 8.580747241878209e-05, "loss": 0.1333345890045166, "memory(GiB)": 122.96, "step": 16130, "token_acc": 0.9382566585956417, "train_speed(iter/s)": 0.245294 }, { "epoch": 1.2298955713087887, "grad_norm": 1.1701644659042358, "learning_rate": 8.579911456214243e-05, "loss": 0.13622759580612182, "memory(GiB)": 122.96, "step": 16135, "token_acc": 0.9469596094096759, "train_speed(iter/s)": 0.245311 }, { "epoch": 1.2302766979190487, "grad_norm": 1.0073952674865723, "learning_rate": 8.579075465258966e-05, "loss": 0.1396502137184143, "memory(GiB)": 122.96, "step": 16140, "token_acc": 0.9413292920545573, "train_speed(iter/s)": 0.245329 }, { "epoch": 1.2306578245293087, "grad_norm": 0.7338088154792786, "learning_rate": 8.578239269060322e-05, "loss": 0.1352464437484741, "memory(GiB)": 122.96, "step": 16145, "token_acc": 0.9448759709346028, "train_speed(iter/s)": 0.245351 }, { "epoch": 1.2310389511395685, "grad_norm": 0.56617271900177, "learning_rate": 8.57740286766626e-05, "loss": 0.14350303411483764, "memory(GiB)": 122.96, "step": 16150, "token_acc": 0.9498313040927094, "train_speed(iter/s)": 0.245359 }, { "epoch": 1.2314200777498285, "grad_norm": 0.7738737463951111, "learning_rate": 8.576566261124744e-05, "loss": 0.13511101007461548, "memory(GiB)": 122.96, "step": 16155, "token_acc": 0.9497809762202754, "train_speed(iter/s)": 0.245369 }, { "epoch": 1.2318012043600883, "grad_norm": 0.7543452978134155, "learning_rate": 8.57572944948375e-05, "loss": 0.09294385313987732, "memory(GiB)": 122.96, "step": 16160, "token_acc": 0.9539319248826291, "train_speed(iter/s)": 0.245391 }, { "epoch": 1.2321823309703483, "grad_norm": 1.1241014003753662, "learning_rate": 8.574892432791268e-05, "loss": 0.10364333391189576, "memory(GiB)": 122.96, "step": 16165, "token_acc": 0.9556797020484171, "train_speed(iter/s)": 0.245408 }, { "epoch": 1.2325634575806084, "grad_norm": 0.6214132905006409, "learning_rate": 8.574055211095292e-05, "loss": 0.15829126834869384, "memory(GiB)": 122.96, "step": 16170, "token_acc": 0.9444979532867807, "train_speed(iter/s)": 0.24541 }, { "epoch": 1.2329445841908682, "grad_norm": 0.860433042049408, "learning_rate": 8.573217784443837e-05, "loss": 0.14953093528747557, "memory(GiB)": 122.96, "step": 16175, "token_acc": 0.9428657344760186, "train_speed(iter/s)": 0.245418 }, { "epoch": 1.2333257108011282, "grad_norm": 1.0455974340438843, "learning_rate": 8.572380152884923e-05, "loss": 0.12000889778137207, "memory(GiB)": 122.96, "step": 16180, "token_acc": 0.9553333333333334, "train_speed(iter/s)": 0.245437 }, { "epoch": 1.233706837411388, "grad_norm": 0.9940935969352722, "learning_rate": 8.571542316466583e-05, "loss": 0.15520564317703248, "memory(GiB)": 122.96, "step": 16185, "token_acc": 0.9447749809305873, "train_speed(iter/s)": 0.245448 }, { "epoch": 1.234087964021648, "grad_norm": 0.6602475047111511, "learning_rate": 8.570704275236869e-05, "loss": 0.12008627653121948, "memory(GiB)": 122.96, "step": 16190, "token_acc": 0.9549254816430389, "train_speed(iter/s)": 0.245456 }, { "epoch": 1.234469090631908, "grad_norm": 0.43843433260917664, "learning_rate": 8.569866029243831e-05, "loss": 0.19270925521850585, "memory(GiB)": 122.96, "step": 16195, "token_acc": 0.919500346981263, "train_speed(iter/s)": 0.245481 }, { "epoch": 1.2348502172421678, "grad_norm": 1.0461527109146118, "learning_rate": 8.569027578535545e-05, "loss": 0.1280996561050415, "memory(GiB)": 122.96, "step": 16200, "token_acc": 0.9569446946496127, "train_speed(iter/s)": 0.245498 }, { "epoch": 1.2348502172421678, "eval_loss": 0.1076628789305687, "eval_runtime": 161.5295, "eval_samples_per_second": 3.281, "eval_steps_per_second": 3.281, "eval_token_acc": 0.9496189988554906, "step": 16200 }, { "epoch": 1.2352313438524278, "grad_norm": 1.022774577140808, "learning_rate": 8.568188923160089e-05, "loss": 0.12476699352264405, "memory(GiB)": 122.96, "step": 16205, "token_acc": 0.9497708187667295, "train_speed(iter/s)": 0.244904 }, { "epoch": 1.2356124704626876, "grad_norm": 1.008844256401062, "learning_rate": 8.567350063165557e-05, "loss": 0.1749922513961792, "memory(GiB)": 122.96, "step": 16210, "token_acc": 0.9320887445887446, "train_speed(iter/s)": 0.244928 }, { "epoch": 1.2359935970729476, "grad_norm": 0.8388006687164307, "learning_rate": 8.566510998600055e-05, "loss": 0.19828248023986816, "memory(GiB)": 122.96, "step": 16215, "token_acc": 0.9346955681347507, "train_speed(iter/s)": 0.244937 }, { "epoch": 1.2363747236832077, "grad_norm": 1.3474334478378296, "learning_rate": 8.565671729511695e-05, "loss": 0.1534719228744507, "memory(GiB)": 122.96, "step": 16220, "token_acc": 0.9482410028305701, "train_speed(iter/s)": 0.244954 }, { "epoch": 1.2367558502934675, "grad_norm": 0.9822812080383301, "learning_rate": 8.56483225594861e-05, "loss": 0.13751211166381835, "memory(GiB)": 122.96, "step": 16225, "token_acc": 0.9480043442845506, "train_speed(iter/s)": 0.244975 }, { "epoch": 1.2371369769037275, "grad_norm": 1.3502331972122192, "learning_rate": 8.563992577958937e-05, "loss": 0.10468108654022217, "memory(GiB)": 122.96, "step": 16230, "token_acc": 0.9591907115760432, "train_speed(iter/s)": 0.24498 }, { "epoch": 1.2375181035139873, "grad_norm": 0.734050452709198, "learning_rate": 8.563152695590828e-05, "loss": 0.16945064067840576, "memory(GiB)": 122.96, "step": 16235, "token_acc": 0.9434931506849316, "train_speed(iter/s)": 0.244993 }, { "epoch": 1.2378992301242473, "grad_norm": 1.1526421308517456, "learning_rate": 8.562312608892447e-05, "loss": 0.14653047323226928, "memory(GiB)": 122.96, "step": 16240, "token_acc": 0.9507986309184255, "train_speed(iter/s)": 0.245005 }, { "epoch": 1.2382803567345073, "grad_norm": 0.9251593947410583, "learning_rate": 8.561472317911971e-05, "loss": 0.1089218258857727, "memory(GiB)": 122.96, "step": 16245, "token_acc": 0.953121373868647, "train_speed(iter/s)": 0.245029 }, { "epoch": 1.2386614833447671, "grad_norm": 0.8780396580696106, "learning_rate": 8.560631822697582e-05, "loss": 0.14373908042907715, "memory(GiB)": 122.96, "step": 16250, "token_acc": 0.9377880184331797, "train_speed(iter/s)": 0.245053 }, { "epoch": 1.2390426099550271, "grad_norm": 0.6174344420433044, "learning_rate": 8.559791123297483e-05, "loss": 0.11026408672332763, "memory(GiB)": 122.96, "step": 16255, "token_acc": 0.956318480642805, "train_speed(iter/s)": 0.245058 }, { "epoch": 1.239423736565287, "grad_norm": 2.2393927574157715, "learning_rate": 8.558950219759882e-05, "loss": 0.17896136045455932, "memory(GiB)": 122.96, "step": 16260, "token_acc": 0.943562110039705, "train_speed(iter/s)": 0.245078 }, { "epoch": 1.239804863175547, "grad_norm": 0.8893852829933167, "learning_rate": 8.558109112133004e-05, "loss": 0.11942565441131592, "memory(GiB)": 122.96, "step": 16265, "token_acc": 0.945926800472255, "train_speed(iter/s)": 0.245101 }, { "epoch": 1.240185989785807, "grad_norm": 0.9451327323913574, "learning_rate": 8.557267800465077e-05, "loss": 0.1296193242073059, "memory(GiB)": 122.96, "step": 16270, "token_acc": 0.9412670479542455, "train_speed(iter/s)": 0.24512 }, { "epoch": 1.2405671163960668, "grad_norm": 0.6511697173118591, "learning_rate": 8.556426284804351e-05, "loss": 0.1145636796951294, "memory(GiB)": 122.96, "step": 16275, "token_acc": 0.9564516129032258, "train_speed(iter/s)": 0.245147 }, { "epoch": 1.2409482430063268, "grad_norm": 1.461281180381775, "learning_rate": 8.555584565199079e-05, "loss": 0.06572734713554382, "memory(GiB)": 122.96, "step": 16280, "token_acc": 0.9666913397483345, "train_speed(iter/s)": 0.245172 }, { "epoch": 1.2413293696165866, "grad_norm": 0.6185061931610107, "learning_rate": 8.554742641697535e-05, "loss": 0.09961251020431519, "memory(GiB)": 122.96, "step": 16285, "token_acc": 0.9528130671506352, "train_speed(iter/s)": 0.245185 }, { "epoch": 1.2417104962268466, "grad_norm": 0.9334349036216736, "learning_rate": 8.553900514347994e-05, "loss": 0.11793203353881836, "memory(GiB)": 122.96, "step": 16290, "token_acc": 0.9545622460288142, "train_speed(iter/s)": 0.24521 }, { "epoch": 1.2420916228371064, "grad_norm": 0.9299949407577515, "learning_rate": 8.553058183198753e-05, "loss": 0.09529297351837158, "memory(GiB)": 122.96, "step": 16295, "token_acc": 0.9651732269181934, "train_speed(iter/s)": 0.245222 }, { "epoch": 1.2424727494473664, "grad_norm": 1.369661808013916, "learning_rate": 8.552215648298113e-05, "loss": 0.16186554431915284, "memory(GiB)": 122.96, "step": 16300, "token_acc": 0.9357547764014277, "train_speed(iter/s)": 0.245239 }, { "epoch": 1.2428538760576264, "grad_norm": 1.834374189376831, "learning_rate": 8.551372909694389e-05, "loss": 0.11763108968734741, "memory(GiB)": 122.96, "step": 16305, "token_acc": 0.9580162686958803, "train_speed(iter/s)": 0.245261 }, { "epoch": 1.2432350026678862, "grad_norm": 1.0423831939697266, "learning_rate": 8.550529967435909e-05, "loss": 0.11769866943359375, "memory(GiB)": 122.96, "step": 16310, "token_acc": 0.9575956596230726, "train_speed(iter/s)": 0.245272 }, { "epoch": 1.2436161292781462, "grad_norm": 0.640189528465271, "learning_rate": 8.549686821571012e-05, "loss": 0.1259806513786316, "memory(GiB)": 122.96, "step": 16315, "token_acc": 0.9440507273405446, "train_speed(iter/s)": 0.245298 }, { "epoch": 1.243997255888406, "grad_norm": 0.9018551707267761, "learning_rate": 8.548843472148049e-05, "loss": 0.18629682064056396, "memory(GiB)": 122.96, "step": 16320, "token_acc": 0.9366583541147132, "train_speed(iter/s)": 0.245306 }, { "epoch": 1.244378382498666, "grad_norm": 0.812984824180603, "learning_rate": 8.54799991921538e-05, "loss": 0.1318651556968689, "memory(GiB)": 122.96, "step": 16325, "token_acc": 0.9460930640913081, "train_speed(iter/s)": 0.245319 }, { "epoch": 1.244759509108926, "grad_norm": 0.4474635720252991, "learning_rate": 8.547156162821382e-05, "loss": 0.11338679790496826, "memory(GiB)": 122.96, "step": 16330, "token_acc": 0.9498855253116255, "train_speed(iter/s)": 0.245327 }, { "epoch": 1.2451406357191859, "grad_norm": 0.9136907458305359, "learning_rate": 8.546312203014438e-05, "loss": 0.1202282190322876, "memory(GiB)": 122.96, "step": 16335, "token_acc": 0.9558554437328454, "train_speed(iter/s)": 0.245347 }, { "epoch": 1.245521762329446, "grad_norm": 0.6102603077888489, "learning_rate": 8.545468039842945e-05, "loss": 0.1217921495437622, "memory(GiB)": 122.96, "step": 16340, "token_acc": 0.9565377532228361, "train_speed(iter/s)": 0.245355 }, { "epoch": 1.2459028889397057, "grad_norm": 0.8213858008384705, "learning_rate": 8.544623673355314e-05, "loss": 0.08639991283416748, "memory(GiB)": 122.96, "step": 16345, "token_acc": 0.9613460663938154, "train_speed(iter/s)": 0.245367 }, { "epoch": 1.2462840155499657, "grad_norm": 0.6500285863876343, "learning_rate": 8.543779103599964e-05, "loss": 0.16746288537979126, "memory(GiB)": 122.96, "step": 16350, "token_acc": 0.9406820365033621, "train_speed(iter/s)": 0.245387 }, { "epoch": 1.2466651421602255, "grad_norm": 0.950935423374176, "learning_rate": 8.54293433062533e-05, "loss": 0.12734637260437012, "memory(GiB)": 122.96, "step": 16355, "token_acc": 0.9531165311653117, "train_speed(iter/s)": 0.245401 }, { "epoch": 1.2470462687704855, "grad_norm": 0.7351865768432617, "learning_rate": 8.54208935447985e-05, "loss": 0.1124348759651184, "memory(GiB)": 122.96, "step": 16360, "token_acc": 0.9591836734693877, "train_speed(iter/s)": 0.24543 }, { "epoch": 1.2474273953807455, "grad_norm": 1.1125338077545166, "learning_rate": 8.541244175211984e-05, "loss": 0.15277912616729736, "memory(GiB)": 122.96, "step": 16365, "token_acc": 0.9322274881516588, "train_speed(iter/s)": 0.245455 }, { "epoch": 1.2478085219910053, "grad_norm": 1.076313853263855, "learning_rate": 8.540398792870199e-05, "loss": 0.2022336959838867, "memory(GiB)": 122.96, "step": 16370, "token_acc": 0.9228992203291944, "train_speed(iter/s)": 0.245477 }, { "epoch": 1.2481896486012654, "grad_norm": 1.180124282836914, "learning_rate": 8.539553207502971e-05, "loss": 0.10317001342773438, "memory(GiB)": 122.96, "step": 16375, "token_acc": 0.9590984974958264, "train_speed(iter/s)": 0.245501 }, { "epoch": 1.2485707752115252, "grad_norm": 1.0889081954956055, "learning_rate": 8.538707419158793e-05, "loss": 0.13459961414337157, "memory(GiB)": 122.96, "step": 16380, "token_acc": 0.9444880037635252, "train_speed(iter/s)": 0.245513 }, { "epoch": 1.2489519018217852, "grad_norm": 0.9878136515617371, "learning_rate": 8.537861427886167e-05, "loss": 0.15145862102508545, "memory(GiB)": 122.96, "step": 16385, "token_acc": 0.9526451089162495, "train_speed(iter/s)": 0.245528 }, { "epoch": 1.2493330284320452, "grad_norm": 0.6209480166435242, "learning_rate": 8.537015233733606e-05, "loss": 0.06706008911132813, "memory(GiB)": 122.96, "step": 16390, "token_acc": 0.9724349157733537, "train_speed(iter/s)": 0.245549 }, { "epoch": 1.249714155042305, "grad_norm": 0.9216794371604919, "learning_rate": 8.536168836749632e-05, "loss": 0.10744179487228393, "memory(GiB)": 122.96, "step": 16395, "token_acc": 0.9530596731280881, "train_speed(iter/s)": 0.245564 }, { "epoch": 1.250095281652565, "grad_norm": 0.5902736186981201, "learning_rate": 8.535322236982788e-05, "loss": 0.10428780317306519, "memory(GiB)": 122.96, "step": 16400, "token_acc": 0.9600868856910127, "train_speed(iter/s)": 0.245575 }, { "epoch": 1.250095281652565, "eval_loss": 0.10671170800924301, "eval_runtime": 160.9739, "eval_samples_per_second": 3.292, "eval_steps_per_second": 3.292, "eval_token_acc": 0.9503343172098067, "step": 16400 }, { "epoch": 1.2504764082628248, "grad_norm": 0.8466094732284546, "learning_rate": 8.534475434481617e-05, "loss": 0.09915532469749451, "memory(GiB)": 122.96, "step": 16405, "token_acc": 0.9505668706010914, "train_speed(iter/s)": 0.245 }, { "epoch": 1.2508575348730848, "grad_norm": 0.8198062777519226, "learning_rate": 8.533628429294685e-05, "loss": 0.1627612829208374, "memory(GiB)": 122.96, "step": 16410, "token_acc": 0.9393644617380026, "train_speed(iter/s)": 0.245022 }, { "epoch": 1.2512386614833448, "grad_norm": 0.989240288734436, "learning_rate": 8.532781221470559e-05, "loss": 0.14997178316116333, "memory(GiB)": 122.96, "step": 16415, "token_acc": 0.9427037686240141, "train_speed(iter/s)": 0.245034 }, { "epoch": 1.2516197880936046, "grad_norm": 0.9298478364944458, "learning_rate": 8.531933811057825e-05, "loss": 0.11156213283538818, "memory(GiB)": 122.96, "step": 16420, "token_acc": 0.9604992657856094, "train_speed(iter/s)": 0.245047 }, { "epoch": 1.2520009147038647, "grad_norm": 1.0287277698516846, "learning_rate": 8.531086198105074e-05, "loss": 0.12091947793960571, "memory(GiB)": 122.96, "step": 16425, "token_acc": 0.9555222388805598, "train_speed(iter/s)": 0.245057 }, { "epoch": 1.2523820413141245, "grad_norm": 0.6381069421768188, "learning_rate": 8.530238382660917e-05, "loss": 0.11415219306945801, "memory(GiB)": 122.96, "step": 16430, "token_acc": 0.9507007419620775, "train_speed(iter/s)": 0.245064 }, { "epoch": 1.2527631679243845, "grad_norm": 0.4479133188724518, "learning_rate": 8.529390364773974e-05, "loss": 0.10267646312713623, "memory(GiB)": 122.96, "step": 16435, "token_acc": 0.9621434284190883, "train_speed(iter/s)": 0.245087 }, { "epoch": 1.2531442945346445, "grad_norm": 0.6177681684494019, "learning_rate": 8.52854214449287e-05, "loss": 0.0666852593421936, "memory(GiB)": 122.96, "step": 16440, "token_acc": 0.9640237513098149, "train_speed(iter/s)": 0.245113 }, { "epoch": 1.2535254211449043, "grad_norm": 0.5901747345924377, "learning_rate": 8.527693721866247e-05, "loss": 0.08779544234275818, "memory(GiB)": 122.96, "step": 16445, "token_acc": 0.9554937413073713, "train_speed(iter/s)": 0.245139 }, { "epoch": 1.2539065477551643, "grad_norm": 0.8009762167930603, "learning_rate": 8.526845096942761e-05, "loss": 0.10611989498138427, "memory(GiB)": 122.96, "step": 16450, "token_acc": 0.9622106754841757, "train_speed(iter/s)": 0.245137 }, { "epoch": 1.254287674365424, "grad_norm": 0.8432754874229431, "learning_rate": 8.525996269771077e-05, "loss": 0.14922208786010743, "memory(GiB)": 122.96, "step": 16455, "token_acc": 0.936909059652742, "train_speed(iter/s)": 0.245151 }, { "epoch": 1.2546688009756841, "grad_norm": 0.42287787795066833, "learning_rate": 8.525147240399866e-05, "loss": 0.12019532918930054, "memory(GiB)": 122.96, "step": 16460, "token_acc": 0.9485928095005693, "train_speed(iter/s)": 0.245162 }, { "epoch": 1.2550499275859441, "grad_norm": 0.5572043657302856, "learning_rate": 8.524298008877822e-05, "loss": 0.12271336317062378, "memory(GiB)": 122.96, "step": 16465, "token_acc": 0.9433326728749752, "train_speed(iter/s)": 0.245184 }, { "epoch": 1.255431054196204, "grad_norm": 0.8416683673858643, "learning_rate": 8.523448575253641e-05, "loss": 0.11233663558959961, "memory(GiB)": 122.96, "step": 16470, "token_acc": 0.9510349750178444, "train_speed(iter/s)": 0.245195 }, { "epoch": 1.255812180806464, "grad_norm": 0.8764181733131409, "learning_rate": 8.522598939576036e-05, "loss": 0.17379262447357177, "memory(GiB)": 122.96, "step": 16475, "token_acc": 0.9307965711896299, "train_speed(iter/s)": 0.245215 }, { "epoch": 1.2561933074167237, "grad_norm": 1.2228213548660278, "learning_rate": 8.521749101893727e-05, "loss": 0.17483339309692383, "memory(GiB)": 122.96, "step": 16480, "token_acc": 0.9297376628141626, "train_speed(iter/s)": 0.245231 }, { "epoch": 1.2565744340269838, "grad_norm": 0.8729822635650635, "learning_rate": 8.52089906225545e-05, "loss": 0.1262011408805847, "memory(GiB)": 122.96, "step": 16485, "token_acc": 0.9547311095983663, "train_speed(iter/s)": 0.245244 }, { "epoch": 1.2569555606372438, "grad_norm": 0.7889236211776733, "learning_rate": 8.520048820709951e-05, "loss": 0.09612629413604737, "memory(GiB)": 122.96, "step": 16490, "token_acc": 0.9645352669742914, "train_speed(iter/s)": 0.245255 }, { "epoch": 1.2573366872475036, "grad_norm": 1.8017657995224, "learning_rate": 8.519198377305989e-05, "loss": 0.12849689722061158, "memory(GiB)": 122.96, "step": 16495, "token_acc": 0.93839019542895, "train_speed(iter/s)": 0.24528 }, { "epoch": 1.2577178138577636, "grad_norm": 0.678687572479248, "learning_rate": 8.518347732092329e-05, "loss": 0.12587478160858154, "memory(GiB)": 122.96, "step": 16500, "token_acc": 0.95273492286115, "train_speed(iter/s)": 0.245293 }, { "epoch": 1.2580989404680234, "grad_norm": 0.0745910257101059, "learning_rate": 8.517496885117756e-05, "loss": 0.06280009150505066, "memory(GiB)": 122.96, "step": 16505, "token_acc": 0.9671020803096275, "train_speed(iter/s)": 0.245319 }, { "epoch": 1.2584800670782834, "grad_norm": 1.2563490867614746, "learning_rate": 8.516645836431057e-05, "loss": 0.11339030265808106, "memory(GiB)": 122.96, "step": 16510, "token_acc": 0.9502164502164502, "train_speed(iter/s)": 0.245345 }, { "epoch": 1.2588611936885434, "grad_norm": 0.46298980712890625, "learning_rate": 8.515794586081041e-05, "loss": 0.13147248029708863, "memory(GiB)": 122.96, "step": 16515, "token_acc": 0.9391153512575889, "train_speed(iter/s)": 0.245364 }, { "epoch": 1.2592423202988032, "grad_norm": 0.6436245441436768, "learning_rate": 8.51494313411652e-05, "loss": 0.13074697256088258, "memory(GiB)": 122.96, "step": 16520, "token_acc": 0.9517058041648205, "train_speed(iter/s)": 0.245385 }, { "epoch": 1.2596234469090632, "grad_norm": 0.33527952432632446, "learning_rate": 8.51409148058632e-05, "loss": 0.14128496646881103, "memory(GiB)": 122.96, "step": 16525, "token_acc": 0.9370854652264544, "train_speed(iter/s)": 0.245406 }, { "epoch": 1.260004573519323, "grad_norm": 1.056339144706726, "learning_rate": 8.513239625539282e-05, "loss": 0.14635547399520873, "memory(GiB)": 122.96, "step": 16530, "token_acc": 0.9517494356659142, "train_speed(iter/s)": 0.24543 }, { "epoch": 1.260385700129583, "grad_norm": 0.8895571827888489, "learning_rate": 8.512387569024255e-05, "loss": 0.09188920855522156, "memory(GiB)": 122.96, "step": 16535, "token_acc": 0.9640861931364725, "train_speed(iter/s)": 0.245462 }, { "epoch": 1.260766826739843, "grad_norm": 0.8117167949676514, "learning_rate": 8.5115353110901e-05, "loss": 0.11151890754699707, "memory(GiB)": 122.96, "step": 16540, "token_acc": 0.9422891333719359, "train_speed(iter/s)": 0.245479 }, { "epoch": 1.2611479533501029, "grad_norm": 1.9728516340255737, "learning_rate": 8.510682851785692e-05, "loss": 0.09429635405540467, "memory(GiB)": 122.96, "step": 16545, "token_acc": 0.9603361344537815, "train_speed(iter/s)": 0.245492 }, { "epoch": 1.261529079960363, "grad_norm": 0.8058775663375854, "learning_rate": 8.509830191159912e-05, "loss": 0.17679688930511475, "memory(GiB)": 122.96, "step": 16550, "token_acc": 0.9270623742454729, "train_speed(iter/s)": 0.245507 }, { "epoch": 1.2619102065706227, "grad_norm": 1.322827935218811, "learning_rate": 8.508977329261658e-05, "loss": 0.1269286870956421, "memory(GiB)": 122.96, "step": 16555, "token_acc": 0.9547930283224401, "train_speed(iter/s)": 0.245532 }, { "epoch": 1.2622913331808827, "grad_norm": 0.6617803573608398, "learning_rate": 8.50812426613984e-05, "loss": 0.11586322784423828, "memory(GiB)": 122.96, "step": 16560, "token_acc": 0.9531333214983135, "train_speed(iter/s)": 0.245548 }, { "epoch": 1.2626724597911427, "grad_norm": 0.5419811010360718, "learning_rate": 8.507271001843374e-05, "loss": 0.12028144598007202, "memory(GiB)": 122.96, "step": 16565, "token_acc": 0.9575171712032562, "train_speed(iter/s)": 0.245552 }, { "epoch": 1.2630535864014025, "grad_norm": 1.1417500972747803, "learning_rate": 8.506417536421192e-05, "loss": 0.13043017387390138, "memory(GiB)": 122.96, "step": 16570, "token_acc": 0.9504040122596824, "train_speed(iter/s)": 0.245574 }, { "epoch": 1.2634347130116625, "grad_norm": 0.4405645728111267, "learning_rate": 8.505563869922235e-05, "loss": 0.1401545524597168, "memory(GiB)": 122.96, "step": 16575, "token_acc": 0.9485524938960586, "train_speed(iter/s)": 0.24559 }, { "epoch": 1.2638158396219223, "grad_norm": 0.7274104952812195, "learning_rate": 8.504710002395459e-05, "loss": 0.12750451564788817, "memory(GiB)": 122.96, "step": 16580, "token_acc": 0.9525946704067321, "train_speed(iter/s)": 0.245613 }, { "epoch": 1.2641969662321824, "grad_norm": 1.7580556869506836, "learning_rate": 8.503855933889829e-05, "loss": 0.18542473316192626, "memory(GiB)": 122.96, "step": 16585, "token_acc": 0.928035043804756, "train_speed(iter/s)": 0.245638 }, { "epoch": 1.2645780928424424, "grad_norm": 0.22889874875545502, "learning_rate": 8.503001664454317e-05, "loss": 0.10515034198760986, "memory(GiB)": 122.96, "step": 16590, "token_acc": 0.9576151455213338, "train_speed(iter/s)": 0.24566 }, { "epoch": 1.2649592194527022, "grad_norm": 0.9161299467086792, "learning_rate": 8.502147194137919e-05, "loss": 0.12585874795913696, "memory(GiB)": 122.96, "step": 16595, "token_acc": 0.9414176918570591, "train_speed(iter/s)": 0.245685 }, { "epoch": 1.2653403460629622, "grad_norm": 0.5459222197532654, "learning_rate": 8.50129252298963e-05, "loss": 0.1326700210571289, "memory(GiB)": 122.96, "step": 16600, "token_acc": 0.970125786163522, "train_speed(iter/s)": 0.245708 }, { "epoch": 1.2653403460629622, "eval_loss": 0.10843290388584137, "eval_runtime": 164.2228, "eval_samples_per_second": 3.227, "eval_steps_per_second": 3.227, "eval_token_acc": 0.9502063128727185, "step": 16600 }, { "epoch": 1.265721472673222, "grad_norm": 0.6280312538146973, "learning_rate": 8.500437651058463e-05, "loss": 0.09482347965240479, "memory(GiB)": 122.96, "step": 16605, "token_acc": 0.9504958407497548, "train_speed(iter/s)": 0.245132 }, { "epoch": 1.266102599283482, "grad_norm": 0.7071524858474731, "learning_rate": 8.499582578393442e-05, "loss": 0.12856109142303468, "memory(GiB)": 122.96, "step": 16610, "token_acc": 0.9546107440390692, "train_speed(iter/s)": 0.245156 }, { "epoch": 1.266483725893742, "grad_norm": 1.6478323936462402, "learning_rate": 8.498727305043599e-05, "loss": 0.13119884729385375, "memory(GiB)": 122.96, "step": 16615, "token_acc": 0.9456521739130435, "train_speed(iter/s)": 0.245179 }, { "epoch": 1.2668648525040018, "grad_norm": 0.7057093381881714, "learning_rate": 8.497871831057983e-05, "loss": 0.128168523311615, "memory(GiB)": 122.96, "step": 16620, "token_acc": 0.9482184055370417, "train_speed(iter/s)": 0.245202 }, { "epoch": 1.2672459791142616, "grad_norm": 1.0310853719711304, "learning_rate": 8.497016156485646e-05, "loss": 0.1395874261856079, "memory(GiB)": 122.96, "step": 16625, "token_acc": 0.9476098077262304, "train_speed(iter/s)": 0.245216 }, { "epoch": 1.2676271057245216, "grad_norm": 0.5219086408615112, "learning_rate": 8.496160281375664e-05, "loss": 0.14560294151306152, "memory(GiB)": 122.96, "step": 16630, "token_acc": 0.939164276935682, "train_speed(iter/s)": 0.24524 }, { "epoch": 1.2680082323347817, "grad_norm": 0.7037075161933899, "learning_rate": 8.495304205777112e-05, "loss": 0.15865329504013062, "memory(GiB)": 122.96, "step": 16635, "token_acc": 0.9334075723830735, "train_speed(iter/s)": 0.245261 }, { "epoch": 1.2683893589450417, "grad_norm": 1.0574370622634888, "learning_rate": 8.494447929739084e-05, "loss": 0.12828024625778198, "memory(GiB)": 122.96, "step": 16640, "token_acc": 0.9384018953262977, "train_speed(iter/s)": 0.245282 }, { "epoch": 1.2687704855553015, "grad_norm": 0.6939772367477417, "learning_rate": 8.493591453310685e-05, "loss": 0.12425237894058228, "memory(GiB)": 122.96, "step": 16645, "token_acc": 0.9527962085308057, "train_speed(iter/s)": 0.245298 }, { "epoch": 1.2691516121655613, "grad_norm": 0.6951548457145691, "learning_rate": 8.492734776541027e-05, "loss": 0.17778323888778685, "memory(GiB)": 122.96, "step": 16650, "token_acc": 0.9351767075627174, "train_speed(iter/s)": 0.24531 }, { "epoch": 1.2695327387758213, "grad_norm": 1.303305983543396, "learning_rate": 8.49187789947924e-05, "loss": 0.13065205812454223, "memory(GiB)": 122.96, "step": 16655, "token_acc": 0.9449127031908489, "train_speed(iter/s)": 0.245334 }, { "epoch": 1.2699138653860813, "grad_norm": 0.6236063241958618, "learning_rate": 8.491020822174456e-05, "loss": 0.09996622800827026, "memory(GiB)": 122.96, "step": 16660, "token_acc": 0.9535962877030162, "train_speed(iter/s)": 0.245353 }, { "epoch": 1.270294991996341, "grad_norm": 0.8411254286766052, "learning_rate": 8.490163544675832e-05, "loss": 0.09706242680549622, "memory(GiB)": 122.96, "step": 16665, "token_acc": 0.962457337883959, "train_speed(iter/s)": 0.245379 }, { "epoch": 1.2706761186066011, "grad_norm": 0.651681661605835, "learning_rate": 8.489306067032521e-05, "loss": 0.1211594820022583, "memory(GiB)": 122.96, "step": 16670, "token_acc": 0.956490727532097, "train_speed(iter/s)": 0.245389 }, { "epoch": 1.271057245216861, "grad_norm": 0.9033162593841553, "learning_rate": 8.488448389293701e-05, "loss": 0.11309378147125244, "memory(GiB)": 122.96, "step": 16675, "token_acc": 0.9561920808761584, "train_speed(iter/s)": 0.245405 }, { "epoch": 1.271438371827121, "grad_norm": 0.6026763319969177, "learning_rate": 8.487590511508556e-05, "loss": 0.08474704027175903, "memory(GiB)": 122.96, "step": 16680, "token_acc": 0.9651908396946565, "train_speed(iter/s)": 0.245418 }, { "epoch": 1.271819498437381, "grad_norm": 0.33365529775619507, "learning_rate": 8.486732433726279e-05, "loss": 0.07867011427879333, "memory(GiB)": 122.96, "step": 16685, "token_acc": 0.9666203059805285, "train_speed(iter/s)": 0.245446 }, { "epoch": 1.2722006250476408, "grad_norm": 2.1607868671417236, "learning_rate": 8.485874155996077e-05, "loss": 0.1455127477645874, "memory(GiB)": 122.96, "step": 16690, "token_acc": 0.9283065512978986, "train_speed(iter/s)": 0.245471 }, { "epoch": 1.2725817516579008, "grad_norm": 1.3021788597106934, "learning_rate": 8.485015678367167e-05, "loss": 0.10835864543914794, "memory(GiB)": 122.96, "step": 16695, "token_acc": 0.9634146341463414, "train_speed(iter/s)": 0.245492 }, { "epoch": 1.2729628782681606, "grad_norm": 0.9539653658866882, "learning_rate": 8.484157000888782e-05, "loss": 0.15383527278900147, "memory(GiB)": 122.96, "step": 16700, "token_acc": 0.9433748584371461, "train_speed(iter/s)": 0.245504 }, { "epoch": 1.2733440048784206, "grad_norm": 1.088435173034668, "learning_rate": 8.483298123610161e-05, "loss": 0.13168885707855224, "memory(GiB)": 122.96, "step": 16705, "token_acc": 0.9292035398230089, "train_speed(iter/s)": 0.245528 }, { "epoch": 1.2737251314886806, "grad_norm": 0.7620453238487244, "learning_rate": 8.482439046580558e-05, "loss": 0.10965490341186523, "memory(GiB)": 122.96, "step": 16710, "token_acc": 0.9517208413001912, "train_speed(iter/s)": 0.245555 }, { "epoch": 1.2741062580989404, "grad_norm": 1.4650856256484985, "learning_rate": 8.481579769849235e-05, "loss": 0.17378017902374268, "memory(GiB)": 122.96, "step": 16715, "token_acc": 0.9357118815213733, "train_speed(iter/s)": 0.24558 }, { "epoch": 1.2744873847092004, "grad_norm": 2.788140058517456, "learning_rate": 8.480720293465469e-05, "loss": 0.086276775598526, "memory(GiB)": 122.96, "step": 16720, "token_acc": 0.9664804469273743, "train_speed(iter/s)": 0.245603 }, { "epoch": 1.2748685113194602, "grad_norm": 0.9689181447029114, "learning_rate": 8.479860617478548e-05, "loss": 0.13692901134490967, "memory(GiB)": 122.96, "step": 16725, "token_acc": 0.9334056399132321, "train_speed(iter/s)": 0.245624 }, { "epoch": 1.2752496379297202, "grad_norm": 0.5657161474227905, "learning_rate": 8.479000741937769e-05, "loss": 0.13685139417648315, "memory(GiB)": 122.96, "step": 16730, "token_acc": 0.9551272831871135, "train_speed(iter/s)": 0.245635 }, { "epoch": 1.2756307645399803, "grad_norm": 0.6459820866584778, "learning_rate": 8.478140666892439e-05, "loss": 0.08421671986579896, "memory(GiB)": 122.96, "step": 16735, "token_acc": 0.9641909814323607, "train_speed(iter/s)": 0.245653 }, { "epoch": 1.27601189115024, "grad_norm": 0.5140261650085449, "learning_rate": 8.477280392391884e-05, "loss": 0.13019053936004638, "memory(GiB)": 122.96, "step": 16740, "token_acc": 0.9478367597422522, "train_speed(iter/s)": 0.245661 }, { "epoch": 1.2763930177605, "grad_norm": 0.9778750538825989, "learning_rate": 8.476419918485435e-05, "loss": 0.18440454006195067, "memory(GiB)": 122.96, "step": 16745, "token_acc": 0.9302653204977694, "train_speed(iter/s)": 0.24568 }, { "epoch": 1.2767741443707599, "grad_norm": 1.3635205030441284, "learning_rate": 8.475559245222438e-05, "loss": 0.14349088668823243, "memory(GiB)": 122.96, "step": 16750, "token_acc": 0.9518221976808393, "train_speed(iter/s)": 0.24569 }, { "epoch": 1.2771552709810199, "grad_norm": 0.5575041770935059, "learning_rate": 8.474698372652245e-05, "loss": 0.10252753496170045, "memory(GiB)": 122.96, "step": 16755, "token_acc": 0.9529956161714564, "train_speed(iter/s)": 0.245711 }, { "epoch": 1.27753639759128, "grad_norm": 0.8783716559410095, "learning_rate": 8.473837300824224e-05, "loss": 0.12319101095199585, "memory(GiB)": 122.96, "step": 16760, "token_acc": 0.9392131799460304, "train_speed(iter/s)": 0.245725 }, { "epoch": 1.2779175242015397, "grad_norm": 0.6588578820228577, "learning_rate": 8.472976029787756e-05, "loss": 0.10819822549819946, "memory(GiB)": 122.96, "step": 16765, "token_acc": 0.9613003095975232, "train_speed(iter/s)": 0.245741 }, { "epoch": 1.2782986508117997, "grad_norm": 0.6164048910140991, "learning_rate": 8.472114559592229e-05, "loss": 0.10308420658111572, "memory(GiB)": 122.96, "step": 16770, "token_acc": 0.959489993544222, "train_speed(iter/s)": 0.245751 }, { "epoch": 1.2786797774220595, "grad_norm": 1.179402232170105, "learning_rate": 8.471252890287045e-05, "loss": 0.08818424344062806, "memory(GiB)": 122.96, "step": 16775, "token_acc": 0.963302752293578, "train_speed(iter/s)": 0.245774 }, { "epoch": 1.2790609040323195, "grad_norm": 0.7468491792678833, "learning_rate": 8.470391021921615e-05, "loss": 0.13655853271484375, "memory(GiB)": 122.96, "step": 16780, "token_acc": 0.9432897862232779, "train_speed(iter/s)": 0.245797 }, { "epoch": 1.2794420306425796, "grad_norm": 0.7331514358520508, "learning_rate": 8.469528954545365e-05, "loss": 0.119598126411438, "memory(GiB)": 122.96, "step": 16785, "token_acc": 0.9605747327843, "train_speed(iter/s)": 0.245811 }, { "epoch": 1.2798231572528393, "grad_norm": 1.1030611991882324, "learning_rate": 8.468666688207732e-05, "loss": 0.254576301574707, "memory(GiB)": 122.96, "step": 16790, "token_acc": 0.913382416630576, "train_speed(iter/s)": 0.245828 }, { "epoch": 1.2802042838630994, "grad_norm": 0.3509121835231781, "learning_rate": 8.467804222958157e-05, "loss": 0.08735076785087585, "memory(GiB)": 122.96, "step": 16795, "token_acc": 0.9728183118741058, "train_speed(iter/s)": 0.24585 }, { "epoch": 1.2805854104733592, "grad_norm": 0.624954104423523, "learning_rate": 8.466941558846105e-05, "loss": 0.08531662225723266, "memory(GiB)": 122.96, "step": 16800, "token_acc": 0.9599097535577924, "train_speed(iter/s)": 0.245865 }, { "epoch": 1.2805854104733592, "eval_loss": 0.11012812703847885, "eval_runtime": 160.0252, "eval_samples_per_second": 3.312, "eval_steps_per_second": 3.312, "eval_token_acc": 0.9500105415336425, "step": 16800 }, { "epoch": 1.2809665370836192, "grad_norm": 1.1083186864852905, "learning_rate": 8.466078695921042e-05, "loss": 0.2117542505264282, "memory(GiB)": 122.96, "step": 16805, "token_acc": 0.9490462714542982, "train_speed(iter/s)": 0.245313 }, { "epoch": 1.2813476636938792, "grad_norm": 1.05453360080719, "learning_rate": 8.465215634232453e-05, "loss": 0.13501791954040526, "memory(GiB)": 122.96, "step": 16810, "token_acc": 0.9495321368308023, "train_speed(iter/s)": 0.245322 }, { "epoch": 1.281728790304139, "grad_norm": 0.6642823815345764, "learning_rate": 8.464352373829826e-05, "loss": 0.10566951036453247, "memory(GiB)": 122.96, "step": 16815, "token_acc": 0.9539170506912442, "train_speed(iter/s)": 0.245341 }, { "epoch": 1.282109916914399, "grad_norm": 1.0369282960891724, "learning_rate": 8.463488914762668e-05, "loss": 0.0880319595336914, "memory(GiB)": 122.96, "step": 16820, "token_acc": 0.9553307392996109, "train_speed(iter/s)": 0.245349 }, { "epoch": 1.2824910435246588, "grad_norm": 1.9384706020355225, "learning_rate": 8.462625257080491e-05, "loss": 0.14360315799713136, "memory(GiB)": 122.96, "step": 16825, "token_acc": 0.9445843828715366, "train_speed(iter/s)": 0.24537 }, { "epoch": 1.2828721701349188, "grad_norm": 0.633171558380127, "learning_rate": 8.461761400832826e-05, "loss": 0.11990576982498169, "memory(GiB)": 122.96, "step": 16830, "token_acc": 0.9652777777777778, "train_speed(iter/s)": 0.245385 }, { "epoch": 1.2832532967451789, "grad_norm": 0.4971645176410675, "learning_rate": 8.460897346069209e-05, "loss": 0.18707531690597534, "memory(GiB)": 122.96, "step": 16835, "token_acc": 0.9380812530592266, "train_speed(iter/s)": 0.245404 }, { "epoch": 1.2836344233554386, "grad_norm": 0.9122052788734436, "learning_rate": 8.460033092839189e-05, "loss": 0.13146095275878905, "memory(GiB)": 122.96, "step": 16840, "token_acc": 0.953405017921147, "train_speed(iter/s)": 0.245425 }, { "epoch": 1.2840155499656987, "grad_norm": 1.0855666399002075, "learning_rate": 8.459168641192328e-05, "loss": 0.12091307640075684, "memory(GiB)": 122.96, "step": 16845, "token_acc": 0.9475249628213299, "train_speed(iter/s)": 0.245443 }, { "epoch": 1.2843966765759585, "grad_norm": 0.23356075584888458, "learning_rate": 8.458303991178198e-05, "loss": 0.11455096006393432, "memory(GiB)": 122.96, "step": 16850, "token_acc": 0.9545115585384042, "train_speed(iter/s)": 0.245461 }, { "epoch": 1.2847778031862185, "grad_norm": 1.3910518884658813, "learning_rate": 8.457439142846381e-05, "loss": 0.15824854373931885, "memory(GiB)": 122.96, "step": 16855, "token_acc": 0.9372628383506197, "train_speed(iter/s)": 0.245482 }, { "epoch": 1.2851589297964785, "grad_norm": 0.6380828619003296, "learning_rate": 8.456574096246477e-05, "loss": 0.1476808786392212, "memory(GiB)": 122.96, "step": 16860, "token_acc": 0.9506670022652907, "train_speed(iter/s)": 0.245504 }, { "epoch": 1.2855400564067383, "grad_norm": 0.5515353679656982, "learning_rate": 8.455708851428086e-05, "loss": 0.1284482717514038, "memory(GiB)": 122.96, "step": 16865, "token_acc": 0.9490946361462248, "train_speed(iter/s)": 0.245518 }, { "epoch": 1.2859211830169983, "grad_norm": 0.8030123710632324, "learning_rate": 8.454843408440832e-05, "loss": 0.14678490161895752, "memory(GiB)": 122.96, "step": 16870, "token_acc": 0.9423900968100302, "train_speed(iter/s)": 0.245528 }, { "epoch": 1.2863023096272581, "grad_norm": 1.040271282196045, "learning_rate": 8.453977767334338e-05, "loss": 0.11741855144500732, "memory(GiB)": 122.96, "step": 16875, "token_acc": 0.9606645492303933, "train_speed(iter/s)": 0.245528 }, { "epoch": 1.2866834362375181, "grad_norm": 0.8346788883209229, "learning_rate": 8.453111928158248e-05, "loss": 0.09133844375610352, "memory(GiB)": 122.96, "step": 16880, "token_acc": 0.964770523594053, "train_speed(iter/s)": 0.24554 }, { "epoch": 1.2870645628477781, "grad_norm": 0.3464534282684326, "learning_rate": 8.452245890962214e-05, "loss": 0.0947684407234192, "memory(GiB)": 122.96, "step": 16885, "token_acc": 0.9433465085638999, "train_speed(iter/s)": 0.245565 }, { "epoch": 1.287445689458038, "grad_norm": 0.7286012768745422, "learning_rate": 8.451379655795898e-05, "loss": 0.15775563716888427, "memory(GiB)": 122.96, "step": 16890, "token_acc": 0.937419906023067, "train_speed(iter/s)": 0.245586 }, { "epoch": 1.287826816068298, "grad_norm": 1.0279159545898438, "learning_rate": 8.450513222708976e-05, "loss": 0.14544072151184081, "memory(GiB)": 122.96, "step": 16895, "token_acc": 0.9542450432130147, "train_speed(iter/s)": 0.245613 }, { "epoch": 1.2882079426785578, "grad_norm": 0.7253280878067017, "learning_rate": 8.449646591751133e-05, "loss": 0.11688371896743774, "memory(GiB)": 122.96, "step": 16900, "token_acc": 0.956463719766472, "train_speed(iter/s)": 0.245628 }, { "epoch": 1.2885890692888178, "grad_norm": 0.8149957656860352, "learning_rate": 8.448779762972066e-05, "loss": 0.07185850143432618, "memory(GiB)": 122.96, "step": 16905, "token_acc": 0.9642621653699064, "train_speed(iter/s)": 0.245639 }, { "epoch": 1.2889701958990778, "grad_norm": 0.8636221289634705, "learning_rate": 8.447912736421481e-05, "loss": 0.13652395009994506, "memory(GiB)": 122.96, "step": 16910, "token_acc": 0.9506204102304381, "train_speed(iter/s)": 0.245651 }, { "epoch": 1.2893513225093376, "grad_norm": 0.786990225315094, "learning_rate": 8.447045512149104e-05, "loss": 0.0927212417125702, "memory(GiB)": 122.96, "step": 16915, "token_acc": 0.9682692307692308, "train_speed(iter/s)": 0.245674 }, { "epoch": 1.2897324491195976, "grad_norm": 0.9049322009086609, "learning_rate": 8.446178090204664e-05, "loss": 0.13560017347335815, "memory(GiB)": 122.96, "step": 16920, "token_acc": 0.9399727148703957, "train_speed(iter/s)": 0.2457 }, { "epoch": 1.2901135757298574, "grad_norm": 0.8390566110610962, "learning_rate": 8.445310470637901e-05, "loss": 0.18702114820480348, "memory(GiB)": 122.96, "step": 16925, "token_acc": 0.9335153364380921, "train_speed(iter/s)": 0.245712 }, { "epoch": 1.2904947023401174, "grad_norm": 0.9412629008293152, "learning_rate": 8.44444265349857e-05, "loss": 0.12953122854232788, "memory(GiB)": 122.96, "step": 16930, "token_acc": 0.9558526348515467, "train_speed(iter/s)": 0.245718 }, { "epoch": 1.2908758289503774, "grad_norm": 1.0262689590454102, "learning_rate": 8.443574638836438e-05, "loss": 0.14410165548324586, "memory(GiB)": 122.96, "step": 16935, "token_acc": 0.9607949904710046, "train_speed(iter/s)": 0.245737 }, { "epoch": 1.2912569555606372, "grad_norm": 0.8348223567008972, "learning_rate": 8.442706426701281e-05, "loss": 0.11238853931427002, "memory(GiB)": 122.96, "step": 16940, "token_acc": 0.9424932458510228, "train_speed(iter/s)": 0.245762 }, { "epoch": 1.291638082170897, "grad_norm": 1.5163309574127197, "learning_rate": 8.441838017142886e-05, "loss": 0.1375640630722046, "memory(GiB)": 122.96, "step": 16945, "token_acc": 0.9357326478149101, "train_speed(iter/s)": 0.245772 }, { "epoch": 1.292019208781157, "grad_norm": 1.3778321743011475, "learning_rate": 8.440969410211053e-05, "loss": 0.14319937229156493, "memory(GiB)": 122.96, "step": 16950, "token_acc": 0.9387650525236997, "train_speed(iter/s)": 0.245794 }, { "epoch": 1.292400335391417, "grad_norm": 0.8878343105316162, "learning_rate": 8.440100605955593e-05, "loss": 0.16279075145721436, "memory(GiB)": 122.96, "step": 16955, "token_acc": 0.931673582295989, "train_speed(iter/s)": 0.245816 }, { "epoch": 1.292781462001677, "grad_norm": 1.2844645977020264, "learning_rate": 8.439231604426326e-05, "loss": 0.17660362720489503, "memory(GiB)": 122.96, "step": 16960, "token_acc": 0.9334148898986369, "train_speed(iter/s)": 0.245834 }, { "epoch": 1.293162588611937, "grad_norm": 0.5441691279411316, "learning_rate": 8.438362405673086e-05, "loss": 0.1585702896118164, "memory(GiB)": 122.96, "step": 16965, "token_acc": 0.9337474120082816, "train_speed(iter/s)": 0.245851 }, { "epoch": 1.2935437152221967, "grad_norm": 0.9319804906845093, "learning_rate": 8.43749300974572e-05, "loss": 0.1298724889755249, "memory(GiB)": 122.96, "step": 16970, "token_acc": 0.9465054314408691, "train_speed(iter/s)": 0.245873 }, { "epoch": 1.2939248418324567, "grad_norm": 0.8321642875671387, "learning_rate": 8.43662341669408e-05, "loss": 0.13857544660568238, "memory(GiB)": 122.96, "step": 16975, "token_acc": 0.94664120422377, "train_speed(iter/s)": 0.245876 }, { "epoch": 1.2943059684427167, "grad_norm": 0.6314399242401123, "learning_rate": 8.435753626568037e-05, "loss": 0.09951622486114502, "memory(GiB)": 122.96, "step": 16980, "token_acc": 0.9547935619314206, "train_speed(iter/s)": 0.24589 }, { "epoch": 1.2946870950529765, "grad_norm": 0.905633807182312, "learning_rate": 8.434883639417466e-05, "loss": 0.15711526870727538, "memory(GiB)": 122.96, "step": 16985, "token_acc": 0.9351305812973884, "train_speed(iter/s)": 0.245913 }, { "epoch": 1.2950682216632365, "grad_norm": 1.139641523361206, "learning_rate": 8.434013455292258e-05, "loss": 0.15649032592773438, "memory(GiB)": 122.96, "step": 16990, "token_acc": 0.9438172043010753, "train_speed(iter/s)": 0.245914 }, { "epoch": 1.2954493482734963, "grad_norm": 0.8122710585594177, "learning_rate": 8.433143074242314e-05, "loss": 0.1116984486579895, "memory(GiB)": 122.96, "step": 16995, "token_acc": 0.9445331205107741, "train_speed(iter/s)": 0.245941 }, { "epoch": 1.2958304748837564, "grad_norm": 0.5538355708122253, "learning_rate": 8.432272496317545e-05, "loss": 0.11734380722045898, "memory(GiB)": 122.96, "step": 17000, "token_acc": 0.9552238805970149, "train_speed(iter/s)": 0.245945 }, { "epoch": 1.2958304748837564, "eval_loss": 0.10688108205795288, "eval_runtime": 160.107, "eval_samples_per_second": 3.31, "eval_steps_per_second": 3.31, "eval_token_acc": 0.9504322028793446, "step": 17000 }, { "epoch": 1.2962116014940164, "grad_norm": 1.0664434432983398, "learning_rate": 8.431401721567879e-05, "loss": 0.1707775354385376, "memory(GiB)": 122.96, "step": 17005, "token_acc": 0.9498527450951663, "train_speed(iter/s)": 0.245398 }, { "epoch": 1.2965927281042762, "grad_norm": 1.1330010890960693, "learning_rate": 8.430530750043247e-05, "loss": 0.12271435260772705, "memory(GiB)": 122.96, "step": 17010, "token_acc": 0.9568221070811744, "train_speed(iter/s)": 0.245401 }, { "epoch": 1.2969738547145362, "grad_norm": 0.7640411257743835, "learning_rate": 8.429659581793596e-05, "loss": 0.08081393241882324, "memory(GiB)": 122.96, "step": 17015, "token_acc": 0.9625036054225555, "train_speed(iter/s)": 0.245408 }, { "epoch": 1.297354981324796, "grad_norm": 1.002199649810791, "learning_rate": 8.428788216868882e-05, "loss": 0.10103360414505005, "memory(GiB)": 122.96, "step": 17020, "token_acc": 0.9570154095701541, "train_speed(iter/s)": 0.245431 }, { "epoch": 1.297736107935056, "grad_norm": 0.0024337118957191706, "learning_rate": 8.427916655319076e-05, "loss": 0.1190273642539978, "memory(GiB)": 122.96, "step": 17025, "token_acc": 0.9565954560868091, "train_speed(iter/s)": 0.245454 }, { "epoch": 1.298117234545316, "grad_norm": 0.6473399996757507, "learning_rate": 8.427044897194158e-05, "loss": 0.10979140996932983, "memory(GiB)": 122.96, "step": 17030, "token_acc": 0.957001239157373, "train_speed(iter/s)": 0.245459 }, { "epoch": 1.2984983611555758, "grad_norm": 1.0952626466751099, "learning_rate": 8.426172942544119e-05, "loss": 0.13432486057281495, "memory(GiB)": 122.96, "step": 17035, "token_acc": 0.9487134296792387, "train_speed(iter/s)": 0.245475 }, { "epoch": 1.2988794877658358, "grad_norm": 0.527126133441925, "learning_rate": 8.425300791418962e-05, "loss": 0.14489022493362427, "memory(GiB)": 122.96, "step": 17040, "token_acc": 0.9495308501563833, "train_speed(iter/s)": 0.245491 }, { "epoch": 1.2992606143760956, "grad_norm": 0.8874191045761108, "learning_rate": 8.424428443868697e-05, "loss": 0.133234703540802, "memory(GiB)": 122.96, "step": 17045, "token_acc": 0.9385635996538795, "train_speed(iter/s)": 0.245511 }, { "epoch": 1.2996417409863557, "grad_norm": 1.0849732160568237, "learning_rate": 8.423555899943354e-05, "loss": 0.16623787879943847, "memory(GiB)": 122.96, "step": 17050, "token_acc": 0.9336905559276625, "train_speed(iter/s)": 0.245523 }, { "epoch": 1.3000228675966157, "grad_norm": 0.21279145777225494, "learning_rate": 8.422683159692968e-05, "loss": 0.11043118238449097, "memory(GiB)": 122.96, "step": 17055, "token_acc": 0.9491763760546404, "train_speed(iter/s)": 0.245538 }, { "epoch": 1.3004039942068755, "grad_norm": 0.3098379075527191, "learning_rate": 8.421810223167583e-05, "loss": 0.14525763988494872, "memory(GiB)": 122.96, "step": 17060, "token_acc": 0.9443312966734555, "train_speed(iter/s)": 0.245561 }, { "epoch": 1.3007851208171355, "grad_norm": 0.5146898627281189, "learning_rate": 8.420937090417264e-05, "loss": 0.1777060627937317, "memory(GiB)": 122.96, "step": 17065, "token_acc": 0.9284403669724771, "train_speed(iter/s)": 0.24558 }, { "epoch": 1.3011662474273953, "grad_norm": 0.7406203150749207, "learning_rate": 8.420063761492077e-05, "loss": 0.10023574829101563, "memory(GiB)": 122.96, "step": 17070, "token_acc": 0.9596385542168675, "train_speed(iter/s)": 0.245579 }, { "epoch": 1.3015473740376553, "grad_norm": 0.6222447752952576, "learning_rate": 8.419190236442103e-05, "loss": 0.15357439517974852, "memory(GiB)": 122.96, "step": 17075, "token_acc": 0.9366591928251121, "train_speed(iter/s)": 0.245608 }, { "epoch": 1.3019285006479153, "grad_norm": 0.3130849003791809, "learning_rate": 8.418316515317437e-05, "loss": 0.07205913066864014, "memory(GiB)": 122.96, "step": 17080, "token_acc": 0.9748115147361206, "train_speed(iter/s)": 0.245616 }, { "epoch": 1.3023096272581751, "grad_norm": 2.3052546977996826, "learning_rate": 8.417442598168181e-05, "loss": 0.13468055725097655, "memory(GiB)": 122.96, "step": 17085, "token_acc": 0.9517464163133454, "train_speed(iter/s)": 0.245634 }, { "epoch": 1.3026907538684351, "grad_norm": 0.7789310216903687, "learning_rate": 8.416568485044452e-05, "loss": 0.12488923072814942, "memory(GiB)": 122.96, "step": 17090, "token_acc": 0.9546796575140242, "train_speed(iter/s)": 0.24564 }, { "epoch": 1.303071880478695, "grad_norm": 0.6070392727851868, "learning_rate": 8.415694175996375e-05, "loss": 0.1148412585258484, "memory(GiB)": 122.96, "step": 17095, "token_acc": 0.9512767066180302, "train_speed(iter/s)": 0.245661 }, { "epoch": 1.303453007088955, "grad_norm": 1.0394083261489868, "learning_rate": 8.414819671074088e-05, "loss": 0.14373077154159547, "memory(GiB)": 122.96, "step": 17100, "token_acc": 0.9395525940028557, "train_speed(iter/s)": 0.245688 }, { "epoch": 1.303834133699215, "grad_norm": 0.9584836959838867, "learning_rate": 8.413944970327739e-05, "loss": 0.11865272521972656, "memory(GiB)": 122.96, "step": 17105, "token_acc": 0.9585785916721169, "train_speed(iter/s)": 0.245706 }, { "epoch": 1.3042152603094748, "grad_norm": 1.0724223852157593, "learning_rate": 8.41307007380749e-05, "loss": 0.1674983024597168, "memory(GiB)": 122.96, "step": 17110, "token_acc": 0.9380607814761216, "train_speed(iter/s)": 0.245729 }, { "epoch": 1.3045963869197348, "grad_norm": 0.41320744156837463, "learning_rate": 8.412194981563508e-05, "loss": 0.1283259630203247, "memory(GiB)": 122.96, "step": 17115, "token_acc": 0.959330609067912, "train_speed(iter/s)": 0.245747 }, { "epoch": 1.3049775135299946, "grad_norm": 1.0701948404312134, "learning_rate": 8.41131969364598e-05, "loss": 0.15799010992050172, "memory(GiB)": 122.96, "step": 17120, "token_acc": 0.9441251596424011, "train_speed(iter/s)": 0.245771 }, { "epoch": 1.3053586401402546, "grad_norm": 0.7024646997451782, "learning_rate": 8.410444210105098e-05, "loss": 0.12450881004333496, "memory(GiB)": 122.96, "step": 17125, "token_acc": 0.9534198113207547, "train_speed(iter/s)": 0.245791 }, { "epoch": 1.3057397667505146, "grad_norm": 0.701589822769165, "learning_rate": 8.409568530991068e-05, "loss": 0.15855600833892822, "memory(GiB)": 122.96, "step": 17130, "token_acc": 0.9352139612718144, "train_speed(iter/s)": 0.245811 }, { "epoch": 1.3061208933607744, "grad_norm": 0.7844749093055725, "learning_rate": 8.408692656354103e-05, "loss": 0.11438111066818238, "memory(GiB)": 122.96, "step": 17135, "token_acc": 0.9487975174553918, "train_speed(iter/s)": 0.245828 }, { "epoch": 1.3065020199710344, "grad_norm": 1.646537184715271, "learning_rate": 8.407816586244432e-05, "loss": 0.17005476951599122, "memory(GiB)": 122.96, "step": 17140, "token_acc": 0.9437150498419645, "train_speed(iter/s)": 0.245833 }, { "epoch": 1.3068831465812942, "grad_norm": 0.5122559070587158, "learning_rate": 8.406940320712296e-05, "loss": 0.09763526916503906, "memory(GiB)": 122.96, "step": 17145, "token_acc": 0.961284645842499, "train_speed(iter/s)": 0.245843 }, { "epoch": 1.3072642731915542, "grad_norm": 1.6116046905517578, "learning_rate": 8.406063859807942e-05, "loss": 0.16987732648849488, "memory(GiB)": 122.96, "step": 17150, "token_acc": 0.9350282485875706, "train_speed(iter/s)": 0.245862 }, { "epoch": 1.3076453998018143, "grad_norm": 0.8747091889381409, "learning_rate": 8.405187203581633e-05, "loss": 0.10864295959472656, "memory(GiB)": 122.96, "step": 17155, "token_acc": 0.9460154241645244, "train_speed(iter/s)": 0.245881 }, { "epoch": 1.308026526412074, "grad_norm": 0.6038386225700378, "learning_rate": 8.404310352083637e-05, "loss": 0.09954867959022522, "memory(GiB)": 122.96, "step": 17160, "token_acc": 0.9625352112676057, "train_speed(iter/s)": 0.245903 }, { "epoch": 1.308407653022334, "grad_norm": 0.5138129591941833, "learning_rate": 8.403433305364243e-05, "loss": 0.13169536590576172, "memory(GiB)": 122.96, "step": 17165, "token_acc": 0.9560789306174411, "train_speed(iter/s)": 0.245915 }, { "epoch": 1.3087887796325939, "grad_norm": 0.9502174258232117, "learning_rate": 8.402556063473741e-05, "loss": 0.16813015937805176, "memory(GiB)": 122.96, "step": 17170, "token_acc": 0.9326923076923077, "train_speed(iter/s)": 0.245934 }, { "epoch": 1.309169906242854, "grad_norm": 1.1338709592819214, "learning_rate": 8.401678626462439e-05, "loss": 0.13977776765823363, "memory(GiB)": 122.96, "step": 17175, "token_acc": 0.9451974071891573, "train_speed(iter/s)": 0.245956 }, { "epoch": 1.309551032853114, "grad_norm": 1.739615559577942, "learning_rate": 8.400800994380655e-05, "loss": 0.1482320547103882, "memory(GiB)": 122.96, "step": 17180, "token_acc": 0.9427917620137299, "train_speed(iter/s)": 0.24598 }, { "epoch": 1.3099321594633737, "grad_norm": 0.5803881883621216, "learning_rate": 8.399923167278714e-05, "loss": 0.12828866243362427, "memory(GiB)": 122.96, "step": 17185, "token_acc": 0.9539267015706806, "train_speed(iter/s)": 0.245991 }, { "epoch": 1.3103132860736337, "grad_norm": 0.6725849509239197, "learning_rate": 8.399045145206956e-05, "loss": 0.11253666877746582, "memory(GiB)": 122.96, "step": 17190, "token_acc": 0.9557808759329672, "train_speed(iter/s)": 0.246003 }, { "epoch": 1.3106944126838935, "grad_norm": 0.7948446869850159, "learning_rate": 8.398166928215735e-05, "loss": 0.16044634580612183, "memory(GiB)": 122.96, "step": 17195, "token_acc": 0.9514348785871964, "train_speed(iter/s)": 0.246015 }, { "epoch": 1.3110755392941535, "grad_norm": 0.7779802083969116, "learning_rate": 8.397288516355408e-05, "loss": 0.10310243368148804, "memory(GiB)": 122.96, "step": 17200, "token_acc": 0.9596569869421165, "train_speed(iter/s)": 0.246029 }, { "epoch": 1.3110755392941535, "eval_loss": 0.10746937990188599, "eval_runtime": 159.9136, "eval_samples_per_second": 3.314, "eval_steps_per_second": 3.314, "eval_token_acc": 0.9510722245647852, "step": 17200 }, { "epoch": 1.3114566659044136, "grad_norm": 1.4999827146530151, "learning_rate": 8.396409909676351e-05, "loss": 0.20731382369995116, "memory(GiB)": 122.96, "step": 17205, "token_acc": 0.9505328687437127, "train_speed(iter/s)": 0.245484 }, { "epoch": 1.3118377925146734, "grad_norm": 1.0059967041015625, "learning_rate": 8.395531108228946e-05, "loss": 0.17043153047561646, "memory(GiB)": 122.96, "step": 17210, "token_acc": 0.9463869463869464, "train_speed(iter/s)": 0.2455 }, { "epoch": 1.3122189191249334, "grad_norm": 0.72733074426651, "learning_rate": 8.39465211206359e-05, "loss": 0.13085744380950928, "memory(GiB)": 122.96, "step": 17215, "token_acc": 0.9538401861908457, "train_speed(iter/s)": 0.245524 }, { "epoch": 1.3126000457351932, "grad_norm": 0.7875025868415833, "learning_rate": 8.393772921230689e-05, "loss": 0.10882532596588135, "memory(GiB)": 122.96, "step": 17220, "token_acc": 0.9548611111111112, "train_speed(iter/s)": 0.245546 }, { "epoch": 1.3129811723454532, "grad_norm": 0.8773347735404968, "learning_rate": 8.392893535780659e-05, "loss": 0.13828046321868898, "memory(GiB)": 122.96, "step": 17225, "token_acc": 0.9523579201934703, "train_speed(iter/s)": 0.245551 }, { "epoch": 1.3133622989557132, "grad_norm": 1.0035037994384766, "learning_rate": 8.39201395576393e-05, "loss": 0.1355830192565918, "memory(GiB)": 122.96, "step": 17230, "token_acc": 0.9423344140805928, "train_speed(iter/s)": 0.245558 }, { "epoch": 1.313743425565973, "grad_norm": 0.5424474477767944, "learning_rate": 8.391134181230942e-05, "loss": 0.119659161567688, "memory(GiB)": 122.96, "step": 17235, "token_acc": 0.9561262707330123, "train_speed(iter/s)": 0.245576 }, { "epoch": 1.3141245521762328, "grad_norm": 1.3532859086990356, "learning_rate": 8.390254212232145e-05, "loss": 0.17380475997924805, "memory(GiB)": 122.96, "step": 17240, "token_acc": 0.9346645647223681, "train_speed(iter/s)": 0.245591 }, { "epoch": 1.3145056787864928, "grad_norm": 1.08351469039917, "learning_rate": 8.389374048818001e-05, "loss": 0.13181604146957399, "memory(GiB)": 122.96, "step": 17245, "token_acc": 0.942057942057942, "train_speed(iter/s)": 0.24561 }, { "epoch": 1.3148868053967528, "grad_norm": 1.3166043758392334, "learning_rate": 8.388493691038985e-05, "loss": 0.12745643854141236, "memory(GiB)": 122.96, "step": 17250, "token_acc": 0.9507944643772425, "train_speed(iter/s)": 0.245624 }, { "epoch": 1.3152679320070129, "grad_norm": 1.2673962116241455, "learning_rate": 8.387613138945579e-05, "loss": 0.1515251636505127, "memory(GiB)": 122.96, "step": 17255, "token_acc": 0.946551724137931, "train_speed(iter/s)": 0.24565 }, { "epoch": 1.3156490586172727, "grad_norm": 0.6673775911331177, "learning_rate": 8.386732392588281e-05, "loss": 0.09816930294036866, "memory(GiB)": 122.96, "step": 17260, "token_acc": 0.9580152671755725, "train_speed(iter/s)": 0.245669 }, { "epoch": 1.3160301852275325, "grad_norm": 0.8394562005996704, "learning_rate": 8.385851452017597e-05, "loss": 0.1531757593154907, "memory(GiB)": 122.96, "step": 17265, "token_acc": 0.9429559204840103, "train_speed(iter/s)": 0.245681 }, { "epoch": 1.3164113118377925, "grad_norm": 0.6488535404205322, "learning_rate": 8.384970317284042e-05, "loss": 0.1264307141304016, "memory(GiB)": 122.96, "step": 17270, "token_acc": 0.9439882697947214, "train_speed(iter/s)": 0.245703 }, { "epoch": 1.3167924384480525, "grad_norm": 0.4511919617652893, "learning_rate": 8.38408898843815e-05, "loss": 0.11835004091262817, "memory(GiB)": 122.96, "step": 17275, "token_acc": 0.9543332194510876, "train_speed(iter/s)": 0.245706 }, { "epoch": 1.3171735650583123, "grad_norm": 0.6283314228057861, "learning_rate": 8.383207465530458e-05, "loss": 0.110160231590271, "memory(GiB)": 122.96, "step": 17280, "token_acc": 0.9602985548673971, "train_speed(iter/s)": 0.24572 }, { "epoch": 1.3175546916685723, "grad_norm": 0.7329379320144653, "learning_rate": 8.382325748611518e-05, "loss": 0.17345643043518066, "memory(GiB)": 122.96, "step": 17285, "token_acc": 0.9336140698538519, "train_speed(iter/s)": 0.245739 }, { "epoch": 1.317935818278832, "grad_norm": 0.946789562702179, "learning_rate": 8.381443837731892e-05, "loss": 0.11887201070785522, "memory(GiB)": 122.96, "step": 17290, "token_acc": 0.9526420737786641, "train_speed(iter/s)": 0.245765 }, { "epoch": 1.3183169448890921, "grad_norm": 0.7587791085243225, "learning_rate": 8.380561732942154e-05, "loss": 0.12729693651199342, "memory(GiB)": 122.96, "step": 17295, "token_acc": 0.9606316725978647, "train_speed(iter/s)": 0.245787 }, { "epoch": 1.3186980714993521, "grad_norm": 2.0177884101867676, "learning_rate": 8.379679434292889e-05, "loss": 0.1721964955329895, "memory(GiB)": 122.96, "step": 17300, "token_acc": 0.9388324367403865, "train_speed(iter/s)": 0.245805 }, { "epoch": 1.319079198109612, "grad_norm": 0.7249957323074341, "learning_rate": 8.37879694183469e-05, "loss": 0.12923588752746581, "memory(GiB)": 122.96, "step": 17305, "token_acc": 0.9524725803348085, "train_speed(iter/s)": 0.245822 }, { "epoch": 1.319460324719872, "grad_norm": 1.7513567209243774, "learning_rate": 8.377914255618166e-05, "loss": 0.13777538537979125, "memory(GiB)": 122.96, "step": 17310, "token_acc": 0.9516968561315844, "train_speed(iter/s)": 0.245838 }, { "epoch": 1.3198414513301318, "grad_norm": 1.5102633237838745, "learning_rate": 8.377031375693938e-05, "loss": 0.1403293490409851, "memory(GiB)": 122.96, "step": 17315, "token_acc": 0.9451219512195121, "train_speed(iter/s)": 0.245856 }, { "epoch": 1.3202225779403918, "grad_norm": 1.8501505851745605, "learning_rate": 8.37614830211263e-05, "loss": 0.16069526672363282, "memory(GiB)": 122.96, "step": 17320, "token_acc": 0.9356376638855781, "train_speed(iter/s)": 0.245878 }, { "epoch": 1.3206037045506518, "grad_norm": 1.1055574417114258, "learning_rate": 8.375265034924884e-05, "loss": 0.14554964303970336, "memory(GiB)": 122.96, "step": 17325, "token_acc": 0.9428632570083458, "train_speed(iter/s)": 0.245895 }, { "epoch": 1.3209848311609116, "grad_norm": 0.7933129668235779, "learning_rate": 8.374381574181351e-05, "loss": 0.10255119800567628, "memory(GiB)": 122.96, "step": 17330, "token_acc": 0.952840639341668, "train_speed(iter/s)": 0.245909 }, { "epoch": 1.3213659577711716, "grad_norm": 1.4013934135437012, "learning_rate": 8.373497919932694e-05, "loss": 0.1416211485862732, "memory(GiB)": 122.96, "step": 17335, "token_acc": 0.9552209233207847, "train_speed(iter/s)": 0.245928 }, { "epoch": 1.3217470843814314, "grad_norm": 0.9369850754737854, "learning_rate": 8.372614072229585e-05, "loss": 0.11370632648468018, "memory(GiB)": 122.96, "step": 17340, "token_acc": 0.9513126491646778, "train_speed(iter/s)": 0.245943 }, { "epoch": 1.3221282109916914, "grad_norm": 0.5698795914649963, "learning_rate": 8.371730031122712e-05, "loss": 0.14144572019577026, "memory(GiB)": 122.96, "step": 17345, "token_acc": 0.9459159617251421, "train_speed(iter/s)": 0.245953 }, { "epoch": 1.3225093376019514, "grad_norm": 1.8146551847457886, "learning_rate": 8.370845796662768e-05, "loss": 0.12035472393035888, "memory(GiB)": 122.96, "step": 17350, "token_acc": 0.9514237855946399, "train_speed(iter/s)": 0.245972 }, { "epoch": 1.3228904642122112, "grad_norm": 1.5464584827423096, "learning_rate": 8.36996136890046e-05, "loss": 0.20954666137695313, "memory(GiB)": 122.96, "step": 17355, "token_acc": 0.9348387096774193, "train_speed(iter/s)": 0.245985 }, { "epoch": 1.3232715908224713, "grad_norm": 0.8469153046607971, "learning_rate": 8.369076747886507e-05, "loss": 0.10908982753753663, "memory(GiB)": 122.96, "step": 17360, "token_acc": 0.9477425552353507, "train_speed(iter/s)": 0.246004 }, { "epoch": 1.323652717432731, "grad_norm": 1.2215858697891235, "learning_rate": 8.368191933671635e-05, "loss": 0.13885202407836914, "memory(GiB)": 122.96, "step": 17365, "token_acc": 0.9425490196078431, "train_speed(iter/s)": 0.246021 }, { "epoch": 1.324033844042991, "grad_norm": 1.1979280710220337, "learning_rate": 8.367306926306586e-05, "loss": 0.16453665494918823, "memory(GiB)": 122.96, "step": 17370, "token_acc": 0.9351984696317551, "train_speed(iter/s)": 0.246038 }, { "epoch": 1.324414970653251, "grad_norm": 0.8311096429824829, "learning_rate": 8.366421725842112e-05, "loss": 0.18164433240890504, "memory(GiB)": 122.96, "step": 17375, "token_acc": 0.9315191387559809, "train_speed(iter/s)": 0.24606 }, { "epoch": 1.3247960972635109, "grad_norm": 0.6123730540275574, "learning_rate": 8.365536332328973e-05, "loss": 0.10397469997406006, "memory(GiB)": 122.96, "step": 17380, "token_acc": 0.9628552971576227, "train_speed(iter/s)": 0.246081 }, { "epoch": 1.325177223873771, "grad_norm": 0.9071429371833801, "learning_rate": 8.364650745817946e-05, "loss": 0.25822536945343016, "memory(GiB)": 122.96, "step": 17385, "token_acc": 0.9069562665101262, "train_speed(iter/s)": 0.246103 }, { "epoch": 1.3255583504840307, "grad_norm": 1.339561104774475, "learning_rate": 8.36376496635981e-05, "loss": 0.14134405851364135, "memory(GiB)": 122.96, "step": 17390, "token_acc": 0.9507219802887921, "train_speed(iter/s)": 0.246119 }, { "epoch": 1.3259394770942907, "grad_norm": 1.0709000825881958, "learning_rate": 8.362878994005364e-05, "loss": 0.11098490953445435, "memory(GiB)": 122.96, "step": 17395, "token_acc": 0.9503765205638154, "train_speed(iter/s)": 0.246133 }, { "epoch": 1.3263206037045507, "grad_norm": 0.6154966354370117, "learning_rate": 8.361992828805415e-05, "loss": 0.100138258934021, "memory(GiB)": 122.96, "step": 17400, "token_acc": 0.9585445869370712, "train_speed(iter/s)": 0.246154 }, { "epoch": 1.3263206037045507, "eval_loss": 0.10663587599992752, "eval_runtime": 161.5073, "eval_samples_per_second": 3.282, "eval_steps_per_second": 3.282, "eval_token_acc": 0.9507108005541834, "step": 17400 }, { "epoch": 1.3267017303148105, "grad_norm": 1.4776822328567505, "learning_rate": 8.361106470810775e-05, "loss": 0.08831592202186585, "memory(GiB)": 122.96, "step": 17405, "token_acc": 0.9508884519519629, "train_speed(iter/s)": 0.245611 }, { "epoch": 1.3270828569250706, "grad_norm": 0.7651785016059875, "learning_rate": 8.360219920072279e-05, "loss": 0.11316902637481689, "memory(GiB)": 122.96, "step": 17410, "token_acc": 0.952078640180217, "train_speed(iter/s)": 0.245625 }, { "epoch": 1.3274639835353303, "grad_norm": 0.572742223739624, "learning_rate": 8.359333176640762e-05, "loss": 0.09311432838439941, "memory(GiB)": 122.96, "step": 17415, "token_acc": 0.9520977865873802, "train_speed(iter/s)": 0.245649 }, { "epoch": 1.3278451101455904, "grad_norm": 1.5521225929260254, "learning_rate": 8.358446240567079e-05, "loss": 0.08502548933029175, "memory(GiB)": 122.96, "step": 17420, "token_acc": 0.9636486031639179, "train_speed(iter/s)": 0.245673 }, { "epoch": 1.3282262367558504, "grad_norm": 1.3262386322021484, "learning_rate": 8.357559111902086e-05, "loss": 0.11562271118164062, "memory(GiB)": 122.96, "step": 17425, "token_acc": 0.952529994783516, "train_speed(iter/s)": 0.2457 }, { "epoch": 1.3286073633661102, "grad_norm": 1.201525092124939, "learning_rate": 8.356671790696661e-05, "loss": 0.13940199613571166, "memory(GiB)": 122.96, "step": 17430, "token_acc": 0.9548914575697772, "train_speed(iter/s)": 0.245714 }, { "epoch": 1.3289884899763702, "grad_norm": 0.8851081132888794, "learning_rate": 8.355784277001687e-05, "loss": 0.12636091709136962, "memory(GiB)": 122.96, "step": 17435, "token_acc": 0.950075075075075, "train_speed(iter/s)": 0.245721 }, { "epoch": 1.32936961658663, "grad_norm": 0.5397643446922302, "learning_rate": 8.354896570868056e-05, "loss": 0.09881922602653503, "memory(GiB)": 122.96, "step": 17440, "token_acc": 0.9639733444398167, "train_speed(iter/s)": 0.245735 }, { "epoch": 1.32975074319689, "grad_norm": 0.8318448066711426, "learning_rate": 8.354008672346676e-05, "loss": 0.12209821939468384, "memory(GiB)": 122.96, "step": 17445, "token_acc": 0.9447128287707998, "train_speed(iter/s)": 0.245756 }, { "epoch": 1.33013186980715, "grad_norm": 0.16590054333209991, "learning_rate": 8.353120581488462e-05, "loss": 0.08436434268951416, "memory(GiB)": 122.96, "step": 17450, "token_acc": 0.9566274827725983, "train_speed(iter/s)": 0.245781 }, { "epoch": 1.3305129964174098, "grad_norm": 0.8471395969390869, "learning_rate": 8.352232298344342e-05, "loss": 0.11898227930068969, "memory(GiB)": 122.96, "step": 17455, "token_acc": 0.9610238510762071, "train_speed(iter/s)": 0.245799 }, { "epoch": 1.3308941230276699, "grad_norm": 0.8134551644325256, "learning_rate": 8.351343822965258e-05, "loss": 0.13603322505950927, "memory(GiB)": 122.96, "step": 17460, "token_acc": 0.9490608992601024, "train_speed(iter/s)": 0.245818 }, { "epoch": 1.3312752496379296, "grad_norm": 1.4180384874343872, "learning_rate": 8.350455155402158e-05, "loss": 0.13956949710845948, "memory(GiB)": 122.96, "step": 17465, "token_acc": 0.9462264150943396, "train_speed(iter/s)": 0.245844 }, { "epoch": 1.3316563762481897, "grad_norm": 0.6852266192436218, "learning_rate": 8.349566295706002e-05, "loss": 0.1203418493270874, "memory(GiB)": 122.96, "step": 17470, "token_acc": 0.9619418639496554, "train_speed(iter/s)": 0.245865 }, { "epoch": 1.3320375028584497, "grad_norm": 1.3381239175796509, "learning_rate": 8.348677243927763e-05, "loss": 0.10267441272735596, "memory(GiB)": 122.96, "step": 17475, "token_acc": 0.9558166862514689, "train_speed(iter/s)": 0.245885 }, { "epoch": 1.3324186294687095, "grad_norm": 0.6857523322105408, "learning_rate": 8.347788000118424e-05, "loss": 0.10902951955795288, "memory(GiB)": 122.96, "step": 17480, "token_acc": 0.9621871116634121, "train_speed(iter/s)": 0.245897 }, { "epoch": 1.3327997560789695, "grad_norm": 0.5820972323417664, "learning_rate": 8.346898564328977e-05, "loss": 0.1461030960083008, "memory(GiB)": 122.96, "step": 17485, "token_acc": 0.9417204676321759, "train_speed(iter/s)": 0.24591 }, { "epoch": 1.3331808826892293, "grad_norm": 0.5850436091423035, "learning_rate": 8.346008936610432e-05, "loss": 0.17387588024139405, "memory(GiB)": 122.96, "step": 17490, "token_acc": 0.9303262182843335, "train_speed(iter/s)": 0.245935 }, { "epoch": 1.3335620092994893, "grad_norm": 1.5295518636703491, "learning_rate": 8.3451191170138e-05, "loss": 0.15591977834701537, "memory(GiB)": 122.96, "step": 17495, "token_acc": 0.9333731700029877, "train_speed(iter/s)": 0.245958 }, { "epoch": 1.3339431359097493, "grad_norm": 0.6870015859603882, "learning_rate": 8.344229105590109e-05, "loss": 0.11218595504760742, "memory(GiB)": 122.96, "step": 17500, "token_acc": 0.9491292392300642, "train_speed(iter/s)": 0.245983 }, { "epoch": 1.3343242625200091, "grad_norm": 0.9706870317459106, "learning_rate": 8.3433389023904e-05, "loss": 0.09720314741134643, "memory(GiB)": 122.96, "step": 17505, "token_acc": 0.9629339598016183, "train_speed(iter/s)": 0.245981 }, { "epoch": 1.3347053891302691, "grad_norm": 0.7524513602256775, "learning_rate": 8.342448507465719e-05, "loss": 0.12113804817199707, "memory(GiB)": 122.96, "step": 17510, "token_acc": 0.9536003080477474, "train_speed(iter/s)": 0.245995 }, { "epoch": 1.335086515740529, "grad_norm": 0.9725679159164429, "learning_rate": 8.341557920867127e-05, "loss": 0.11712092161178589, "memory(GiB)": 122.96, "step": 17515, "token_acc": 0.9570375696566601, "train_speed(iter/s)": 0.246009 }, { "epoch": 1.335467642350789, "grad_norm": 0.901344895362854, "learning_rate": 8.340667142645695e-05, "loss": 0.12351453304290771, "memory(GiB)": 122.96, "step": 17520, "token_acc": 0.9550861361771944, "train_speed(iter/s)": 0.246026 }, { "epoch": 1.335848768961049, "grad_norm": 0.759326159954071, "learning_rate": 8.339776172852505e-05, "loss": 0.13640780448913575, "memory(GiB)": 122.96, "step": 17525, "token_acc": 0.9488248673237301, "train_speed(iter/s)": 0.246042 }, { "epoch": 1.3362298955713088, "grad_norm": 0.8271862268447876, "learning_rate": 8.33888501153865e-05, "loss": 0.09537164568901062, "memory(GiB)": 122.96, "step": 17530, "token_acc": 0.960991615020051, "train_speed(iter/s)": 0.246064 }, { "epoch": 1.3366110221815688, "grad_norm": 0.16925379633903503, "learning_rate": 8.337993658755232e-05, "loss": 0.0796302616596222, "memory(GiB)": 122.96, "step": 17535, "token_acc": 0.9762633996937213, "train_speed(iter/s)": 0.246088 }, { "epoch": 1.3369921487918286, "grad_norm": 0.8744942545890808, "learning_rate": 8.33710211455337e-05, "loss": 0.10761260986328125, "memory(GiB)": 122.96, "step": 17540, "token_acc": 0.9479039479039479, "train_speed(iter/s)": 0.246112 }, { "epoch": 1.3373732754020886, "grad_norm": 1.3832414150238037, "learning_rate": 8.336210378984188e-05, "loss": 0.13142119646072387, "memory(GiB)": 122.96, "step": 17545, "token_acc": 0.9443316903450275, "train_speed(iter/s)": 0.246132 }, { "epoch": 1.3377544020123486, "grad_norm": 0.9809344410896301, "learning_rate": 8.335318452098822e-05, "loss": 0.18262282609939576, "memory(GiB)": 122.96, "step": 17550, "token_acc": 0.9328323156411461, "train_speed(iter/s)": 0.246147 }, { "epoch": 1.3381355286226084, "grad_norm": 0.8487640619277954, "learning_rate": 8.33442633394842e-05, "loss": 0.09845225811004639, "memory(GiB)": 122.96, "step": 17555, "token_acc": 0.9580756013745705, "train_speed(iter/s)": 0.246176 }, { "epoch": 1.3385166552328682, "grad_norm": 1.0003710985183716, "learning_rate": 8.333534024584142e-05, "loss": 0.19865424633026124, "memory(GiB)": 122.96, "step": 17560, "token_acc": 0.9179010795882501, "train_speed(iter/s)": 0.246195 }, { "epoch": 1.3388977818431282, "grad_norm": 0.8688986301422119, "learning_rate": 8.332641524057159e-05, "loss": 0.1393133282661438, "memory(GiB)": 122.96, "step": 17565, "token_acc": 0.953009828009828, "train_speed(iter/s)": 0.24621 }, { "epoch": 1.3392789084533883, "grad_norm": 0.7204969525337219, "learning_rate": 8.331748832418647e-05, "loss": 0.06464812159538269, "memory(GiB)": 122.96, "step": 17570, "token_acc": 0.9683530198350792, "train_speed(iter/s)": 0.246229 }, { "epoch": 1.3396600350636483, "grad_norm": 1.3241522312164307, "learning_rate": 8.330855949719802e-05, "loss": 0.18624211549758912, "memory(GiB)": 122.96, "step": 17575, "token_acc": 0.9295379537953795, "train_speed(iter/s)": 0.246241 }, { "epoch": 1.340041161673908, "grad_norm": 0.7863385677337646, "learning_rate": 8.329962876011825e-05, "loss": 0.11213642358779907, "memory(GiB)": 122.96, "step": 17580, "token_acc": 0.9536570622607294, "train_speed(iter/s)": 0.246253 }, { "epoch": 1.3404222882841679, "grad_norm": 1.5388987064361572, "learning_rate": 8.32906961134593e-05, "loss": 0.12429332733154297, "memory(GiB)": 122.96, "step": 17585, "token_acc": 0.9559293523969723, "train_speed(iter/s)": 0.246269 }, { "epoch": 1.340803414894428, "grad_norm": 0.8988392949104309, "learning_rate": 8.328176155773342e-05, "loss": 0.15549924373626708, "memory(GiB)": 122.96, "step": 17590, "token_acc": 0.9484193011647255, "train_speed(iter/s)": 0.24629 }, { "epoch": 1.341184541504688, "grad_norm": 0.6303218007087708, "learning_rate": 8.327282509345295e-05, "loss": 0.1806449770927429, "memory(GiB)": 122.96, "step": 17595, "token_acc": 0.9409682899207248, "train_speed(iter/s)": 0.246293 }, { "epoch": 1.3415656681149477, "grad_norm": 0.6814365386962891, "learning_rate": 8.326388672113038e-05, "loss": 0.12485001087188721, "memory(GiB)": 122.96, "step": 17600, "token_acc": 0.9519755511896966, "train_speed(iter/s)": 0.246314 }, { "epoch": 1.3415656681149477, "eval_loss": 0.10737795382738113, "eval_runtime": 162.55, "eval_samples_per_second": 3.261, "eval_steps_per_second": 3.261, "eval_token_acc": 0.950831275224384, "step": 17600 }, { "epoch": 1.3419467947252077, "grad_norm": 0.5256803035736084, "learning_rate": 8.325494644127825e-05, "loss": 0.13773317337036134, "memory(GiB)": 122.96, "step": 17605, "token_acc": 0.9505811008351063, "train_speed(iter/s)": 0.245772 }, { "epoch": 1.3423279213354675, "grad_norm": 0.686322808265686, "learning_rate": 8.324600425440928e-05, "loss": 0.11007115840911866, "memory(GiB)": 122.96, "step": 17610, "token_acc": 0.9596682387380062, "train_speed(iter/s)": 0.245783 }, { "epoch": 1.3427090479457275, "grad_norm": 0.8901757001876831, "learning_rate": 8.323706016103626e-05, "loss": 0.10439928770065307, "memory(GiB)": 122.96, "step": 17615, "token_acc": 0.9555349698934692, "train_speed(iter/s)": 0.245808 }, { "epoch": 1.3430901745559876, "grad_norm": 1.0986119508743286, "learning_rate": 8.322811416167204e-05, "loss": 0.2230586051940918, "memory(GiB)": 122.96, "step": 17620, "token_acc": 0.9224288204532248, "train_speed(iter/s)": 0.245831 }, { "epoch": 1.3434713011662474, "grad_norm": 0.8188419938087463, "learning_rate": 8.32191662568297e-05, "loss": 0.09669690132141114, "memory(GiB)": 122.96, "step": 17625, "token_acc": 0.9584639498432602, "train_speed(iter/s)": 0.245842 }, { "epoch": 1.3438524277765074, "grad_norm": 0.5431106090545654, "learning_rate": 8.321021644702232e-05, "loss": 0.0972038447856903, "memory(GiB)": 122.96, "step": 17630, "token_acc": 0.9629570998891879, "train_speed(iter/s)": 0.245854 }, { "epoch": 1.3442335543867672, "grad_norm": 0.45160844922065735, "learning_rate": 8.320126473276315e-05, "loss": 0.09916614890098571, "memory(GiB)": 122.96, "step": 17635, "token_acc": 0.9548566538296962, "train_speed(iter/s)": 0.245868 }, { "epoch": 1.3446146809970272, "grad_norm": 0.9227285981178284, "learning_rate": 8.319231111456552e-05, "loss": 0.1558426022529602, "memory(GiB)": 122.96, "step": 17640, "token_acc": 0.936279926335175, "train_speed(iter/s)": 0.24589 }, { "epoch": 1.3449958076072872, "grad_norm": 0.7520461082458496, "learning_rate": 8.318335559294286e-05, "loss": 0.14146947860717773, "memory(GiB)": 122.96, "step": 17645, "token_acc": 0.9474508439185662, "train_speed(iter/s)": 0.245901 }, { "epoch": 1.345376934217547, "grad_norm": 1.0533876419067383, "learning_rate": 8.317439816840877e-05, "loss": 0.1133010983467102, "memory(GiB)": 122.96, "step": 17650, "token_acc": 0.9504541701073493, "train_speed(iter/s)": 0.24592 }, { "epoch": 1.345758060827807, "grad_norm": 0.630739152431488, "learning_rate": 8.316543884147688e-05, "loss": 0.11152706146240235, "memory(GiB)": 122.96, "step": 17655, "token_acc": 0.9616939026533122, "train_speed(iter/s)": 0.245934 }, { "epoch": 1.3461391874380668, "grad_norm": 0.47246092557907104, "learning_rate": 8.315647761266097e-05, "loss": 0.09312688112258911, "memory(GiB)": 122.96, "step": 17660, "token_acc": 0.9671734623358673, "train_speed(iter/s)": 0.245948 }, { "epoch": 1.3465203140483268, "grad_norm": 1.2111607789993286, "learning_rate": 8.314751448247496e-05, "loss": 0.1276992678642273, "memory(GiB)": 122.96, "step": 17665, "token_acc": 0.9541191095038988, "train_speed(iter/s)": 0.245958 }, { "epoch": 1.3469014406585869, "grad_norm": 0.7736193537712097, "learning_rate": 8.313854945143277e-05, "loss": 0.11056758165359497, "memory(GiB)": 122.96, "step": 17670, "token_acc": 0.9531531531531532, "train_speed(iter/s)": 0.245975 }, { "epoch": 1.3472825672688467, "grad_norm": 1.0006659030914307, "learning_rate": 8.312958252004859e-05, "loss": 0.15911266803741456, "memory(GiB)": 122.96, "step": 17675, "token_acc": 0.94194428209624, "train_speed(iter/s)": 0.24599 }, { "epoch": 1.3476636938791067, "grad_norm": 0.8486997485160828, "learning_rate": 8.31206136888366e-05, "loss": 0.14367567300796508, "memory(GiB)": 122.96, "step": 17680, "token_acc": 0.9501241721854304, "train_speed(iter/s)": 0.246001 }, { "epoch": 1.3480448204893665, "grad_norm": 0.7927463054656982, "learning_rate": 8.311164295831109e-05, "loss": 0.10201640129089355, "memory(GiB)": 122.96, "step": 17685, "token_acc": 0.9566160520607375, "train_speed(iter/s)": 0.246019 }, { "epoch": 1.3484259470996265, "grad_norm": 0.692664384841919, "learning_rate": 8.310267032898652e-05, "loss": 0.13350664377212523, "memory(GiB)": 122.96, "step": 17690, "token_acc": 0.9575549450549451, "train_speed(iter/s)": 0.24602 }, { "epoch": 1.3488070737098865, "grad_norm": 0.5226743817329407, "learning_rate": 8.309369580137742e-05, "loss": 0.16792951822280883, "memory(GiB)": 122.96, "step": 17695, "token_acc": 0.9514369140925157, "train_speed(iter/s)": 0.246025 }, { "epoch": 1.3491882003201463, "grad_norm": 0.3264235258102417, "learning_rate": 8.308471937599845e-05, "loss": 0.10804102420806885, "memory(GiB)": 122.96, "step": 17700, "token_acc": 0.9477372847416901, "train_speed(iter/s)": 0.246042 }, { "epoch": 1.3495693269304063, "grad_norm": 1.317716121673584, "learning_rate": 8.307574105336434e-05, "loss": 0.1523658275604248, "memory(GiB)": 122.96, "step": 17705, "token_acc": 0.9404255319148936, "train_speed(iter/s)": 0.246056 }, { "epoch": 1.3499504535406661, "grad_norm": 1.0504631996154785, "learning_rate": 8.306676083398999e-05, "loss": 0.17230364084243774, "memory(GiB)": 122.96, "step": 17710, "token_acc": 0.9354215833512122, "train_speed(iter/s)": 0.246072 }, { "epoch": 1.3503315801509261, "grad_norm": 0.541141927242279, "learning_rate": 8.305777871839034e-05, "loss": 0.11566168069839478, "memory(GiB)": 122.96, "step": 17715, "token_acc": 0.9517754318618042, "train_speed(iter/s)": 0.246086 }, { "epoch": 1.3507127067611862, "grad_norm": 0.8101071715354919, "learning_rate": 8.304879470708049e-05, "loss": 0.1641600251197815, "memory(GiB)": 122.96, "step": 17720, "token_acc": 0.945438282647585, "train_speed(iter/s)": 0.246106 }, { "epoch": 1.351093833371446, "grad_norm": 0.6932691931724548, "learning_rate": 8.303980880057563e-05, "loss": 0.11383298635482789, "memory(GiB)": 122.96, "step": 17725, "token_acc": 0.9464173145618827, "train_speed(iter/s)": 0.246124 }, { "epoch": 1.351474959981706, "grad_norm": 0.7255609035491943, "learning_rate": 8.303082099939105e-05, "loss": 0.12572647333145143, "memory(GiB)": 122.96, "step": 17730, "token_acc": 0.949474224160041, "train_speed(iter/s)": 0.246144 }, { "epoch": 1.3518560865919658, "grad_norm": 0.6205984354019165, "learning_rate": 8.302183130404217e-05, "loss": 0.11141109466552734, "memory(GiB)": 122.96, "step": 17735, "token_acc": 0.951878585970757, "train_speed(iter/s)": 0.246161 }, { "epoch": 1.3522372132022258, "grad_norm": 0.9445013403892517, "learning_rate": 8.301283971504453e-05, "loss": 0.11060664653778077, "memory(GiB)": 122.96, "step": 17740, "token_acc": 0.9546429665951578, "train_speed(iter/s)": 0.246173 }, { "epoch": 1.3526183398124858, "grad_norm": 0.9685593247413635, "learning_rate": 8.30038462329137e-05, "loss": 0.09569424390792847, "memory(GiB)": 122.96, "step": 17745, "token_acc": 0.9633093525179857, "train_speed(iter/s)": 0.246191 }, { "epoch": 1.3529994664227456, "grad_norm": 1.2590841054916382, "learning_rate": 8.299485085816546e-05, "loss": 0.1558597207069397, "memory(GiB)": 122.96, "step": 17750, "token_acc": 0.9312452253628725, "train_speed(iter/s)": 0.246215 }, { "epoch": 1.3533805930330056, "grad_norm": 1.1839690208435059, "learning_rate": 8.298585359131564e-05, "loss": 0.10891849994659424, "memory(GiB)": 122.96, "step": 17755, "token_acc": 0.937113891968953, "train_speed(iter/s)": 0.24623 }, { "epoch": 1.3537617196432654, "grad_norm": 0.3947868347167969, "learning_rate": 8.297685443288017e-05, "loss": 0.055293101072311404, "memory(GiB)": 122.96, "step": 17760, "token_acc": 0.9725339139172667, "train_speed(iter/s)": 0.246244 }, { "epoch": 1.3541428462535254, "grad_norm": 0.8137479424476624, "learning_rate": 8.296785338337515e-05, "loss": 0.14944162368774414, "memory(GiB)": 122.96, "step": 17765, "token_acc": 0.9431429965836994, "train_speed(iter/s)": 0.246261 }, { "epoch": 1.3545239728637855, "grad_norm": 0.2736319899559021, "learning_rate": 8.295885044331672e-05, "loss": 0.0863140881061554, "memory(GiB)": 122.96, "step": 17770, "token_acc": 0.965359477124183, "train_speed(iter/s)": 0.246286 }, { "epoch": 1.3549050994740452, "grad_norm": 0.6915073990821838, "learning_rate": 8.294984561322116e-05, "loss": 0.11396056413650513, "memory(GiB)": 122.96, "step": 17775, "token_acc": 0.954119850187266, "train_speed(iter/s)": 0.246301 }, { "epoch": 1.3552862260843053, "grad_norm": 1.5455626249313354, "learning_rate": 8.294083889360488e-05, "loss": 0.14401098489761352, "memory(GiB)": 122.96, "step": 17780, "token_acc": 0.9491455347298787, "train_speed(iter/s)": 0.246308 }, { "epoch": 1.355667352694565, "grad_norm": 1.0948069095611572, "learning_rate": 8.293183028498433e-05, "loss": 0.09390276074409484, "memory(GiB)": 122.96, "step": 17785, "token_acc": 0.9537037037037037, "train_speed(iter/s)": 0.246332 }, { "epoch": 1.356048479304825, "grad_norm": 1.4408879280090332, "learning_rate": 8.292281978787615e-05, "loss": 0.1454519271850586, "memory(GiB)": 122.96, "step": 17790, "token_acc": 0.9407496977025392, "train_speed(iter/s)": 0.24635 }, { "epoch": 1.356429605915085, "grad_norm": 0.5982159972190857, "learning_rate": 8.291380740279704e-05, "loss": 0.16239447593688966, "memory(GiB)": 122.96, "step": 17795, "token_acc": 0.9379905808477237, "train_speed(iter/s)": 0.246363 }, { "epoch": 1.356810732525345, "grad_norm": 1.441665530204773, "learning_rate": 8.290479313026381e-05, "loss": 0.16269701719284058, "memory(GiB)": 122.96, "step": 17800, "token_acc": 0.9382474981693922, "train_speed(iter/s)": 0.24638 }, { "epoch": 1.356810732525345, "eval_loss": 0.10667567700147629, "eval_runtime": 163.39, "eval_samples_per_second": 3.244, "eval_steps_per_second": 3.244, "eval_token_acc": 0.9513357629058491, "step": 17800 }, { "epoch": 1.357191859135605, "grad_norm": 2.677741765975952, "learning_rate": 8.289577697079337e-05, "loss": 0.09886575341224671, "memory(GiB)": 122.96, "step": 17805, "token_acc": 0.9514716703458426, "train_speed(iter/s)": 0.245844 }, { "epoch": 1.3575729857458647, "grad_norm": 1.5536164045333862, "learning_rate": 8.288675892490279e-05, "loss": 0.08565916419029236, "memory(GiB)": 122.96, "step": 17810, "token_acc": 0.9609429978888107, "train_speed(iter/s)": 0.245866 }, { "epoch": 1.3579541123561247, "grad_norm": 1.26435124874115, "learning_rate": 8.28777389931092e-05, "loss": 0.14625492095947265, "memory(GiB)": 122.96, "step": 17815, "token_acc": 0.9401625215464171, "train_speed(iter/s)": 0.245885 }, { "epoch": 1.3583352389663848, "grad_norm": 0.07300937920808792, "learning_rate": 8.286871717592986e-05, "loss": 0.1096879243850708, "memory(GiB)": 122.96, "step": 17820, "token_acc": 0.9471436198611852, "train_speed(iter/s)": 0.245906 }, { "epoch": 1.3587163655766445, "grad_norm": 0.9479355216026306, "learning_rate": 8.285969347388209e-05, "loss": 0.1105151891708374, "memory(GiB)": 122.96, "step": 17825, "token_acc": 0.9541213063763608, "train_speed(iter/s)": 0.245929 }, { "epoch": 1.3590974921869046, "grad_norm": 0.4941056966781616, "learning_rate": 8.285066788748342e-05, "loss": 0.10193665027618408, "memory(GiB)": 122.96, "step": 17830, "token_acc": 0.9640866873065016, "train_speed(iter/s)": 0.245934 }, { "epoch": 1.3594786187971644, "grad_norm": 0.8482246994972229, "learning_rate": 8.284164041725138e-05, "loss": 0.1355947732925415, "memory(GiB)": 122.96, "step": 17835, "token_acc": 0.9485470541189016, "train_speed(iter/s)": 0.245951 }, { "epoch": 1.3598597454074244, "grad_norm": 1.2586345672607422, "learning_rate": 8.283261106370367e-05, "loss": 0.13523796796798707, "memory(GiB)": 122.96, "step": 17840, "token_acc": 0.9439274080967892, "train_speed(iter/s)": 0.245969 }, { "epoch": 1.3602408720176844, "grad_norm": 1.098524570465088, "learning_rate": 8.282357982735807e-05, "loss": 0.14252423048019408, "memory(GiB)": 122.96, "step": 17845, "token_acc": 0.9496782709123907, "train_speed(iter/s)": 0.245988 }, { "epoch": 1.3606219986279442, "grad_norm": 1.2601457834243774, "learning_rate": 8.281454670873248e-05, "loss": 0.14382587671279906, "memory(GiB)": 122.96, "step": 17850, "token_acc": 0.945500387897595, "train_speed(iter/s)": 0.246005 }, { "epoch": 1.3610031252382042, "grad_norm": 1.0263535976409912, "learning_rate": 8.280551170834494e-05, "loss": 0.12325994968414307, "memory(GiB)": 122.96, "step": 17855, "token_acc": 0.9524447421299397, "train_speed(iter/s)": 0.246022 }, { "epoch": 1.361384251848464, "grad_norm": 0.5292584300041199, "learning_rate": 8.279647482671352e-05, "loss": 0.1428394317626953, "memory(GiB)": 122.96, "step": 17860, "token_acc": 0.9557175884759025, "train_speed(iter/s)": 0.246034 }, { "epoch": 1.361765378458724, "grad_norm": 0.9273515343666077, "learning_rate": 8.278743606435648e-05, "loss": 0.08865985870361329, "memory(GiB)": 122.96, "step": 17865, "token_acc": 0.9612541422380831, "train_speed(iter/s)": 0.246052 }, { "epoch": 1.362146505068984, "grad_norm": 1.4182050228118896, "learning_rate": 8.27783954217921e-05, "loss": 0.1267813563346863, "memory(GiB)": 122.96, "step": 17870, "token_acc": 0.9525403103133556, "train_speed(iter/s)": 0.246064 }, { "epoch": 1.3625276316792438, "grad_norm": 1.503830075263977, "learning_rate": 8.276935289953888e-05, "loss": 0.15808428525924684, "memory(GiB)": 122.96, "step": 17875, "token_acc": 0.945475910693302, "train_speed(iter/s)": 0.246082 }, { "epoch": 1.3629087582895036, "grad_norm": 1.2763851881027222, "learning_rate": 8.276030849811533e-05, "loss": 0.11728420257568359, "memory(GiB)": 122.96, "step": 17880, "token_acc": 0.9543408360128617, "train_speed(iter/s)": 0.246093 }, { "epoch": 1.3632898848997637, "grad_norm": 0.7728233933448792, "learning_rate": 8.275126221804012e-05, "loss": 0.13222057819366456, "memory(GiB)": 122.96, "step": 17885, "token_acc": 0.9549034749034749, "train_speed(iter/s)": 0.246109 }, { "epoch": 1.3636710115100237, "grad_norm": 1.3834729194641113, "learning_rate": 8.2742214059832e-05, "loss": 0.12971363067626954, "memory(GiB)": 122.96, "step": 17890, "token_acc": 0.9503348214285714, "train_speed(iter/s)": 0.246126 }, { "epoch": 1.3640521381202835, "grad_norm": 0.8098664283752441, "learning_rate": 8.273316402400985e-05, "loss": 0.13942209482192994, "memory(GiB)": 122.96, "step": 17895, "token_acc": 0.9443159031312663, "train_speed(iter/s)": 0.24614 }, { "epoch": 1.3644332647305435, "grad_norm": 0.8736711740493774, "learning_rate": 8.272411211109264e-05, "loss": 0.16162443161010742, "memory(GiB)": 122.96, "step": 17900, "token_acc": 0.9360972913211719, "train_speed(iter/s)": 0.246139 }, { "epoch": 1.3648143913408033, "grad_norm": 1.2855279445648193, "learning_rate": 8.271505832159947e-05, "loss": 0.16058338880538942, "memory(GiB)": 122.96, "step": 17905, "token_acc": 0.9446550416982562, "train_speed(iter/s)": 0.246163 }, { "epoch": 1.3651955179510633, "grad_norm": 1.1717934608459473, "learning_rate": 8.270600265604951e-05, "loss": 0.094147789478302, "memory(GiB)": 122.96, "step": 17910, "token_acc": 0.9564208354822074, "train_speed(iter/s)": 0.246181 }, { "epoch": 1.3655766445613233, "grad_norm": 0.9194140434265137, "learning_rate": 8.269694511496208e-05, "loss": 0.1140640377998352, "memory(GiB)": 122.96, "step": 17915, "token_acc": 0.9580354367423065, "train_speed(iter/s)": 0.246202 }, { "epoch": 1.3659577711715831, "grad_norm": 0.8268420696258545, "learning_rate": 8.268788569885657e-05, "loss": 0.16119309663772582, "memory(GiB)": 122.96, "step": 17920, "token_acc": 0.9275291828793775, "train_speed(iter/s)": 0.246229 }, { "epoch": 1.3663388977818431, "grad_norm": 0.9691854119300842, "learning_rate": 8.267882440825252e-05, "loss": 0.15487890243530272, "memory(GiB)": 122.96, "step": 17925, "token_acc": 0.9382558879694463, "train_speed(iter/s)": 0.246241 }, { "epoch": 1.366720024392103, "grad_norm": 0.6109914779663086, "learning_rate": 8.266976124366952e-05, "loss": 0.07472390532493592, "memory(GiB)": 122.96, "step": 17930, "token_acc": 0.9679556032939491, "train_speed(iter/s)": 0.246253 }, { "epoch": 1.367101151002363, "grad_norm": 0.49206143617630005, "learning_rate": 8.266069620562733e-05, "loss": 0.11703864336013795, "memory(GiB)": 122.96, "step": 17935, "token_acc": 0.9456373551465576, "train_speed(iter/s)": 0.246267 }, { "epoch": 1.367482277612623, "grad_norm": 0.8435714840888977, "learning_rate": 8.265162929464578e-05, "loss": 0.13197627067565917, "memory(GiB)": 122.96, "step": 17940, "token_acc": 0.9541984732824428, "train_speed(iter/s)": 0.246289 }, { "epoch": 1.3678634042228828, "grad_norm": 0.974494218826294, "learning_rate": 8.26425605112448e-05, "loss": 0.13275246620178222, "memory(GiB)": 122.96, "step": 17945, "token_acc": 0.9594985535197685, "train_speed(iter/s)": 0.24631 }, { "epoch": 1.3682445308331428, "grad_norm": 0.5084572434425354, "learning_rate": 8.263348985594447e-05, "loss": 0.10422978401184083, "memory(GiB)": 122.96, "step": 17950, "token_acc": 0.9617696561673479, "train_speed(iter/s)": 0.246331 }, { "epoch": 1.3686256574434026, "grad_norm": 0.8850142955780029, "learning_rate": 8.262441732926491e-05, "loss": 0.10666424036026001, "memory(GiB)": 122.96, "step": 17955, "token_acc": 0.9582011569322635, "train_speed(iter/s)": 0.246348 }, { "epoch": 1.3690067840536626, "grad_norm": 1.0291752815246582, "learning_rate": 8.261534293172644e-05, "loss": 0.124328875541687, "memory(GiB)": 122.96, "step": 17960, "token_acc": 0.9590654952076677, "train_speed(iter/s)": 0.246365 }, { "epoch": 1.3693879106639226, "grad_norm": 1.0088142156600952, "learning_rate": 8.26062666638494e-05, "loss": 0.1829899549484253, "memory(GiB)": 122.96, "step": 17965, "token_acc": 0.9363027382771183, "train_speed(iter/s)": 0.246372 }, { "epoch": 1.3697690372741824, "grad_norm": 0.7081704139709473, "learning_rate": 8.259718852615428e-05, "loss": 0.11073747873306275, "memory(GiB)": 122.96, "step": 17970, "token_acc": 0.9655437624980773, "train_speed(iter/s)": 0.246378 }, { "epoch": 1.3701501638844424, "grad_norm": 0.8046636581420898, "learning_rate": 8.258810851916165e-05, "loss": 0.09961427450180053, "memory(GiB)": 122.96, "step": 17975, "token_acc": 0.9684716779479872, "train_speed(iter/s)": 0.246392 }, { "epoch": 1.3705312904947022, "grad_norm": 0.7290323972702026, "learning_rate": 8.257902664339223e-05, "loss": 0.13557759523391724, "memory(GiB)": 122.96, "step": 17980, "token_acc": 0.935324746505892, "train_speed(iter/s)": 0.246412 }, { "epoch": 1.3709124171049623, "grad_norm": 0.8848634958267212, "learning_rate": 8.256994289936684e-05, "loss": 0.1504672646522522, "memory(GiB)": 122.96, "step": 17985, "token_acc": 0.9508417508417508, "train_speed(iter/s)": 0.246427 }, { "epoch": 1.3712935437152223, "grad_norm": 1.135188102722168, "learning_rate": 8.256085728760634e-05, "loss": 0.14887114763259887, "memory(GiB)": 122.96, "step": 17990, "token_acc": 0.9364406779661016, "train_speed(iter/s)": 0.246454 }, { "epoch": 1.371674670325482, "grad_norm": 0.3642294406890869, "learning_rate": 8.255176980863179e-05, "loss": 0.1219519853591919, "memory(GiB)": 122.96, "step": 17995, "token_acc": 0.9428571428571428, "train_speed(iter/s)": 0.24648 }, { "epoch": 1.372055796935742, "grad_norm": 1.0374178886413574, "learning_rate": 8.254268046296429e-05, "loss": 0.11976799964904786, "memory(GiB)": 122.96, "step": 18000, "token_acc": 0.9487672670047211, "train_speed(iter/s)": 0.246496 }, { "epoch": 1.372055796935742, "eval_loss": 0.10556968301534653, "eval_runtime": 160.9041, "eval_samples_per_second": 3.294, "eval_steps_per_second": 3.294, "eval_token_acc": 0.9516218902475755, "step": 18000 }, { "epoch": 1.3724369235460019, "grad_norm": 0.8936917185783386, "learning_rate": 8.253358925112508e-05, "loss": 0.1344006896018982, "memory(GiB)": 122.96, "step": 18005, "token_acc": 0.9509125098393633, "train_speed(iter/s)": 0.245969 }, { "epoch": 1.372818050156262, "grad_norm": 0.45324191451072693, "learning_rate": 8.252449617363551e-05, "loss": 0.10691550970077515, "memory(GiB)": 122.96, "step": 18010, "token_acc": 0.9641922436354985, "train_speed(iter/s)": 0.245974 }, { "epoch": 1.373199176766522, "grad_norm": 1.3490302562713623, "learning_rate": 8.251540123101702e-05, "loss": 0.12843425273895265, "memory(GiB)": 122.96, "step": 18015, "token_acc": 0.9534513274336284, "train_speed(iter/s)": 0.245983 }, { "epoch": 1.3735803033767817, "grad_norm": 0.6886975765228271, "learning_rate": 8.250630442379115e-05, "loss": 0.17647043466567994, "memory(GiB)": 122.96, "step": 18020, "token_acc": 0.9448818897637795, "train_speed(iter/s)": 0.246 }, { "epoch": 1.3739614299870417, "grad_norm": 0.8229357600212097, "learning_rate": 8.249720575247957e-05, "loss": 0.11848915815353393, "memory(GiB)": 122.96, "step": 18025, "token_acc": 0.955500797216988, "train_speed(iter/s)": 0.246003 }, { "epoch": 1.3743425565973015, "grad_norm": 0.6941283941268921, "learning_rate": 8.248810521760407e-05, "loss": 0.09479608535766601, "memory(GiB)": 122.96, "step": 18030, "token_acc": 0.9614359617494905, "train_speed(iter/s)": 0.246016 }, { "epoch": 1.3747236832075616, "grad_norm": 1.2863913774490356, "learning_rate": 8.247900281968645e-05, "loss": 0.15749361515045165, "memory(GiB)": 122.96, "step": 18035, "token_acc": 0.9438816855753647, "train_speed(iter/s)": 0.246033 }, { "epoch": 1.3751048098178216, "grad_norm": 0.5278708338737488, "learning_rate": 8.246989855924878e-05, "loss": 0.12265716791152954, "memory(GiB)": 122.96, "step": 18040, "token_acc": 0.9550858652575958, "train_speed(iter/s)": 0.246048 }, { "epoch": 1.3754859364280814, "grad_norm": 1.5825202465057373, "learning_rate": 8.246079243681309e-05, "loss": 0.16010622978210448, "memory(GiB)": 122.96, "step": 18045, "token_acc": 0.9347092677118476, "train_speed(iter/s)": 0.246064 }, { "epoch": 1.3758670630383414, "grad_norm": 1.084681510925293, "learning_rate": 8.245168445290158e-05, "loss": 0.20658977031707765, "memory(GiB)": 122.96, "step": 18050, "token_acc": 0.9391498881431767, "train_speed(iter/s)": 0.246087 }, { "epoch": 1.3762481896486012, "grad_norm": 1.1032503843307495, "learning_rate": 8.244257460803659e-05, "loss": 0.1630723237991333, "memory(GiB)": 122.96, "step": 18055, "token_acc": 0.9244808055380742, "train_speed(iter/s)": 0.246114 }, { "epoch": 1.3766293162588612, "grad_norm": 1.422139286994934, "learning_rate": 8.243346290274047e-05, "loss": 0.2012692928314209, "memory(GiB)": 122.96, "step": 18060, "token_acc": 0.9333964049195838, "train_speed(iter/s)": 0.246132 }, { "epoch": 1.3770104428691212, "grad_norm": 1.087647557258606, "learning_rate": 8.242434933753578e-05, "loss": 0.09678897261619568, "memory(GiB)": 122.96, "step": 18065, "token_acc": 0.954467564259486, "train_speed(iter/s)": 0.246147 }, { "epoch": 1.377391569479381, "grad_norm": 0.6008079648017883, "learning_rate": 8.241523391294513e-05, "loss": 0.12846691608428956, "memory(GiB)": 122.96, "step": 18070, "token_acc": 0.9601556708673091, "train_speed(iter/s)": 0.246156 }, { "epoch": 1.377772696089641, "grad_norm": 1.2556928396224976, "learning_rate": 8.240611662949122e-05, "loss": 0.10868494510650635, "memory(GiB)": 122.96, "step": 18075, "token_acc": 0.9469147005444646, "train_speed(iter/s)": 0.246182 }, { "epoch": 1.3781538226999008, "grad_norm": 0.8896218538284302, "learning_rate": 8.239699748769691e-05, "loss": 0.13121745586395264, "memory(GiB)": 122.96, "step": 18080, "token_acc": 0.9460830108100894, "train_speed(iter/s)": 0.246192 }, { "epoch": 1.3785349493101609, "grad_norm": 0.4368506371974945, "learning_rate": 8.238787648808512e-05, "loss": 0.10223814249038696, "memory(GiB)": 122.96, "step": 18085, "token_acc": 0.9565772669220945, "train_speed(iter/s)": 0.246204 }, { "epoch": 1.3789160759204209, "grad_norm": 0.7719905972480774, "learning_rate": 8.237875363117895e-05, "loss": 0.12547726631164552, "memory(GiB)": 122.96, "step": 18090, "token_acc": 0.956474428726877, "train_speed(iter/s)": 0.246213 }, { "epoch": 1.3792972025306807, "grad_norm": 0.803806722164154, "learning_rate": 8.236962891750149e-05, "loss": 0.12150166034698487, "memory(GiB)": 122.96, "step": 18095, "token_acc": 0.9447890818858561, "train_speed(iter/s)": 0.246237 }, { "epoch": 1.3796783291409407, "grad_norm": 1.0717142820358276, "learning_rate": 8.236050234757602e-05, "loss": 0.1570334553718567, "memory(GiB)": 122.96, "step": 18100, "token_acc": 0.9244069159630076, "train_speed(iter/s)": 0.246259 }, { "epoch": 1.3800594557512005, "grad_norm": 0.4425473213195801, "learning_rate": 8.235137392192592e-05, "loss": 0.09883411526679993, "memory(GiB)": 122.96, "step": 18105, "token_acc": 0.951958224543081, "train_speed(iter/s)": 0.24628 }, { "epoch": 1.3804405823614605, "grad_norm": 0.954807698726654, "learning_rate": 8.234224364107466e-05, "loss": 0.11676779985427857, "memory(GiB)": 122.96, "step": 18110, "token_acc": 0.9512258064516129, "train_speed(iter/s)": 0.246299 }, { "epoch": 1.3808217089717205, "grad_norm": 0.6995159387588501, "learning_rate": 8.233311150554582e-05, "loss": 0.11575464010238648, "memory(GiB)": 122.96, "step": 18115, "token_acc": 0.9480493228219122, "train_speed(iter/s)": 0.246315 }, { "epoch": 1.3812028355819803, "grad_norm": 0.4467090666294098, "learning_rate": 8.232397751586304e-05, "loss": 0.1280052900314331, "memory(GiB)": 122.96, "step": 18120, "token_acc": 0.9448345035105316, "train_speed(iter/s)": 0.246338 }, { "epoch": 1.3815839621922403, "grad_norm": 1.6978009939193726, "learning_rate": 8.231484167255019e-05, "loss": 0.12066982984542847, "memory(GiB)": 122.96, "step": 18125, "token_acc": 0.9557305110996386, "train_speed(iter/s)": 0.246343 }, { "epoch": 1.3819650888025001, "grad_norm": 0.8319315314292908, "learning_rate": 8.230570397613111e-05, "loss": 0.12117900848388671, "memory(GiB)": 122.96, "step": 18130, "token_acc": 0.9601789709172259, "train_speed(iter/s)": 0.246351 }, { "epoch": 1.3823462154127601, "grad_norm": 1.2421098947525024, "learning_rate": 8.229656442712983e-05, "loss": 0.18607721328735352, "memory(GiB)": 122.96, "step": 18135, "token_acc": 0.924357034795764, "train_speed(iter/s)": 0.246364 }, { "epoch": 1.3827273420230202, "grad_norm": 0.45954227447509766, "learning_rate": 8.228742302607045e-05, "loss": 0.08876882791519165, "memory(GiB)": 122.96, "step": 18140, "token_acc": 0.9598711914788209, "train_speed(iter/s)": 0.246386 }, { "epoch": 1.38310846863328, "grad_norm": 0.7441730499267578, "learning_rate": 8.22782797734772e-05, "loss": 0.16405327320098878, "memory(GiB)": 122.96, "step": 18145, "token_acc": 0.9367260390161154, "train_speed(iter/s)": 0.246402 }, { "epoch": 1.38348959524354, "grad_norm": 0.8882571458816528, "learning_rate": 8.226913466987438e-05, "loss": 0.09578163623809814, "memory(GiB)": 122.96, "step": 18150, "token_acc": 0.9578059071729957, "train_speed(iter/s)": 0.24642 }, { "epoch": 1.3838707218537998, "grad_norm": 0.23473678529262543, "learning_rate": 8.225998771578645e-05, "loss": 0.10532549619674683, "memory(GiB)": 122.96, "step": 18155, "token_acc": 0.9447200566973778, "train_speed(iter/s)": 0.246444 }, { "epoch": 1.3842518484640598, "grad_norm": 0.8115023374557495, "learning_rate": 8.225083891173792e-05, "loss": 0.19637296199798585, "memory(GiB)": 122.96, "step": 18160, "token_acc": 0.9246612466124661, "train_speed(iter/s)": 0.246451 }, { "epoch": 1.3846329750743198, "grad_norm": 1.1889185905456543, "learning_rate": 8.224168825825345e-05, "loss": 0.11275131702423095, "memory(GiB)": 122.96, "step": 18165, "token_acc": 0.9499618029029794, "train_speed(iter/s)": 0.246475 }, { "epoch": 1.3850141016845796, "grad_norm": 0.8904452323913574, "learning_rate": 8.223253575585776e-05, "loss": 0.11512763500213623, "memory(GiB)": 122.96, "step": 18170, "token_acc": 0.9564375605033882, "train_speed(iter/s)": 0.246472 }, { "epoch": 1.3853952282948394, "grad_norm": 0.558654248714447, "learning_rate": 8.222338140507575e-05, "loss": 0.11917814016342163, "memory(GiB)": 122.96, "step": 18175, "token_acc": 0.9551599819738621, "train_speed(iter/s)": 0.246477 }, { "epoch": 1.3857763549050994, "grad_norm": 0.6444247961044312, "learning_rate": 8.221422520643234e-05, "loss": 0.11732649803161621, "memory(GiB)": 122.96, "step": 18180, "token_acc": 0.9503080898429735, "train_speed(iter/s)": 0.246489 }, { "epoch": 1.3861574815153594, "grad_norm": 0.9569051861763, "learning_rate": 8.22050671604526e-05, "loss": 0.11414519548416138, "memory(GiB)": 122.96, "step": 18185, "token_acc": 0.9591812447935261, "train_speed(iter/s)": 0.246498 }, { "epoch": 1.3865386081256195, "grad_norm": 1.148328185081482, "learning_rate": 8.219590726766171e-05, "loss": 0.14313658475875854, "memory(GiB)": 122.96, "step": 18190, "token_acc": 0.9452187379016648, "train_speed(iter/s)": 0.246513 }, { "epoch": 1.3869197347358793, "grad_norm": 0.9692733883857727, "learning_rate": 8.218674552858494e-05, "loss": 0.15794689655303956, "memory(GiB)": 122.96, "step": 18195, "token_acc": 0.9492404594294183, "train_speed(iter/s)": 0.246538 }, { "epoch": 1.387300861346139, "grad_norm": 0.6120030879974365, "learning_rate": 8.217758194374769e-05, "loss": 0.154066002368927, "memory(GiB)": 122.96, "step": 18200, "token_acc": 0.9381902093291578, "train_speed(iter/s)": 0.246552 }, { "epoch": 1.387300861346139, "eval_loss": 0.10513678193092346, "eval_runtime": 162.4132, "eval_samples_per_second": 3.263, "eval_steps_per_second": 3.263, "eval_token_acc": 0.952442623938317, "step": 18200 }, { "epoch": 1.387681987956399, "grad_norm": 1.1022573709487915, "learning_rate": 8.216841651367541e-05, "loss": 0.09991744756698609, "memory(GiB)": 122.96, "step": 18205, "token_acc": 0.9530609772609046, "train_speed(iter/s)": 0.246021 }, { "epoch": 1.388063114566659, "grad_norm": 0.5583586096763611, "learning_rate": 8.215924923889375e-05, "loss": 0.09317960143089295, "memory(GiB)": 122.96, "step": 18210, "token_acc": 0.9648814749780509, "train_speed(iter/s)": 0.246042 }, { "epoch": 1.388444241176919, "grad_norm": 0.12464092671871185, "learning_rate": 8.215008011992837e-05, "loss": 0.1480857253074646, "memory(GiB)": 122.96, "step": 18215, "token_acc": 0.9445672504676076, "train_speed(iter/s)": 0.246057 }, { "epoch": 1.388825367787179, "grad_norm": 0.6729146242141724, "learning_rate": 8.21409091573051e-05, "loss": 0.11179187297821044, "memory(GiB)": 122.96, "step": 18220, "token_acc": 0.9579610370587374, "train_speed(iter/s)": 0.246066 }, { "epoch": 1.3892064943974387, "grad_norm": 0.5150517821311951, "learning_rate": 8.213173635154985e-05, "loss": 0.0917933464050293, "memory(GiB)": 122.96, "step": 18225, "token_acc": 0.9647311827956989, "train_speed(iter/s)": 0.246079 }, { "epoch": 1.3895876210076987, "grad_norm": 0.6624912023544312, "learning_rate": 8.212256170318861e-05, "loss": 0.10810695886611939, "memory(GiB)": 122.96, "step": 18230, "token_acc": 0.9570655441972339, "train_speed(iter/s)": 0.246085 }, { "epoch": 1.3899687476179587, "grad_norm": 0.5920590162277222, "learning_rate": 8.211338521274754e-05, "loss": 0.1038819432258606, "memory(GiB)": 122.96, "step": 18235, "token_acc": 0.9689461513049223, "train_speed(iter/s)": 0.246098 }, { "epoch": 1.3903498742282185, "grad_norm": 1.125763177871704, "learning_rate": 8.210420688075284e-05, "loss": 0.12476167678833008, "memory(GiB)": 122.96, "step": 18240, "token_acc": 0.954566037735849, "train_speed(iter/s)": 0.246112 }, { "epoch": 1.3907310008384786, "grad_norm": 0.7007716298103333, "learning_rate": 8.209502670773085e-05, "loss": 0.0969914972782135, "memory(GiB)": 122.96, "step": 18245, "token_acc": 0.962416578854935, "train_speed(iter/s)": 0.246119 }, { "epoch": 1.3911121274487384, "grad_norm": 0.9235966801643372, "learning_rate": 8.208584469420804e-05, "loss": 0.13488421440124512, "memory(GiB)": 122.96, "step": 18250, "token_acc": 0.9404175463288764, "train_speed(iter/s)": 0.246135 }, { "epoch": 1.3914932540589984, "grad_norm": 0.5928240418434143, "learning_rate": 8.207666084071093e-05, "loss": 0.08261986970901489, "memory(GiB)": 122.96, "step": 18255, "token_acc": 0.9684904416611734, "train_speed(iter/s)": 0.246148 }, { "epoch": 1.3918743806692584, "grad_norm": 1.3564022779464722, "learning_rate": 8.206747514776616e-05, "loss": 0.08748424649238587, "memory(GiB)": 122.96, "step": 18260, "token_acc": 0.9510974201001156, "train_speed(iter/s)": 0.246171 }, { "epoch": 1.3922555072795182, "grad_norm": 1.1536822319030762, "learning_rate": 8.20582876159005e-05, "loss": 0.1377529978752136, "memory(GiB)": 122.96, "step": 18265, "token_acc": 0.9394889103182257, "train_speed(iter/s)": 0.246192 }, { "epoch": 1.3926366338897782, "grad_norm": 1.7612333297729492, "learning_rate": 8.204909824564082e-05, "loss": 0.1368294596672058, "memory(GiB)": 122.96, "step": 18270, "token_acc": 0.9411939411939412, "train_speed(iter/s)": 0.246213 }, { "epoch": 1.393017760500038, "grad_norm": 1.1100726127624512, "learning_rate": 8.20399070375141e-05, "loss": 0.08371630907058716, "memory(GiB)": 122.96, "step": 18275, "token_acc": 0.974389755902361, "train_speed(iter/s)": 0.246237 }, { "epoch": 1.393398887110298, "grad_norm": 1.144019365310669, "learning_rate": 8.203071399204737e-05, "loss": 0.1365604043006897, "memory(GiB)": 122.96, "step": 18280, "token_acc": 0.9365411436541143, "train_speed(iter/s)": 0.246259 }, { "epoch": 1.393780013720558, "grad_norm": 0.8807190656661987, "learning_rate": 8.202151910976785e-05, "loss": 0.11065781116485596, "memory(GiB)": 122.96, "step": 18285, "token_acc": 0.954520697167756, "train_speed(iter/s)": 0.246281 }, { "epoch": 1.3941611403308178, "grad_norm": 0.8500208854675293, "learning_rate": 8.201232239120279e-05, "loss": 0.10109958648681641, "memory(GiB)": 122.96, "step": 18290, "token_acc": 0.9564670357322597, "train_speed(iter/s)": 0.246302 }, { "epoch": 1.3945422669410779, "grad_norm": 0.7054122686386108, "learning_rate": 8.20031238368796e-05, "loss": 0.1048314094543457, "memory(GiB)": 122.96, "step": 18295, "token_acc": 0.9614573076382051, "train_speed(iter/s)": 0.246311 }, { "epoch": 1.3949233935513377, "grad_norm": 1.013264775276184, "learning_rate": 8.199392344732578e-05, "loss": 0.12618597745895385, "memory(GiB)": 122.96, "step": 18300, "token_acc": 0.9525449674703406, "train_speed(iter/s)": 0.246334 }, { "epoch": 1.3953045201615977, "grad_norm": 1.185962200164795, "learning_rate": 8.198472122306893e-05, "loss": 0.16850589513778685, "memory(GiB)": 122.96, "step": 18305, "token_acc": 0.938801261829653, "train_speed(iter/s)": 0.246348 }, { "epoch": 1.3956856467718577, "grad_norm": 0.9958047270774841, "learning_rate": 8.197551716463672e-05, "loss": 0.12407782077789306, "memory(GiB)": 122.96, "step": 18310, "token_acc": 0.9596340150699677, "train_speed(iter/s)": 0.246368 }, { "epoch": 1.3960667733821175, "grad_norm": 1.0537302494049072, "learning_rate": 8.1966311272557e-05, "loss": 0.14571125507354737, "memory(GiB)": 122.96, "step": 18315, "token_acc": 0.9358014966522253, "train_speed(iter/s)": 0.24639 }, { "epoch": 1.3964478999923775, "grad_norm": 1.4046000242233276, "learning_rate": 8.195710354735766e-05, "loss": 0.1646967887878418, "memory(GiB)": 122.96, "step": 18320, "token_acc": 0.9291666666666667, "train_speed(iter/s)": 0.246411 }, { "epoch": 1.3968290266026373, "grad_norm": 0.95902419090271, "learning_rate": 8.194789398956673e-05, "loss": 0.16405644416809081, "memory(GiB)": 122.96, "step": 18325, "token_acc": 0.93956795187312, "train_speed(iter/s)": 0.246431 }, { "epoch": 1.3972101532128973, "grad_norm": 0.5269853472709656, "learning_rate": 8.193868259971236e-05, "loss": 0.14841848611831665, "memory(GiB)": 122.96, "step": 18330, "token_acc": 0.9386781739722916, "train_speed(iter/s)": 0.246445 }, { "epoch": 1.3975912798231573, "grad_norm": 1.1013139486312866, "learning_rate": 8.192946937832273e-05, "loss": 0.09789665937423705, "memory(GiB)": 122.96, "step": 18335, "token_acc": 0.9562720848056537, "train_speed(iter/s)": 0.24647 }, { "epoch": 1.3979724064334171, "grad_norm": 0.7420839071273804, "learning_rate": 8.19202543259262e-05, "loss": 0.12652567625045777, "memory(GiB)": 122.96, "step": 18340, "token_acc": 0.9533906050082251, "train_speed(iter/s)": 0.246484 }, { "epoch": 1.3983535330436772, "grad_norm": 1.3114731311798096, "learning_rate": 8.191103744305123e-05, "loss": 0.11934573650360107, "memory(GiB)": 122.96, "step": 18345, "token_acc": 0.9542146755609461, "train_speed(iter/s)": 0.246507 }, { "epoch": 1.398734659653937, "grad_norm": 0.7111780643463135, "learning_rate": 8.190181873022634e-05, "loss": 0.09125722646713257, "memory(GiB)": 122.96, "step": 18350, "token_acc": 0.9627312658059364, "train_speed(iter/s)": 0.24651 }, { "epoch": 1.399115786264197, "grad_norm": 0.8680695295333862, "learning_rate": 8.18925981879802e-05, "loss": 0.15024205446243286, "memory(GiB)": 122.96, "step": 18355, "token_acc": 0.945898161244696, "train_speed(iter/s)": 0.246524 }, { "epoch": 1.399496912874457, "grad_norm": 0.7217831015586853, "learning_rate": 8.188337581684153e-05, "loss": 0.12381165027618408, "memory(GiB)": 122.96, "step": 18360, "token_acc": 0.9510852302805717, "train_speed(iter/s)": 0.246522 }, { "epoch": 1.3998780394847168, "grad_norm": 0.6787427663803101, "learning_rate": 8.187415161733924e-05, "loss": 0.1253532886505127, "memory(GiB)": 122.96, "step": 18365, "token_acc": 0.952762547448334, "train_speed(iter/s)": 0.246535 }, { "epoch": 1.4002591660949768, "grad_norm": 0.6483678817749023, "learning_rate": 8.186492559000225e-05, "loss": 0.10359303951263428, "memory(GiB)": 122.96, "step": 18370, "token_acc": 0.9552165354330708, "train_speed(iter/s)": 0.246553 }, { "epoch": 1.4006402927052366, "grad_norm": 1.2542468309402466, "learning_rate": 8.185569773535966e-05, "loss": 0.10218816995620728, "memory(GiB)": 122.96, "step": 18375, "token_acc": 0.9534168834239773, "train_speed(iter/s)": 0.246574 }, { "epoch": 1.4010214193154966, "grad_norm": 1.207767367362976, "learning_rate": 8.184646805394063e-05, "loss": 0.12504873275756836, "memory(GiB)": 122.96, "step": 18380, "token_acc": 0.9365455502896262, "train_speed(iter/s)": 0.246595 }, { "epoch": 1.4014025459257566, "grad_norm": 1.8677887916564941, "learning_rate": 8.183723654627444e-05, "loss": 0.1507176637649536, "memory(GiB)": 122.96, "step": 18385, "token_acc": 0.9484002509410289, "train_speed(iter/s)": 0.246607 }, { "epoch": 1.4017836725360164, "grad_norm": 0.9721167087554932, "learning_rate": 8.182800321289047e-05, "loss": 0.16029274463653564, "memory(GiB)": 122.96, "step": 18390, "token_acc": 0.9440866510538641, "train_speed(iter/s)": 0.246617 }, { "epoch": 1.4021647991462765, "grad_norm": 0.16145023703575134, "learning_rate": 8.181876805431823e-05, "loss": 0.10199804306030273, "memory(GiB)": 122.96, "step": 18395, "token_acc": 0.9563287768507013, "train_speed(iter/s)": 0.24663 }, { "epoch": 1.4025459257565362, "grad_norm": 1.8744113445281982, "learning_rate": 8.18095310710873e-05, "loss": 0.14973278045654298, "memory(GiB)": 122.96, "step": 18400, "token_acc": 0.943858168003377, "train_speed(iter/s)": 0.246646 }, { "epoch": 1.4025459257565362, "eval_loss": 0.10320473462343216, "eval_runtime": 160.3622, "eval_samples_per_second": 3.305, "eval_steps_per_second": 3.305, "eval_token_acc": 0.9527061622793808, "step": 18400 }, { "epoch": 1.4029270523667963, "grad_norm": 1.2142605781555176, "learning_rate": 8.180029226372736e-05, "loss": 0.08121050596237182, "memory(GiB)": 122.96, "step": 18405, "token_acc": 0.9529249180376071, "train_speed(iter/s)": 0.246134 }, { "epoch": 1.4033081789770563, "grad_norm": 1.3426215648651123, "learning_rate": 8.179105163276823e-05, "loss": 0.11242272853851318, "memory(GiB)": 122.96, "step": 18410, "token_acc": 0.9478949562317632, "train_speed(iter/s)": 0.246156 }, { "epoch": 1.403689305587316, "grad_norm": 0.9762861728668213, "learning_rate": 8.178180917873984e-05, "loss": 0.14818623065948486, "memory(GiB)": 122.96, "step": 18415, "token_acc": 0.9284578696343402, "train_speed(iter/s)": 0.246175 }, { "epoch": 1.404070432197576, "grad_norm": 0.7943832278251648, "learning_rate": 8.177256490217216e-05, "loss": 0.16130706071853637, "memory(GiB)": 122.96, "step": 18420, "token_acc": 0.9407176287051482, "train_speed(iter/s)": 0.246188 }, { "epoch": 1.404451558807836, "grad_norm": 0.4993910491466522, "learning_rate": 8.176331880359535e-05, "loss": 0.13296910524368286, "memory(GiB)": 122.96, "step": 18425, "token_acc": 0.9519466853735531, "train_speed(iter/s)": 0.246198 }, { "epoch": 1.404832685418096, "grad_norm": 0.64460289478302, "learning_rate": 8.175407088353957e-05, "loss": 0.1346402049064636, "memory(GiB)": 122.96, "step": 18430, "token_acc": 0.9527158461768667, "train_speed(iter/s)": 0.246212 }, { "epoch": 1.405213812028356, "grad_norm": 1.8868021965026855, "learning_rate": 8.174482114253522e-05, "loss": 0.1428571105003357, "memory(GiB)": 122.96, "step": 18435, "token_acc": 0.9436619718309859, "train_speed(iter/s)": 0.246238 }, { "epoch": 1.4055949386386157, "grad_norm": 0.6482090353965759, "learning_rate": 8.173556958111266e-05, "loss": 0.12975727319717406, "memory(GiB)": 122.96, "step": 18440, "token_acc": 0.9552290606200833, "train_speed(iter/s)": 0.246242 }, { "epoch": 1.4059760652488758, "grad_norm": 0.7293150424957275, "learning_rate": 8.172631619980246e-05, "loss": 0.1396550178527832, "memory(GiB)": 122.96, "step": 18445, "token_acc": 0.9524506217995611, "train_speed(iter/s)": 0.246256 }, { "epoch": 1.4063571918591355, "grad_norm": 1.014978289604187, "learning_rate": 8.171706099913527e-05, "loss": 0.12642989158630372, "memory(GiB)": 122.96, "step": 18450, "token_acc": 0.9482280431432973, "train_speed(iter/s)": 0.246275 }, { "epoch": 1.4067383184693956, "grad_norm": 1.206221580505371, "learning_rate": 8.170780397964182e-05, "loss": 0.10709239244461059, "memory(GiB)": 122.96, "step": 18455, "token_acc": 0.9652525252525253, "train_speed(iter/s)": 0.246294 }, { "epoch": 1.4071194450796556, "grad_norm": 0.9043061137199402, "learning_rate": 8.169854514185294e-05, "loss": 0.10309563875198365, "memory(GiB)": 122.96, "step": 18460, "token_acc": 0.9595172624503514, "train_speed(iter/s)": 0.246306 }, { "epoch": 1.4075005716899154, "grad_norm": 1.062195897102356, "learning_rate": 8.16892844862996e-05, "loss": 0.11574127674102783, "memory(GiB)": 122.96, "step": 18465, "token_acc": 0.9554785390562517, "train_speed(iter/s)": 0.246328 }, { "epoch": 1.4078816983001754, "grad_norm": 0.7399715781211853, "learning_rate": 8.168002201351283e-05, "loss": 0.14091129302978517, "memory(GiB)": 122.96, "step": 18470, "token_acc": 0.9486703772418058, "train_speed(iter/s)": 0.246343 }, { "epoch": 1.4082628249104352, "grad_norm": 0.40641507506370544, "learning_rate": 8.167075772402383e-05, "loss": 0.08318618535995484, "memory(GiB)": 122.96, "step": 18475, "token_acc": 0.9507684918347743, "train_speed(iter/s)": 0.24636 }, { "epoch": 1.4086439515206952, "grad_norm": 0.9433817267417908, "learning_rate": 8.166149161836385e-05, "loss": 0.09168183207511901, "memory(GiB)": 122.96, "step": 18480, "token_acc": 0.9647467036780014, "train_speed(iter/s)": 0.246363 }, { "epoch": 1.4090250781309552, "grad_norm": 0.9048735499382019, "learning_rate": 8.165222369706426e-05, "loss": 0.17582111358642577, "memory(GiB)": 122.96, "step": 18485, "token_acc": 0.9302117506710409, "train_speed(iter/s)": 0.246388 }, { "epoch": 1.409406204741215, "grad_norm": 0.6095515489578247, "learning_rate": 8.164295396065651e-05, "loss": 0.07898681759834289, "memory(GiB)": 122.96, "step": 18490, "token_acc": 0.9631639063392348, "train_speed(iter/s)": 0.246407 }, { "epoch": 1.4097873313514748, "grad_norm": 0.7602821588516235, "learning_rate": 8.16336824096722e-05, "loss": 0.11943141222000123, "memory(GiB)": 122.96, "step": 18495, "token_acc": 0.9526553672316385, "train_speed(iter/s)": 0.246402 }, { "epoch": 1.4101684579617348, "grad_norm": 0.5212048888206482, "learning_rate": 8.1624409044643e-05, "loss": 0.13339110612869262, "memory(GiB)": 122.96, "step": 18500, "token_acc": 0.9504244482173175, "train_speed(iter/s)": 0.246422 }, { "epoch": 1.4105495845719949, "grad_norm": 0.791562557220459, "learning_rate": 8.161513386610068e-05, "loss": 0.08556466698646545, "memory(GiB)": 122.96, "step": 18505, "token_acc": 0.961061189559264, "train_speed(iter/s)": 0.24644 }, { "epoch": 1.4109307111822549, "grad_norm": 1.1346789598464966, "learning_rate": 8.160585687457715e-05, "loss": 0.17056045532226563, "memory(GiB)": 122.96, "step": 18510, "token_acc": 0.9273207796498183, "train_speed(iter/s)": 0.24646 }, { "epoch": 1.4113118377925147, "grad_norm": 0.7100658416748047, "learning_rate": 8.159657807060441e-05, "loss": 0.11528911590576171, "memory(GiB)": 122.96, "step": 18515, "token_acc": 0.953519256308101, "train_speed(iter/s)": 0.246478 }, { "epoch": 1.4116929644027745, "grad_norm": 0.9273611307144165, "learning_rate": 8.158729745471454e-05, "loss": 0.10855063199996948, "memory(GiB)": 122.96, "step": 18520, "token_acc": 0.9600835945663532, "train_speed(iter/s)": 0.246496 }, { "epoch": 1.4120740910130345, "grad_norm": 0.6418458223342896, "learning_rate": 8.157801502743975e-05, "loss": 0.12325654029846192, "memory(GiB)": 122.96, "step": 18525, "token_acc": 0.9537140724150802, "train_speed(iter/s)": 0.246518 }, { "epoch": 1.4124552176232945, "grad_norm": 0.8599861264228821, "learning_rate": 8.156873078931233e-05, "loss": 0.10880006551742553, "memory(GiB)": 122.96, "step": 18530, "token_acc": 0.9562413634269922, "train_speed(iter/s)": 0.246531 }, { "epoch": 1.4128363442335543, "grad_norm": 0.7725469470024109, "learning_rate": 8.15594447408647e-05, "loss": 0.12699611186981202, "memory(GiB)": 122.96, "step": 18535, "token_acc": 0.9428571428571428, "train_speed(iter/s)": 0.246557 }, { "epoch": 1.4132174708438143, "grad_norm": 0.5519210696220398, "learning_rate": 8.155015688262934e-05, "loss": 0.13941999673843383, "memory(GiB)": 122.96, "step": 18540, "token_acc": 0.9533449799609496, "train_speed(iter/s)": 0.246563 }, { "epoch": 1.4135985974540741, "grad_norm": 0.29357147216796875, "learning_rate": 8.154086721513894e-05, "loss": 0.1513966679573059, "memory(GiB)": 122.96, "step": 18545, "token_acc": 0.9516941391941391, "train_speed(iter/s)": 0.246579 }, { "epoch": 1.4139797240643341, "grad_norm": 1.7423571348190308, "learning_rate": 8.153157573892614e-05, "loss": 0.12263892889022827, "memory(GiB)": 122.96, "step": 18550, "token_acc": 0.95578231292517, "train_speed(iter/s)": 0.246602 }, { "epoch": 1.4143608506745942, "grad_norm": 0.8322332501411438, "learning_rate": 8.152228245452381e-05, "loss": 0.08305848836898803, "memory(GiB)": 122.96, "step": 18555, "token_acc": 0.9636085626911315, "train_speed(iter/s)": 0.246623 }, { "epoch": 1.414741977284854, "grad_norm": 0.8521695733070374, "learning_rate": 8.151298736246486e-05, "loss": 0.16248252391815185, "memory(GiB)": 122.96, "step": 18560, "token_acc": 0.9456869009584664, "train_speed(iter/s)": 0.246631 }, { "epoch": 1.415123103895114, "grad_norm": 1.0960078239440918, "learning_rate": 8.150369046328233e-05, "loss": 0.08836096525192261, "memory(GiB)": 122.96, "step": 18565, "token_acc": 0.9554865424430642, "train_speed(iter/s)": 0.246652 }, { "epoch": 1.4155042305053738, "grad_norm": 1.0672720670700073, "learning_rate": 8.149439175750933e-05, "loss": 0.15664026737213135, "memory(GiB)": 122.96, "step": 18570, "token_acc": 0.9309090909090909, "train_speed(iter/s)": 0.246675 }, { "epoch": 1.4158853571156338, "grad_norm": 1.688234567642212, "learning_rate": 8.148509124567913e-05, "loss": 0.20674436092376708, "memory(GiB)": 122.96, "step": 18575, "token_acc": 0.9268852459016393, "train_speed(iter/s)": 0.246696 }, { "epoch": 1.4162664837258938, "grad_norm": 0.5362254977226257, "learning_rate": 8.147578892832504e-05, "loss": 0.16085220575332643, "memory(GiB)": 122.96, "step": 18580, "token_acc": 0.9428197293758184, "train_speed(iter/s)": 0.246707 }, { "epoch": 1.4166476103361536, "grad_norm": 0.518570601940155, "learning_rate": 8.146648480598054e-05, "loss": 0.14986461400985718, "memory(GiB)": 122.96, "step": 18585, "token_acc": 0.9420970266040689, "train_speed(iter/s)": 0.246719 }, { "epoch": 1.4170287369464136, "grad_norm": 1.705245852470398, "learning_rate": 8.145717887917915e-05, "loss": 0.12895435094833374, "memory(GiB)": 122.96, "step": 18590, "token_acc": 0.9496243923994697, "train_speed(iter/s)": 0.246744 }, { "epoch": 1.4174098635566734, "grad_norm": 0.6099913716316223, "learning_rate": 8.144787114845453e-05, "loss": 0.09701173305511475, "memory(GiB)": 122.96, "step": 18595, "token_acc": 0.9497716894977168, "train_speed(iter/s)": 0.246766 }, { "epoch": 1.4177909901669334, "grad_norm": 1.1442960500717163, "learning_rate": 8.143856161434043e-05, "loss": 0.09798418283462525, "memory(GiB)": 122.96, "step": 18600, "token_acc": 0.9490534521158129, "train_speed(iter/s)": 0.246789 }, { "epoch": 1.4177909901669334, "eval_loss": 0.10525099188089371, "eval_runtime": 159.3102, "eval_samples_per_second": 3.327, "eval_steps_per_second": 3.327, "eval_token_acc": 0.9520059032588398, "step": 18600 }, { "epoch": 1.4181721167771935, "grad_norm": 0.6838528513908386, "learning_rate": 8.142925027737072e-05, "loss": 0.11573319435119629, "memory(GiB)": 122.96, "step": 18605, "token_acc": 0.9514552547239584, "train_speed(iter/s)": 0.246288 }, { "epoch": 1.4185532433874533, "grad_norm": 0.9842451214790344, "learning_rate": 8.141993713807934e-05, "loss": 0.11576131582260132, "memory(GiB)": 122.96, "step": 18610, "token_acc": 0.9599628999845417, "train_speed(iter/s)": 0.246297 }, { "epoch": 1.4189343699977133, "grad_norm": 1.5666344165802002, "learning_rate": 8.141062219700039e-05, "loss": 0.13617385625839235, "memory(GiB)": 122.96, "step": 18615, "token_acc": 0.9514661274014156, "train_speed(iter/s)": 0.246307 }, { "epoch": 1.419315496607973, "grad_norm": 1.377042293548584, "learning_rate": 8.140130545466802e-05, "loss": 0.11497244834899903, "memory(GiB)": 122.96, "step": 18620, "token_acc": 0.951213282247765, "train_speed(iter/s)": 0.246326 }, { "epoch": 1.419696623218233, "grad_norm": 0.6518558263778687, "learning_rate": 8.13919869116165e-05, "loss": 0.1697978138923645, "memory(GiB)": 122.96, "step": 18625, "token_acc": 0.940002419257288, "train_speed(iter/s)": 0.246329 }, { "epoch": 1.420077749828493, "grad_norm": 0.6899514198303223, "learning_rate": 8.13826665683802e-05, "loss": 0.1616070032119751, "memory(GiB)": 122.96, "step": 18630, "token_acc": 0.9433686518085495, "train_speed(iter/s)": 0.246353 }, { "epoch": 1.420458876438753, "grad_norm": 1.0554395914077759, "learning_rate": 8.13733444254936e-05, "loss": 0.14253787994384765, "memory(GiB)": 122.96, "step": 18635, "token_acc": 0.9472913616398243, "train_speed(iter/s)": 0.246368 }, { "epoch": 1.420840003049013, "grad_norm": 1.558193325996399, "learning_rate": 8.136402048349132e-05, "loss": 0.1398906707763672, "memory(GiB)": 122.96, "step": 18640, "token_acc": 0.947289905519642, "train_speed(iter/s)": 0.246387 }, { "epoch": 1.4212211296592727, "grad_norm": 0.6901738047599792, "learning_rate": 8.135469474290797e-05, "loss": 0.1218982219696045, "memory(GiB)": 122.96, "step": 18645, "token_acc": 0.9547489983502239, "train_speed(iter/s)": 0.246404 }, { "epoch": 1.4216022562695327, "grad_norm": 0.9578468203544617, "learning_rate": 8.13453672042784e-05, "loss": 0.10346074104309082, "memory(GiB)": 122.96, "step": 18650, "token_acc": 0.956984667802385, "train_speed(iter/s)": 0.246427 }, { "epoch": 1.4219833828797928, "grad_norm": 0.7844031453132629, "learning_rate": 8.13360378681375e-05, "loss": 0.10355440378189087, "memory(GiB)": 122.96, "step": 18655, "token_acc": 0.9595024587792884, "train_speed(iter/s)": 0.246432 }, { "epoch": 1.4223645094900526, "grad_norm": 1.0240603685379028, "learning_rate": 8.132670673502022e-05, "loss": 0.1594296932220459, "memory(GiB)": 122.96, "step": 18660, "token_acc": 0.9134020618556701, "train_speed(iter/s)": 0.246456 }, { "epoch": 1.4227456361003126, "grad_norm": 0.47735917568206787, "learning_rate": 8.131737380546169e-05, "loss": 0.11618781089782715, "memory(GiB)": 122.96, "step": 18665, "token_acc": 0.952177886079805, "train_speed(iter/s)": 0.246478 }, { "epoch": 1.4231267627105724, "grad_norm": 0.95379239320755, "learning_rate": 8.130803907999709e-05, "loss": 0.07237839698791504, "memory(GiB)": 122.96, "step": 18670, "token_acc": 0.9677043933663079, "train_speed(iter/s)": 0.246497 }, { "epoch": 1.4235078893208324, "grad_norm": 0.8878023624420166, "learning_rate": 8.129870255916176e-05, "loss": 0.13588590621948243, "memory(GiB)": 122.96, "step": 18675, "token_acc": 0.9458155501948413, "train_speed(iter/s)": 0.246513 }, { "epoch": 1.4238890159310924, "grad_norm": 1.6210001707077026, "learning_rate": 8.128936424349106e-05, "loss": 0.13079198598861694, "memory(GiB)": 122.96, "step": 18680, "token_acc": 0.9538310412573674, "train_speed(iter/s)": 0.246537 }, { "epoch": 1.4242701425413522, "grad_norm": 0.9744818210601807, "learning_rate": 8.128002413352053e-05, "loss": 0.10710233449935913, "memory(GiB)": 122.96, "step": 18685, "token_acc": 0.9540128512032254, "train_speed(iter/s)": 0.24654 }, { "epoch": 1.4246512691516122, "grad_norm": 1.2290986776351929, "learning_rate": 8.127068222978578e-05, "loss": 0.17551583051681519, "memory(GiB)": 122.96, "step": 18690, "token_acc": 0.9245283018867925, "train_speed(iter/s)": 0.246565 }, { "epoch": 1.425032395761872, "grad_norm": 0.6828672289848328, "learning_rate": 8.126133853282252e-05, "loss": 0.15813136100769043, "memory(GiB)": 122.96, "step": 18695, "token_acc": 0.9324384787472035, "train_speed(iter/s)": 0.246584 }, { "epoch": 1.425413522372132, "grad_norm": 0.7722671031951904, "learning_rate": 8.125199304316655e-05, "loss": 0.0733360230922699, "memory(GiB)": 122.96, "step": 18700, "token_acc": 0.9793561931420574, "train_speed(iter/s)": 0.246606 }, { "epoch": 1.425794648982392, "grad_norm": 0.7025227546691895, "learning_rate": 8.12426457613538e-05, "loss": 0.1174615740776062, "memory(GiB)": 122.96, "step": 18705, "token_acc": 0.9620211898940505, "train_speed(iter/s)": 0.246615 }, { "epoch": 1.4261757755926519, "grad_norm": 1.0486899614334106, "learning_rate": 8.123329668792032e-05, "loss": 0.15710290670394897, "memory(GiB)": 122.96, "step": 18710, "token_acc": 0.9391683057038493, "train_speed(iter/s)": 0.246627 }, { "epoch": 1.4265569022029119, "grad_norm": 1.3589959144592285, "learning_rate": 8.122394582340222e-05, "loss": 0.1665947437286377, "memory(GiB)": 122.96, "step": 18715, "token_acc": 0.9432163861285743, "train_speed(iter/s)": 0.246641 }, { "epoch": 1.4269380288131717, "grad_norm": 0.9270812273025513, "learning_rate": 8.121459316833571e-05, "loss": 0.11129704713821412, "memory(GiB)": 122.96, "step": 18720, "token_acc": 0.9608272506082725, "train_speed(iter/s)": 0.246658 }, { "epoch": 1.4273191554234317, "grad_norm": 0.7994949817657471, "learning_rate": 8.120523872325712e-05, "loss": 0.13685863018035888, "memory(GiB)": 122.96, "step": 18725, "token_acc": 0.9308652988403211, "train_speed(iter/s)": 0.246676 }, { "epoch": 1.4277002820336917, "grad_norm": 1.3021093606948853, "learning_rate": 8.119588248870293e-05, "loss": 0.09754078388214112, "memory(GiB)": 122.96, "step": 18730, "token_acc": 0.9688535453943009, "train_speed(iter/s)": 0.246685 }, { "epoch": 1.4280814086439515, "grad_norm": 1.257550835609436, "learning_rate": 8.118652446520963e-05, "loss": 0.11728484630584717, "memory(GiB)": 122.96, "step": 18735, "token_acc": 0.9530288909599255, "train_speed(iter/s)": 0.2467 }, { "epoch": 1.4284625352542115, "grad_norm": 0.2866252660751343, "learning_rate": 8.117716465331386e-05, "loss": 0.09095391631126404, "memory(GiB)": 122.96, "step": 18740, "token_acc": 0.962937822427369, "train_speed(iter/s)": 0.246711 }, { "epoch": 1.4288436618644713, "grad_norm": 1.6896525621414185, "learning_rate": 8.11678030535524e-05, "loss": 0.11995611190795899, "memory(GiB)": 122.96, "step": 18745, "token_acc": 0.9452054794520548, "train_speed(iter/s)": 0.246733 }, { "epoch": 1.4292247884747313, "grad_norm": 0.953292727470398, "learning_rate": 8.115843966646206e-05, "loss": 0.1408531427383423, "memory(GiB)": 122.96, "step": 18750, "token_acc": 0.9589912797548904, "train_speed(iter/s)": 0.246753 }, { "epoch": 1.4296059150849914, "grad_norm": 1.224972128868103, "learning_rate": 8.114907449257981e-05, "loss": 0.13542817831039428, "memory(GiB)": 122.96, "step": 18755, "token_acc": 0.9455965526528414, "train_speed(iter/s)": 0.246772 }, { "epoch": 1.4299870416952511, "grad_norm": 1.3824682235717773, "learning_rate": 8.113970753244269e-05, "loss": 0.14549415111541747, "memory(GiB)": 122.96, "step": 18760, "token_acc": 0.9528401386168449, "train_speed(iter/s)": 0.246784 }, { "epoch": 1.4303681683055112, "grad_norm": 0.7319364547729492, "learning_rate": 8.113033878658782e-05, "loss": 0.1010746717453003, "memory(GiB)": 122.96, "step": 18765, "token_acc": 0.9644444444444444, "train_speed(iter/s)": 0.2468 }, { "epoch": 1.430749294915771, "grad_norm": 0.8387638926506042, "learning_rate": 8.112096825555251e-05, "loss": 0.12028862237930298, "memory(GiB)": 122.96, "step": 18770, "token_acc": 0.9659367396593674, "train_speed(iter/s)": 0.246819 }, { "epoch": 1.431130421526031, "grad_norm": 0.8554867506027222, "learning_rate": 8.111159593987407e-05, "loss": 0.11790206432342529, "memory(GiB)": 122.96, "step": 18775, "token_acc": 0.9475982532751092, "train_speed(iter/s)": 0.246834 }, { "epoch": 1.431511548136291, "grad_norm": 1.5390795469284058, "learning_rate": 8.110222184009e-05, "loss": 0.16302452087402344, "memory(GiB)": 122.96, "step": 18780, "token_acc": 0.942101226993865, "train_speed(iter/s)": 0.246859 }, { "epoch": 1.4318926747465508, "grad_norm": 1.7068110704421997, "learning_rate": 8.109284595673782e-05, "loss": 0.1819934606552124, "memory(GiB)": 122.96, "step": 18785, "token_acc": 0.9290465631929047, "train_speed(iter/s)": 0.246883 }, { "epoch": 1.4322738013568106, "grad_norm": 0.9326579570770264, "learning_rate": 8.108346829035522e-05, "loss": 0.11843564510345458, "memory(GiB)": 122.96, "step": 18790, "token_acc": 0.950561797752809, "train_speed(iter/s)": 0.246897 }, { "epoch": 1.4326549279670706, "grad_norm": 1.2024770975112915, "learning_rate": 8.107408884147998e-05, "loss": 0.14099421501159667, "memory(GiB)": 122.96, "step": 18795, "token_acc": 0.9606472968002943, "train_speed(iter/s)": 0.246906 }, { "epoch": 1.4330360545773306, "grad_norm": 1.2317966222763062, "learning_rate": 8.106470761064992e-05, "loss": 0.11027973890304565, "memory(GiB)": 122.96, "step": 18800, "token_acc": 0.9470515317867895, "train_speed(iter/s)": 0.24692 }, { "epoch": 1.4330360545773306, "eval_loss": 0.10415904968976974, "eval_runtime": 158.9709, "eval_samples_per_second": 3.334, "eval_steps_per_second": 3.334, "eval_token_acc": 0.9524275646045419, "step": 18800 }, { "epoch": 1.4334171811875906, "grad_norm": 0.6738716959953308, "learning_rate": 8.105532459840304e-05, "loss": 0.18317933082580568, "memory(GiB)": 122.96, "step": 18805, "token_acc": 0.9517214030786979, "train_speed(iter/s)": 0.246415 }, { "epoch": 1.4337983077978504, "grad_norm": 1.1211717128753662, "learning_rate": 8.10459398052774e-05, "loss": 0.12054901123046875, "memory(GiB)": 122.96, "step": 18810, "token_acc": 0.950733752620545, "train_speed(iter/s)": 0.246429 }, { "epoch": 1.4341794344081102, "grad_norm": 0.7109419703483582, "learning_rate": 8.10365532318112e-05, "loss": 0.12549219131469727, "memory(GiB)": 122.96, "step": 18815, "token_acc": 0.9477434679334917, "train_speed(iter/s)": 0.246443 }, { "epoch": 1.4345605610183703, "grad_norm": 0.6351598501205444, "learning_rate": 8.10271648785427e-05, "loss": 0.12163188457489013, "memory(GiB)": 122.96, "step": 18820, "token_acc": 0.9497509205111544, "train_speed(iter/s)": 0.24646 }, { "epoch": 1.4349416876286303, "grad_norm": 1.36000394821167, "learning_rate": 8.101777474601027e-05, "loss": 0.12906453609466553, "memory(GiB)": 122.96, "step": 18825, "token_acc": 0.9475908706677938, "train_speed(iter/s)": 0.246469 }, { "epoch": 1.43532281423889, "grad_norm": 0.6743232011795044, "learning_rate": 8.100838283475239e-05, "loss": 0.11569726467132568, "memory(GiB)": 122.96, "step": 18830, "token_acc": 0.9585253456221198, "train_speed(iter/s)": 0.246479 }, { "epoch": 1.43570394084915, "grad_norm": 0.520671010017395, "learning_rate": 8.099898914530767e-05, "loss": 0.12249659299850464, "memory(GiB)": 122.96, "step": 18835, "token_acc": 0.951148377688662, "train_speed(iter/s)": 0.246498 }, { "epoch": 1.43608506745941, "grad_norm": 0.9329952001571655, "learning_rate": 8.098959367821478e-05, "loss": 0.11815061569213867, "memory(GiB)": 122.96, "step": 18840, "token_acc": 0.9549814356435643, "train_speed(iter/s)": 0.246505 }, { "epoch": 1.43646619406967, "grad_norm": 0.5914559364318848, "learning_rate": 8.098019643401246e-05, "loss": 0.10759944915771484, "memory(GiB)": 122.96, "step": 18845, "token_acc": 0.9509683746016181, "train_speed(iter/s)": 0.246522 }, { "epoch": 1.43684732067993, "grad_norm": 0.6574293375015259, "learning_rate": 8.097079741323968e-05, "loss": 0.1287701964378357, "memory(GiB)": 122.96, "step": 18850, "token_acc": 0.9619450317124736, "train_speed(iter/s)": 0.246526 }, { "epoch": 1.4372284472901897, "grad_norm": 1.2271397113800049, "learning_rate": 8.096139661643539e-05, "loss": 0.15353147983551024, "memory(GiB)": 122.96, "step": 18855, "token_acc": 0.9493518963034085, "train_speed(iter/s)": 0.246543 }, { "epoch": 1.4376095739004497, "grad_norm": 0.8273851871490479, "learning_rate": 8.095199404413867e-05, "loss": 0.09196496605873108, "memory(GiB)": 122.96, "step": 18860, "token_acc": 0.9653054958550814, "train_speed(iter/s)": 0.246555 }, { "epoch": 1.4379907005107095, "grad_norm": 0.31403011083602905, "learning_rate": 8.094258969688872e-05, "loss": 0.12972168922424315, "memory(GiB)": 122.96, "step": 18865, "token_acc": 0.9515151515151515, "train_speed(iter/s)": 0.246569 }, { "epoch": 1.4383718271209696, "grad_norm": 1.547602653503418, "learning_rate": 8.093318357522485e-05, "loss": 0.15748403072357178, "memory(GiB)": 122.96, "step": 18870, "token_acc": 0.9419542083198968, "train_speed(iter/s)": 0.246584 }, { "epoch": 1.4387529537312296, "grad_norm": 0.7255045175552368, "learning_rate": 8.092377567968643e-05, "loss": 0.12394015789031983, "memory(GiB)": 122.96, "step": 18875, "token_acc": 0.9547635366689513, "train_speed(iter/s)": 0.246594 }, { "epoch": 1.4391340803414894, "grad_norm": 0.581085205078125, "learning_rate": 8.091436601081301e-05, "loss": 0.17961875200271607, "memory(GiB)": 122.96, "step": 18880, "token_acc": 0.9493790496760259, "train_speed(iter/s)": 0.246601 }, { "epoch": 1.4395152069517494, "grad_norm": 0.641188383102417, "learning_rate": 8.090495456914414e-05, "loss": 0.1227304220199585, "memory(GiB)": 122.96, "step": 18885, "token_acc": 0.9578073664188138, "train_speed(iter/s)": 0.246612 }, { "epoch": 1.4398963335620092, "grad_norm": 1.1288622617721558, "learning_rate": 8.089554135521955e-05, "loss": 0.13889453411102295, "memory(GiB)": 122.96, "step": 18890, "token_acc": 0.9556601922699627, "train_speed(iter/s)": 0.246615 }, { "epoch": 1.4402774601722692, "grad_norm": 0.6036928296089172, "learning_rate": 8.088612636957904e-05, "loss": 0.10831539630889893, "memory(GiB)": 122.96, "step": 18895, "token_acc": 0.9655430711610486, "train_speed(iter/s)": 0.246631 }, { "epoch": 1.4406585867825292, "grad_norm": 0.7085936665534973, "learning_rate": 8.08767096127625e-05, "loss": 0.11340765953063965, "memory(GiB)": 122.96, "step": 18900, "token_acc": 0.9606768350810295, "train_speed(iter/s)": 0.24665 }, { "epoch": 1.441039713392789, "grad_norm": 0.8926483988761902, "learning_rate": 8.086729108530997e-05, "loss": 0.12060369253158569, "memory(GiB)": 122.96, "step": 18905, "token_acc": 0.9500085895894176, "train_speed(iter/s)": 0.246662 }, { "epoch": 1.441420840003049, "grad_norm": 0.8751965761184692, "learning_rate": 8.085787078776152e-05, "loss": 0.09916902780532837, "memory(GiB)": 122.96, "step": 18910, "token_acc": 0.9575066627183891, "train_speed(iter/s)": 0.246673 }, { "epoch": 1.4418019666133088, "grad_norm": 1.1364500522613525, "learning_rate": 8.084844872065737e-05, "loss": 0.13124603033065796, "memory(GiB)": 122.96, "step": 18915, "token_acc": 0.9583468922876668, "train_speed(iter/s)": 0.246691 }, { "epoch": 1.4421830932235689, "grad_norm": 1.1538490056991577, "learning_rate": 8.083902488453786e-05, "loss": 0.15582749843597413, "memory(GiB)": 122.96, "step": 18920, "token_acc": 0.9307137433561123, "train_speed(iter/s)": 0.246711 }, { "epoch": 1.4425642198338289, "grad_norm": 0.7574672102928162, "learning_rate": 8.082959927994339e-05, "loss": 0.13207271099090576, "memory(GiB)": 122.96, "step": 18925, "token_acc": 0.9500621118012422, "train_speed(iter/s)": 0.246733 }, { "epoch": 1.4429453464440887, "grad_norm": 0.7052353024482727, "learning_rate": 8.082017190741444e-05, "loss": 0.10468072891235351, "memory(GiB)": 122.96, "step": 18930, "token_acc": 0.9494015233949945, "train_speed(iter/s)": 0.246749 }, { "epoch": 1.4433264730543487, "grad_norm": 0.9169278144836426, "learning_rate": 8.081074276749167e-05, "loss": 0.09797731637954712, "memory(GiB)": 122.96, "step": 18935, "token_acc": 0.9592096876991715, "train_speed(iter/s)": 0.24677 }, { "epoch": 1.4437075996646085, "grad_norm": 1.780732274055481, "learning_rate": 8.080131186071577e-05, "loss": 0.16652768850326538, "memory(GiB)": 122.96, "step": 18940, "token_acc": 0.941617096276713, "train_speed(iter/s)": 0.246783 }, { "epoch": 1.4440887262748685, "grad_norm": 0.7023558616638184, "learning_rate": 8.079187918762759e-05, "loss": 0.18593850135803222, "memory(GiB)": 122.96, "step": 18945, "token_acc": 0.9136790810998956, "train_speed(iter/s)": 0.246803 }, { "epoch": 1.4444698528851285, "grad_norm": 1.960587978363037, "learning_rate": 8.078244474876802e-05, "loss": 0.13992748260498047, "memory(GiB)": 122.96, "step": 18950, "token_acc": 0.9525134483398257, "train_speed(iter/s)": 0.246814 }, { "epoch": 1.4448509794953883, "grad_norm": 1.0297163724899292, "learning_rate": 8.077300854467809e-05, "loss": 0.15598548650741578, "memory(GiB)": 122.96, "step": 18955, "token_acc": 0.9436513899323816, "train_speed(iter/s)": 0.246825 }, { "epoch": 1.4452321061056483, "grad_norm": 0.6106840968132019, "learning_rate": 8.076357057589892e-05, "loss": 0.10381536483764649, "memory(GiB)": 122.96, "step": 18960, "token_acc": 0.958816094375395, "train_speed(iter/s)": 0.24683 }, { "epoch": 1.4456132327159081, "grad_norm": 2.681159734725952, "learning_rate": 8.075413084297176e-05, "loss": 0.17083433866500855, "memory(GiB)": 122.96, "step": 18965, "token_acc": 0.9308755760368663, "train_speed(iter/s)": 0.246849 }, { "epoch": 1.4459943593261682, "grad_norm": 0.6529614925384521, "learning_rate": 8.074468934643788e-05, "loss": 0.14015731811523438, "memory(GiB)": 122.96, "step": 18970, "token_acc": 0.9474245115452931, "train_speed(iter/s)": 0.246861 }, { "epoch": 1.4463754859364282, "grad_norm": 0.8163145184516907, "learning_rate": 8.073524608683875e-05, "loss": 0.13044567108154298, "memory(GiB)": 122.96, "step": 18975, "token_acc": 0.952580560649725, "train_speed(iter/s)": 0.246879 }, { "epoch": 1.446756612546688, "grad_norm": 0.5419698357582092, "learning_rate": 8.07258010647159e-05, "loss": 0.15981061458587648, "memory(GiB)": 122.96, "step": 18980, "token_acc": 0.9437718590260963, "train_speed(iter/s)": 0.2469 }, { "epoch": 1.447137739156948, "grad_norm": 1.5064417123794556, "learning_rate": 8.071635428061094e-05, "loss": 0.10654369592666627, "memory(GiB)": 122.96, "step": 18985, "token_acc": 0.9634350888963435, "train_speed(iter/s)": 0.24692 }, { "epoch": 1.4475188657672078, "grad_norm": 1.3496054410934448, "learning_rate": 8.070690573506561e-05, "loss": 0.1434991717338562, "memory(GiB)": 122.96, "step": 18990, "token_acc": 0.9427596664139499, "train_speed(iter/s)": 0.246943 }, { "epoch": 1.4478999923774678, "grad_norm": 0.6815381050109863, "learning_rate": 8.069745542862171e-05, "loss": 0.12301586866378784, "memory(GiB)": 122.96, "step": 18995, "token_acc": 0.957037037037037, "train_speed(iter/s)": 0.246962 }, { "epoch": 1.4482811189877278, "grad_norm": 0.6960711479187012, "learning_rate": 8.068800336182121e-05, "loss": 0.07955302000045776, "memory(GiB)": 122.96, "step": 19000, "token_acc": 0.9598242310106717, "train_speed(iter/s)": 0.246981 }, { "epoch": 1.4482811189877278, "eval_loss": 0.10236881673336029, "eval_runtime": 158.6226, "eval_samples_per_second": 3.341, "eval_steps_per_second": 3.341, "eval_token_acc": 0.952864285284019, "step": 19000 }, { "epoch": 1.4486622455979876, "grad_norm": 0.9389320015907288, "learning_rate": 8.067854953520612e-05, "loss": 0.11166160106658936, "memory(GiB)": 122.96, "step": 19005, "token_acc": 0.9531217515359635, "train_speed(iter/s)": 0.246481 }, { "epoch": 1.4490433722082476, "grad_norm": 1.4802606105804443, "learning_rate": 8.066909394931859e-05, "loss": 0.12198755741119385, "memory(GiB)": 122.96, "step": 19010, "token_acc": 0.9535010940919038, "train_speed(iter/s)": 0.246497 }, { "epoch": 1.4494244988185074, "grad_norm": 1.1712068319320679, "learning_rate": 8.065963660470084e-05, "loss": 0.13377373218536376, "memory(GiB)": 122.96, "step": 19015, "token_acc": 0.9465648854961832, "train_speed(iter/s)": 0.246517 }, { "epoch": 1.4498056254287675, "grad_norm": 0.7430325746536255, "learning_rate": 8.06501775018952e-05, "loss": 0.10509793758392334, "memory(GiB)": 122.96, "step": 19020, "token_acc": 0.9599596716116953, "train_speed(iter/s)": 0.246525 }, { "epoch": 1.4501867520390275, "grad_norm": 1.1666769981384277, "learning_rate": 8.064071664144413e-05, "loss": 0.1798298716545105, "memory(GiB)": 122.96, "step": 19025, "token_acc": 0.9395912461566287, "train_speed(iter/s)": 0.246539 }, { "epoch": 1.4505678786492873, "grad_norm": 1.1944515705108643, "learning_rate": 8.063125402389013e-05, "loss": 0.10532540082931519, "memory(GiB)": 122.96, "step": 19030, "token_acc": 0.9612244897959183, "train_speed(iter/s)": 0.246564 }, { "epoch": 1.4509490052595473, "grad_norm": 1.3634631633758545, "learning_rate": 8.062178964977586e-05, "loss": 0.09269207715988159, "memory(GiB)": 122.96, "step": 19035, "token_acc": 0.9647340107591154, "train_speed(iter/s)": 0.246589 }, { "epoch": 1.451330131869807, "grad_norm": 1.131097435951233, "learning_rate": 8.061232351964408e-05, "loss": 0.12403786182403564, "memory(GiB)": 122.96, "step": 19040, "token_acc": 0.9534836890857833, "train_speed(iter/s)": 0.246592 }, { "epoch": 1.451711258480067, "grad_norm": 0.06138041988015175, "learning_rate": 8.060285563403759e-05, "loss": 0.07665469646453857, "memory(GiB)": 122.96, "step": 19045, "token_acc": 0.9620098039215687, "train_speed(iter/s)": 0.246614 }, { "epoch": 1.4520923850903271, "grad_norm": 0.4815457761287689, "learning_rate": 8.059338599349935e-05, "loss": 0.1626629948616028, "memory(GiB)": 122.96, "step": 19050, "token_acc": 0.9522924411400248, "train_speed(iter/s)": 0.246626 }, { "epoch": 1.452473511700587, "grad_norm": 1.0686595439910889, "learning_rate": 8.058391459857238e-05, "loss": 0.16177575588226317, "memory(GiB)": 122.96, "step": 19055, "token_acc": 0.932710793737984, "train_speed(iter/s)": 0.246648 }, { "epoch": 1.452854638310847, "grad_norm": 0.5887235403060913, "learning_rate": 8.057444144979984e-05, "loss": 0.08406713008880615, "memory(GiB)": 122.96, "step": 19060, "token_acc": 0.9643623872906827, "train_speed(iter/s)": 0.246658 }, { "epoch": 1.4532357649211067, "grad_norm": 0.807970404624939, "learning_rate": 8.056496654772499e-05, "loss": 0.12369675636291504, "memory(GiB)": 122.96, "step": 19065, "token_acc": 0.9601706970128022, "train_speed(iter/s)": 0.246678 }, { "epoch": 1.4536168915313668, "grad_norm": 0.6419606804847717, "learning_rate": 8.055548989289112e-05, "loss": 0.11172273159027099, "memory(GiB)": 122.96, "step": 19070, "token_acc": 0.9584256483774274, "train_speed(iter/s)": 0.246682 }, { "epoch": 1.4539980181416268, "grad_norm": 1.1831451654434204, "learning_rate": 8.05460114858417e-05, "loss": 0.16299207210540773, "memory(GiB)": 122.96, "step": 19075, "token_acc": 0.9439598778892281, "train_speed(iter/s)": 0.246697 }, { "epoch": 1.4543791447518866, "grad_norm": 2.044217824935913, "learning_rate": 8.053653132712027e-05, "loss": 0.13252384662628175, "memory(GiB)": 122.96, "step": 19080, "token_acc": 0.9541477073853692, "train_speed(iter/s)": 0.246717 }, { "epoch": 1.4547602713621466, "grad_norm": 1.104249358177185, "learning_rate": 8.05270494172705e-05, "loss": 0.10227638483047485, "memory(GiB)": 122.96, "step": 19085, "token_acc": 0.9574846206425154, "train_speed(iter/s)": 0.246724 }, { "epoch": 1.4551413979724064, "grad_norm": 0.9822995662689209, "learning_rate": 8.051756575683608e-05, "loss": 0.1559286594390869, "memory(GiB)": 122.96, "step": 19090, "token_acc": 0.951131221719457, "train_speed(iter/s)": 0.246741 }, { "epoch": 1.4555225245826664, "grad_norm": 1.4082252979278564, "learning_rate": 8.050808034636091e-05, "loss": 0.14878170490264891, "memory(GiB)": 122.96, "step": 19095, "token_acc": 0.9453237410071943, "train_speed(iter/s)": 0.246766 }, { "epoch": 1.4559036511929264, "grad_norm": 0.9668273329734802, "learning_rate": 8.049859318638888e-05, "loss": 0.09336472153663636, "memory(GiB)": 122.96, "step": 19100, "token_acc": 0.9616807926465322, "train_speed(iter/s)": 0.246778 }, { "epoch": 1.4562847778031862, "grad_norm": 0.807685911655426, "learning_rate": 8.048910427746407e-05, "loss": 0.10755785703659057, "memory(GiB)": 122.96, "step": 19105, "token_acc": 0.9490582191780822, "train_speed(iter/s)": 0.246796 }, { "epoch": 1.456665904413446, "grad_norm": 0.9314944744110107, "learning_rate": 8.047961362013062e-05, "loss": 0.12040450572967529, "memory(GiB)": 122.96, "step": 19110, "token_acc": 0.9518762532225723, "train_speed(iter/s)": 0.246807 }, { "epoch": 1.457047031023706, "grad_norm": 0.4960266351699829, "learning_rate": 8.047012121493277e-05, "loss": 0.1359207272529602, "memory(GiB)": 122.96, "step": 19115, "token_acc": 0.960494603374201, "train_speed(iter/s)": 0.246816 }, { "epoch": 1.457428157633966, "grad_norm": 1.0363414287567139, "learning_rate": 8.046062706241488e-05, "loss": 0.16500072479248046, "memory(GiB)": 122.96, "step": 19120, "token_acc": 0.9442728283701964, "train_speed(iter/s)": 0.246817 }, { "epoch": 1.457809284244226, "grad_norm": 0.8802107572555542, "learning_rate": 8.045113116312136e-05, "loss": 0.15310912132263182, "memory(GiB)": 122.96, "step": 19125, "token_acc": 0.9373231773667029, "train_speed(iter/s)": 0.246833 }, { "epoch": 1.4581904108544859, "grad_norm": 0.9917898178100586, "learning_rate": 8.044163351759679e-05, "loss": 0.11162164211273193, "memory(GiB)": 122.96, "step": 19130, "token_acc": 0.9593220338983051, "train_speed(iter/s)": 0.246853 }, { "epoch": 1.4585715374647457, "grad_norm": 0.9474581480026245, "learning_rate": 8.043213412638581e-05, "loss": 0.11770514249801636, "memory(GiB)": 122.96, "step": 19135, "token_acc": 0.9534438775510204, "train_speed(iter/s)": 0.246875 }, { "epoch": 1.4589526640750057, "grad_norm": 0.575875461101532, "learning_rate": 8.042263299003315e-05, "loss": 0.09289878606796265, "memory(GiB)": 122.96, "step": 19140, "token_acc": 0.957276901577161, "train_speed(iter/s)": 0.246886 }, { "epoch": 1.4593337906852657, "grad_norm": 0.735436737537384, "learning_rate": 8.041313010908367e-05, "loss": 0.1364153265953064, "memory(GiB)": 122.96, "step": 19145, "token_acc": 0.9473071808510638, "train_speed(iter/s)": 0.246905 }, { "epoch": 1.4597149172955255, "grad_norm": 0.838154137134552, "learning_rate": 8.04036254840823e-05, "loss": 0.16957075595855714, "memory(GiB)": 122.96, "step": 19150, "token_acc": 0.9326585416236315, "train_speed(iter/s)": 0.246918 }, { "epoch": 1.4600960439057855, "grad_norm": 1.3365490436553955, "learning_rate": 8.039411911557411e-05, "loss": 0.10395469665527343, "memory(GiB)": 122.96, "step": 19155, "token_acc": 0.9662110858010631, "train_speed(iter/s)": 0.24694 }, { "epoch": 1.4604771705160453, "grad_norm": 1.4630630016326904, "learning_rate": 8.038461100410424e-05, "loss": 0.17304645776748656, "memory(GiB)": 122.96, "step": 19160, "token_acc": 0.9246987951807228, "train_speed(iter/s)": 0.246961 }, { "epoch": 1.4608582971263053, "grad_norm": 1.1528059244155884, "learning_rate": 8.037510115021792e-05, "loss": 0.09160124063491822, "memory(GiB)": 122.96, "step": 19165, "token_acc": 0.9611374407582939, "train_speed(iter/s)": 0.246979 }, { "epoch": 1.4612394237365653, "grad_norm": 1.7567652463912964, "learning_rate": 8.036558955446052e-05, "loss": 0.13365966081619263, "memory(GiB)": 122.96, "step": 19170, "token_acc": 0.9434044405746626, "train_speed(iter/s)": 0.246996 }, { "epoch": 1.4616205503468251, "grad_norm": 0.7164626717567444, "learning_rate": 8.035607621737746e-05, "loss": 0.07561657428741456, "memory(GiB)": 122.96, "step": 19175, "token_acc": 0.9621442387399834, "train_speed(iter/s)": 0.247013 }, { "epoch": 1.4620016769570852, "grad_norm": 1.136659026145935, "learning_rate": 8.034656113951431e-05, "loss": 0.1305892586708069, "memory(GiB)": 122.96, "step": 19180, "token_acc": 0.9555845852384063, "train_speed(iter/s)": 0.24702 }, { "epoch": 1.462382803567345, "grad_norm": 2.721268653869629, "learning_rate": 8.033704432141668e-05, "loss": 0.17148168087005616, "memory(GiB)": 122.96, "step": 19185, "token_acc": 0.9401983218916857, "train_speed(iter/s)": 0.247034 }, { "epoch": 1.462763930177605, "grad_norm": 0.5779721140861511, "learning_rate": 8.032752576363036e-05, "loss": 0.122471022605896, "memory(GiB)": 122.96, "step": 19190, "token_acc": 0.9504008016032064, "train_speed(iter/s)": 0.247058 }, { "epoch": 1.463145056787865, "grad_norm": 5.59716272354126, "learning_rate": 8.031800546670119e-05, "loss": 0.13431329727172853, "memory(GiB)": 122.96, "step": 19195, "token_acc": 0.9508443908323281, "train_speed(iter/s)": 0.24708 }, { "epoch": 1.4635261833981248, "grad_norm": 0.9296948909759521, "learning_rate": 8.030848343117507e-05, "loss": 0.12604997158050538, "memory(GiB)": 122.96, "step": 19200, "token_acc": 0.950969942442976, "train_speed(iter/s)": 0.247095 }, { "epoch": 1.4635261833981248, "eval_loss": 0.10428343713283539, "eval_runtime": 157.0812, "eval_samples_per_second": 3.374, "eval_steps_per_second": 3.374, "eval_token_acc": 0.9523899162701042, "step": 19200 }, { "epoch": 1.4639073100083848, "grad_norm": 1.6003395318984985, "learning_rate": 8.029895965759808e-05, "loss": 0.10673586130142212, "memory(GiB)": 122.96, "step": 19205, "token_acc": 0.952639854231261, "train_speed(iter/s)": 0.246615 }, { "epoch": 1.4642884366186446, "grad_norm": 2.2689566612243652, "learning_rate": 8.028943414651636e-05, "loss": 0.1152036428451538, "memory(GiB)": 122.96, "step": 19210, "token_acc": 0.9425025329280649, "train_speed(iter/s)": 0.246635 }, { "epoch": 1.4646695632289046, "grad_norm": 0.6150451302528381, "learning_rate": 8.027990689847615e-05, "loss": 0.11118817329406738, "memory(GiB)": 122.96, "step": 19215, "token_acc": 0.9524413443246671, "train_speed(iter/s)": 0.24664 }, { "epoch": 1.4650506898391646, "grad_norm": 0.9661538004875183, "learning_rate": 8.027037791402381e-05, "loss": 0.14630751609802245, "memory(GiB)": 122.96, "step": 19220, "token_acc": 0.9491453893226419, "train_speed(iter/s)": 0.246655 }, { "epoch": 1.4654318164494244, "grad_norm": 0.802452027797699, "learning_rate": 8.026084719370574e-05, "loss": 0.12909675836563111, "memory(GiB)": 122.96, "step": 19225, "token_acc": 0.953958944281525, "train_speed(iter/s)": 0.246662 }, { "epoch": 1.4658129430596845, "grad_norm": 1.3821479082107544, "learning_rate": 8.025131473806855e-05, "loss": 0.1347055435180664, "memory(GiB)": 122.96, "step": 19230, "token_acc": 0.9501705191360363, "train_speed(iter/s)": 0.246674 }, { "epoch": 1.4661940696699443, "grad_norm": 0.643237292766571, "learning_rate": 8.024178054765881e-05, "loss": 0.12436169385910034, "memory(GiB)": 122.96, "step": 19235, "token_acc": 0.9388783868935098, "train_speed(iter/s)": 0.246695 }, { "epoch": 1.4665751962802043, "grad_norm": 1.0124833583831787, "learning_rate": 8.023224462302331e-05, "loss": 0.11001471281051636, "memory(GiB)": 122.96, "step": 19240, "token_acc": 0.9597222222222223, "train_speed(iter/s)": 0.246713 }, { "epoch": 1.4669563228904643, "grad_norm": 1.1995809078216553, "learning_rate": 8.022270696470887e-05, "loss": 0.12745610475540162, "memory(GiB)": 122.96, "step": 19245, "token_acc": 0.9467048710601719, "train_speed(iter/s)": 0.246736 }, { "epoch": 1.467337449500724, "grad_norm": 0.5778623223304749, "learning_rate": 8.021316757326244e-05, "loss": 0.10835487842559814, "memory(GiB)": 122.96, "step": 19250, "token_acc": 0.9595565592280846, "train_speed(iter/s)": 0.24675 }, { "epoch": 1.467718576110984, "grad_norm": 0.6737306118011475, "learning_rate": 8.020362644923105e-05, "loss": 0.09826849699020386, "memory(GiB)": 122.96, "step": 19255, "token_acc": 0.9500908940670963, "train_speed(iter/s)": 0.246764 }, { "epoch": 1.468099702721244, "grad_norm": 1.2376142740249634, "learning_rate": 8.019408359316185e-05, "loss": 0.1031100869178772, "memory(GiB)": 122.96, "step": 19260, "token_acc": 0.9601205246366536, "train_speed(iter/s)": 0.246773 }, { "epoch": 1.468480829331504, "grad_norm": 0.9293076395988464, "learning_rate": 8.018453900560208e-05, "loss": 0.06892385482788085, "memory(GiB)": 122.96, "step": 19265, "token_acc": 0.9670039595248571, "train_speed(iter/s)": 0.246799 }, { "epoch": 1.468861955941764, "grad_norm": 1.1473495960235596, "learning_rate": 8.017499268709906e-05, "loss": 0.17390531301498413, "memory(GiB)": 122.96, "step": 19270, "token_acc": 0.9244460588448965, "train_speed(iter/s)": 0.246819 }, { "epoch": 1.4692430825520237, "grad_norm": 0.7495232224464417, "learning_rate": 8.016544463820024e-05, "loss": 0.12031754255294799, "memory(GiB)": 122.96, "step": 19275, "token_acc": 0.945064761054042, "train_speed(iter/s)": 0.246842 }, { "epoch": 1.4696242091622838, "grad_norm": 1.3020985126495361, "learning_rate": 8.015589485945315e-05, "loss": 0.16225168704986573, "memory(GiB)": 122.96, "step": 19280, "token_acc": 0.9306384933394579, "train_speed(iter/s)": 0.246866 }, { "epoch": 1.4700053357725436, "grad_norm": 0.7558112144470215, "learning_rate": 8.014634335140544e-05, "loss": 0.09617698192596436, "memory(GiB)": 122.96, "step": 19285, "token_acc": 0.957438934122872, "train_speed(iter/s)": 0.246888 }, { "epoch": 1.4703864623828036, "grad_norm": 1.030594825744629, "learning_rate": 8.013679011460483e-05, "loss": 0.13217276334762573, "memory(GiB)": 122.96, "step": 19290, "token_acc": 0.9378084896347483, "train_speed(iter/s)": 0.246911 }, { "epoch": 1.4707675889930636, "grad_norm": 0.7262478470802307, "learning_rate": 8.012723514959916e-05, "loss": 0.16664459705352783, "memory(GiB)": 122.96, "step": 19295, "token_acc": 0.9466590736522399, "train_speed(iter/s)": 0.246929 }, { "epoch": 1.4711487156033234, "grad_norm": 1.2593241930007935, "learning_rate": 8.011767845693636e-05, "loss": 0.10207643508911132, "memory(GiB)": 122.96, "step": 19300, "token_acc": 0.948976948976949, "train_speed(iter/s)": 0.246949 }, { "epoch": 1.4715298422135834, "grad_norm": 0.9787925481796265, "learning_rate": 8.010812003716448e-05, "loss": 0.15673841238021852, "memory(GiB)": 122.96, "step": 19305, "token_acc": 0.9327036599763873, "train_speed(iter/s)": 0.246969 }, { "epoch": 1.4719109688238432, "grad_norm": 1.547568440437317, "learning_rate": 8.009855989083162e-05, "loss": 0.18790249824523925, "memory(GiB)": 122.96, "step": 19310, "token_acc": 0.9062233589087809, "train_speed(iter/s)": 0.246989 }, { "epoch": 1.4722920954341032, "grad_norm": 0.8740751147270203, "learning_rate": 8.008899801848602e-05, "loss": 0.10033726692199707, "memory(GiB)": 122.96, "step": 19315, "token_acc": 0.958904109589041, "train_speed(iter/s)": 0.247009 }, { "epoch": 1.4726732220443632, "grad_norm": 0.8829959630966187, "learning_rate": 8.007943442067603e-05, "loss": 0.13317534923553467, "memory(GiB)": 122.96, "step": 19320, "token_acc": 0.9479499854608898, "train_speed(iter/s)": 0.247028 }, { "epoch": 1.473054348654623, "grad_norm": 2.4803214073181152, "learning_rate": 8.006986909795004e-05, "loss": 0.1168176293373108, "memory(GiB)": 122.96, "step": 19325, "token_acc": 0.9554106635818698, "train_speed(iter/s)": 0.247031 }, { "epoch": 1.473435475264883, "grad_norm": 0.4304928481578827, "learning_rate": 8.00603020508566e-05, "loss": 0.12166771888732911, "memory(GiB)": 122.96, "step": 19330, "token_acc": 0.943013698630137, "train_speed(iter/s)": 0.247051 }, { "epoch": 1.4738166018751429, "grad_norm": 1.0527632236480713, "learning_rate": 8.005073327994434e-05, "loss": 0.13487184047698975, "memory(GiB)": 122.96, "step": 19335, "token_acc": 0.9475792988313857, "train_speed(iter/s)": 0.247072 }, { "epoch": 1.4741977284854029, "grad_norm": 0.5613982677459717, "learning_rate": 8.004116278576199e-05, "loss": 0.11813082695007324, "memory(GiB)": 122.96, "step": 19340, "token_acc": 0.951978417266187, "train_speed(iter/s)": 0.247087 }, { "epoch": 1.4745788550956629, "grad_norm": 0.9535490274429321, "learning_rate": 8.003159056885836e-05, "loss": 0.09012985825538636, "memory(GiB)": 122.96, "step": 19345, "token_acc": 0.9627906976744186, "train_speed(iter/s)": 0.247113 }, { "epoch": 1.4749599817059227, "grad_norm": 0.9837018847465515, "learning_rate": 8.002201662978236e-05, "loss": 0.2295475959777832, "memory(GiB)": 122.96, "step": 19350, "token_acc": 0.9070306464074169, "train_speed(iter/s)": 0.247132 }, { "epoch": 1.4753411083161827, "grad_norm": 0.7316795587539673, "learning_rate": 8.001244096908303e-05, "loss": 0.13050657510757446, "memory(GiB)": 122.96, "step": 19355, "token_acc": 0.9529252519175816, "train_speed(iter/s)": 0.247144 }, { "epoch": 1.4757222349264425, "grad_norm": 1.4817649126052856, "learning_rate": 8.000286358730949e-05, "loss": 0.12510656118392943, "memory(GiB)": 122.96, "step": 19360, "token_acc": 0.9547977795400476, "train_speed(iter/s)": 0.247166 }, { "epoch": 1.4761033615367025, "grad_norm": 0.9311909675598145, "learning_rate": 7.999328448501095e-05, "loss": 0.11473931074142456, "memory(GiB)": 122.96, "step": 19365, "token_acc": 0.9558096415327565, "train_speed(iter/s)": 0.247187 }, { "epoch": 1.4764844881469625, "grad_norm": 0.6809597611427307, "learning_rate": 7.998370366273674e-05, "loss": 0.09254549145698547, "memory(GiB)": 122.96, "step": 19370, "token_acc": 0.9553030303030303, "train_speed(iter/s)": 0.247206 }, { "epoch": 1.4768656147572223, "grad_norm": 1.0189076662063599, "learning_rate": 7.997412112103626e-05, "loss": 0.19234459400177, "memory(GiB)": 122.96, "step": 19375, "token_acc": 0.9273927392739274, "train_speed(iter/s)": 0.247223 }, { "epoch": 1.4772467413674824, "grad_norm": 1.0204483270645142, "learning_rate": 7.996453686045904e-05, "loss": 0.12496333122253418, "memory(GiB)": 122.96, "step": 19380, "token_acc": 0.9535562037244784, "train_speed(iter/s)": 0.247237 }, { "epoch": 1.4776278679777421, "grad_norm": 0.7151778340339661, "learning_rate": 7.995495088155468e-05, "loss": 0.1400434970855713, "memory(GiB)": 122.96, "step": 19385, "token_acc": 0.9382544103992572, "train_speed(iter/s)": 0.247249 }, { "epoch": 1.4780089945880022, "grad_norm": 0.6130173206329346, "learning_rate": 7.994536318487288e-05, "loss": 0.14464077949523926, "memory(GiB)": 122.96, "step": 19390, "token_acc": 0.9429640718562874, "train_speed(iter/s)": 0.24726 }, { "epoch": 1.4783901211982622, "grad_norm": 0.7056736350059509, "learning_rate": 7.993577377096347e-05, "loss": 0.11020042896270751, "memory(GiB)": 122.96, "step": 19395, "token_acc": 0.9639193918185109, "train_speed(iter/s)": 0.247272 }, { "epoch": 1.478771247808522, "grad_norm": 1.6137927770614624, "learning_rate": 7.992618264037635e-05, "loss": 0.1082154631614685, "memory(GiB)": 122.96, "step": 19400, "token_acc": 0.9552488498536177, "train_speed(iter/s)": 0.247295 }, { "epoch": 1.478771247808522, "eval_loss": 0.10356870293617249, "eval_runtime": 159.8795, "eval_samples_per_second": 3.315, "eval_steps_per_second": 3.315, "eval_token_acc": 0.9527212216131559, "step": 19400 }, { "epoch": 1.479152374418782, "grad_norm": 1.0343507528305054, "learning_rate": 7.991658979366154e-05, "loss": 0.1110692024230957, "memory(GiB)": 122.96, "step": 19405, "token_acc": 0.9534910427193385, "train_speed(iter/s)": 0.246803 }, { "epoch": 1.4795335010290418, "grad_norm": 0.8005077242851257, "learning_rate": 7.990699523136914e-05, "loss": 0.16004748344421388, "memory(GiB)": 122.96, "step": 19410, "token_acc": 0.9387583892617449, "train_speed(iter/s)": 0.246823 }, { "epoch": 1.4799146276393018, "grad_norm": 1.129117727279663, "learning_rate": 7.989739895404933e-05, "loss": 0.13003225326538087, "memory(GiB)": 122.96, "step": 19415, "token_acc": 0.9507793273174734, "train_speed(iter/s)": 0.246834 }, { "epoch": 1.4802957542495618, "grad_norm": 0.6003535389900208, "learning_rate": 7.988780096225244e-05, "loss": 0.1324818730354309, "memory(GiB)": 122.96, "step": 19420, "token_acc": 0.9519028718188186, "train_speed(iter/s)": 0.246844 }, { "epoch": 1.4806768808598216, "grad_norm": 0.59591144323349, "learning_rate": 7.987820125652887e-05, "loss": 0.1294941186904907, "memory(GiB)": 122.96, "step": 19425, "token_acc": 0.9554375432925422, "train_speed(iter/s)": 0.246856 }, { "epoch": 1.4810580074700814, "grad_norm": 2.4959707260131836, "learning_rate": 7.98685998374291e-05, "loss": 0.14869287014007568, "memory(GiB)": 122.96, "step": 19430, "token_acc": 0.949187759727994, "train_speed(iter/s)": 0.246868 }, { "epoch": 1.4814391340803414, "grad_norm": 0.812667727470398, "learning_rate": 7.985899670550374e-05, "loss": 0.12696577310562135, "memory(GiB)": 122.96, "step": 19435, "token_acc": 0.9468791500664011, "train_speed(iter/s)": 0.246881 }, { "epoch": 1.4818202606906015, "grad_norm": 0.5335561037063599, "learning_rate": 7.984939186130348e-05, "loss": 0.10233520269393921, "memory(GiB)": 122.96, "step": 19440, "token_acc": 0.9535018889857599, "train_speed(iter/s)": 0.246902 }, { "epoch": 1.4822013873008615, "grad_norm": 0.6089061498641968, "learning_rate": 7.983978530537912e-05, "loss": 0.08428755402565002, "memory(GiB)": 122.96, "step": 19445, "token_acc": 0.9699627538147303, "train_speed(iter/s)": 0.246908 }, { "epoch": 1.4825825139111213, "grad_norm": 0.8938884735107422, "learning_rate": 7.983017703828154e-05, "loss": 0.10936611890792847, "memory(GiB)": 122.96, "step": 19450, "token_acc": 0.9556562328390994, "train_speed(iter/s)": 0.24692 }, { "epoch": 1.482963640521381, "grad_norm": 1.2218573093414307, "learning_rate": 7.982056706056173e-05, "loss": 0.11943564414978028, "memory(GiB)": 122.96, "step": 19455, "token_acc": 0.9487014656724093, "train_speed(iter/s)": 0.246927 }, { "epoch": 1.483344767131641, "grad_norm": 0.7786802053451538, "learning_rate": 7.98109553727708e-05, "loss": 0.1626746416091919, "memory(GiB)": 122.96, "step": 19460, "token_acc": 0.9345140992775577, "train_speed(iter/s)": 0.246946 }, { "epoch": 1.4837258937419011, "grad_norm": 0.197900652885437, "learning_rate": 7.98013419754599e-05, "loss": 0.14463427066802978, "memory(GiB)": 122.96, "step": 19465, "token_acc": 0.9485726280436608, "train_speed(iter/s)": 0.246959 }, { "epoch": 1.484107020352161, "grad_norm": 0.8000593185424805, "learning_rate": 7.979172686918035e-05, "loss": 0.1879422187805176, "memory(GiB)": 122.96, "step": 19470, "token_acc": 0.9295370749692748, "train_speed(iter/s)": 0.246973 }, { "epoch": 1.484488146962421, "grad_norm": 0.037604451179504395, "learning_rate": 7.97821100544835e-05, "loss": 0.11854655742645263, "memory(GiB)": 122.96, "step": 19475, "token_acc": 0.9548678272082527, "train_speed(iter/s)": 0.246993 }, { "epoch": 1.4848692735726807, "grad_norm": 0.9423521757125854, "learning_rate": 7.977249153192086e-05, "loss": 0.1625124216079712, "memory(GiB)": 122.96, "step": 19480, "token_acc": 0.9467198260239217, "train_speed(iter/s)": 0.247012 }, { "epoch": 1.4852504001829407, "grad_norm": 0.6165697574615479, "learning_rate": 7.976287130204398e-05, "loss": 0.10783276557922364, "memory(GiB)": 122.96, "step": 19485, "token_acc": 0.9574660633484163, "train_speed(iter/s)": 0.247022 }, { "epoch": 1.4856315267932008, "grad_norm": 0.767785906791687, "learning_rate": 7.975324936540455e-05, "loss": 0.10678926706314087, "memory(GiB)": 122.96, "step": 19490, "token_acc": 0.9594909481986019, "train_speed(iter/s)": 0.247032 }, { "epoch": 1.4860126534034606, "grad_norm": 0.8805824518203735, "learning_rate": 7.974362572255435e-05, "loss": 0.11128337383270263, "memory(GiB)": 122.96, "step": 19495, "token_acc": 0.9471677559912854, "train_speed(iter/s)": 0.24704 }, { "epoch": 1.4863937800137206, "grad_norm": 1.2872785329818726, "learning_rate": 7.973400037404524e-05, "loss": 0.15685967206954957, "memory(GiB)": 122.96, "step": 19500, "token_acc": 0.9356808731808732, "train_speed(iter/s)": 0.247051 }, { "epoch": 1.4867749066239804, "grad_norm": 0.9299206733703613, "learning_rate": 7.972437332042917e-05, "loss": 0.1419074058532715, "memory(GiB)": 122.96, "step": 19505, "token_acc": 0.9431232659532303, "train_speed(iter/s)": 0.24706 }, { "epoch": 1.4871560332342404, "grad_norm": 1.0724431276321411, "learning_rate": 7.971474456225825e-05, "loss": 0.11269270181655884, "memory(GiB)": 122.96, "step": 19510, "token_acc": 0.9545516769336071, "train_speed(iter/s)": 0.247075 }, { "epoch": 1.4875371598445004, "grad_norm": 0.3817760944366455, "learning_rate": 7.970511410008462e-05, "loss": 0.09447119235992432, "memory(GiB)": 122.96, "step": 19515, "token_acc": 0.9635985439417577, "train_speed(iter/s)": 0.247085 }, { "epoch": 1.4879182864547602, "grad_norm": 1.4868746995925903, "learning_rate": 7.969548193446053e-05, "loss": 0.11578338146209717, "memory(GiB)": 122.96, "step": 19520, "token_acc": 0.9522998296422487, "train_speed(iter/s)": 0.247099 }, { "epoch": 1.4882994130650202, "grad_norm": 0.6867897510528564, "learning_rate": 7.968584806593838e-05, "loss": 0.12062606811523438, "memory(GiB)": 122.96, "step": 19525, "token_acc": 0.9537524231514816, "train_speed(iter/s)": 0.247118 }, { "epoch": 1.48868053967528, "grad_norm": 0.8785312175750732, "learning_rate": 7.967621249507059e-05, "loss": 0.11990485191345215, "memory(GiB)": 122.96, "step": 19530, "token_acc": 0.957963620230701, "train_speed(iter/s)": 0.247115 }, { "epoch": 1.48906166628554, "grad_norm": 1.009458303451538, "learning_rate": 7.966657522240972e-05, "loss": 0.13346171379089355, "memory(GiB)": 122.96, "step": 19535, "token_acc": 0.9560102301790281, "train_speed(iter/s)": 0.247136 }, { "epoch": 1.4894427928958, "grad_norm": 1.6763949394226074, "learning_rate": 7.965693624850843e-05, "loss": 0.1697704792022705, "memory(GiB)": 122.96, "step": 19540, "token_acc": 0.9331395348837209, "train_speed(iter/s)": 0.247147 }, { "epoch": 1.4898239195060599, "grad_norm": 1.018202781677246, "learning_rate": 7.964729557391948e-05, "loss": 0.1612674593925476, "memory(GiB)": 122.96, "step": 19545, "token_acc": 0.9403169886082219, "train_speed(iter/s)": 0.247157 }, { "epoch": 1.4902050461163199, "grad_norm": 0.5618013739585876, "learning_rate": 7.963765319919571e-05, "loss": 0.05081660747528076, "memory(GiB)": 122.96, "step": 19550, "token_acc": 0.9751234333459932, "train_speed(iter/s)": 0.247167 }, { "epoch": 1.4905861727265797, "grad_norm": 0.8601058125495911, "learning_rate": 7.962800912489006e-05, "loss": 0.19733173847198487, "memory(GiB)": 122.96, "step": 19555, "token_acc": 0.9362486828240253, "train_speed(iter/s)": 0.24718 }, { "epoch": 1.4909672993368397, "grad_norm": 1.852170467376709, "learning_rate": 7.961836335155557e-05, "loss": 0.10446784496307374, "memory(GiB)": 122.96, "step": 19560, "token_acc": 0.962796664528544, "train_speed(iter/s)": 0.247194 }, { "epoch": 1.4913484259470997, "grad_norm": 0.8012587428092957, "learning_rate": 7.96087158797454e-05, "loss": 0.10556679964065552, "memory(GiB)": 122.96, "step": 19565, "token_acc": 0.9559113300492611, "train_speed(iter/s)": 0.247204 }, { "epoch": 1.4917295525573595, "grad_norm": 0.5750257968902588, "learning_rate": 7.959906671001278e-05, "loss": 0.10351088047027587, "memory(GiB)": 122.96, "step": 19570, "token_acc": 0.9665882352941176, "train_speed(iter/s)": 0.247222 }, { "epoch": 1.4921106791676195, "grad_norm": 0.7876543402671814, "learning_rate": 7.958941584291104e-05, "loss": 0.1303958296775818, "memory(GiB)": 122.96, "step": 19575, "token_acc": 0.9361069836552749, "train_speed(iter/s)": 0.247235 }, { "epoch": 1.4924918057778793, "grad_norm": 0.5471728444099426, "learning_rate": 7.957976327899359e-05, "loss": 0.10810225009918213, "memory(GiB)": 122.96, "step": 19580, "token_acc": 0.9621023513139696, "train_speed(iter/s)": 0.247249 }, { "epoch": 1.4928729323881393, "grad_norm": 0.9433920979499817, "learning_rate": 7.957010901881401e-05, "loss": 0.14671926498413085, "memory(GiB)": 122.96, "step": 19585, "token_acc": 0.9508825786646201, "train_speed(iter/s)": 0.247258 }, { "epoch": 1.4932540589983994, "grad_norm": 0.9087037444114685, "learning_rate": 7.956045306292588e-05, "loss": 0.0765055239200592, "memory(GiB)": 122.96, "step": 19590, "token_acc": 0.9686274509803922, "train_speed(iter/s)": 0.247271 }, { "epoch": 1.4936351856086592, "grad_norm": 0.7251271605491638, "learning_rate": 7.955079541188296e-05, "loss": 0.13198179006576538, "memory(GiB)": 122.96, "step": 19595, "token_acc": 0.9511557512383049, "train_speed(iter/s)": 0.247272 }, { "epoch": 1.4940163122189192, "grad_norm": 0.9423989057540894, "learning_rate": 7.954113606623905e-05, "loss": 0.14341285228729247, "memory(GiB)": 122.96, "step": 19600, "token_acc": 0.945958757999526, "train_speed(iter/s)": 0.247283 }, { "epoch": 1.4940163122189192, "eval_loss": 0.1026344820857048, "eval_runtime": 214.9155, "eval_samples_per_second": 2.466, "eval_steps_per_second": 2.466, "eval_token_acc": 0.9530525269562075, "step": 19600 }, { "epoch": 1.494397438829179, "grad_norm": 0.5532892346382141, "learning_rate": 7.953147502654808e-05, "loss": 0.1197356939315796, "memory(GiB)": 122.96, "step": 19605, "token_acc": 0.9530506067362813, "train_speed(iter/s)": 0.246631 }, { "epoch": 1.494778565439439, "grad_norm": 0.7363724112510681, "learning_rate": 7.952181229336404e-05, "loss": 0.13328335285186768, "memory(GiB)": 122.96, "step": 19610, "token_acc": 0.9489695780176644, "train_speed(iter/s)": 0.246636 }, { "epoch": 1.495159692049699, "grad_norm": 1.2684293985366821, "learning_rate": 7.951214786724108e-05, "loss": 0.14898955821990967, "memory(GiB)": 122.96, "step": 19615, "token_acc": 0.9245949926362298, "train_speed(iter/s)": 0.246653 }, { "epoch": 1.4955408186599588, "grad_norm": 2.0893633365631104, "learning_rate": 7.950248174873338e-05, "loss": 0.11479883193969727, "memory(GiB)": 122.96, "step": 19620, "token_acc": 0.9569454287739192, "train_speed(iter/s)": 0.246661 }, { "epoch": 1.4959219452702188, "grad_norm": 1.064025640487671, "learning_rate": 7.949281393839527e-05, "loss": 0.14143116474151612, "memory(GiB)": 122.96, "step": 19625, "token_acc": 0.946818123144443, "train_speed(iter/s)": 0.246669 }, { "epoch": 1.4963030718804786, "grad_norm": 0.9734113812446594, "learning_rate": 7.948314443678114e-05, "loss": 0.10418089628219604, "memory(GiB)": 122.96, "step": 19630, "token_acc": 0.9596990865126276, "train_speed(iter/s)": 0.246688 }, { "epoch": 1.4966841984907386, "grad_norm": 0.7173066139221191, "learning_rate": 7.94734732444455e-05, "loss": 0.13902422189712524, "memory(GiB)": 122.96, "step": 19635, "token_acc": 0.9387966804979253, "train_speed(iter/s)": 0.246703 }, { "epoch": 1.4970653251009987, "grad_norm": 0.914384663105011, "learning_rate": 7.946380036194295e-05, "loss": 0.1380342960357666, "memory(GiB)": 122.96, "step": 19640, "token_acc": 0.9442446043165468, "train_speed(iter/s)": 0.246722 }, { "epoch": 1.4974464517112585, "grad_norm": 0.9852327704429626, "learning_rate": 7.945412578982815e-05, "loss": 0.12994029521942138, "memory(GiB)": 122.96, "step": 19645, "token_acc": 0.9583333333333334, "train_speed(iter/s)": 0.24673 }, { "epoch": 1.4978275783215185, "grad_norm": 0.6682660579681396, "learning_rate": 7.944444952865595e-05, "loss": 0.06821726560592652, "memory(GiB)": 122.96, "step": 19650, "token_acc": 0.9621044885945548, "train_speed(iter/s)": 0.246746 }, { "epoch": 1.4982087049317783, "grad_norm": 1.0375949144363403, "learning_rate": 7.943477157898121e-05, "loss": 0.1410351276397705, "memory(GiB)": 122.96, "step": 19655, "token_acc": 0.9431068601583114, "train_speed(iter/s)": 0.24676 }, { "epoch": 1.4985898315420383, "grad_norm": 0.8390384316444397, "learning_rate": 7.94250919413589e-05, "loss": 0.10949827432632446, "memory(GiB)": 122.96, "step": 19660, "token_acc": 0.9622335495829472, "train_speed(iter/s)": 0.246774 }, { "epoch": 1.4989709581522983, "grad_norm": 0.6838564276695251, "learning_rate": 7.94154106163441e-05, "loss": 0.15448163747787474, "memory(GiB)": 122.96, "step": 19665, "token_acc": 0.9331372549019608, "train_speed(iter/s)": 0.246783 }, { "epoch": 1.499352084762558, "grad_norm": 1.2503373622894287, "learning_rate": 7.940572760449203e-05, "loss": 0.12280181646347046, "memory(GiB)": 122.96, "step": 19670, "token_acc": 0.9567415730337079, "train_speed(iter/s)": 0.246789 }, { "epoch": 1.4997332113728181, "grad_norm": 1.2787450551986694, "learning_rate": 7.939604290635792e-05, "loss": 0.14259564876556396, "memory(GiB)": 122.96, "step": 19675, "token_acc": 0.9538258575197889, "train_speed(iter/s)": 0.2468 }, { "epoch": 1.500114337983078, "grad_norm": 1.0289186239242554, "learning_rate": 7.93863565224972e-05, "loss": 0.14326307773590088, "memory(GiB)": 122.96, "step": 19680, "token_acc": 0.9477780567411499, "train_speed(iter/s)": 0.246812 }, { "epoch": 1.500495464593338, "grad_norm": 1.0400272607803345, "learning_rate": 7.937666845346528e-05, "loss": 0.1190578818321228, "memory(GiB)": 122.96, "step": 19685, "token_acc": 0.9530888668648827, "train_speed(iter/s)": 0.246816 }, { "epoch": 1.500876591203598, "grad_norm": 0.571942925453186, "learning_rate": 7.936697869981773e-05, "loss": 0.07677346467971802, "memory(GiB)": 122.96, "step": 19690, "token_acc": 0.9575846833578793, "train_speed(iter/s)": 0.246832 }, { "epoch": 1.5012577178138578, "grad_norm": 0.9892421364784241, "learning_rate": 7.935728726211026e-05, "loss": 0.13357781171798705, "memory(GiB)": 122.96, "step": 19695, "token_acc": 0.9488345650938033, "train_speed(iter/s)": 0.246843 }, { "epoch": 1.5016388444241175, "grad_norm": 1.0298819541931152, "learning_rate": 7.934759414089859e-05, "loss": 0.14054994583129882, "memory(GiB)": 122.96, "step": 19700, "token_acc": 0.9533502968617472, "train_speed(iter/s)": 0.246857 }, { "epoch": 1.5020199710343776, "grad_norm": 0.9911893606185913, "learning_rate": 7.933789933673859e-05, "loss": 0.15960516929626464, "memory(GiB)": 122.96, "step": 19705, "token_acc": 0.9409698770768608, "train_speed(iter/s)": 0.246865 }, { "epoch": 1.5024010976446376, "grad_norm": 0.5609776973724365, "learning_rate": 7.932820285018619e-05, "loss": 0.13781036138534547, "memory(GiB)": 122.96, "step": 19710, "token_acc": 0.9537267080745342, "train_speed(iter/s)": 0.246879 }, { "epoch": 1.5027822242548976, "grad_norm": 1.6884427070617676, "learning_rate": 7.931850468179747e-05, "loss": 0.154878568649292, "memory(GiB)": 122.96, "step": 19715, "token_acc": 0.9413767939674045, "train_speed(iter/s)": 0.24689 }, { "epoch": 1.5031633508651574, "grad_norm": 0.6800521612167358, "learning_rate": 7.930880483212858e-05, "loss": 0.10867469310760498, "memory(GiB)": 122.96, "step": 19720, "token_acc": 0.9628665625542252, "train_speed(iter/s)": 0.246898 }, { "epoch": 1.5035444774754172, "grad_norm": 1.1414752006530762, "learning_rate": 7.929910330173571e-05, "loss": 0.1524043083190918, "memory(GiB)": 122.96, "step": 19725, "token_acc": 0.9510050251256281, "train_speed(iter/s)": 0.246912 }, { "epoch": 1.5039256040856772, "grad_norm": 0.4729847311973572, "learning_rate": 7.928940009117524e-05, "loss": 0.11313719749450683, "memory(GiB)": 122.96, "step": 19730, "token_acc": 0.9556848701880036, "train_speed(iter/s)": 0.24693 }, { "epoch": 1.5043067306959372, "grad_norm": 0.9326164126396179, "learning_rate": 7.92796952010036e-05, "loss": 0.090743088722229, "memory(GiB)": 122.96, "step": 19735, "token_acc": 0.9662710805746408, "train_speed(iter/s)": 0.246935 }, { "epoch": 1.5046878573061973, "grad_norm": 0.6476622223854065, "learning_rate": 7.92699886317773e-05, "loss": 0.172224760055542, "memory(GiB)": 122.96, "step": 19740, "token_acc": 0.9256668124180149, "train_speed(iter/s)": 0.246949 }, { "epoch": 1.505068983916457, "grad_norm": 0.5700234174728394, "learning_rate": 7.9260280384053e-05, "loss": 0.13455065488815307, "memory(GiB)": 122.96, "step": 19745, "token_acc": 0.9432114882506527, "train_speed(iter/s)": 0.246947 }, { "epoch": 1.5054501105267168, "grad_norm": 1.6151906251907349, "learning_rate": 7.925057045838738e-05, "loss": 0.166504430770874, "memory(GiB)": 122.96, "step": 19750, "token_acc": 0.9259962049335864, "train_speed(iter/s)": 0.246967 }, { "epoch": 1.5058312371369769, "grad_norm": 0.7164655327796936, "learning_rate": 7.92408588553373e-05, "loss": 0.11083769798278809, "memory(GiB)": 122.96, "step": 19755, "token_acc": 0.9594258373205742, "train_speed(iter/s)": 0.246977 }, { "epoch": 1.5062123637472369, "grad_norm": 0.8635382056236267, "learning_rate": 7.923114557545966e-05, "loss": 0.1281970262527466, "memory(GiB)": 122.96, "step": 19760, "token_acc": 0.959216528038637, "train_speed(iter/s)": 0.246992 }, { "epoch": 1.506593490357497, "grad_norm": 1.3610992431640625, "learning_rate": 7.922143061931148e-05, "loss": 0.16386263370513915, "memory(GiB)": 122.96, "step": 19765, "token_acc": 0.9484969053934571, "train_speed(iter/s)": 0.247002 }, { "epoch": 1.5069746169677567, "grad_norm": 1.4102039337158203, "learning_rate": 7.921171398744985e-05, "loss": 0.1787477970123291, "memory(GiB)": 122.96, "step": 19770, "token_acc": 0.9304115088658415, "train_speed(iter/s)": 0.247018 }, { "epoch": 1.5073557435780165, "grad_norm": 1.0997235774993896, "learning_rate": 7.920199568043197e-05, "loss": 0.1746086597442627, "memory(GiB)": 122.96, "step": 19775, "token_acc": 0.9540958660918083, "train_speed(iter/s)": 0.247029 }, { "epoch": 1.5077368701882765, "grad_norm": 1.0602174997329712, "learning_rate": 7.919227569881516e-05, "loss": 0.11254098415374755, "memory(GiB)": 122.96, "step": 19780, "token_acc": 0.9547553093259464, "train_speed(iter/s)": 0.247045 }, { "epoch": 1.5081179967985365, "grad_norm": 0.6758850812911987, "learning_rate": 7.918255404315681e-05, "loss": 0.12972047328948974, "memory(GiB)": 122.96, "step": 19785, "token_acc": 0.9575185434929198, "train_speed(iter/s)": 0.247058 }, { "epoch": 1.5084991234087965, "grad_norm": 0.5171512365341187, "learning_rate": 7.917283071401442e-05, "loss": 0.10074751377105713, "memory(GiB)": 122.96, "step": 19790, "token_acc": 0.9550561797752809, "train_speed(iter/s)": 0.247073 }, { "epoch": 1.5088802500190563, "grad_norm": 0.9484192728996277, "learning_rate": 7.916310571194555e-05, "loss": 0.17800108194351197, "memory(GiB)": 122.96, "step": 19795, "token_acc": 0.9264190609670637, "train_speed(iter/s)": 0.247087 }, { "epoch": 1.5092613766293161, "grad_norm": 1.1067534685134888, "learning_rate": 7.91533790375079e-05, "loss": 0.12665514945983886, "memory(GiB)": 122.96, "step": 19800, "token_acc": 0.9539589841097379, "train_speed(iter/s)": 0.247089 }, { "epoch": 1.5092613766293161, "eval_loss": 0.10344883054494858, "eval_runtime": 221.3235, "eval_samples_per_second": 2.395, "eval_steps_per_second": 2.395, "eval_token_acc": 0.9526459249442805, "step": 19800 }, { "epoch": 1.5096425032395762, "grad_norm": 1.7629859447479248, "learning_rate": 7.914365069125927e-05, "loss": 0.06470956802368164, "memory(GiB)": 122.96, "step": 19805, "token_acc": 0.9531598838481855, "train_speed(iter/s)": 0.246421 }, { "epoch": 1.5100236298498362, "grad_norm": 0.5553001165390015, "learning_rate": 7.913392067375748e-05, "loss": 0.10130875110626221, "memory(GiB)": 122.96, "step": 19810, "token_acc": 0.9612034837688044, "train_speed(iter/s)": 0.246429 }, { "epoch": 1.5104047564600962, "grad_norm": 0.595329999923706, "learning_rate": 7.912418898556056e-05, "loss": 0.12129310369491578, "memory(GiB)": 122.96, "step": 19815, "token_acc": 0.9584345479082321, "train_speed(iter/s)": 0.246443 }, { "epoch": 1.510785883070356, "grad_norm": 0.5989564657211304, "learning_rate": 7.911445562722654e-05, "loss": 0.08824927210807801, "memory(GiB)": 122.96, "step": 19820, "token_acc": 0.9641970198675497, "train_speed(iter/s)": 0.246452 }, { "epoch": 1.5111670096806158, "grad_norm": 0.6907153129577637, "learning_rate": 7.910472059931362e-05, "loss": 0.1513066291809082, "memory(GiB)": 122.96, "step": 19825, "token_acc": 0.9443944574410653, "train_speed(iter/s)": 0.246456 }, { "epoch": 1.5115481362908758, "grad_norm": 0.7042458653450012, "learning_rate": 7.909498390238003e-05, "loss": 0.12620601654052735, "memory(GiB)": 122.96, "step": 19830, "token_acc": 0.9533938066937754, "train_speed(iter/s)": 0.24647 }, { "epoch": 1.5119292629011358, "grad_norm": 1.309507131576538, "learning_rate": 7.908524553698412e-05, "loss": 0.14071328639984132, "memory(GiB)": 122.96, "step": 19835, "token_acc": 0.9402332361516035, "train_speed(iter/s)": 0.246487 }, { "epoch": 1.5123103895113958, "grad_norm": 0.42026466131210327, "learning_rate": 7.907550550368436e-05, "loss": 0.11528303623199462, "memory(GiB)": 122.96, "step": 19840, "token_acc": 0.9587301587301588, "train_speed(iter/s)": 0.246493 }, { "epoch": 1.5126915161216556, "grad_norm": 1.0084787607192993, "learning_rate": 7.906576380303928e-05, "loss": 0.08393782377243042, "memory(GiB)": 122.96, "step": 19845, "token_acc": 0.9676624576532183, "train_speed(iter/s)": 0.246511 }, { "epoch": 1.5130726427319154, "grad_norm": 0.8755314350128174, "learning_rate": 7.905602043560753e-05, "loss": 0.1195113182067871, "memory(GiB)": 122.96, "step": 19850, "token_acc": 0.9364274570982839, "train_speed(iter/s)": 0.246526 }, { "epoch": 1.5134537693421755, "grad_norm": 0.4037114381790161, "learning_rate": 7.904627540194784e-05, "loss": 0.0884668469429016, "memory(GiB)": 122.96, "step": 19855, "token_acc": 0.9564174330267893, "train_speed(iter/s)": 0.246544 }, { "epoch": 1.5138348959524355, "grad_norm": 0.8069786429405212, "learning_rate": 7.903652870261906e-05, "loss": 0.1453533411026001, "memory(GiB)": 122.96, "step": 19860, "token_acc": 0.9477557027225901, "train_speed(iter/s)": 0.246553 }, { "epoch": 1.5142160225626953, "grad_norm": 1.2896838188171387, "learning_rate": 7.90267803381801e-05, "loss": 0.15717616081237792, "memory(GiB)": 122.96, "step": 19865, "token_acc": 0.9295173002990175, "train_speed(iter/s)": 0.24657 }, { "epoch": 1.5145971491729553, "grad_norm": 0.8219897747039795, "learning_rate": 7.901703030918999e-05, "loss": 0.08251654505729675, "memory(GiB)": 122.96, "step": 19870, "token_acc": 0.9635773530472412, "train_speed(iter/s)": 0.246586 }, { "epoch": 1.514978275783215, "grad_norm": 1.0965030193328857, "learning_rate": 7.900727861620782e-05, "loss": 0.14219316244125366, "memory(GiB)": 122.96, "step": 19875, "token_acc": 0.9346542198514003, "train_speed(iter/s)": 0.246597 }, { "epoch": 1.515359402393475, "grad_norm": 1.559840440750122, "learning_rate": 7.899752525979287e-05, "loss": 0.14786453247070314, "memory(GiB)": 122.96, "step": 19880, "token_acc": 0.9452887537993921, "train_speed(iter/s)": 0.246617 }, { "epoch": 1.5157405290037351, "grad_norm": 0.6014488935470581, "learning_rate": 7.898777024050439e-05, "loss": 0.06071994304656982, "memory(GiB)": 122.96, "step": 19885, "token_acc": 0.966282165039929, "train_speed(iter/s)": 0.246639 }, { "epoch": 1.516121655613995, "grad_norm": 0.6659355759620667, "learning_rate": 7.897801355890182e-05, "loss": 0.12213494777679443, "memory(GiB)": 122.96, "step": 19890, "token_acc": 0.9601900739176347, "train_speed(iter/s)": 0.246641 }, { "epoch": 1.516502782224255, "grad_norm": 1.918127179145813, "learning_rate": 7.896825521554463e-05, "loss": 0.17732417583465576, "memory(GiB)": 122.96, "step": 19895, "token_acc": 0.942842430484037, "train_speed(iter/s)": 0.246647 }, { "epoch": 1.5168839088345147, "grad_norm": 0.7422630786895752, "learning_rate": 7.895849521099245e-05, "loss": 0.12009958028793336, "memory(GiB)": 122.96, "step": 19900, "token_acc": 0.95625, "train_speed(iter/s)": 0.246652 }, { "epoch": 1.5172650354447748, "grad_norm": 0.893923282623291, "learning_rate": 7.894873354580494e-05, "loss": 0.1087106704711914, "memory(GiB)": 122.96, "step": 19905, "token_acc": 0.9470391993327774, "train_speed(iter/s)": 0.246671 }, { "epoch": 1.5176461620550348, "grad_norm": 0.7962908744812012, "learning_rate": 7.893897022054191e-05, "loss": 0.1157883882522583, "memory(GiB)": 122.96, "step": 19910, "token_acc": 0.953660797034291, "train_speed(iter/s)": 0.246677 }, { "epoch": 1.5180272886652946, "grad_norm": 0.8339529037475586, "learning_rate": 7.892920523576322e-05, "loss": 0.13793869018554689, "memory(GiB)": 122.96, "step": 19915, "token_acc": 0.9509782369751594, "train_speed(iter/s)": 0.246679 }, { "epoch": 1.5184084152755546, "grad_norm": 0.5754239559173584, "learning_rate": 7.891943859202886e-05, "loss": 0.10501706600189209, "memory(GiB)": 122.96, "step": 19920, "token_acc": 0.9563342318059299, "train_speed(iter/s)": 0.246684 }, { "epoch": 1.5187895418858144, "grad_norm": 1.144008994102478, "learning_rate": 7.89096702898989e-05, "loss": 0.09556471109390259, "memory(GiB)": 122.96, "step": 19925, "token_acc": 0.9557595993322203, "train_speed(iter/s)": 0.246702 }, { "epoch": 1.5191706684960744, "grad_norm": 0.7938567399978638, "learning_rate": 7.889990032993351e-05, "loss": 0.12240054607391357, "memory(GiB)": 122.96, "step": 19930, "token_acc": 0.9648703956343793, "train_speed(iter/s)": 0.246717 }, { "epoch": 1.5195517951063344, "grad_norm": 0.8805842399597168, "learning_rate": 7.889012871269294e-05, "loss": 0.14045388698577882, "memory(GiB)": 122.96, "step": 19935, "token_acc": 0.9472774416594641, "train_speed(iter/s)": 0.246729 }, { "epoch": 1.5199329217165942, "grad_norm": 0.5368992686271667, "learning_rate": 7.888035543873757e-05, "loss": 0.12152795791625977, "memory(GiB)": 122.96, "step": 19940, "token_acc": 0.9595177853847299, "train_speed(iter/s)": 0.246733 }, { "epoch": 1.520314048326854, "grad_norm": 0.7631328105926514, "learning_rate": 7.887058050862782e-05, "loss": 0.1380767822265625, "memory(GiB)": 122.96, "step": 19945, "token_acc": 0.9518090605047127, "train_speed(iter/s)": 0.246738 }, { "epoch": 1.520695174937114, "grad_norm": 0.4443354606628418, "learning_rate": 7.886080392292427e-05, "loss": 0.10610029697418213, "memory(GiB)": 122.96, "step": 19950, "token_acc": 0.9418269230769231, "train_speed(iter/s)": 0.246756 }, { "epoch": 1.521076301547374, "grad_norm": 0.8930485844612122, "learning_rate": 7.885102568218754e-05, "loss": 0.16015876531600953, "memory(GiB)": 122.96, "step": 19955, "token_acc": 0.9316123188405797, "train_speed(iter/s)": 0.246767 }, { "epoch": 1.521457428157634, "grad_norm": 1.225701928138733, "learning_rate": 7.884124578697836e-05, "loss": 0.17236390113830566, "memory(GiB)": 122.96, "step": 19960, "token_acc": 0.9354561101549054, "train_speed(iter/s)": 0.246784 }, { "epoch": 1.5218385547678939, "grad_norm": 0.6648107171058655, "learning_rate": 7.883146423785759e-05, "loss": 0.12917861938476563, "memory(GiB)": 122.96, "step": 19965, "token_acc": 0.9461122047244095, "train_speed(iter/s)": 0.246797 }, { "epoch": 1.5222196813781537, "grad_norm": 1.532604694366455, "learning_rate": 7.882168103538614e-05, "loss": 0.13182382583618163, "memory(GiB)": 122.96, "step": 19970, "token_acc": 0.9516650501131587, "train_speed(iter/s)": 0.246803 }, { "epoch": 1.5226008079884137, "grad_norm": 0.5979650616645813, "learning_rate": 7.881189618012501e-05, "loss": 0.11065940856933594, "memory(GiB)": 122.96, "step": 19975, "token_acc": 0.9589864719776681, "train_speed(iter/s)": 0.246817 }, { "epoch": 1.5229819345986737, "grad_norm": 1.0075050592422485, "learning_rate": 7.880210967263535e-05, "loss": 0.07949135899543762, "memory(GiB)": 122.96, "step": 19980, "token_acc": 0.9605445957210336, "train_speed(iter/s)": 0.24683 }, { "epoch": 1.5233630612089337, "grad_norm": 1.3120921850204468, "learning_rate": 7.879232151347837e-05, "loss": 0.12599412202835084, "memory(GiB)": 122.96, "step": 19985, "token_acc": 0.9491468718634996, "train_speed(iter/s)": 0.246834 }, { "epoch": 1.5237441878191935, "grad_norm": 1.3930038213729858, "learning_rate": 7.878253170321534e-05, "loss": 0.14369891881942748, "memory(GiB)": 122.96, "step": 19990, "token_acc": 0.9559837369630546, "train_speed(iter/s)": 0.246845 }, { "epoch": 1.5241253144294533, "grad_norm": 0.6779171824455261, "learning_rate": 7.877274024240772e-05, "loss": 0.09963648915290832, "memory(GiB)": 122.96, "step": 19995, "token_acc": 0.9662921348314607, "train_speed(iter/s)": 0.246855 }, { "epoch": 1.5245064410397133, "grad_norm": 1.5286799669265747, "learning_rate": 7.876294713161694e-05, "loss": 0.13486528396606445, "memory(GiB)": 122.96, "step": 20000, "token_acc": 0.9574747096290745, "train_speed(iter/s)": 0.246865 }, { "epoch": 1.5245064410397133, "eval_loss": 0.09980521351099014, "eval_runtime": 218.895, "eval_samples_per_second": 2.421, "eval_steps_per_second": 2.421, "eval_token_acc": 0.9535946629721102, "step": 20000 }, { "epoch": 1.5248875676499734, "grad_norm": 1.0427583456039429, "learning_rate": 7.875315237140462e-05, "loss": 0.16351816654205323, "memory(GiB)": 122.96, "step": 20005, "token_acc": 0.9534204292584882, "train_speed(iter/s)": 0.246202 }, { "epoch": 1.5252686942602334, "grad_norm": 0.9279622435569763, "learning_rate": 7.874335596233245e-05, "loss": 0.1350972294807434, "memory(GiB)": 122.96, "step": 20010, "token_acc": 0.9446019067250708, "train_speed(iter/s)": 0.246208 }, { "epoch": 1.5256498208704932, "grad_norm": 1.3505561351776123, "learning_rate": 7.87335579049622e-05, "loss": 0.13941578865051268, "memory(GiB)": 122.96, "step": 20015, "token_acc": 0.9334628460417679, "train_speed(iter/s)": 0.246226 }, { "epoch": 1.526030947480753, "grad_norm": 0.6336546540260315, "learning_rate": 7.872375819985575e-05, "loss": 0.12051969766616821, "memory(GiB)": 122.96, "step": 20020, "token_acc": 0.9616204690831557, "train_speed(iter/s)": 0.246238 }, { "epoch": 1.526412074091013, "grad_norm": 0.7159956693649292, "learning_rate": 7.871395684757505e-05, "loss": 0.09127166867256165, "memory(GiB)": 122.96, "step": 20025, "token_acc": 0.9533545057562525, "train_speed(iter/s)": 0.24625 }, { "epoch": 1.526793200701273, "grad_norm": 0.3750533163547516, "learning_rate": 7.870415384868218e-05, "loss": 0.09657259583473206, "memory(GiB)": 122.96, "step": 20030, "token_acc": 0.9586501901140685, "train_speed(iter/s)": 0.246265 }, { "epoch": 1.527174327311533, "grad_norm": 1.0353111028671265, "learning_rate": 7.869434920373929e-05, "loss": 0.13254839181900024, "memory(GiB)": 122.96, "step": 20035, "token_acc": 0.955221329687233, "train_speed(iter/s)": 0.246279 }, { "epoch": 1.5275554539217928, "grad_norm": 1.0133919715881348, "learning_rate": 7.868454291330864e-05, "loss": 0.13081047534942628, "memory(GiB)": 122.96, "step": 20040, "token_acc": 0.9447674418604651, "train_speed(iter/s)": 0.246284 }, { "epoch": 1.5279365805320526, "grad_norm": 0.5247119069099426, "learning_rate": 7.867473497795255e-05, "loss": 0.1046711802482605, "memory(GiB)": 122.96, "step": 20045, "token_acc": 0.9602190368819455, "train_speed(iter/s)": 0.246288 }, { "epoch": 1.5283177071423126, "grad_norm": 1.0129351615905762, "learning_rate": 7.86649253982335e-05, "loss": 0.11597826480865478, "memory(GiB)": 122.96, "step": 20050, "token_acc": 0.9630872483221476, "train_speed(iter/s)": 0.246302 }, { "epoch": 1.5286988337525727, "grad_norm": 0.47850099205970764, "learning_rate": 7.865511417471398e-05, "loss": 0.0985840380191803, "memory(GiB)": 122.96, "step": 20055, "token_acc": 0.9497655726724715, "train_speed(iter/s)": 0.24631 }, { "epoch": 1.5290799603628327, "grad_norm": 0.7165057063102722, "learning_rate": 7.864530130795663e-05, "loss": 0.132874059677124, "memory(GiB)": 122.96, "step": 20060, "token_acc": 0.9460912302257718, "train_speed(iter/s)": 0.246318 }, { "epoch": 1.5294610869730925, "grad_norm": 1.003940224647522, "learning_rate": 7.863548679852419e-05, "loss": 0.11425788402557373, "memory(GiB)": 122.96, "step": 20065, "token_acc": 0.9625085207907293, "train_speed(iter/s)": 0.246329 }, { "epoch": 1.5298422135833523, "grad_norm": 0.9472588300704956, "learning_rate": 7.862567064697948e-05, "loss": 0.11797839403152466, "memory(GiB)": 122.96, "step": 20070, "token_acc": 0.9489768076398363, "train_speed(iter/s)": 0.246343 }, { "epoch": 1.5302233401936123, "grad_norm": 0.7316445112228394, "learning_rate": 7.861585285388538e-05, "loss": 0.10134296417236328, "memory(GiB)": 122.96, "step": 20075, "token_acc": 0.9595070422535211, "train_speed(iter/s)": 0.246343 }, { "epoch": 1.5306044668038723, "grad_norm": 0.8136548399925232, "learning_rate": 7.860603341980491e-05, "loss": 0.11671496629714966, "memory(GiB)": 122.96, "step": 20080, "token_acc": 0.9531203785862171, "train_speed(iter/s)": 0.246349 }, { "epoch": 1.5309855934141323, "grad_norm": 1.5228571891784668, "learning_rate": 7.859621234530118e-05, "loss": 0.1400763511657715, "memory(GiB)": 122.96, "step": 20085, "token_acc": 0.9529757531227039, "train_speed(iter/s)": 0.246346 }, { "epoch": 1.5313667200243921, "grad_norm": 1.6831352710723877, "learning_rate": 7.858638963093739e-05, "loss": 0.16669340133666993, "memory(GiB)": 122.96, "step": 20090, "token_acc": 0.937200956937799, "train_speed(iter/s)": 0.24636 }, { "epoch": 1.531747846634652, "grad_norm": 0.802392303943634, "learning_rate": 7.85765652772768e-05, "loss": 0.050355559587478636, "memory(GiB)": 122.96, "step": 20095, "token_acc": 0.9795819154107924, "train_speed(iter/s)": 0.24638 }, { "epoch": 1.532128973244912, "grad_norm": 1.094305396080017, "learning_rate": 7.856673928488279e-05, "loss": 0.11495417356491089, "memory(GiB)": 122.96, "step": 20100, "token_acc": 0.9563466494845361, "train_speed(iter/s)": 0.246388 }, { "epoch": 1.532510099855172, "grad_norm": 0.7223188281059265, "learning_rate": 7.855691165431886e-05, "loss": 0.14258017539978027, "memory(GiB)": 122.96, "step": 20105, "token_acc": 0.9469838572642311, "train_speed(iter/s)": 0.246396 }, { "epoch": 1.532891226465432, "grad_norm": 0.9584779143333435, "learning_rate": 7.854708238614857e-05, "loss": 0.12313451766967773, "memory(GiB)": 122.96, "step": 20110, "token_acc": 0.949468085106383, "train_speed(iter/s)": 0.246412 }, { "epoch": 1.5332723530756918, "grad_norm": 0.697218120098114, "learning_rate": 7.853725148093557e-05, "loss": 0.11974886655807496, "memory(GiB)": 122.96, "step": 20115, "token_acc": 0.9582420415001933, "train_speed(iter/s)": 0.246417 }, { "epoch": 1.5336534796859516, "grad_norm": 1.0408462285995483, "learning_rate": 7.852741893924362e-05, "loss": 0.10541114807128907, "memory(GiB)": 122.96, "step": 20120, "token_acc": 0.9675020655466814, "train_speed(iter/s)": 0.246428 }, { "epoch": 1.5340346062962116, "grad_norm": 0.6189677119255066, "learning_rate": 7.85175847616366e-05, "loss": 0.09472081065177917, "memory(GiB)": 122.96, "step": 20125, "token_acc": 0.9610284605433377, "train_speed(iter/s)": 0.246437 }, { "epoch": 1.5344157329064716, "grad_norm": 1.2660642862319946, "learning_rate": 7.850774894867841e-05, "loss": 0.1448038101196289, "memory(GiB)": 122.96, "step": 20130, "token_acc": 0.9532821824381926, "train_speed(iter/s)": 0.246445 }, { "epoch": 1.5347968595167316, "grad_norm": 1.3050897121429443, "learning_rate": 7.849791150093313e-05, "loss": 0.095840322971344, "memory(GiB)": 122.96, "step": 20135, "token_acc": 0.9655744504355039, "train_speed(iter/s)": 0.246455 }, { "epoch": 1.5351779861269914, "grad_norm": 1.060912847518921, "learning_rate": 7.848807241896485e-05, "loss": 0.11860479116439819, "memory(GiB)": 122.96, "step": 20140, "token_acc": 0.9619218826605654, "train_speed(iter/s)": 0.246458 }, { "epoch": 1.5355591127372512, "grad_norm": 0.29924488067626953, "learning_rate": 7.847823170333783e-05, "loss": 0.13371775150299073, "memory(GiB)": 122.96, "step": 20145, "token_acc": 0.9391241141195712, "train_speed(iter/s)": 0.246465 }, { "epoch": 1.5359402393475112, "grad_norm": 0.638933002948761, "learning_rate": 7.846838935461637e-05, "loss": 0.1501001834869385, "memory(GiB)": 122.96, "step": 20150, "token_acc": 0.9536407766990291, "train_speed(iter/s)": 0.246476 }, { "epoch": 1.5363213659577712, "grad_norm": 0.9239512085914612, "learning_rate": 7.84585453733649e-05, "loss": 0.12085071802139283, "memory(GiB)": 122.96, "step": 20155, "token_acc": 0.9536002482929857, "train_speed(iter/s)": 0.246482 }, { "epoch": 1.5367024925680313, "grad_norm": 0.7460366487503052, "learning_rate": 7.844869976014793e-05, "loss": 0.08245945572853089, "memory(GiB)": 122.96, "step": 20160, "token_acc": 0.969331158238173, "train_speed(iter/s)": 0.246487 }, { "epoch": 1.537083619178291, "grad_norm": 0.3189503252506256, "learning_rate": 7.843885251553002e-05, "loss": 0.13522355556488036, "memory(GiB)": 122.96, "step": 20165, "token_acc": 0.9434980124929018, "train_speed(iter/s)": 0.246501 }, { "epoch": 1.5374647457885509, "grad_norm": 0.7383835315704346, "learning_rate": 7.84290036400759e-05, "loss": 0.1545405864715576, "memory(GiB)": 122.96, "step": 20170, "token_acc": 0.9442487364068004, "train_speed(iter/s)": 0.246509 }, { "epoch": 1.5378458723988109, "grad_norm": 1.2051562070846558, "learning_rate": 7.841915313435036e-05, "loss": 0.10371520519256591, "memory(GiB)": 122.96, "step": 20175, "token_acc": 0.9559489501852614, "train_speed(iter/s)": 0.246527 }, { "epoch": 1.538226999009071, "grad_norm": 1.144430160522461, "learning_rate": 7.840930099891824e-05, "loss": 0.14800443649291992, "memory(GiB)": 122.96, "step": 20180, "token_acc": 0.9489953632148377, "train_speed(iter/s)": 0.246541 }, { "epoch": 1.5386081256193307, "grad_norm": 0.6102069616317749, "learning_rate": 7.839944723434458e-05, "loss": 0.10440422296524048, "memory(GiB)": 122.96, "step": 20185, "token_acc": 0.9584139264990329, "train_speed(iter/s)": 0.246538 }, { "epoch": 1.5389892522295907, "grad_norm": 1.077303171157837, "learning_rate": 7.838959184119438e-05, "loss": 0.10544029474258423, "memory(GiB)": 122.96, "step": 20190, "token_acc": 0.9534090909090909, "train_speed(iter/s)": 0.246557 }, { "epoch": 1.5393703788398505, "grad_norm": 0.6479737162590027, "learning_rate": 7.837973482003284e-05, "loss": 0.09800500869750976, "memory(GiB)": 122.96, "step": 20195, "token_acc": 0.9662089391491653, "train_speed(iter/s)": 0.246563 }, { "epoch": 1.5397515054501105, "grad_norm": 0.894629180431366, "learning_rate": 7.836987617142522e-05, "loss": 0.10362192392349243, "memory(GiB)": 122.96, "step": 20200, "token_acc": 0.9600333055786844, "train_speed(iter/s)": 0.246576 }, { "epoch": 1.5397515054501105, "eval_loss": 0.09949084371328354, "eval_runtime": 220.2188, "eval_samples_per_second": 2.407, "eval_steps_per_second": 2.407, "eval_token_acc": 0.953677489307873, "step": 20200 }, { "epoch": 1.5401326320603705, "grad_norm": 0.3351824879646301, "learning_rate": 7.836001589593683e-05, "loss": 0.07862884402275086, "memory(GiB)": 122.96, "step": 20205, "token_acc": 0.9544074693382516, "train_speed(iter/s)": 0.245921 }, { "epoch": 1.5405137586706303, "grad_norm": 1.110632061958313, "learning_rate": 7.835015399413314e-05, "loss": 0.10955632925033569, "memory(GiB)": 122.96, "step": 20210, "token_acc": 0.9600696770735629, "train_speed(iter/s)": 0.245927 }, { "epoch": 1.5408948852808904, "grad_norm": 0.8664857149124146, "learning_rate": 7.834029046657969e-05, "loss": 0.11455215215682983, "memory(GiB)": 122.96, "step": 20215, "token_acc": 0.9418789808917197, "train_speed(iter/s)": 0.245942 }, { "epoch": 1.5412760118911502, "grad_norm": 0.8753293752670288, "learning_rate": 7.833042531384209e-05, "loss": 0.14046541452407837, "memory(GiB)": 122.96, "step": 20220, "token_acc": 0.9419804741980474, "train_speed(iter/s)": 0.245955 }, { "epoch": 1.5416571385014102, "grad_norm": 0.9860674738883972, "learning_rate": 7.832055853648607e-05, "loss": 0.15805554389953613, "memory(GiB)": 122.96, "step": 20225, "token_acc": 0.9512195121951219, "train_speed(iter/s)": 0.245963 }, { "epoch": 1.5420382651116702, "grad_norm": 1.1086833477020264, "learning_rate": 7.831069013507745e-05, "loss": 0.14654064178466797, "memory(GiB)": 122.96, "step": 20230, "token_acc": 0.9422556971803785, "train_speed(iter/s)": 0.245975 }, { "epoch": 1.54241939172193, "grad_norm": 1.4926705360412598, "learning_rate": 7.830082011018212e-05, "loss": 0.11448988914489747, "memory(GiB)": 122.96, "step": 20235, "token_acc": 0.9663665704093766, "train_speed(iter/s)": 0.245983 }, { "epoch": 1.54280051833219, "grad_norm": 0.679052472114563, "learning_rate": 7.829094846236608e-05, "loss": 0.12560720443725587, "memory(GiB)": 122.96, "step": 20240, "token_acc": 0.9578141420207176, "train_speed(iter/s)": 0.245986 }, { "epoch": 1.5431816449424498, "grad_norm": 0.6988186836242676, "learning_rate": 7.828107519219545e-05, "loss": 0.13093364238739014, "memory(GiB)": 122.96, "step": 20245, "token_acc": 0.9502868068833652, "train_speed(iter/s)": 0.246001 }, { "epoch": 1.5435627715527098, "grad_norm": 0.6624094843864441, "learning_rate": 7.82712003002364e-05, "loss": 0.0833030879497528, "memory(GiB)": 122.96, "step": 20250, "token_acc": 0.9670095778644909, "train_speed(iter/s)": 0.246006 }, { "epoch": 1.5439438981629698, "grad_norm": 1.5297411680221558, "learning_rate": 7.826132378705518e-05, "loss": 0.15886261463165283, "memory(GiB)": 122.96, "step": 20255, "token_acc": 0.9508134695421869, "train_speed(iter/s)": 0.246012 }, { "epoch": 1.5443250247732296, "grad_norm": 1.0863755941390991, "learning_rate": 7.825144565321822e-05, "loss": 0.14241271018981932, "memory(GiB)": 122.96, "step": 20260, "token_acc": 0.9525147928994083, "train_speed(iter/s)": 0.246023 }, { "epoch": 1.5447061513834894, "grad_norm": 0.8025416135787964, "learning_rate": 7.824156589929193e-05, "loss": 0.13128538131713868, "memory(GiB)": 122.96, "step": 20265, "token_acc": 0.9482948294829483, "train_speed(iter/s)": 0.246024 }, { "epoch": 1.5450872779937495, "grad_norm": 0.7588363289833069, "learning_rate": 7.823168452584291e-05, "loss": 0.14459774494171143, "memory(GiB)": 122.96, "step": 20270, "token_acc": 0.9373315363881402, "train_speed(iter/s)": 0.246041 }, { "epoch": 1.5454684046040095, "grad_norm": 0.9858483672142029, "learning_rate": 7.822180153343776e-05, "loss": 0.14615097045898437, "memory(GiB)": 122.96, "step": 20275, "token_acc": 0.9418269230769231, "train_speed(iter/s)": 0.246057 }, { "epoch": 1.5458495312142695, "grad_norm": 0.7694418430328369, "learning_rate": 7.82119169226433e-05, "loss": 0.12661248445510864, "memory(GiB)": 122.96, "step": 20280, "token_acc": 0.9445916646591183, "train_speed(iter/s)": 0.246068 }, { "epoch": 1.5462306578245293, "grad_norm": 0.6671009063720703, "learning_rate": 7.820203069402631e-05, "loss": 0.11800553798675537, "memory(GiB)": 122.96, "step": 20285, "token_acc": 0.9579511614055986, "train_speed(iter/s)": 0.246073 }, { "epoch": 1.546611784434789, "grad_norm": 1.6451328992843628, "learning_rate": 7.819214284815373e-05, "loss": 0.11763629913330079, "memory(GiB)": 122.96, "step": 20290, "token_acc": 0.9555199358845923, "train_speed(iter/s)": 0.246081 }, { "epoch": 1.546992911045049, "grad_norm": 0.8875593543052673, "learning_rate": 7.818225338559257e-05, "loss": 0.0714452862739563, "memory(GiB)": 122.96, "step": 20295, "token_acc": 0.9728872223824633, "train_speed(iter/s)": 0.246096 }, { "epoch": 1.5473740376553091, "grad_norm": 1.113953709602356, "learning_rate": 7.817236230690999e-05, "loss": 0.09304338097572326, "memory(GiB)": 122.96, "step": 20300, "token_acc": 0.9561042524005487, "train_speed(iter/s)": 0.246113 }, { "epoch": 1.5477551642655691, "grad_norm": 0.7504996657371521, "learning_rate": 7.816246961267315e-05, "loss": 0.11942956447601319, "memory(GiB)": 122.96, "step": 20305, "token_acc": 0.9483532934131736, "train_speed(iter/s)": 0.246127 }, { "epoch": 1.548136290875829, "grad_norm": 0.8645687699317932, "learning_rate": 7.815257530344938e-05, "loss": 0.11602596044540406, "memory(GiB)": 122.96, "step": 20310, "token_acc": 0.951325220870145, "train_speed(iter/s)": 0.246137 }, { "epoch": 1.5485174174860887, "grad_norm": 0.58597332239151, "learning_rate": 7.814267937980603e-05, "loss": 0.12744873762130737, "memory(GiB)": 122.96, "step": 20315, "token_acc": 0.9494235775381182, "train_speed(iter/s)": 0.246147 }, { "epoch": 1.5488985440963488, "grad_norm": 0.5203577280044556, "learning_rate": 7.813278184231065e-05, "loss": 0.13020039796829225, "memory(GiB)": 122.96, "step": 20320, "token_acc": 0.9497214683616626, "train_speed(iter/s)": 0.246149 }, { "epoch": 1.5492796707066088, "grad_norm": 1.3316888809204102, "learning_rate": 7.812288269153076e-05, "loss": 0.1589285612106323, "memory(GiB)": 122.96, "step": 20325, "token_acc": 0.955716222322639, "train_speed(iter/s)": 0.246158 }, { "epoch": 1.5496607973168688, "grad_norm": 0.6558024287223816, "learning_rate": 7.811298192803407e-05, "loss": 0.13084545135498046, "memory(GiB)": 122.96, "step": 20330, "token_acc": 0.9482501861504096, "train_speed(iter/s)": 0.246171 }, { "epoch": 1.5500419239271286, "grad_norm": 0.772087812423706, "learning_rate": 7.810307955238831e-05, "loss": 0.14036908149719238, "memory(GiB)": 122.96, "step": 20335, "token_acc": 0.9391684193496198, "train_speed(iter/s)": 0.246181 }, { "epoch": 1.5504230505373884, "grad_norm": 1.0317705869674683, "learning_rate": 7.809317556516135e-05, "loss": 0.1289795994758606, "memory(GiB)": 122.96, "step": 20340, "token_acc": 0.9468509984639016, "train_speed(iter/s)": 0.246197 }, { "epoch": 1.5508041771476484, "grad_norm": 1.314374566078186, "learning_rate": 7.808326996692116e-05, "loss": 0.1446303367614746, "memory(GiB)": 122.96, "step": 20345, "token_acc": 0.9512534818941504, "train_speed(iter/s)": 0.246209 }, { "epoch": 1.5511853037579084, "grad_norm": 0.8148555755615234, "learning_rate": 7.807336275823576e-05, "loss": 0.1161515474319458, "memory(GiB)": 122.96, "step": 20350, "token_acc": 0.9390439525184472, "train_speed(iter/s)": 0.24622 }, { "epoch": 1.5515664303681684, "grad_norm": 0.771094024181366, "learning_rate": 7.806345393967327e-05, "loss": 0.20508849620819092, "memory(GiB)": 122.96, "step": 20355, "token_acc": 0.9269553975436329, "train_speed(iter/s)": 0.246232 }, { "epoch": 1.5519475569784282, "grad_norm": 0.6842091679573059, "learning_rate": 7.805354351180192e-05, "loss": 0.12329316139221191, "memory(GiB)": 122.96, "step": 20360, "token_acc": 0.959165815954499, "train_speed(iter/s)": 0.24624 }, { "epoch": 1.552328683588688, "grad_norm": 0.7424257397651672, "learning_rate": 7.804363147519006e-05, "loss": 0.09223066568374634, "memory(GiB)": 122.96, "step": 20365, "token_acc": 0.9575612671846981, "train_speed(iter/s)": 0.24625 }, { "epoch": 1.552709810198948, "grad_norm": 0.6114147305488586, "learning_rate": 7.803371783040605e-05, "loss": 0.1305585265159607, "memory(GiB)": 122.96, "step": 20370, "token_acc": 0.9541335563920257, "train_speed(iter/s)": 0.246252 }, { "epoch": 1.553090936809208, "grad_norm": 0.86359041929245, "learning_rate": 7.802380257801843e-05, "loss": 0.13010406494140625, "memory(GiB)": 122.96, "step": 20375, "token_acc": 0.958300395256917, "train_speed(iter/s)": 0.246256 }, { "epoch": 1.553472063419468, "grad_norm": 0.4465113878250122, "learning_rate": 7.801388571859577e-05, "loss": 0.10984261035919189, "memory(GiB)": 122.96, "step": 20380, "token_acc": 0.9614594850734481, "train_speed(iter/s)": 0.246264 }, { "epoch": 1.5538531900297279, "grad_norm": 1.3323390483856201, "learning_rate": 7.800396725270675e-05, "loss": 0.14129831790924072, "memory(GiB)": 122.96, "step": 20385, "token_acc": 0.9375109706863262, "train_speed(iter/s)": 0.246273 }, { "epoch": 1.5542343166399877, "grad_norm": 1.0194460153579712, "learning_rate": 7.79940471809202e-05, "loss": 0.1262844443321228, "memory(GiB)": 122.96, "step": 20390, "token_acc": 0.9443878293783666, "train_speed(iter/s)": 0.246276 }, { "epoch": 1.5546154432502477, "grad_norm": 1.1700646877288818, "learning_rate": 7.798412550380492e-05, "loss": 0.17530068159103393, "memory(GiB)": 122.96, "step": 20395, "token_acc": 0.9373977086743044, "train_speed(iter/s)": 0.246284 }, { "epoch": 1.5549965698605077, "grad_norm": 0.6830636262893677, "learning_rate": 7.79742022219299e-05, "loss": 0.11403472423553467, "memory(GiB)": 122.96, "step": 20400, "token_acc": 0.9486501793468001, "train_speed(iter/s)": 0.246295 }, { "epoch": 1.5549965698605077, "eval_loss": 0.09771151095628738, "eval_runtime": 221.6361, "eval_samples_per_second": 2.391, "eval_steps_per_second": 2.391, "eval_token_acc": 0.9539636166495994, "step": 20400 }, { "epoch": 1.5553776964707677, "grad_norm": 1.5672130584716797, "learning_rate": 7.796427733586422e-05, "loss": 0.14692559242248535, "memory(GiB)": 122.96, "step": 20405, "token_acc": 0.9534975894845317, "train_speed(iter/s)": 0.245644 }, { "epoch": 1.5557588230810275, "grad_norm": 0.9636224508285522, "learning_rate": 7.795435084617699e-05, "loss": 0.09436936974525452, "memory(GiB)": 122.96, "step": 20410, "token_acc": 0.9514285714285714, "train_speed(iter/s)": 0.245658 }, { "epoch": 1.5561399496912873, "grad_norm": 0.9270995259284973, "learning_rate": 7.794442275343748e-05, "loss": 0.10201631784439087, "memory(GiB)": 122.96, "step": 20415, "token_acc": 0.9603230337078652, "train_speed(iter/s)": 0.245664 }, { "epoch": 1.5565210763015473, "grad_norm": 1.011034369468689, "learning_rate": 7.793449305821499e-05, "loss": 0.09668781757354736, "memory(GiB)": 122.96, "step": 20420, "token_acc": 0.9625256673511293, "train_speed(iter/s)": 0.245677 }, { "epoch": 1.5569022029118074, "grad_norm": 1.719583511352539, "learning_rate": 7.792456176107896e-05, "loss": 0.09334666728973388, "memory(GiB)": 122.96, "step": 20425, "token_acc": 0.9604768692497084, "train_speed(iter/s)": 0.245676 }, { "epoch": 1.5572833295220674, "grad_norm": 0.6798993945121765, "learning_rate": 7.79146288625989e-05, "loss": 0.10918974876403809, "memory(GiB)": 122.96, "step": 20430, "token_acc": 0.9518205731690433, "train_speed(iter/s)": 0.245686 }, { "epoch": 1.5576644561323272, "grad_norm": 0.6507182717323303, "learning_rate": 7.790469436334442e-05, "loss": 0.12821564674377442, "memory(GiB)": 122.96, "step": 20435, "token_acc": 0.9455395244868929, "train_speed(iter/s)": 0.245685 }, { "epoch": 1.558045582742587, "grad_norm": 0.7128759622573853, "learning_rate": 7.789475826388519e-05, "loss": 0.10457488298416137, "memory(GiB)": 122.96, "step": 20440, "token_acc": 0.9502840909090909, "train_speed(iter/s)": 0.245701 }, { "epoch": 1.558426709352847, "grad_norm": 0.23801110684871674, "learning_rate": 7.788482056479104e-05, "loss": 0.0810682713985443, "memory(GiB)": 122.96, "step": 20445, "token_acc": 0.9634680134680135, "train_speed(iter/s)": 0.24571 }, { "epoch": 1.558807835963107, "grad_norm": 0.7529171109199524, "learning_rate": 7.787488126663183e-05, "loss": 0.09499533176422119, "memory(GiB)": 122.96, "step": 20450, "token_acc": 0.9625514403292181, "train_speed(iter/s)": 0.245718 }, { "epoch": 1.559188962573367, "grad_norm": 0.6019001007080078, "learning_rate": 7.786494036997754e-05, "loss": 0.16096495389938353, "memory(GiB)": 122.96, "step": 20455, "token_acc": 0.950214387803716, "train_speed(iter/s)": 0.245729 }, { "epoch": 1.5595700891836268, "grad_norm": 0.7061730623245239, "learning_rate": 7.78549978753982e-05, "loss": 0.13868269920349122, "memory(GiB)": 122.96, "step": 20460, "token_acc": 0.9477531131564699, "train_speed(iter/s)": 0.245744 }, { "epoch": 1.5599512157938866, "grad_norm": 0.707014799118042, "learning_rate": 7.784505378346402e-05, "loss": 0.13204164505004884, "memory(GiB)": 122.96, "step": 20465, "token_acc": 0.9571034189849225, "train_speed(iter/s)": 0.245735 }, { "epoch": 1.5603323424041466, "grad_norm": 0.9085134267807007, "learning_rate": 7.783510809474522e-05, "loss": 0.13544275760650634, "memory(GiB)": 122.96, "step": 20470, "token_acc": 0.945273631840796, "train_speed(iter/s)": 0.245747 }, { "epoch": 1.5607134690144067, "grad_norm": 2.0543816089630127, "learning_rate": 7.782516080981214e-05, "loss": 0.1424916625022888, "memory(GiB)": 122.96, "step": 20475, "token_acc": 0.9488989271597967, "train_speed(iter/s)": 0.245753 }, { "epoch": 1.5610945956246665, "grad_norm": 1.0858070850372314, "learning_rate": 7.78152119292352e-05, "loss": 0.11884559392929077, "memory(GiB)": 122.96, "step": 20480, "token_acc": 0.9438669438669439, "train_speed(iter/s)": 0.245771 }, { "epoch": 1.5614757222349265, "grad_norm": 0.7581131458282471, "learning_rate": 7.780526145358496e-05, "loss": 0.12399560213088989, "memory(GiB)": 122.96, "step": 20485, "token_acc": 0.9427042342200536, "train_speed(iter/s)": 0.245779 }, { "epoch": 1.5618568488451863, "grad_norm": 1.1300866603851318, "learning_rate": 7.779530938343198e-05, "loss": 0.09838156700134278, "memory(GiB)": 122.96, "step": 20490, "token_acc": 0.9652198107957707, "train_speed(iter/s)": 0.245793 }, { "epoch": 1.5622379754554463, "grad_norm": 0.7925743460655212, "learning_rate": 7.778535571934702e-05, "loss": 0.16609673500061034, "memory(GiB)": 122.96, "step": 20495, "token_acc": 0.9350500715307583, "train_speed(iter/s)": 0.245808 }, { "epoch": 1.5626191020657063, "grad_norm": 0.7945945858955383, "learning_rate": 7.777540046190083e-05, "loss": 0.14994269609451294, "memory(GiB)": 122.96, "step": 20500, "token_acc": 0.9257748776508973, "train_speed(iter/s)": 0.245821 }, { "epoch": 1.563000228675966, "grad_norm": 0.822462797164917, "learning_rate": 7.776544361166431e-05, "loss": 0.154620897769928, "memory(GiB)": 122.96, "step": 20505, "token_acc": 0.9416574933165592, "train_speed(iter/s)": 0.245822 }, { "epoch": 1.5633813552862261, "grad_norm": 0.7540737390518188, "learning_rate": 7.775548516920847e-05, "loss": 0.118274986743927, "memory(GiB)": 122.96, "step": 20510, "token_acc": 0.955232789494628, "train_speed(iter/s)": 0.245831 }, { "epoch": 1.563762481896486, "grad_norm": 0.6483361721038818, "learning_rate": 7.774552513510434e-05, "loss": 0.12014776468276978, "memory(GiB)": 122.96, "step": 20515, "token_acc": 0.9550882658359294, "train_speed(iter/s)": 0.245845 }, { "epoch": 1.564143608506746, "grad_norm": 0.5103482604026794, "learning_rate": 7.77355635099231e-05, "loss": 0.11107646226882935, "memory(GiB)": 122.96, "step": 20520, "token_acc": 0.9500192233756247, "train_speed(iter/s)": 0.245854 }, { "epoch": 1.564524735117006, "grad_norm": 0.6634088158607483, "learning_rate": 7.772560029423601e-05, "loss": 0.09796789884567261, "memory(GiB)": 122.96, "step": 20525, "token_acc": 0.9694431869624265, "train_speed(iter/s)": 0.245863 }, { "epoch": 1.5649058617272658, "grad_norm": 1.8865690231323242, "learning_rate": 7.77156354886144e-05, "loss": 0.12131747007369995, "memory(GiB)": 122.96, "step": 20530, "token_acc": 0.9557938299473289, "train_speed(iter/s)": 0.245874 }, { "epoch": 1.5652869883375258, "grad_norm": 0.4541265368461609, "learning_rate": 7.770566909362972e-05, "loss": 0.0987938940525055, "memory(GiB)": 122.96, "step": 20535, "token_acc": 0.9633286318758815, "train_speed(iter/s)": 0.24588 }, { "epoch": 1.5656681149477856, "grad_norm": 0.8200079798698425, "learning_rate": 7.769570110985348e-05, "loss": 0.10751773118972778, "memory(GiB)": 122.96, "step": 20540, "token_acc": 0.964881028120626, "train_speed(iter/s)": 0.245883 }, { "epoch": 1.5660492415580456, "grad_norm": 0.8397249579429626, "learning_rate": 7.76857315378573e-05, "loss": 0.11798399686813354, "memory(GiB)": 122.96, "step": 20545, "token_acc": 0.9525673497582317, "train_speed(iter/s)": 0.245896 }, { "epoch": 1.5664303681683056, "grad_norm": 0.9297528266906738, "learning_rate": 7.767576037821289e-05, "loss": 0.11577843427658081, "memory(GiB)": 122.96, "step": 20550, "token_acc": 0.9617235345581803, "train_speed(iter/s)": 0.245908 }, { "epoch": 1.5668114947785654, "grad_norm": 1.9333354234695435, "learning_rate": 7.766578763149207e-05, "loss": 0.18470335006713867, "memory(GiB)": 122.96, "step": 20555, "token_acc": 0.925050641458474, "train_speed(iter/s)": 0.245926 }, { "epoch": 1.5671926213888252, "grad_norm": 1.2132799625396729, "learning_rate": 7.76558132982667e-05, "loss": 0.15248109102249147, "memory(GiB)": 122.96, "step": 20560, "token_acc": 0.9378453038674033, "train_speed(iter/s)": 0.245938 }, { "epoch": 1.5675737479990852, "grad_norm": 0.7555849552154541, "learning_rate": 7.764583737910878e-05, "loss": 0.11216704845428467, "memory(GiB)": 122.96, "step": 20565, "token_acc": 0.9554089709762533, "train_speed(iter/s)": 0.245951 }, { "epoch": 1.5679548746093452, "grad_norm": 0.7011930346488953, "learning_rate": 7.763585987459039e-05, "loss": 0.1146240234375, "memory(GiB)": 122.96, "step": 20570, "token_acc": 0.9597523219814241, "train_speed(iter/s)": 0.245957 }, { "epoch": 1.5683360012196053, "grad_norm": 0.3983931541442871, "learning_rate": 7.762588078528367e-05, "loss": 0.07913058996200562, "memory(GiB)": 122.96, "step": 20575, "token_acc": 0.9615598885793872, "train_speed(iter/s)": 0.24597 }, { "epoch": 1.568717127829865, "grad_norm": 1.6100703477859497, "learning_rate": 7.761590011176089e-05, "loss": 0.1341947555541992, "memory(GiB)": 122.96, "step": 20580, "token_acc": 0.9433035714285715, "train_speed(iter/s)": 0.245986 }, { "epoch": 1.5690982544401249, "grad_norm": 1.7765450477600098, "learning_rate": 7.760591785459438e-05, "loss": 0.130311918258667, "memory(GiB)": 122.96, "step": 20585, "token_acc": 0.9407303931351028, "train_speed(iter/s)": 0.245998 }, { "epoch": 1.5694793810503849, "grad_norm": 0.8172553181648254, "learning_rate": 7.759593401435661e-05, "loss": 0.08370200395584107, "memory(GiB)": 122.96, "step": 20590, "token_acc": 0.9521885521885521, "train_speed(iter/s)": 0.24601 }, { "epoch": 1.5698605076606449, "grad_norm": 1.365710973739624, "learning_rate": 7.758594859162007e-05, "loss": 0.14865376949310302, "memory(GiB)": 122.96, "step": 20595, "token_acc": 0.9477255032810589, "train_speed(iter/s)": 0.246013 }, { "epoch": 1.570241634270905, "grad_norm": 1.4803544282913208, "learning_rate": 7.75759615869574e-05, "loss": 0.132806396484375, "memory(GiB)": 122.96, "step": 20600, "token_acc": 0.9496957403651115, "train_speed(iter/s)": 0.246024 }, { "epoch": 1.570241634270905, "eval_loss": 0.100025475025177, "eval_runtime": 220.3654, "eval_samples_per_second": 2.405, "eval_steps_per_second": 2.405, "eval_token_acc": 0.954588579001265, "step": 20600 }, { "epoch": 1.5706227608811647, "grad_norm": 0.5783364176750183, "learning_rate": 7.75659730009413e-05, "loss": 0.08317768573760986, "memory(GiB)": 122.96, "step": 20605, "token_acc": 0.9551393916667869, "train_speed(iter/s)": 0.245389 }, { "epoch": 1.5710038874914245, "grad_norm": 1.2214646339416504, "learning_rate": 7.755598283414455e-05, "loss": 0.15190064907073975, "memory(GiB)": 122.96, "step": 20610, "token_acc": 0.9497925311203319, "train_speed(iter/s)": 0.245406 }, { "epoch": 1.5713850141016845, "grad_norm": 0.9062264561653137, "learning_rate": 7.75459910871401e-05, "loss": 0.16756923198699952, "memory(GiB)": 122.96, "step": 20615, "token_acc": 0.9272624220668808, "train_speed(iter/s)": 0.245418 }, { "epoch": 1.5717661407119445, "grad_norm": 0.12439700216054916, "learning_rate": 7.753599776050087e-05, "loss": 0.10593580007553101, "memory(GiB)": 122.96, "step": 20620, "token_acc": 0.9475497702909648, "train_speed(iter/s)": 0.245438 }, { "epoch": 1.5721472673222046, "grad_norm": 0.8501498103141785, "learning_rate": 7.752600285479994e-05, "loss": 0.15638244152069092, "memory(GiB)": 122.96, "step": 20625, "token_acc": 0.9282670454545454, "train_speed(iter/s)": 0.245447 }, { "epoch": 1.5725283939324644, "grad_norm": 0.6716774106025696, "learning_rate": 7.75160063706105e-05, "loss": 0.12386003732681275, "memory(GiB)": 122.96, "step": 20630, "token_acc": 0.9584285061671997, "train_speed(iter/s)": 0.245458 }, { "epoch": 1.5729095205427241, "grad_norm": 0.9133725166320801, "learning_rate": 7.750600830850578e-05, "loss": 0.13842649459838868, "memory(GiB)": 122.96, "step": 20635, "token_acc": 0.9465688842325825, "train_speed(iter/s)": 0.245471 }, { "epoch": 1.5732906471529842, "grad_norm": 1.2853803634643555, "learning_rate": 7.749600866905913e-05, "loss": 0.12650713920593262, "memory(GiB)": 122.96, "step": 20640, "token_acc": 0.9388573337788172, "train_speed(iter/s)": 0.245489 }, { "epoch": 1.5736717737632442, "grad_norm": 0.9152953624725342, "learning_rate": 7.748600745284396e-05, "loss": 0.11206097602844238, "memory(GiB)": 122.96, "step": 20645, "token_acc": 0.9528985507246377, "train_speed(iter/s)": 0.245492 }, { "epoch": 1.5740529003735042, "grad_norm": 1.8143556118011475, "learning_rate": 7.747600466043384e-05, "loss": 0.11714316606521606, "memory(GiB)": 122.96, "step": 20650, "token_acc": 0.9478210173444105, "train_speed(iter/s)": 0.245501 }, { "epoch": 1.574434026983764, "grad_norm": 0.5015907883644104, "learning_rate": 7.746600029240234e-05, "loss": 0.11515450477600098, "memory(GiB)": 122.96, "step": 20655, "token_acc": 0.9554274735830932, "train_speed(iter/s)": 0.245512 }, { "epoch": 1.5748151535940238, "grad_norm": 0.8459840416908264, "learning_rate": 7.745599434932319e-05, "loss": 0.13441752195358275, "memory(GiB)": 122.96, "step": 20660, "token_acc": 0.9456540373735642, "train_speed(iter/s)": 0.245516 }, { "epoch": 1.5751962802042838, "grad_norm": 1.6829885244369507, "learning_rate": 7.744598683177015e-05, "loss": 0.1081918478012085, "memory(GiB)": 122.96, "step": 20665, "token_acc": 0.960492413398225, "train_speed(iter/s)": 0.245528 }, { "epoch": 1.5755774068145438, "grad_norm": 0.6918957829475403, "learning_rate": 7.743597774031717e-05, "loss": 0.09591987133026122, "memory(GiB)": 122.96, "step": 20670, "token_acc": 0.9625374027528426, "train_speed(iter/s)": 0.245525 }, { "epoch": 1.5759585334248039, "grad_norm": 0.4692426919937134, "learning_rate": 7.742596707553815e-05, "loss": 0.11996984481811523, "memory(GiB)": 122.96, "step": 20675, "token_acc": 0.9552818315704914, "train_speed(iter/s)": 0.245527 }, { "epoch": 1.5763396600350637, "grad_norm": 0.6507902145385742, "learning_rate": 7.741595483800721e-05, "loss": 0.09089620709419251, "memory(GiB)": 122.96, "step": 20680, "token_acc": 0.9595134955014994, "train_speed(iter/s)": 0.245534 }, { "epoch": 1.5767207866453234, "grad_norm": 0.6726707220077515, "learning_rate": 7.740594102829848e-05, "loss": 0.11457359790802002, "memory(GiB)": 122.96, "step": 20685, "token_acc": 0.9593358999037536, "train_speed(iter/s)": 0.245546 }, { "epoch": 1.5771019132555835, "grad_norm": 0.7749638557434082, "learning_rate": 7.739592564698621e-05, "loss": 0.09378604292869568, "memory(GiB)": 122.96, "step": 20690, "token_acc": 0.9682210708117444, "train_speed(iter/s)": 0.245548 }, { "epoch": 1.5774830398658435, "grad_norm": 0.5079705715179443, "learning_rate": 7.738590869464474e-05, "loss": 0.07211803793907165, "memory(GiB)": 122.96, "step": 20695, "token_acc": 0.9638326585695006, "train_speed(iter/s)": 0.245562 }, { "epoch": 1.5778641664761035, "grad_norm": 1.094911813735962, "learning_rate": 7.73758901718485e-05, "loss": 0.10836167335510254, "memory(GiB)": 122.96, "step": 20700, "token_acc": 0.962747175141243, "train_speed(iter/s)": 0.245568 }, { "epoch": 1.5782452930863633, "grad_norm": 0.7047770619392395, "learning_rate": 7.736587007917198e-05, "loss": 0.10340862274169922, "memory(GiB)": 122.96, "step": 20705, "token_acc": 0.9541984732824428, "train_speed(iter/s)": 0.245583 }, { "epoch": 1.578626419696623, "grad_norm": 0.5577402114868164, "learning_rate": 7.735584841718981e-05, "loss": 0.09742294549942017, "memory(GiB)": 122.96, "step": 20710, "token_acc": 0.9578016241299304, "train_speed(iter/s)": 0.245593 }, { "epoch": 1.5790075463068831, "grad_norm": 1.1945008039474487, "learning_rate": 7.73458251864767e-05, "loss": 0.1780307412147522, "memory(GiB)": 122.96, "step": 20715, "token_acc": 0.9453531598513011, "train_speed(iter/s)": 0.245602 }, { "epoch": 1.5793886729171431, "grad_norm": 1.10934579372406, "learning_rate": 7.733580038760739e-05, "loss": 0.11831210851669312, "memory(GiB)": 122.96, "step": 20720, "token_acc": 0.949410163339383, "train_speed(iter/s)": 0.245615 }, { "epoch": 1.5797697995274032, "grad_norm": 0.9987887144088745, "learning_rate": 7.732577402115679e-05, "loss": 0.0815968632698059, "memory(GiB)": 122.96, "step": 20725, "token_acc": 0.9653708668453976, "train_speed(iter/s)": 0.245627 }, { "epoch": 1.580150926137663, "grad_norm": 1.2202340364456177, "learning_rate": 7.731574608769987e-05, "loss": 0.09722353219985962, "memory(GiB)": 122.96, "step": 20730, "token_acc": 0.9539249146757679, "train_speed(iter/s)": 0.245637 }, { "epoch": 1.5805320527479227, "grad_norm": 0.8471435308456421, "learning_rate": 7.730571658781165e-05, "loss": 0.1377018451690674, "memory(GiB)": 122.96, "step": 20735, "token_acc": 0.9464581140021056, "train_speed(iter/s)": 0.245644 }, { "epoch": 1.5809131793581828, "grad_norm": 0.7374142408370972, "learning_rate": 7.729568552206732e-05, "loss": 0.13249988555908204, "memory(GiB)": 122.96, "step": 20740, "token_acc": 0.950739667530883, "train_speed(iter/s)": 0.245653 }, { "epoch": 1.5812943059684428, "grad_norm": 0.8034853339195251, "learning_rate": 7.728565289104207e-05, "loss": 0.1009147047996521, "memory(GiB)": 122.96, "step": 20745, "token_acc": 0.9587203302373581, "train_speed(iter/s)": 0.245654 }, { "epoch": 1.5816754325787028, "grad_norm": 0.8192576766014099, "learning_rate": 7.727561869531126e-05, "loss": 0.09572315812110901, "memory(GiB)": 122.96, "step": 20750, "token_acc": 0.9628647214854111, "train_speed(iter/s)": 0.245673 }, { "epoch": 1.5820565591889626, "grad_norm": 1.0575897693634033, "learning_rate": 7.726558293545029e-05, "loss": 0.14186527729034423, "memory(GiB)": 122.96, "step": 20755, "token_acc": 0.9397826474256191, "train_speed(iter/s)": 0.245679 }, { "epoch": 1.5824376857992224, "grad_norm": 1.1924842596054077, "learning_rate": 7.725554561203467e-05, "loss": 0.18795114755630493, "memory(GiB)": 122.96, "step": 20760, "token_acc": 0.9336392516766678, "train_speed(iter/s)": 0.24569 }, { "epoch": 1.5828188124094824, "grad_norm": 0.727869987487793, "learning_rate": 7.724550672563999e-05, "loss": 0.19045872688293458, "memory(GiB)": 122.96, "step": 20765, "token_acc": 0.9321734402609759, "train_speed(iter/s)": 0.245694 }, { "epoch": 1.5831999390197424, "grad_norm": 0.2743649482727051, "learning_rate": 7.723546627684193e-05, "loss": 0.09919785261154175, "memory(GiB)": 122.96, "step": 20770, "token_acc": 0.9596015495296071, "train_speed(iter/s)": 0.245697 }, { "epoch": 1.5835810656300024, "grad_norm": 1.0975489616394043, "learning_rate": 7.722542426621627e-05, "loss": 0.10497939586639404, "memory(GiB)": 122.96, "step": 20775, "token_acc": 0.9531308586426697, "train_speed(iter/s)": 0.245712 }, { "epoch": 1.5839621922402622, "grad_norm": 1.3196781873703003, "learning_rate": 7.721538069433887e-05, "loss": 0.16957426071166992, "memory(GiB)": 122.96, "step": 20780, "token_acc": 0.9370191447486134, "train_speed(iter/s)": 0.24572 }, { "epoch": 1.584343318850522, "grad_norm": 0.9964106678962708, "learning_rate": 7.720533556178568e-05, "loss": 0.1129598617553711, "memory(GiB)": 122.96, "step": 20785, "token_acc": 0.9455414674703804, "train_speed(iter/s)": 0.245733 }, { "epoch": 1.584724445460782, "grad_norm": 0.5133894085884094, "learning_rate": 7.719528886913274e-05, "loss": 0.08703320622444152, "memory(GiB)": 122.96, "step": 20790, "token_acc": 0.9678053204353083, "train_speed(iter/s)": 0.245736 }, { "epoch": 1.585105572071042, "grad_norm": 1.0179443359375, "learning_rate": 7.718524061695618e-05, "loss": 0.13606297969818115, "memory(GiB)": 122.96, "step": 20795, "token_acc": 0.940584478420747, "train_speed(iter/s)": 0.245748 }, { "epoch": 1.5854866986813019, "grad_norm": 0.5480346083641052, "learning_rate": 7.717519080583224e-05, "loss": 0.12875640392303467, "memory(GiB)": 122.96, "step": 20800, "token_acc": 0.9453843324751747, "train_speed(iter/s)": 0.245757 }, { "epoch": 1.5854866986813019, "eval_loss": 0.09927195310592651, "eval_runtime": 221.4724, "eval_samples_per_second": 2.393, "eval_steps_per_second": 2.393, "eval_token_acc": 0.9543702186615264, "step": 20800 }, { "epoch": 1.585867825291562, "grad_norm": 1.0272544622421265, "learning_rate": 7.71651394363372e-05, "loss": 0.13165522813796998, "memory(GiB)": 122.96, "step": 20805, "token_acc": 0.9542325446152727, "train_speed(iter/s)": 0.245124 }, { "epoch": 1.5862489519018217, "grad_norm": 1.2003084421157837, "learning_rate": 7.715508650904749e-05, "loss": 0.22343883514404297, "memory(GiB)": 122.96, "step": 20810, "token_acc": 0.9091894822445107, "train_speed(iter/s)": 0.245137 }, { "epoch": 1.5866300785120817, "grad_norm": 1.2003456354141235, "learning_rate": 7.714503202453958e-05, "loss": 0.16920464038848876, "memory(GiB)": 122.96, "step": 20815, "token_acc": 0.9418020108275329, "train_speed(iter/s)": 0.245148 }, { "epoch": 1.5870112051223417, "grad_norm": 1.6364136934280396, "learning_rate": 7.713497598339005e-05, "loss": 0.14751338958740234, "memory(GiB)": 122.96, "step": 20820, "token_acc": 0.9439986341130272, "train_speed(iter/s)": 0.245159 }, { "epoch": 1.5873923317326015, "grad_norm": 1.2855488061904907, "learning_rate": 7.712491838617557e-05, "loss": 0.10106512308120727, "memory(GiB)": 122.96, "step": 20825, "token_acc": 0.9555654299044657, "train_speed(iter/s)": 0.245173 }, { "epoch": 1.5877734583428615, "grad_norm": 1.134937047958374, "learning_rate": 7.71148592334729e-05, "loss": 0.1403339385986328, "memory(GiB)": 122.96, "step": 20830, "token_acc": 0.9502410468319559, "train_speed(iter/s)": 0.245179 }, { "epoch": 1.5881545849531213, "grad_norm": 0.7597388029098511, "learning_rate": 7.710479852585888e-05, "loss": 0.11871968507766724, "memory(GiB)": 122.96, "step": 20835, "token_acc": 0.9539335296592343, "train_speed(iter/s)": 0.245189 }, { "epoch": 1.5885357115633814, "grad_norm": 0.7001367211341858, "learning_rate": 7.709473626391044e-05, "loss": 0.08914567232131958, "memory(GiB)": 122.96, "step": 20840, "token_acc": 0.9583697743571803, "train_speed(iter/s)": 0.245197 }, { "epoch": 1.5889168381736414, "grad_norm": 0.8192274570465088, "learning_rate": 7.70846724482046e-05, "loss": 0.10207723379135132, "memory(GiB)": 122.96, "step": 20845, "token_acc": 0.9629365645046329, "train_speed(iter/s)": 0.24521 }, { "epoch": 1.5892979647839012, "grad_norm": 0.4913978576660156, "learning_rate": 7.707460707931851e-05, "loss": 0.11911356449127197, "memory(GiB)": 122.96, "step": 20850, "token_acc": 0.9532724505327245, "train_speed(iter/s)": 0.245218 }, { "epoch": 1.5896790913941612, "grad_norm": 0.5263283848762512, "learning_rate": 7.706454015782933e-05, "loss": 0.13822083473205565, "memory(GiB)": 122.96, "step": 20855, "token_acc": 0.950909780136467, "train_speed(iter/s)": 0.245225 }, { "epoch": 1.590060218004421, "grad_norm": 0.6441394686698914, "learning_rate": 7.705447168431437e-05, "loss": 0.09559541940689087, "memory(GiB)": 122.96, "step": 20860, "token_acc": 0.9586638830897704, "train_speed(iter/s)": 0.245237 }, { "epoch": 1.590441344614681, "grad_norm": 1.0356595516204834, "learning_rate": 7.7044401659351e-05, "loss": 0.08366692066192627, "memory(GiB)": 122.96, "step": 20865, "token_acc": 0.9692671394799054, "train_speed(iter/s)": 0.245255 }, { "epoch": 1.590822471224941, "grad_norm": 0.6904747486114502, "learning_rate": 7.703433008351671e-05, "loss": 0.13917274475097657, "memory(GiB)": 122.96, "step": 20870, "token_acc": 0.9461871281773931, "train_speed(iter/s)": 0.245258 }, { "epoch": 1.5912035978352008, "grad_norm": 0.8436818718910217, "learning_rate": 7.702425695738901e-05, "loss": 0.09327720403671265, "memory(GiB)": 122.96, "step": 20875, "token_acc": 0.9665311229277448, "train_speed(iter/s)": 0.245273 }, { "epoch": 1.5915847244454606, "grad_norm": 0.5542105436325073, "learning_rate": 7.701418228154562e-05, "loss": 0.09700791835784912, "memory(GiB)": 122.96, "step": 20880, "token_acc": 0.9582624826093677, "train_speed(iter/s)": 0.245276 }, { "epoch": 1.5919658510557206, "grad_norm": 1.124210238456726, "learning_rate": 7.700410605656423e-05, "loss": 0.14567749500274657, "memory(GiB)": 122.96, "step": 20885, "token_acc": 0.9512195121951219, "train_speed(iter/s)": 0.24529 }, { "epoch": 1.5923469776659807, "grad_norm": 0.9775960445404053, "learning_rate": 7.699402828302267e-05, "loss": 0.12584342956542968, "memory(GiB)": 122.96, "step": 20890, "token_acc": 0.9486511627906977, "train_speed(iter/s)": 0.245298 }, { "epoch": 1.5927281042762407, "grad_norm": 0.47661611437797546, "learning_rate": 7.698394896149885e-05, "loss": 0.09772167801856994, "memory(GiB)": 122.96, "step": 20895, "token_acc": 0.9482407676650189, "train_speed(iter/s)": 0.245312 }, { "epoch": 1.5931092308865005, "grad_norm": 0.942423939704895, "learning_rate": 7.697386809257076e-05, "loss": 0.09949523210525513, "memory(GiB)": 122.96, "step": 20900, "token_acc": 0.9535673839184597, "train_speed(iter/s)": 0.24532 }, { "epoch": 1.5934903574967603, "grad_norm": 1.0262631177902222, "learning_rate": 7.696378567681655e-05, "loss": 0.12186434268951415, "memory(GiB)": 122.96, "step": 20905, "token_acc": 0.9564701226751088, "train_speed(iter/s)": 0.245329 }, { "epoch": 1.5938714841070203, "grad_norm": 0.9496902823448181, "learning_rate": 7.695370171481433e-05, "loss": 0.1473616600036621, "memory(GiB)": 122.96, "step": 20910, "token_acc": 0.9468249870934434, "train_speed(iter/s)": 0.245339 }, { "epoch": 1.5942526107172803, "grad_norm": 1.1006805896759033, "learning_rate": 7.69436162071424e-05, "loss": 0.13541456460952758, "memory(GiB)": 122.96, "step": 20915, "token_acc": 0.9422147509981088, "train_speed(iter/s)": 0.245349 }, { "epoch": 1.5946337373275403, "grad_norm": 0.7859416604042053, "learning_rate": 7.693352915437913e-05, "loss": 0.10025631189346314, "memory(GiB)": 122.96, "step": 20920, "token_acc": 0.9350356740518213, "train_speed(iter/s)": 0.245365 }, { "epoch": 1.5950148639378001, "grad_norm": 0.8449248671531677, "learning_rate": 7.692344055710293e-05, "loss": 0.13025113344192504, "memory(GiB)": 122.96, "step": 20925, "token_acc": 0.9537704918032787, "train_speed(iter/s)": 0.245375 }, { "epoch": 1.59539599054806, "grad_norm": 1.1783325672149658, "learning_rate": 7.691335041589236e-05, "loss": 0.11162058115005494, "memory(GiB)": 122.96, "step": 20930, "token_acc": 0.9654107473749228, "train_speed(iter/s)": 0.245382 }, { "epoch": 1.59577711715832, "grad_norm": 0.6656584143638611, "learning_rate": 7.690325873132604e-05, "loss": 0.1297250747680664, "memory(GiB)": 122.96, "step": 20935, "token_acc": 0.9530685920577617, "train_speed(iter/s)": 0.24538 }, { "epoch": 1.59615824376858, "grad_norm": 1.3154218196868896, "learning_rate": 7.68931655039827e-05, "loss": 0.13031408786773682, "memory(GiB)": 122.96, "step": 20940, "token_acc": 0.9463642908567315, "train_speed(iter/s)": 0.245396 }, { "epoch": 1.59653937037884, "grad_norm": 1.0300265550613403, "learning_rate": 7.688307073444108e-05, "loss": 0.08854464888572693, "memory(GiB)": 122.96, "step": 20945, "token_acc": 0.962640385056154, "train_speed(iter/s)": 0.245407 }, { "epoch": 1.5969204969890998, "grad_norm": 0.4914930760860443, "learning_rate": 7.687297442328011e-05, "loss": 0.12869787216186523, "memory(GiB)": 122.96, "step": 20950, "token_acc": 0.9597253155159614, "train_speed(iter/s)": 0.245418 }, { "epoch": 1.5973016235993596, "grad_norm": 0.8184056282043457, "learning_rate": 7.686287657107878e-05, "loss": 0.11275973320007324, "memory(GiB)": 122.96, "step": 20955, "token_acc": 0.9455723542116631, "train_speed(iter/s)": 0.245435 }, { "epoch": 1.5976827502096196, "grad_norm": 0.7182145714759827, "learning_rate": 7.685277717841613e-05, "loss": 0.1738657236099243, "memory(GiB)": 122.96, "step": 20960, "token_acc": 0.9275862068965517, "train_speed(iter/s)": 0.245449 }, { "epoch": 1.5980638768198796, "grad_norm": 1.1600788831710815, "learning_rate": 7.684267624587132e-05, "loss": 0.1301661491394043, "memory(GiB)": 122.96, "step": 20965, "token_acc": 0.9602868906893346, "train_speed(iter/s)": 0.245453 }, { "epoch": 1.5984450034301396, "grad_norm": 1.7993288040161133, "learning_rate": 7.68325737740236e-05, "loss": 0.10327138900756835, "memory(GiB)": 122.96, "step": 20970, "token_acc": 0.9644332262505737, "train_speed(iter/s)": 0.245467 }, { "epoch": 1.5988261300403994, "grad_norm": 0.6049738526344299, "learning_rate": 7.682246976345229e-05, "loss": 0.13669487237930297, "memory(GiB)": 122.96, "step": 20975, "token_acc": 0.9361179361179361, "train_speed(iter/s)": 0.245486 }, { "epoch": 1.5992072566506592, "grad_norm": 0.980158805847168, "learning_rate": 7.681236421473682e-05, "loss": 0.14166605472564697, "memory(GiB)": 122.96, "step": 20980, "token_acc": 0.9447611558875565, "train_speed(iter/s)": 0.2455 }, { "epoch": 1.5995883832609192, "grad_norm": 1.5580737590789795, "learning_rate": 7.680225712845666e-05, "loss": 0.13606619834899902, "memory(GiB)": 122.96, "step": 20985, "token_acc": 0.9390023261824761, "train_speed(iter/s)": 0.245513 }, { "epoch": 1.5999695098711793, "grad_norm": 0.9977888464927673, "learning_rate": 7.679214850519145e-05, "loss": 0.10841109752655029, "memory(GiB)": 122.96, "step": 20990, "token_acc": 0.9572671775555267, "train_speed(iter/s)": 0.245516 }, { "epoch": 1.6003506364814393, "grad_norm": 1.015363097190857, "learning_rate": 7.678203834552085e-05, "loss": 0.15654139518737792, "memory(GiB)": 122.96, "step": 20995, "token_acc": 0.9287598944591029, "train_speed(iter/s)": 0.245532 }, { "epoch": 1.600731763091699, "grad_norm": 1.2107189893722534, "learning_rate": 7.677192665002464e-05, "loss": 0.13278648853302003, "memory(GiB)": 122.96, "step": 21000, "token_acc": 0.9505084745762712, "train_speed(iter/s)": 0.245544 }, { "epoch": 1.600731763091699, "eval_loss": 0.10052026808261871, "eval_runtime": 223.829, "eval_samples_per_second": 2.368, "eval_steps_per_second": 2.368, "eval_token_acc": 0.9547316426721282, "step": 21000 }, { "epoch": 1.6011128897019589, "grad_norm": 1.122336745262146, "learning_rate": 7.676181341928266e-05, "loss": 0.14723405838012696, "memory(GiB)": 122.96, "step": 21005, "token_acc": 0.9542368947811204, "train_speed(iter/s)": 0.244907 }, { "epoch": 1.6014940163122189, "grad_norm": 0.7543533444404602, "learning_rate": 7.675169865387488e-05, "loss": 0.16861164569854736, "memory(GiB)": 122.96, "step": 21010, "token_acc": 0.9438870308435526, "train_speed(iter/s)": 0.244917 }, { "epoch": 1.601875142922479, "grad_norm": 0.7721652984619141, "learning_rate": 7.674158235438133e-05, "loss": 0.13729619979858398, "memory(GiB)": 122.96, "step": 21015, "token_acc": 0.9435674822415154, "train_speed(iter/s)": 0.244926 }, { "epoch": 1.602256269532739, "grad_norm": 1.132580280303955, "learning_rate": 7.673146452138212e-05, "loss": 0.15076131820678712, "memory(GiB)": 122.96, "step": 21020, "token_acc": 0.9414612676056338, "train_speed(iter/s)": 0.244936 }, { "epoch": 1.6026373961429987, "grad_norm": 0.991367518901825, "learning_rate": 7.672134515545746e-05, "loss": 0.14036763906478883, "memory(GiB)": 122.96, "step": 21025, "token_acc": 0.9468047126026419, "train_speed(iter/s)": 0.24494 }, { "epoch": 1.6030185227532585, "grad_norm": 0.4407329857349396, "learning_rate": 7.671122425718768e-05, "loss": 0.09789620041847229, "memory(GiB)": 122.96, "step": 21030, "token_acc": 0.9601235607975288, "train_speed(iter/s)": 0.244946 }, { "epoch": 1.6033996493635185, "grad_norm": 0.6290179491043091, "learning_rate": 7.670110182715312e-05, "loss": 0.11151365041732789, "memory(GiB)": 122.96, "step": 21035, "token_acc": 0.9493100944081336, "train_speed(iter/s)": 0.24495 }, { "epoch": 1.6037807759737785, "grad_norm": 1.0370190143585205, "learning_rate": 7.66909778659343e-05, "loss": 0.11002825498580933, "memory(GiB)": 122.96, "step": 21040, "token_acc": 0.9534064212999217, "train_speed(iter/s)": 0.244958 }, { "epoch": 1.6041619025840386, "grad_norm": 0.7112976908683777, "learning_rate": 7.668085237411175e-05, "loss": 0.12406430244445801, "memory(GiB)": 122.96, "step": 21045, "token_acc": 0.9523599470666079, "train_speed(iter/s)": 0.244965 }, { "epoch": 1.6045430291942984, "grad_norm": 0.5956043004989624, "learning_rate": 7.667072535226613e-05, "loss": 0.1274664282798767, "memory(GiB)": 122.96, "step": 21050, "token_acc": 0.9508543244671481, "train_speed(iter/s)": 0.244975 }, { "epoch": 1.6049241558045582, "grad_norm": 0.8544282913208008, "learning_rate": 7.666059680097819e-05, "loss": 0.12154606580734253, "memory(GiB)": 122.96, "step": 21055, "token_acc": 0.9459706959706959, "train_speed(iter/s)": 0.244977 }, { "epoch": 1.6053052824148182, "grad_norm": 0.5605180263519287, "learning_rate": 7.665046672082874e-05, "loss": 0.11159722805023194, "memory(GiB)": 122.96, "step": 21060, "token_acc": 0.9491415613864593, "train_speed(iter/s)": 0.244987 }, { "epoch": 1.6056864090250782, "grad_norm": 0.9872449040412903, "learning_rate": 7.664033511239868e-05, "loss": 0.11699113845825196, "memory(GiB)": 122.96, "step": 21065, "token_acc": 0.957613586877631, "train_speed(iter/s)": 0.244992 }, { "epoch": 1.6060675356353382, "grad_norm": 0.4923055171966553, "learning_rate": 7.663020197626905e-05, "loss": 0.14478111267089844, "memory(GiB)": 122.96, "step": 21070, "token_acc": 0.9429963459196102, "train_speed(iter/s)": 0.245007 }, { "epoch": 1.606448662245598, "grad_norm": 1.112716794013977, "learning_rate": 7.662006731302089e-05, "loss": 0.11040792465209961, "memory(GiB)": 122.96, "step": 21075, "token_acc": 0.9521674140508222, "train_speed(iter/s)": 0.245016 }, { "epoch": 1.6068297888558578, "grad_norm": 0.5777724385261536, "learning_rate": 7.660993112323542e-05, "loss": 0.150126051902771, "memory(GiB)": 122.96, "step": 21080, "token_acc": 0.9428783382789317, "train_speed(iter/s)": 0.245021 }, { "epoch": 1.6072109154661178, "grad_norm": 0.967249870300293, "learning_rate": 7.659979340749388e-05, "loss": 0.11430567502975464, "memory(GiB)": 122.96, "step": 21085, "token_acc": 0.9476036751924509, "train_speed(iter/s)": 0.245036 }, { "epoch": 1.6075920420763778, "grad_norm": 1.223449945449829, "learning_rate": 7.658965416637762e-05, "loss": 0.15256857872009277, "memory(GiB)": 122.96, "step": 21090, "token_acc": 0.9439760674462877, "train_speed(iter/s)": 0.245051 }, { "epoch": 1.6079731686866379, "grad_norm": 1.2616904973983765, "learning_rate": 7.65795134004681e-05, "loss": 0.16777535676956176, "memory(GiB)": 122.96, "step": 21095, "token_acc": 0.9315505057503117, "train_speed(iter/s)": 0.245049 }, { "epoch": 1.6083542952968977, "grad_norm": 1.0931214094161987, "learning_rate": 7.656937111034683e-05, "loss": 0.15341413021087646, "memory(GiB)": 122.96, "step": 21100, "token_acc": 0.9608837377761681, "train_speed(iter/s)": 0.245055 }, { "epoch": 1.6087354219071575, "grad_norm": 1.544132947921753, "learning_rate": 7.655922729659541e-05, "loss": 0.09863582253456116, "memory(GiB)": 122.96, "step": 21105, "token_acc": 0.9552074745809288, "train_speed(iter/s)": 0.245071 }, { "epoch": 1.6091165485174175, "grad_norm": 0.9167470335960388, "learning_rate": 7.654908195979556e-05, "loss": 0.12964112758636476, "memory(GiB)": 122.96, "step": 21110, "token_acc": 0.9624034911043975, "train_speed(iter/s)": 0.245084 }, { "epoch": 1.6094976751276775, "grad_norm": 1.9055020809173584, "learning_rate": 7.653893510052908e-05, "loss": 0.1473854660987854, "memory(GiB)": 122.96, "step": 21115, "token_acc": 0.9472944483485594, "train_speed(iter/s)": 0.245097 }, { "epoch": 1.6098788017379373, "grad_norm": 1.1501610279083252, "learning_rate": 7.65287867193778e-05, "loss": 0.14407827854156494, "memory(GiB)": 122.96, "step": 21120, "token_acc": 0.9469573706475757, "train_speed(iter/s)": 0.245103 }, { "epoch": 1.6102599283481973, "grad_norm": 0.4311082661151886, "learning_rate": 7.651863681692373e-05, "loss": 0.10268464088439941, "memory(GiB)": 122.96, "step": 21125, "token_acc": 0.9654559810501382, "train_speed(iter/s)": 0.245112 }, { "epoch": 1.610641054958457, "grad_norm": 0.804528534412384, "learning_rate": 7.650848539374888e-05, "loss": 0.13145983219146729, "memory(GiB)": 122.96, "step": 21130, "token_acc": 0.9493009277407553, "train_speed(iter/s)": 0.245117 }, { "epoch": 1.6110221815687171, "grad_norm": 0.5702223181724548, "learning_rate": 7.649833245043541e-05, "loss": 0.13181718587875366, "memory(GiB)": 122.96, "step": 21135, "token_acc": 0.9620296271071003, "train_speed(iter/s)": 0.245124 }, { "epoch": 1.6114033081789771, "grad_norm": 0.8335277438163757, "learning_rate": 7.648817798756555e-05, "loss": 0.13570324182510377, "memory(GiB)": 122.96, "step": 21140, "token_acc": 0.9595505617977528, "train_speed(iter/s)": 0.245133 }, { "epoch": 1.611784434789237, "grad_norm": 0.7673050761222839, "learning_rate": 7.647802200572161e-05, "loss": 0.1288021445274353, "memory(GiB)": 122.96, "step": 21145, "token_acc": 0.952629745733194, "train_speed(iter/s)": 0.245147 }, { "epoch": 1.612165561399497, "grad_norm": 1.2397074699401855, "learning_rate": 7.646786450548598e-05, "loss": 0.1488107681274414, "memory(GiB)": 122.96, "step": 21150, "token_acc": 0.9431363838294091, "train_speed(iter/s)": 0.245165 }, { "epoch": 1.6125466880097568, "grad_norm": 0.6371222138404846, "learning_rate": 7.645770548744113e-05, "loss": 0.10398995876312256, "memory(GiB)": 122.96, "step": 21155, "token_acc": 0.9560029054685067, "train_speed(iter/s)": 0.245159 }, { "epoch": 1.6129278146200168, "grad_norm": 0.7567844986915588, "learning_rate": 7.644754495216966e-05, "loss": 0.14142324924468994, "memory(GiB)": 122.96, "step": 21160, "token_acc": 0.9477182896155228, "train_speed(iter/s)": 0.24517 }, { "epoch": 1.6133089412302768, "grad_norm": 0.5750652551651001, "learning_rate": 7.643738290025421e-05, "loss": 0.13432736396789552, "memory(GiB)": 122.96, "step": 21165, "token_acc": 0.9485326008385138, "train_speed(iter/s)": 0.245172 }, { "epoch": 1.6136900678405366, "grad_norm": 0.7844276428222656, "learning_rate": 7.642721933227754e-05, "loss": 0.11795873641967773, "memory(GiB)": 122.96, "step": 21170, "token_acc": 0.9554448595544486, "train_speed(iter/s)": 0.245175 }, { "epoch": 1.6140711944507966, "grad_norm": 0.5134640336036682, "learning_rate": 7.64170542488225e-05, "loss": 0.13315727710723876, "memory(GiB)": 122.96, "step": 21175, "token_acc": 0.9443011861784425, "train_speed(iter/s)": 0.245183 }, { "epoch": 1.6144523210610564, "grad_norm": 0.8794071078300476, "learning_rate": 7.640688765047197e-05, "loss": 0.07707971334457397, "memory(GiB)": 122.96, "step": 21180, "token_acc": 0.9681479578731056, "train_speed(iter/s)": 0.245197 }, { "epoch": 1.6148334476713164, "grad_norm": 1.7058745622634888, "learning_rate": 7.639671953780898e-05, "loss": 0.14568166732788085, "memory(GiB)": 122.96, "step": 21185, "token_acc": 0.9473479696910821, "train_speed(iter/s)": 0.245206 }, { "epoch": 1.6152145742815764, "grad_norm": 0.9609273076057434, "learning_rate": 7.638654991141661e-05, "loss": 0.11351219415664673, "memory(GiB)": 122.96, "step": 21190, "token_acc": 0.9492322932144626, "train_speed(iter/s)": 0.245217 }, { "epoch": 1.6155957008918362, "grad_norm": 1.1888633966445923, "learning_rate": 7.637637877187807e-05, "loss": 0.1600504159927368, "memory(GiB)": 122.96, "step": 21195, "token_acc": 0.944389471601029, "train_speed(iter/s)": 0.245225 }, { "epoch": 1.615976827502096, "grad_norm": 0.7389683723449707, "learning_rate": 7.636620611977658e-05, "loss": 0.09423256516456605, "memory(GiB)": 122.96, "step": 21200, "token_acc": 0.9624149659863945, "train_speed(iter/s)": 0.245232 }, { "epoch": 1.615976827502096, "eval_loss": 0.09900600463151932, "eval_runtime": 220.2684, "eval_samples_per_second": 2.406, "eval_steps_per_second": 2.406, "eval_token_acc": 0.9547542316727908, "step": 21200 }, { "epoch": 1.616357954112356, "grad_norm": 1.3629077672958374, "learning_rate": 7.635603195569553e-05, "loss": 0.1286097764968872, "memory(GiB)": 122.96, "step": 21205, "token_acc": 0.954643212055591, "train_speed(iter/s)": 0.244627 }, { "epoch": 1.616739080722616, "grad_norm": 1.0419262647628784, "learning_rate": 7.634585628021838e-05, "loss": 0.11296499967575073, "memory(GiB)": 122.96, "step": 21210, "token_acc": 0.9527116402116402, "train_speed(iter/s)": 0.244632 }, { "epoch": 1.617120207332876, "grad_norm": 0.49775269627571106, "learning_rate": 7.633567909392861e-05, "loss": 0.10281252861022949, "memory(GiB)": 122.96, "step": 21215, "token_acc": 0.9698397737983034, "train_speed(iter/s)": 0.244642 }, { "epoch": 1.6175013339431359, "grad_norm": 0.5361588001251221, "learning_rate": 7.632550039740987e-05, "loss": 0.09547773599624634, "memory(GiB)": 122.96, "step": 21220, "token_acc": 0.9616613418530351, "train_speed(iter/s)": 0.244645 }, { "epoch": 1.6178824605533957, "grad_norm": 0.8589800596237183, "learning_rate": 7.631532019124584e-05, "loss": 0.12309856414794922, "memory(GiB)": 122.96, "step": 21225, "token_acc": 0.9599839131309069, "train_speed(iter/s)": 0.244656 }, { "epoch": 1.6182635871636557, "grad_norm": 0.8987138271331787, "learning_rate": 7.63051384760203e-05, "loss": 0.11287986040115357, "memory(GiB)": 122.96, "step": 21230, "token_acc": 0.9579905992949471, "train_speed(iter/s)": 0.244658 }, { "epoch": 1.6186447137739157, "grad_norm": 0.5775141716003418, "learning_rate": 7.629495525231717e-05, "loss": 0.09903315901756286, "memory(GiB)": 122.96, "step": 21235, "token_acc": 0.9571865443425076, "train_speed(iter/s)": 0.244667 }, { "epoch": 1.6190258403841757, "grad_norm": 0.7958876490592957, "learning_rate": 7.628477052072037e-05, "loss": 0.095722496509552, "memory(GiB)": 122.96, "step": 21240, "token_acc": 0.9584086799276673, "train_speed(iter/s)": 0.244684 }, { "epoch": 1.6194069669944355, "grad_norm": 1.7569881677627563, "learning_rate": 7.627458428181394e-05, "loss": 0.07642971277236939, "memory(GiB)": 122.96, "step": 21245, "token_acc": 0.9665877400684031, "train_speed(iter/s)": 0.244701 }, { "epoch": 1.6197880936046953, "grad_norm": 1.4220664501190186, "learning_rate": 7.626439653618205e-05, "loss": 0.12150869369506836, "memory(GiB)": 122.96, "step": 21250, "token_acc": 0.9541530788413581, "train_speed(iter/s)": 0.24471 }, { "epoch": 1.6201692202149554, "grad_norm": 1.1206270456314087, "learning_rate": 7.625420728440888e-05, "loss": 0.14630979299545288, "memory(GiB)": 122.96, "step": 21255, "token_acc": 0.9336047604134043, "train_speed(iter/s)": 0.244727 }, { "epoch": 1.6205503468252154, "grad_norm": 0.7649474740028381, "learning_rate": 7.624401652707877e-05, "loss": 0.11858011484146118, "memory(GiB)": 122.96, "step": 21260, "token_acc": 0.9568916619398752, "train_speed(iter/s)": 0.24474 }, { "epoch": 1.6209314734354754, "grad_norm": 0.6453062891960144, "learning_rate": 7.62338242647761e-05, "loss": 0.14061179161071777, "memory(GiB)": 122.96, "step": 21265, "token_acc": 0.9502454991816693, "train_speed(iter/s)": 0.244755 }, { "epoch": 1.6213126000457352, "grad_norm": 0.5856008529663086, "learning_rate": 7.622363049808535e-05, "loss": 0.0705721616744995, "memory(GiB)": 122.96, "step": 21270, "token_acc": 0.9676514584891548, "train_speed(iter/s)": 0.244761 }, { "epoch": 1.621693726655995, "grad_norm": 0.8195800185203552, "learning_rate": 7.621343522759106e-05, "loss": 0.13694562911987304, "memory(GiB)": 122.96, "step": 21275, "token_acc": 0.9542657575253617, "train_speed(iter/s)": 0.24477 }, { "epoch": 1.622074853266255, "grad_norm": 1.1551761627197266, "learning_rate": 7.620323845387793e-05, "loss": 0.1722763419151306, "memory(GiB)": 122.96, "step": 21280, "token_acc": 0.9428515318146111, "train_speed(iter/s)": 0.244781 }, { "epoch": 1.622455979876515, "grad_norm": 0.6181943416595459, "learning_rate": 7.619304017753069e-05, "loss": 0.12748149633407593, "memory(GiB)": 122.96, "step": 21285, "token_acc": 0.9546363409147713, "train_speed(iter/s)": 0.244779 }, { "epoch": 1.622837106486775, "grad_norm": 0.7684677243232727, "learning_rate": 7.618284039913411e-05, "loss": 0.1301543354988098, "memory(GiB)": 122.96, "step": 21290, "token_acc": 0.9524568393094289, "train_speed(iter/s)": 0.244777 }, { "epoch": 1.6232182330970348, "grad_norm": 0.8061923980712891, "learning_rate": 7.617263911927315e-05, "loss": 0.10601390600204467, "memory(GiB)": 122.96, "step": 21295, "token_acc": 0.9584499461786867, "train_speed(iter/s)": 0.244789 }, { "epoch": 1.6235993597072946, "grad_norm": 0.9281865954399109, "learning_rate": 7.616243633853279e-05, "loss": 0.10014996528625489, "memory(GiB)": 122.96, "step": 21300, "token_acc": 0.9649978786593126, "train_speed(iter/s)": 0.244799 }, { "epoch": 1.6239804863175547, "grad_norm": 0.7492789626121521, "learning_rate": 7.615223205749812e-05, "loss": 0.10963995456695556, "memory(GiB)": 122.96, "step": 21305, "token_acc": 0.9483484814896919, "train_speed(iter/s)": 0.244811 }, { "epoch": 1.6243616129278147, "grad_norm": 1.1577093601226807, "learning_rate": 7.614202627675428e-05, "loss": 0.1461019515991211, "memory(GiB)": 122.96, "step": 21310, "token_acc": 0.9537982349887524, "train_speed(iter/s)": 0.244816 }, { "epoch": 1.6247427395380747, "grad_norm": 1.3702080249786377, "learning_rate": 7.613181899688658e-05, "loss": 0.16041842699050904, "memory(GiB)": 122.96, "step": 21315, "token_acc": 0.9365603226989365, "train_speed(iter/s)": 0.244826 }, { "epoch": 1.6251238661483345, "grad_norm": 0.6809081435203552, "learning_rate": 7.612161021848032e-05, "loss": 0.12530640363693238, "memory(GiB)": 122.96, "step": 21320, "token_acc": 0.9555555555555556, "train_speed(iter/s)": 0.244838 }, { "epoch": 1.6255049927585943, "grad_norm": 0.9984610080718994, "learning_rate": 7.611139994212092e-05, "loss": 0.113567054271698, "memory(GiB)": 122.96, "step": 21325, "token_acc": 0.9479539883085046, "train_speed(iter/s)": 0.244847 }, { "epoch": 1.6258861193688543, "grad_norm": 3.0776031017303467, "learning_rate": 7.610118816839391e-05, "loss": 0.11254376173019409, "memory(GiB)": 122.96, "step": 21330, "token_acc": 0.9586170212765958, "train_speed(iter/s)": 0.244844 }, { "epoch": 1.6262672459791143, "grad_norm": 1.1202110052108765, "learning_rate": 7.60909748978849e-05, "loss": 0.10457342863082886, "memory(GiB)": 122.96, "step": 21335, "token_acc": 0.9453386988598256, "train_speed(iter/s)": 0.244851 }, { "epoch": 1.6266483725893743, "grad_norm": 1.0235071182250977, "learning_rate": 7.608076013117953e-05, "loss": 0.14262826442718507, "memory(GiB)": 122.96, "step": 21340, "token_acc": 0.9418409403144558, "train_speed(iter/s)": 0.244858 }, { "epoch": 1.6270294991996341, "grad_norm": 0.7261878848075867, "learning_rate": 7.60705438688636e-05, "loss": 0.1276843786239624, "memory(GiB)": 122.96, "step": 21345, "token_acc": 0.9607666580910728, "train_speed(iter/s)": 0.244865 }, { "epoch": 1.627410625809894, "grad_norm": 0.9381394982337952, "learning_rate": 7.606032611152296e-05, "loss": 0.08540889024734497, "memory(GiB)": 122.96, "step": 21350, "token_acc": 0.9670465807730426, "train_speed(iter/s)": 0.244879 }, { "epoch": 1.627791752420154, "grad_norm": 1.2409436702728271, "learning_rate": 7.605010685974357e-05, "loss": 0.10020071268081665, "memory(GiB)": 122.96, "step": 21355, "token_acc": 0.9513274336283186, "train_speed(iter/s)": 0.244889 }, { "epoch": 1.628172879030414, "grad_norm": 0.8086562752723694, "learning_rate": 7.603988611411142e-05, "loss": 0.16432666778564453, "memory(GiB)": 122.96, "step": 21360, "token_acc": 0.9372227908563631, "train_speed(iter/s)": 0.244904 }, { "epoch": 1.628554005640674, "grad_norm": 1.0228774547576904, "learning_rate": 7.602966387521266e-05, "loss": 0.171173095703125, "memory(GiB)": 122.96, "step": 21365, "token_acc": 0.929341105811694, "train_speed(iter/s)": 0.244913 }, { "epoch": 1.6289351322509338, "grad_norm": 0.8838344216346741, "learning_rate": 7.601944014363346e-05, "loss": 0.10213818550109863, "memory(GiB)": 122.96, "step": 21370, "token_acc": 0.9609137055837563, "train_speed(iter/s)": 0.244922 }, { "epoch": 1.6293162588611936, "grad_norm": 0.9852144718170166, "learning_rate": 7.600921491996011e-05, "loss": 0.1176137924194336, "memory(GiB)": 122.96, "step": 21375, "token_acc": 0.9523202911737944, "train_speed(iter/s)": 0.244931 }, { "epoch": 1.6296973854714536, "grad_norm": 1.081342101097107, "learning_rate": 7.599898820477898e-05, "loss": 0.12937740087509156, "memory(GiB)": 122.96, "step": 21380, "token_acc": 0.952491516342204, "train_speed(iter/s)": 0.244939 }, { "epoch": 1.6300785120817136, "grad_norm": 1.3874026536941528, "learning_rate": 7.598875999867655e-05, "loss": 0.13756983280181884, "memory(GiB)": 122.96, "step": 21385, "token_acc": 0.9450343535290443, "train_speed(iter/s)": 0.244953 }, { "epoch": 1.6304596386919736, "grad_norm": 0.4755589962005615, "learning_rate": 7.59785303022393e-05, "loss": 0.13523530960083008, "memory(GiB)": 122.96, "step": 21390, "token_acc": 0.9574468085106383, "train_speed(iter/s)": 0.244956 }, { "epoch": 1.6308407653022334, "grad_norm": 0.8706849217414856, "learning_rate": 7.59682991160539e-05, "loss": 0.14328677654266359, "memory(GiB)": 122.96, "step": 21395, "token_acc": 0.9475465313028765, "train_speed(iter/s)": 0.244968 }, { "epoch": 1.6312218919124932, "grad_norm": 0.7776228785514832, "learning_rate": 7.595806644070707e-05, "loss": 0.12998690605163574, "memory(GiB)": 122.96, "step": 21400, "token_acc": 0.9417013682331945, "train_speed(iter/s)": 0.244982 }, { "epoch": 1.6312218919124932, "eval_loss": 0.10011614114046097, "eval_runtime": 182.1958, "eval_samples_per_second": 2.909, "eval_steps_per_second": 2.909, "eval_token_acc": 0.9545434009999397, "step": 21400 }, { "epoch": 1.6316030185227532, "grad_norm": 0.8067070245742798, "learning_rate": 7.594783227678559e-05, "loss": 0.1630992293357849, "memory(GiB)": 122.96, "step": 21405, "token_acc": 0.9542650776453593, "train_speed(iter/s)": 0.244477 }, { "epoch": 1.6319841451330133, "grad_norm": 0.8528085947036743, "learning_rate": 7.593759662487632e-05, "loss": 0.12093262672424317, "memory(GiB)": 122.96, "step": 21410, "token_acc": 0.9493136219640972, "train_speed(iter/s)": 0.244493 }, { "epoch": 1.632365271743273, "grad_norm": 1.3266654014587402, "learning_rate": 7.592735948556627e-05, "loss": 0.14022598266601563, "memory(GiB)": 122.96, "step": 21415, "token_acc": 0.9422850412249706, "train_speed(iter/s)": 0.244509 }, { "epoch": 1.632746398353533, "grad_norm": 0.6950247287750244, "learning_rate": 7.591712085944246e-05, "loss": 0.11454125642776489, "memory(GiB)": 122.96, "step": 21420, "token_acc": 0.9466274752475248, "train_speed(iter/s)": 0.244517 }, { "epoch": 1.6331275249637929, "grad_norm": 0.5531617999076843, "learning_rate": 7.590688074709204e-05, "loss": 0.10621529817581177, "memory(GiB)": 122.96, "step": 21425, "token_acc": 0.9522106631989596, "train_speed(iter/s)": 0.244522 }, { "epoch": 1.633508651574053, "grad_norm": 0.8101058602333069, "learning_rate": 7.589663914910224e-05, "loss": 0.15404900312423705, "memory(GiB)": 122.96, "step": 21430, "token_acc": 0.9402173913043478, "train_speed(iter/s)": 0.244529 }, { "epoch": 1.633889778184313, "grad_norm": 1.553690791130066, "learning_rate": 7.588639606606034e-05, "loss": 0.1705829381942749, "memory(GiB)": 122.96, "step": 21435, "token_acc": 0.9309806371018113, "train_speed(iter/s)": 0.244542 }, { "epoch": 1.6342709047945727, "grad_norm": 0.7113982439041138, "learning_rate": 7.587615149855378e-05, "loss": 0.0901265025138855, "memory(GiB)": 122.96, "step": 21440, "token_acc": 0.9694537923893347, "train_speed(iter/s)": 0.244554 }, { "epoch": 1.6346520314048327, "grad_norm": 0.9756497144699097, "learning_rate": 7.586590544716999e-05, "loss": 0.16844894886016845, "memory(GiB)": 122.96, "step": 21445, "token_acc": 0.9398359161349134, "train_speed(iter/s)": 0.244562 }, { "epoch": 1.6350331580150925, "grad_norm": 1.076591968536377, "learning_rate": 7.585565791249657e-05, "loss": 0.15703319311141967, "memory(GiB)": 122.96, "step": 21450, "token_acc": 0.9385176184690158, "train_speed(iter/s)": 0.244573 }, { "epoch": 1.6354142846253525, "grad_norm": 0.6599459648132324, "learning_rate": 7.584540889512115e-05, "loss": 0.11745790243148804, "memory(GiB)": 122.96, "step": 21455, "token_acc": 0.9555316863587541, "train_speed(iter/s)": 0.244584 }, { "epoch": 1.6357954112356126, "grad_norm": 0.6298982501029968, "learning_rate": 7.583515839563147e-05, "loss": 0.14805114269256592, "memory(GiB)": 122.96, "step": 21460, "token_acc": 0.9355504587155963, "train_speed(iter/s)": 0.244594 }, { "epoch": 1.6361765378458724, "grad_norm": 0.8138508796691895, "learning_rate": 7.582490641461533e-05, "loss": 0.07657910585403442, "memory(GiB)": 122.96, "step": 21465, "token_acc": 0.9584728734092431, "train_speed(iter/s)": 0.244603 }, { "epoch": 1.6365576644561324, "grad_norm": 1.0701544284820557, "learning_rate": 7.581465295266065e-05, "loss": 0.13010581731796264, "memory(GiB)": 122.96, "step": 21470, "token_acc": 0.9565374864179645, "train_speed(iter/s)": 0.244612 }, { "epoch": 1.6369387910663922, "grad_norm": 0.7332397699356079, "learning_rate": 7.580439801035544e-05, "loss": 0.11193017959594727, "memory(GiB)": 122.96, "step": 21475, "token_acc": 0.9555628165332462, "train_speed(iter/s)": 0.244615 }, { "epoch": 1.6373199176766522, "grad_norm": 0.8801243901252747, "learning_rate": 7.579414158828774e-05, "loss": 0.13390134572982787, "memory(GiB)": 122.96, "step": 21480, "token_acc": 0.948900651465798, "train_speed(iter/s)": 0.244624 }, { "epoch": 1.6377010442869122, "grad_norm": 1.5497931241989136, "learning_rate": 7.578388368704571e-05, "loss": 0.1721155524253845, "memory(GiB)": 122.96, "step": 21485, "token_acc": 0.9395017793594306, "train_speed(iter/s)": 0.244632 }, { "epoch": 1.638082170897172, "grad_norm": 0.5797527432441711, "learning_rate": 7.577362430721761e-05, "loss": 0.08758810758590699, "memory(GiB)": 122.96, "step": 21490, "token_acc": 0.9654958677685951, "train_speed(iter/s)": 0.244636 }, { "epoch": 1.6384632975074318, "grad_norm": 1.1434603929519653, "learning_rate": 7.576336344939177e-05, "loss": 0.215606164932251, "memory(GiB)": 122.96, "step": 21495, "token_acc": 0.9304247231862502, "train_speed(iter/s)": 0.244643 }, { "epoch": 1.6388444241176918, "grad_norm": 0.7495840191841125, "learning_rate": 7.575310111415656e-05, "loss": 0.1450747013092041, "memory(GiB)": 122.96, "step": 21500, "token_acc": 0.9501011463250169, "train_speed(iter/s)": 0.244649 }, { "epoch": 1.6392255507279518, "grad_norm": 0.3513880968093872, "learning_rate": 7.574283730210055e-05, "loss": 0.12653601169586182, "memory(GiB)": 122.96, "step": 21505, "token_acc": 0.9543973941368078, "train_speed(iter/s)": 0.244659 }, { "epoch": 1.6396066773382119, "grad_norm": 0.6488733887672424, "learning_rate": 7.573257201381228e-05, "loss": 0.10124579668045045, "memory(GiB)": 122.96, "step": 21510, "token_acc": 0.9590661976335145, "train_speed(iter/s)": 0.244674 }, { "epoch": 1.6399878039484717, "grad_norm": 1.209588646888733, "learning_rate": 7.572230524988039e-05, "loss": 0.10848412513732911, "memory(GiB)": 122.96, "step": 21515, "token_acc": 0.9616678858814923, "train_speed(iter/s)": 0.244676 }, { "epoch": 1.6403689305587315, "grad_norm": 1.6341767311096191, "learning_rate": 7.571203701089368e-05, "loss": 0.14244494438171387, "memory(GiB)": 122.96, "step": 21520, "token_acc": 0.9506195672276678, "train_speed(iter/s)": 0.244685 }, { "epoch": 1.6407500571689915, "grad_norm": 1.3060890436172485, "learning_rate": 7.570176729744096e-05, "loss": 0.14231444597244264, "memory(GiB)": 122.96, "step": 21525, "token_acc": 0.9588439306358382, "train_speed(iter/s)": 0.244685 }, { "epoch": 1.6411311837792515, "grad_norm": 1.2117254734039307, "learning_rate": 7.569149611011115e-05, "loss": 0.12864036560058595, "memory(GiB)": 122.96, "step": 21530, "token_acc": 0.9534450651769087, "train_speed(iter/s)": 0.2447 }, { "epoch": 1.6415123103895115, "grad_norm": 0.8329848051071167, "learning_rate": 7.568122344949327e-05, "loss": 0.11292275190353393, "memory(GiB)": 122.96, "step": 21535, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.244711 }, { "epoch": 1.6418934369997713, "grad_norm": 0.864229679107666, "learning_rate": 7.56709493161764e-05, "loss": 0.10160844326019287, "memory(GiB)": 122.96, "step": 21540, "token_acc": 0.9539877300613497, "train_speed(iter/s)": 0.244727 }, { "epoch": 1.642274563610031, "grad_norm": 1.9140585660934448, "learning_rate": 7.566067371074971e-05, "loss": 0.14728922843933107, "memory(GiB)": 122.96, "step": 21545, "token_acc": 0.9496607572773036, "train_speed(iter/s)": 0.24474 }, { "epoch": 1.6426556902202911, "grad_norm": 1.5366543531417847, "learning_rate": 7.565039663380246e-05, "loss": 0.145271897315979, "memory(GiB)": 122.96, "step": 21550, "token_acc": 0.9537619699042408, "train_speed(iter/s)": 0.244753 }, { "epoch": 1.6430368168305511, "grad_norm": 0.6745632886886597, "learning_rate": 7.564011808592399e-05, "loss": 0.09180662631988526, "memory(GiB)": 122.96, "step": 21555, "token_acc": 0.9668346610307267, "train_speed(iter/s)": 0.244763 }, { "epoch": 1.6434179434408112, "grad_norm": 0.6236698031425476, "learning_rate": 7.562983806770377e-05, "loss": 0.09430302381515503, "memory(GiB)": 122.96, "step": 21560, "token_acc": 0.9663710273466372, "train_speed(iter/s)": 0.244766 }, { "epoch": 1.643799070051071, "grad_norm": 1.2128705978393555, "learning_rate": 7.561955657973123e-05, "loss": 0.17546656131744384, "memory(GiB)": 122.96, "step": 21565, "token_acc": 0.9476035743298131, "train_speed(iter/s)": 0.244784 }, { "epoch": 1.6441801966613308, "grad_norm": 0.9031055569648743, "learning_rate": 7.560927362259602e-05, "loss": 0.11155133247375489, "memory(GiB)": 122.96, "step": 21570, "token_acc": 0.9590658619784833, "train_speed(iter/s)": 0.244795 }, { "epoch": 1.6445613232715908, "grad_norm": 1.6655750274658203, "learning_rate": 7.559898919688783e-05, "loss": 0.09753894805908203, "memory(GiB)": 122.96, "step": 21575, "token_acc": 0.9537170263788969, "train_speed(iter/s)": 0.244807 }, { "epoch": 1.6449424498818508, "grad_norm": 1.46859610080719, "learning_rate": 7.558870330319638e-05, "loss": 0.09476051330566407, "memory(GiB)": 122.96, "step": 21580, "token_acc": 0.9632066276803118, "train_speed(iter/s)": 0.244819 }, { "epoch": 1.6453235764921108, "grad_norm": 0.9113039970397949, "learning_rate": 7.557841594211156e-05, "loss": 0.13042519092559815, "memory(GiB)": 122.96, "step": 21585, "token_acc": 0.952893436838391, "train_speed(iter/s)": 0.244826 }, { "epoch": 1.6457047031023706, "grad_norm": 1.0898996591567993, "learning_rate": 7.556812711422327e-05, "loss": 0.12430645227432251, "memory(GiB)": 122.96, "step": 21590, "token_acc": 0.9559812512736906, "train_speed(iter/s)": 0.244838 }, { "epoch": 1.6460858297126304, "grad_norm": 0.7254810929298401, "learning_rate": 7.555783682012155e-05, "loss": 0.12870981693267822, "memory(GiB)": 122.96, "step": 21595, "token_acc": 0.9497016197783461, "train_speed(iter/s)": 0.244848 }, { "epoch": 1.6464669563228904, "grad_norm": 0.932910680770874, "learning_rate": 7.554754506039649e-05, "loss": 0.13001458644866942, "memory(GiB)": 122.96, "step": 21600, "token_acc": 0.9569555302166477, "train_speed(iter/s)": 0.244863 }, { "epoch": 1.6464669563228904, "eval_loss": 0.09887096285820007, "eval_runtime": 207.0601, "eval_samples_per_second": 2.56, "eval_steps_per_second": 2.56, "eval_token_acc": 0.954588579001265, "step": 21600 }, { "epoch": 1.6468480829331504, "grad_norm": 1.1083868741989136, "learning_rate": 7.553725183563827e-05, "loss": 0.12438379526138306, "memory(GiB)": 122.96, "step": 21605, "token_acc": 0.9543340841386325, "train_speed(iter/s)": 0.244302 }, { "epoch": 1.6472292095434105, "grad_norm": 0.5768564343452454, "learning_rate": 7.552695714643716e-05, "loss": 0.1377341032028198, "memory(GiB)": 122.96, "step": 21610, "token_acc": 0.9521018015441808, "train_speed(iter/s)": 0.244309 }, { "epoch": 1.6476103361536703, "grad_norm": 0.38703370094299316, "learning_rate": 7.551666099338352e-05, "loss": 0.09801957607269288, "memory(GiB)": 122.96, "step": 21615, "token_acc": 0.9567879006121714, "train_speed(iter/s)": 0.244325 }, { "epoch": 1.64799146276393, "grad_norm": 0.6239352226257324, "learning_rate": 7.55063633770678e-05, "loss": 0.13844904899597169, "memory(GiB)": 122.96, "step": 21620, "token_acc": 0.9550858652575958, "train_speed(iter/s)": 0.244333 }, { "epoch": 1.64837258937419, "grad_norm": 1.1297690868377686, "learning_rate": 7.549606429808049e-05, "loss": 0.1955648183822632, "memory(GiB)": 122.96, "step": 21625, "token_acc": 0.9281226903178122, "train_speed(iter/s)": 0.244343 }, { "epoch": 1.64875371598445, "grad_norm": 0.5057218670845032, "learning_rate": 7.548576375701222e-05, "loss": 0.12583799362182618, "memory(GiB)": 122.96, "step": 21630, "token_acc": 0.9550173010380623, "train_speed(iter/s)": 0.24435 }, { "epoch": 1.64913484259471, "grad_norm": 0.742306113243103, "learning_rate": 7.547546175445363e-05, "loss": 0.1303316593170166, "memory(GiB)": 122.96, "step": 21635, "token_acc": 0.947278589038498, "train_speed(iter/s)": 0.244356 }, { "epoch": 1.64951596920497, "grad_norm": 0.7356579899787903, "learning_rate": 7.546515829099557e-05, "loss": 0.12348310947418213, "memory(GiB)": 122.96, "step": 21640, "token_acc": 0.9536571747627024, "train_speed(iter/s)": 0.244363 }, { "epoch": 1.6498970958152297, "grad_norm": 1.1050792932510376, "learning_rate": 7.545485336722884e-05, "loss": 0.1148659348487854, "memory(GiB)": 122.96, "step": 21645, "token_acc": 0.9590163934426229, "train_speed(iter/s)": 0.244377 }, { "epoch": 1.6502782224254897, "grad_norm": 1.558332920074463, "learning_rate": 7.54445469837444e-05, "loss": 0.15727694034576417, "memory(GiB)": 122.96, "step": 21650, "token_acc": 0.94533702677747, "train_speed(iter/s)": 0.244383 }, { "epoch": 1.6506593490357497, "grad_norm": 1.296372890472412, "learning_rate": 7.543423914113326e-05, "loss": 0.12738709449768065, "memory(GiB)": 122.96, "step": 21655, "token_acc": 0.9526274915868496, "train_speed(iter/s)": 0.244397 }, { "epoch": 1.6510404756460098, "grad_norm": 1.2693886756896973, "learning_rate": 7.542392983998654e-05, "loss": 0.1318354368209839, "memory(GiB)": 122.96, "step": 21660, "token_acc": 0.9641273679967755, "train_speed(iter/s)": 0.244405 }, { "epoch": 1.6514216022562695, "grad_norm": 0.5409135818481445, "learning_rate": 7.541361908089544e-05, "loss": 0.13069435358047485, "memory(GiB)": 122.96, "step": 21665, "token_acc": 0.9529502506748939, "train_speed(iter/s)": 0.244413 }, { "epoch": 1.6518027288665293, "grad_norm": 1.3257619142532349, "learning_rate": 7.540330686445119e-05, "loss": 0.1214802622795105, "memory(GiB)": 122.96, "step": 21670, "token_acc": 0.9369455006337135, "train_speed(iter/s)": 0.244427 }, { "epoch": 1.6521838554767894, "grad_norm": 0.9389927983283997, "learning_rate": 7.53929931912452e-05, "loss": 0.10654599666595459, "memory(GiB)": 122.96, "step": 21675, "token_acc": 0.9618723404255319, "train_speed(iter/s)": 0.244432 }, { "epoch": 1.6525649820870494, "grad_norm": 0.710505485534668, "learning_rate": 7.538267806186888e-05, "loss": 0.11157717704772949, "memory(GiB)": 122.96, "step": 21680, "token_acc": 0.9517730496453901, "train_speed(iter/s)": 0.244438 }, { "epoch": 1.6529461086973094, "grad_norm": 0.6849519610404968, "learning_rate": 7.537236147691376e-05, "loss": 0.09394598603248597, "memory(GiB)": 122.96, "step": 21685, "token_acc": 0.9669509594882729, "train_speed(iter/s)": 0.244447 }, { "epoch": 1.6533272353075692, "grad_norm": 0.744071364402771, "learning_rate": 7.536204343697144e-05, "loss": 0.12443406581878662, "memory(GiB)": 122.96, "step": 21690, "token_acc": 0.9564116985376828, "train_speed(iter/s)": 0.244463 }, { "epoch": 1.653708361917829, "grad_norm": 1.4571157693862915, "learning_rate": 7.535172394263363e-05, "loss": 0.18266754150390624, "memory(GiB)": 122.96, "step": 21695, "token_acc": 0.9325933946940985, "train_speed(iter/s)": 0.244476 }, { "epoch": 1.654089488528089, "grad_norm": 0.6303447484970093, "learning_rate": 7.53414029944921e-05, "loss": 0.1275927186012268, "memory(GiB)": 122.96, "step": 21700, "token_acc": 0.9372801875732708, "train_speed(iter/s)": 0.244491 }, { "epoch": 1.654470615138349, "grad_norm": 1.0497785806655884, "learning_rate": 7.533108059313872e-05, "loss": 0.15408782958984374, "memory(GiB)": 122.96, "step": 21705, "token_acc": 0.9301167911162167, "train_speed(iter/s)": 0.244503 }, { "epoch": 1.654851741748609, "grad_norm": 0.6926878094673157, "learning_rate": 7.532075673916541e-05, "loss": 0.14043618440628053, "memory(GiB)": 122.96, "step": 21710, "token_acc": 0.9475571492604213, "train_speed(iter/s)": 0.244518 }, { "epoch": 1.6552328683588688, "grad_norm": 1.8521127700805664, "learning_rate": 7.531043143316421e-05, "loss": 0.1173696756362915, "memory(GiB)": 122.96, "step": 21715, "token_acc": 0.9551468215553405, "train_speed(iter/s)": 0.244538 }, { "epoch": 1.6556139949691286, "grad_norm": 1.1897748708724976, "learning_rate": 7.530010467572721e-05, "loss": 0.11866803169250488, "memory(GiB)": 122.96, "step": 21720, "token_acc": 0.9536224617698671, "train_speed(iter/s)": 0.244552 }, { "epoch": 1.6559951215793887, "grad_norm": 1.2291014194488525, "learning_rate": 7.528977646744662e-05, "loss": 0.12303717136383056, "memory(GiB)": 122.96, "step": 21725, "token_acc": 0.9437603993344426, "train_speed(iter/s)": 0.244571 }, { "epoch": 1.6563762481896487, "grad_norm": 0.8321824073791504, "learning_rate": 7.52794468089147e-05, "loss": 0.1278010606765747, "memory(GiB)": 122.96, "step": 21730, "token_acc": 0.9410016977928692, "train_speed(iter/s)": 0.244587 }, { "epoch": 1.6567573747999085, "grad_norm": 0.8199297785758972, "learning_rate": 7.526911570072382e-05, "loss": 0.10078651905059814, "memory(GiB)": 122.96, "step": 21735, "token_acc": 0.9575535665852997, "train_speed(iter/s)": 0.244591 }, { "epoch": 1.6571385014101685, "grad_norm": 0.8343227505683899, "learning_rate": 7.525878314346643e-05, "loss": 0.123862886428833, "memory(GiB)": 122.96, "step": 21740, "token_acc": 0.9483216237314598, "train_speed(iter/s)": 0.244603 }, { "epoch": 1.6575196280204283, "grad_norm": 0.5537599921226501, "learning_rate": 7.524844913773503e-05, "loss": 0.15853463411331176, "memory(GiB)": 122.96, "step": 21745, "token_acc": 0.943023854470568, "train_speed(iter/s)": 0.244617 }, { "epoch": 1.6579007546306883, "grad_norm": 0.8122696876525879, "learning_rate": 7.523811368412223e-05, "loss": 0.1201167106628418, "memory(GiB)": 122.96, "step": 21750, "token_acc": 0.9523184601924759, "train_speed(iter/s)": 0.244626 }, { "epoch": 1.6582818812409483, "grad_norm": 1.562400460243225, "learning_rate": 7.522777678322074e-05, "loss": 0.10144906044006348, "memory(GiB)": 122.96, "step": 21755, "token_acc": 0.9589983489268025, "train_speed(iter/s)": 0.244642 }, { "epoch": 1.6586630078512081, "grad_norm": 1.2185258865356445, "learning_rate": 7.521743843562335e-05, "loss": 0.16420718431472778, "memory(GiB)": 122.96, "step": 21760, "token_acc": 0.9528225806451613, "train_speed(iter/s)": 0.244652 }, { "epoch": 1.6590441344614681, "grad_norm": 0.5832086205482483, "learning_rate": 7.520709864192286e-05, "loss": 0.10363712310791015, "memory(GiB)": 122.96, "step": 21765, "token_acc": 0.9604093272940781, "train_speed(iter/s)": 0.244665 }, { "epoch": 1.659425261071728, "grad_norm": 0.8978058099746704, "learning_rate": 7.519675740271223e-05, "loss": 0.15683286190032958, "memory(GiB)": 122.96, "step": 21770, "token_acc": 0.9470213733515234, "train_speed(iter/s)": 0.244678 }, { "epoch": 1.659806387681988, "grad_norm": 0.7217133641242981, "learning_rate": 7.51864147185845e-05, "loss": 0.11647899150848388, "memory(GiB)": 122.96, "step": 21775, "token_acc": 0.9601152226900066, "train_speed(iter/s)": 0.244692 }, { "epoch": 1.660187514292248, "grad_norm": 1.983233094215393, "learning_rate": 7.517607059013278e-05, "loss": 0.11535542011260987, "memory(GiB)": 122.96, "step": 21780, "token_acc": 0.9439201941224049, "train_speed(iter/s)": 0.24471 }, { "epoch": 1.6605686409025078, "grad_norm": 0.7077997326850891, "learning_rate": 7.516572501795023e-05, "loss": 0.1139068365097046, "memory(GiB)": 122.96, "step": 21785, "token_acc": 0.9641756988020537, "train_speed(iter/s)": 0.244714 }, { "epoch": 1.6609497675127678, "grad_norm": 1.2410058975219727, "learning_rate": 7.515537800263013e-05, "loss": 0.12087781429290771, "memory(GiB)": 122.96, "step": 21790, "token_acc": 0.9548387096774194, "train_speed(iter/s)": 0.244733 }, { "epoch": 1.6613308941230276, "grad_norm": 0.7073180079460144, "learning_rate": 7.514502954476583e-05, "loss": 0.0870409607887268, "memory(GiB)": 122.96, "step": 21795, "token_acc": 0.9525954525954526, "train_speed(iter/s)": 0.244747 }, { "epoch": 1.6617120207332876, "grad_norm": 0.8622203469276428, "learning_rate": 7.51346796449508e-05, "loss": 0.06582088470458984, "memory(GiB)": 122.96, "step": 21800, "token_acc": 0.9633737965676015, "train_speed(iter/s)": 0.244758 }, { "epoch": 1.6617120207332876, "eval_loss": 0.09743323922157288, "eval_runtime": 214.5864, "eval_samples_per_second": 2.47, "eval_steps_per_second": 2.47, "eval_token_acc": 0.9556653213661828, "step": 21800 }, { "epoch": 1.6620931473435476, "grad_norm": 0.9450846314430237, "learning_rate": 7.51243283037785e-05, "loss": 0.1388264298439026, "memory(GiB)": 122.96, "step": 21805, "token_acc": 0.9555004172297422, "train_speed(iter/s)": 0.244184 }, { "epoch": 1.6624742739538074, "grad_norm": 1.2931909561157227, "learning_rate": 7.511397552184257e-05, "loss": 0.15622087717056274, "memory(GiB)": 122.96, "step": 21810, "token_acc": 0.9237192507334687, "train_speed(iter/s)": 0.244198 }, { "epoch": 1.6628554005640672, "grad_norm": 1.2839829921722412, "learning_rate": 7.510362129973669e-05, "loss": 0.10870131254196166, "memory(GiB)": 122.96, "step": 21815, "token_acc": 0.9602771362586605, "train_speed(iter/s)": 0.244207 }, { "epoch": 1.6632365271743272, "grad_norm": 0.7214338779449463, "learning_rate": 7.509326563805464e-05, "loss": 0.12651748657226564, "memory(GiB)": 122.96, "step": 21820, "token_acc": 0.9546433185015071, "train_speed(iter/s)": 0.244211 }, { "epoch": 1.6636176537845873, "grad_norm": 0.8214506506919861, "learning_rate": 7.508290853739024e-05, "loss": 0.11749675273895263, "memory(GiB)": 122.96, "step": 21825, "token_acc": 0.9419400105244694, "train_speed(iter/s)": 0.244224 }, { "epoch": 1.6639987803948473, "grad_norm": 0.7231277823448181, "learning_rate": 7.507254999833744e-05, "loss": 0.14022430181503295, "memory(GiB)": 122.96, "step": 21830, "token_acc": 0.9463383838383839, "train_speed(iter/s)": 0.244239 }, { "epoch": 1.664379907005107, "grad_norm": 0.4340151846408844, "learning_rate": 7.506219002149024e-05, "loss": 0.09152722358703613, "memory(GiB)": 122.96, "step": 21835, "token_acc": 0.9509632224168126, "train_speed(iter/s)": 0.244254 }, { "epoch": 1.6647610336153669, "grad_norm": 1.802154541015625, "learning_rate": 7.505182860744273e-05, "loss": 0.13155027627944946, "memory(GiB)": 122.96, "step": 21840, "token_acc": 0.9517777777777777, "train_speed(iter/s)": 0.244265 }, { "epoch": 1.6651421602256269, "grad_norm": 0.7061432600021362, "learning_rate": 7.504146575678914e-05, "loss": 0.09170042872428893, "memory(GiB)": 122.96, "step": 21845, "token_acc": 0.9651776292648611, "train_speed(iter/s)": 0.244274 }, { "epoch": 1.665523286835887, "grad_norm": 0.9703760743141174, "learning_rate": 7.503110147012368e-05, "loss": 0.12609469890594482, "memory(GiB)": 122.96, "step": 21850, "token_acc": 0.9466928151185594, "train_speed(iter/s)": 0.244282 }, { "epoch": 1.665904413446147, "grad_norm": 0.6366762518882751, "learning_rate": 7.502073574804071e-05, "loss": 0.11363066434860229, "memory(GiB)": 122.96, "step": 21855, "token_acc": 0.952128939138802, "train_speed(iter/s)": 0.244294 }, { "epoch": 1.6662855400564067, "grad_norm": 0.758083164691925, "learning_rate": 7.501036859113464e-05, "loss": 0.11495914459228515, "memory(GiB)": 122.96, "step": 21860, "token_acc": 0.9611980939414567, "train_speed(iter/s)": 0.244296 }, { "epoch": 1.6666666666666665, "grad_norm": 0.6450614929199219, "learning_rate": 7.500000000000001e-05, "loss": 0.1382339358329773, "memory(GiB)": 122.96, "step": 21865, "token_acc": 0.9483043837882548, "train_speed(iter/s)": 0.244305 }, { "epoch": 1.6670477932769265, "grad_norm": 0.6210350394248962, "learning_rate": 7.498962997523139e-05, "loss": 0.1344504714012146, "memory(GiB)": 122.96, "step": 21870, "token_acc": 0.9641860465116279, "train_speed(iter/s)": 0.244313 }, { "epoch": 1.6674289198871866, "grad_norm": 1.373059868812561, "learning_rate": 7.497925851742344e-05, "loss": 0.11739833354949951, "memory(GiB)": 122.96, "step": 21875, "token_acc": 0.9525641025641025, "train_speed(iter/s)": 0.24432 }, { "epoch": 1.6678100464974466, "grad_norm": 1.4337891340255737, "learning_rate": 7.496888562717094e-05, "loss": 0.13731260299682618, "memory(GiB)": 122.96, "step": 21880, "token_acc": 0.9467321496033245, "train_speed(iter/s)": 0.244336 }, { "epoch": 1.6681911731077064, "grad_norm": 0.6601489186286926, "learning_rate": 7.495851130506874e-05, "loss": 0.12720019817352296, "memory(GiB)": 122.96, "step": 21885, "token_acc": 0.9507766794432116, "train_speed(iter/s)": 0.244344 }, { "epoch": 1.6685722997179662, "grad_norm": 1.2852245569229126, "learning_rate": 7.494813555171174e-05, "loss": 0.11445858478546142, "memory(GiB)": 122.96, "step": 21890, "token_acc": 0.9501607717041801, "train_speed(iter/s)": 0.244356 }, { "epoch": 1.6689534263282262, "grad_norm": 1.206202507019043, "learning_rate": 7.493775836769491e-05, "loss": 0.1125169038772583, "memory(GiB)": 122.96, "step": 21895, "token_acc": 0.961662817551963, "train_speed(iter/s)": 0.244369 }, { "epoch": 1.6693345529384862, "grad_norm": 0.7558403015136719, "learning_rate": 7.492737975361338e-05, "loss": 0.11994736194610596, "memory(GiB)": 122.96, "step": 21900, "token_acc": 0.9527466036621383, "train_speed(iter/s)": 0.244377 }, { "epoch": 1.6697156795487462, "grad_norm": 0.6452611684799194, "learning_rate": 7.49169997100623e-05, "loss": 0.14391658306121827, "memory(GiB)": 122.96, "step": 21905, "token_acc": 0.9478484565014031, "train_speed(iter/s)": 0.244375 }, { "epoch": 1.670096806159006, "grad_norm": 1.3184242248535156, "learning_rate": 7.490661823763691e-05, "loss": 0.16561946868896485, "memory(GiB)": 122.96, "step": 21910, "token_acc": 0.9410161572902472, "train_speed(iter/s)": 0.244382 }, { "epoch": 1.6704779327692658, "grad_norm": 1.054750680923462, "learning_rate": 7.489623533693255e-05, "loss": 0.1015552282333374, "memory(GiB)": 122.96, "step": 21915, "token_acc": 0.962681409813407, "train_speed(iter/s)": 0.244398 }, { "epoch": 1.6708590593795258, "grad_norm": 0.628390908241272, "learning_rate": 7.488585100854462e-05, "loss": 0.07151715755462647, "memory(GiB)": 122.96, "step": 21920, "token_acc": 0.9616893607200554, "train_speed(iter/s)": 0.244411 }, { "epoch": 1.6712401859897859, "grad_norm": 0.993069589138031, "learning_rate": 7.487546525306862e-05, "loss": 0.1222212553024292, "memory(GiB)": 122.96, "step": 21925, "token_acc": 0.9565650527993624, "train_speed(iter/s)": 0.244423 }, { "epoch": 1.6716213126000459, "grad_norm": 0.694136381149292, "learning_rate": 7.486507807110013e-05, "loss": 0.13150326013565064, "memory(GiB)": 122.96, "step": 21930, "token_acc": 0.9498040397949955, "train_speed(iter/s)": 0.244433 }, { "epoch": 1.6720024392103057, "grad_norm": 1.414908766746521, "learning_rate": 7.485468946323481e-05, "loss": 0.15290470123291017, "memory(GiB)": 122.96, "step": 21935, "token_acc": 0.9352593108249216, "train_speed(iter/s)": 0.244448 }, { "epoch": 1.6723835658205655, "grad_norm": 1.2081456184387207, "learning_rate": 7.484429943006838e-05, "loss": 0.14979668855667114, "memory(GiB)": 122.96, "step": 21940, "token_acc": 0.9406408094435076, "train_speed(iter/s)": 0.244464 }, { "epoch": 1.6727646924308255, "grad_norm": 0.8129240274429321, "learning_rate": 7.483390797219665e-05, "loss": 0.1436695337295532, "memory(GiB)": 122.96, "step": 21945, "token_acc": 0.9520119970007498, "train_speed(iter/s)": 0.244474 }, { "epoch": 1.6731458190410855, "grad_norm": 0.8009470105171204, "learning_rate": 7.482351509021556e-05, "loss": 0.14666352272033692, "memory(GiB)": 122.96, "step": 21950, "token_acc": 0.954515491100857, "train_speed(iter/s)": 0.244489 }, { "epoch": 1.6735269456513455, "grad_norm": 0.865957498550415, "learning_rate": 7.481312078472107e-05, "loss": 0.09005358219146728, "memory(GiB)": 122.96, "step": 21955, "token_acc": 0.9607072691552063, "train_speed(iter/s)": 0.244502 }, { "epoch": 1.6739080722616053, "grad_norm": 0.8928372263908386, "learning_rate": 7.480272505630926e-05, "loss": 0.07678576111793518, "memory(GiB)": 122.96, "step": 21960, "token_acc": 0.9644424934152765, "train_speed(iter/s)": 0.244514 }, { "epoch": 1.6742891988718651, "grad_norm": 1.0130984783172607, "learning_rate": 7.479232790557624e-05, "loss": 0.10805299282073974, "memory(GiB)": 122.96, "step": 21965, "token_acc": 0.9478131212723658, "train_speed(iter/s)": 0.244531 }, { "epoch": 1.6746703254821251, "grad_norm": 0.8640418648719788, "learning_rate": 7.47819293331183e-05, "loss": 0.09124443531036378, "memory(GiB)": 122.96, "step": 21970, "token_acc": 0.9631200442559469, "train_speed(iter/s)": 0.244544 }, { "epoch": 1.6750514520923852, "grad_norm": 0.9873640537261963, "learning_rate": 7.477152933953169e-05, "loss": 0.13813778162002563, "memory(GiB)": 122.96, "step": 21975, "token_acc": 0.9439231456657731, "train_speed(iter/s)": 0.244555 }, { "epoch": 1.6754325787026452, "grad_norm": 2.879181146621704, "learning_rate": 7.476112792541283e-05, "loss": 0.17924799919128417, "memory(GiB)": 122.96, "step": 21980, "token_acc": 0.9236852510873863, "train_speed(iter/s)": 0.244572 }, { "epoch": 1.675813705312905, "grad_norm": 0.7690970301628113, "learning_rate": 7.47507250913582e-05, "loss": 0.1089483618736267, "memory(GiB)": 122.96, "step": 21985, "token_acc": 0.95836947094536, "train_speed(iter/s)": 0.244578 }, { "epoch": 1.6761948319231648, "grad_norm": 0.8396589756011963, "learning_rate": 7.474032083796434e-05, "loss": 0.1182061791419983, "memory(GiB)": 122.96, "step": 21990, "token_acc": 0.9556701030927836, "train_speed(iter/s)": 0.244583 }, { "epoch": 1.6765759585334248, "grad_norm": 0.7920475006103516, "learning_rate": 7.472991516582788e-05, "loss": 0.13067274093627929, "memory(GiB)": 122.96, "step": 21995, "token_acc": 0.9523026315789473, "train_speed(iter/s)": 0.24459 }, { "epoch": 1.6769570851436848, "grad_norm": 0.6595046520233154, "learning_rate": 7.471950807554556e-05, "loss": 0.10622811317443848, "memory(GiB)": 122.96, "step": 22000, "token_acc": 0.9548540393754243, "train_speed(iter/s)": 0.244601 }, { "epoch": 1.6769570851436848, "eval_loss": 0.09792107343673706, "eval_runtime": 215.4706, "eval_samples_per_second": 2.46, "eval_steps_per_second": 2.46, "eval_token_acc": 0.95561261369797, "step": 22000 }, { "epoch": 1.6773382117539448, "grad_norm": 0.8987069129943848, "learning_rate": 7.470909956771415e-05, "loss": 0.11626861095428467, "memory(GiB)": 122.96, "step": 22005, "token_acc": 0.9555086424912278, "train_speed(iter/s)": 0.244024 }, { "epoch": 1.6777193383642046, "grad_norm": 0.8981941342353821, "learning_rate": 7.469868964293054e-05, "loss": 0.09300388097763061, "memory(GiB)": 122.96, "step": 22010, "token_acc": 0.9652059740686033, "train_speed(iter/s)": 0.24403 }, { "epoch": 1.6781004649744644, "grad_norm": 1.1720300912857056, "learning_rate": 7.46882783017917e-05, "loss": 0.1261923909187317, "memory(GiB)": 122.96, "step": 22015, "token_acc": 0.9577539398755132, "train_speed(iter/s)": 0.244034 }, { "epoch": 1.6784815915847244, "grad_norm": 0.639354944229126, "learning_rate": 7.467786554489469e-05, "loss": 0.0931730568408966, "memory(GiB)": 122.96, "step": 22020, "token_acc": 0.9638834257107098, "train_speed(iter/s)": 0.244043 }, { "epoch": 1.6788627181949844, "grad_norm": 0.8716281652450562, "learning_rate": 7.466745137283659e-05, "loss": 0.11817564964294433, "memory(GiB)": 122.96, "step": 22025, "token_acc": 0.9533799533799534, "train_speed(iter/s)": 0.244053 }, { "epoch": 1.6792438448052445, "grad_norm": 0.8675052523612976, "learning_rate": 7.465703578621461e-05, "loss": 0.122315514087677, "memory(GiB)": 122.96, "step": 22030, "token_acc": 0.9538062691491869, "train_speed(iter/s)": 0.244066 }, { "epoch": 1.6796249714155043, "grad_norm": 0.0795363038778305, "learning_rate": 7.464661878562608e-05, "loss": 0.16214258670806886, "memory(GiB)": 122.96, "step": 22035, "token_acc": 0.9367179634690062, "train_speed(iter/s)": 0.244073 }, { "epoch": 1.680006098025764, "grad_norm": 1.2207492589950562, "learning_rate": 7.463620037166834e-05, "loss": 0.09691034555435181, "memory(GiB)": 122.96, "step": 22040, "token_acc": 0.9513205592957017, "train_speed(iter/s)": 0.244092 }, { "epoch": 1.680387224636024, "grad_norm": 0.9237349033355713, "learning_rate": 7.462578054493881e-05, "loss": 0.09470806121826172, "memory(GiB)": 122.96, "step": 22045, "token_acc": 0.961455525606469, "train_speed(iter/s)": 0.244106 }, { "epoch": 1.680768351246284, "grad_norm": 1.1010830402374268, "learning_rate": 7.461535930603506e-05, "loss": 0.14014571905136108, "memory(GiB)": 122.96, "step": 22050, "token_acc": 0.9586708546867251, "train_speed(iter/s)": 0.244114 }, { "epoch": 1.681149477856544, "grad_norm": 0.662788987159729, "learning_rate": 7.460493665555466e-05, "loss": 0.1259116768836975, "memory(GiB)": 122.96, "step": 22055, "token_acc": 0.9524044389642417, "train_speed(iter/s)": 0.244125 }, { "epoch": 1.681530604466804, "grad_norm": 0.5422541499137878, "learning_rate": 7.459451259409535e-05, "loss": 0.09109730124473572, "memory(GiB)": 122.96, "step": 22060, "token_acc": 0.9641559699685154, "train_speed(iter/s)": 0.24414 }, { "epoch": 1.6819117310770637, "grad_norm": 1.3159769773483276, "learning_rate": 7.458408712225486e-05, "loss": 0.1162842869758606, "memory(GiB)": 122.96, "step": 22065, "token_acc": 0.9713954387321222, "train_speed(iter/s)": 0.244154 }, { "epoch": 1.6822928576873237, "grad_norm": 2.453864097595215, "learning_rate": 7.457366024063107e-05, "loss": 0.12165601253509521, "memory(GiB)": 122.96, "step": 22070, "token_acc": 0.956415620641562, "train_speed(iter/s)": 0.24416 }, { "epoch": 1.6826739842975837, "grad_norm": 1.7615344524383545, "learning_rate": 7.456323194982188e-05, "loss": 0.12992022037506104, "memory(GiB)": 122.96, "step": 22075, "token_acc": 0.9413461538461538, "train_speed(iter/s)": 0.244179 }, { "epoch": 1.6830551109078435, "grad_norm": 1.1103910207748413, "learning_rate": 7.455280225042534e-05, "loss": 0.10277031660079956, "memory(GiB)": 122.96, "step": 22080, "token_acc": 0.9538116591928251, "train_speed(iter/s)": 0.244192 }, { "epoch": 1.6834362375181036, "grad_norm": 1.1069875955581665, "learning_rate": 7.454237114303952e-05, "loss": 0.13763662576675414, "memory(GiB)": 122.96, "step": 22085, "token_acc": 0.946753986332574, "train_speed(iter/s)": 0.244205 }, { "epoch": 1.6838173641283634, "grad_norm": 1.166082739830017, "learning_rate": 7.453193862826262e-05, "loss": 0.13625700473785402, "memory(GiB)": 122.96, "step": 22090, "token_acc": 0.9449838187702265, "train_speed(iter/s)": 0.244211 }, { "epoch": 1.6841984907386234, "grad_norm": 0.7058888673782349, "learning_rate": 7.452150470669288e-05, "loss": 0.11636195182800294, "memory(GiB)": 122.96, "step": 22095, "token_acc": 0.9438555162936788, "train_speed(iter/s)": 0.244227 }, { "epoch": 1.6845796173488834, "grad_norm": 0.6302140951156616, "learning_rate": 7.451106937892862e-05, "loss": 0.12472261190414428, "memory(GiB)": 122.96, "step": 22100, "token_acc": 0.9321957790749887, "train_speed(iter/s)": 0.244245 }, { "epoch": 1.6849607439591432, "grad_norm": 0.7425611019134521, "learning_rate": 7.45006326455683e-05, "loss": 0.1278509020805359, "memory(GiB)": 122.96, "step": 22105, "token_acc": 0.9501529602303401, "train_speed(iter/s)": 0.244252 }, { "epoch": 1.6853418705694032, "grad_norm": 0.5087275505065918, "learning_rate": 7.449019450721039e-05, "loss": 0.10229157209396363, "memory(GiB)": 122.96, "step": 22110, "token_acc": 0.953646748681898, "train_speed(iter/s)": 0.244264 }, { "epoch": 1.685722997179663, "grad_norm": 0.9393994808197021, "learning_rate": 7.447975496445346e-05, "loss": 0.12978110313415528, "memory(GiB)": 122.96, "step": 22115, "token_acc": 0.9477138018628282, "train_speed(iter/s)": 0.244272 }, { "epoch": 1.686104123789923, "grad_norm": 0.9580907225608826, "learning_rate": 7.44693140178962e-05, "loss": 0.1026904821395874, "memory(GiB)": 122.96, "step": 22120, "token_acc": 0.9628865979381444, "train_speed(iter/s)": 0.244288 }, { "epoch": 1.686485250400183, "grad_norm": 1.6943522691726685, "learning_rate": 7.445887166813733e-05, "loss": 0.1538945436477661, "memory(GiB)": 122.96, "step": 22125, "token_acc": 0.9528106786990036, "train_speed(iter/s)": 0.244291 }, { "epoch": 1.6868663770104428, "grad_norm": 0.5352094769477844, "learning_rate": 7.444842791577567e-05, "loss": 0.11455568075180053, "memory(GiB)": 122.96, "step": 22130, "token_acc": 0.9568541780447842, "train_speed(iter/s)": 0.244305 }, { "epoch": 1.6872475036207026, "grad_norm": 1.7479264736175537, "learning_rate": 7.443798276141011e-05, "loss": 0.20240287780761718, "memory(GiB)": 122.96, "step": 22135, "token_acc": 0.9296745070087907, "train_speed(iter/s)": 0.244316 }, { "epoch": 1.6876286302309627, "grad_norm": 0.8587324619293213, "learning_rate": 7.442753620563965e-05, "loss": 0.09306294918060302, "memory(GiB)": 122.96, "step": 22140, "token_acc": 0.9613431613431613, "train_speed(iter/s)": 0.244324 }, { "epoch": 1.6880097568412227, "grad_norm": 0.7959678769111633, "learning_rate": 7.441708824906335e-05, "loss": 0.12130887508392334, "memory(GiB)": 122.96, "step": 22145, "token_acc": 0.9505365526492288, "train_speed(iter/s)": 0.244334 }, { "epoch": 1.6883908834514827, "grad_norm": 0.37279853224754333, "learning_rate": 7.440663889228034e-05, "loss": 0.10801413059234619, "memory(GiB)": 122.96, "step": 22150, "token_acc": 0.9649856859722528, "train_speed(iter/s)": 0.244345 }, { "epoch": 1.6887720100617425, "grad_norm": 0.8536563515663147, "learning_rate": 7.439618813588987e-05, "loss": 0.14450722932815552, "memory(GiB)": 122.96, "step": 22155, "token_acc": 0.9488752556237219, "train_speed(iter/s)": 0.244356 }, { "epoch": 1.6891531366720023, "grad_norm": 0.5939992070198059, "learning_rate": 7.43857359804912e-05, "loss": 0.099187570810318, "memory(GiB)": 122.96, "step": 22160, "token_acc": 0.956495022735652, "train_speed(iter/s)": 0.24436 }, { "epoch": 1.6895342632822623, "grad_norm": 0.7151081562042236, "learning_rate": 7.437528242668376e-05, "loss": 0.12114461660385131, "memory(GiB)": 122.96, "step": 22165, "token_acc": 0.9527859746571776, "train_speed(iter/s)": 0.24437 }, { "epoch": 1.6899153898925223, "grad_norm": 1.2903563976287842, "learning_rate": 7.436482747506696e-05, "loss": 0.12722303867340087, "memory(GiB)": 122.96, "step": 22170, "token_acc": 0.9584905660377359, "train_speed(iter/s)": 0.244379 }, { "epoch": 1.6902965165027823, "grad_norm": 0.6569268107414246, "learning_rate": 7.43543711262404e-05, "loss": 0.11991429328918457, "memory(GiB)": 122.96, "step": 22175, "token_acc": 0.9540487531521434, "train_speed(iter/s)": 0.244385 }, { "epoch": 1.6906776431130421, "grad_norm": 0.8035945892333984, "learning_rate": 7.434391338080367e-05, "loss": 0.07817199230194091, "memory(GiB)": 122.96, "step": 22180, "token_acc": 0.9678270042194093, "train_speed(iter/s)": 0.2444 }, { "epoch": 1.691058769723302, "grad_norm": 0.8482024073600769, "learning_rate": 7.433345423935645e-05, "loss": 0.12436277866363525, "memory(GiB)": 122.96, "step": 22185, "token_acc": 0.9518115942028985, "train_speed(iter/s)": 0.244409 }, { "epoch": 1.691439896333562, "grad_norm": 1.5713300704956055, "learning_rate": 7.432299370249857e-05, "loss": 0.1504884123802185, "memory(GiB)": 122.96, "step": 22190, "token_acc": 0.9470004877255731, "train_speed(iter/s)": 0.244418 }, { "epoch": 1.691821022943822, "grad_norm": 0.15686501562595367, "learning_rate": 7.431253177082987e-05, "loss": 0.11961193084716797, "memory(GiB)": 122.96, "step": 22195, "token_acc": 0.9415041782729805, "train_speed(iter/s)": 0.244432 }, { "epoch": 1.692202149554082, "grad_norm": 1.1346592903137207, "learning_rate": 7.430206844495029e-05, "loss": 0.13513612747192383, "memory(GiB)": 122.96, "step": 22200, "token_acc": 0.9471808165910564, "train_speed(iter/s)": 0.244446 }, { "epoch": 1.692202149554082, "eval_loss": 0.09610839188098907, "eval_runtime": 211.4938, "eval_samples_per_second": 2.506, "eval_steps_per_second": 2.506, "eval_token_acc": 0.955710499367508, "step": 22200 }, { "epoch": 1.6925832761643418, "grad_norm": 0.901281476020813, "learning_rate": 7.429160372545988e-05, "loss": 0.15954294204711914, "memory(GiB)": 122.96, "step": 22205, "token_acc": 0.9551400093534432, "train_speed(iter/s)": 0.243894 }, { "epoch": 1.6929644027746016, "grad_norm": 0.8349506855010986, "learning_rate": 7.428113761295871e-05, "loss": 0.10224639177322388, "memory(GiB)": 122.96, "step": 22210, "token_acc": 0.9571903063107393, "train_speed(iter/s)": 0.243892 }, { "epoch": 1.6933455293848616, "grad_norm": 0.31291085481643677, "learning_rate": 7.427067010804697e-05, "loss": 0.11800258159637451, "memory(GiB)": 122.96, "step": 22215, "token_acc": 0.9505441741357235, "train_speed(iter/s)": 0.243899 }, { "epoch": 1.6937266559951216, "grad_norm": 1.3745156526565552, "learning_rate": 7.426020121132493e-05, "loss": 0.15416754484176637, "memory(GiB)": 122.96, "step": 22220, "token_acc": 0.9528998891762098, "train_speed(iter/s)": 0.243904 }, { "epoch": 1.6941077826053816, "grad_norm": 1.5538796186447144, "learning_rate": 7.424973092339295e-05, "loss": 0.11397719383239746, "memory(GiB)": 122.96, "step": 22225, "token_acc": 0.9486166007905138, "train_speed(iter/s)": 0.243917 }, { "epoch": 1.6944889092156414, "grad_norm": 0.9324905276298523, "learning_rate": 7.423925924485142e-05, "loss": 0.11091266870498658, "memory(GiB)": 122.96, "step": 22230, "token_acc": 0.9556741227170121, "train_speed(iter/s)": 0.243927 }, { "epoch": 1.6948700358259012, "grad_norm": 1.007796287536621, "learning_rate": 7.422878617630084e-05, "loss": 0.09732442498207092, "memory(GiB)": 122.96, "step": 22235, "token_acc": 0.9648780487804878, "train_speed(iter/s)": 0.243942 }, { "epoch": 1.6952511624361613, "grad_norm": 0.7965775728225708, "learning_rate": 7.421831171834184e-05, "loss": 0.13282551765441894, "memory(GiB)": 122.96, "step": 22240, "token_acc": 0.9496417197452229, "train_speed(iter/s)": 0.243951 }, { "epoch": 1.6956322890464213, "grad_norm": 0.6971874237060547, "learning_rate": 7.420783587157504e-05, "loss": 0.0819025456905365, "memory(GiB)": 122.96, "step": 22245, "token_acc": 0.9705414012738853, "train_speed(iter/s)": 0.243958 }, { "epoch": 1.6960134156566813, "grad_norm": 0.22044554352760315, "learning_rate": 7.419735863660119e-05, "loss": 0.15140118598937988, "memory(GiB)": 122.96, "step": 22250, "token_acc": 0.9267015706806283, "train_speed(iter/s)": 0.243975 }, { "epoch": 1.696394542266941, "grad_norm": 0.818751335144043, "learning_rate": 7.41868800140211e-05, "loss": 0.12273628711700439, "memory(GiB)": 122.96, "step": 22255, "token_acc": 0.9550765740215542, "train_speed(iter/s)": 0.243975 }, { "epoch": 1.6967756688772009, "grad_norm": 0.6140573024749756, "learning_rate": 7.417640000443569e-05, "loss": 0.10792572498321533, "memory(GiB)": 122.96, "step": 22260, "token_acc": 0.9582052858020897, "train_speed(iter/s)": 0.243994 }, { "epoch": 1.697156795487461, "grad_norm": 0.8561453223228455, "learning_rate": 7.416591860844593e-05, "loss": 0.13172705173492433, "memory(GiB)": 122.96, "step": 22265, "token_acc": 0.9483933787731256, "train_speed(iter/s)": 0.244002 }, { "epoch": 1.697537922097721, "grad_norm": 0.5916081070899963, "learning_rate": 7.415543582665288e-05, "loss": 0.105859375, "memory(GiB)": 122.96, "step": 22270, "token_acc": 0.9611369639039115, "train_speed(iter/s)": 0.244007 }, { "epoch": 1.697919048707981, "grad_norm": 0.6242464780807495, "learning_rate": 7.41449516596577e-05, "loss": 0.09797279238700866, "memory(GiB)": 122.96, "step": 22275, "token_acc": 0.962390158172232, "train_speed(iter/s)": 0.244013 }, { "epoch": 1.6983001753182407, "grad_norm": 0.4717639684677124, "learning_rate": 7.413446610806156e-05, "loss": 0.1359849214553833, "memory(GiB)": 122.96, "step": 22280, "token_acc": 0.9533471800607126, "train_speed(iter/s)": 0.244021 }, { "epoch": 1.6986813019285005, "grad_norm": 1.0444062948226929, "learning_rate": 7.41239791724658e-05, "loss": 0.16668587923049927, "memory(GiB)": 122.96, "step": 22285, "token_acc": 0.9285110697988845, "train_speed(iter/s)": 0.244031 }, { "epoch": 1.6990624285387605, "grad_norm": 1.4865206480026245, "learning_rate": 7.411349085347177e-05, "loss": 0.11280014514923095, "memory(GiB)": 122.96, "step": 22290, "token_acc": 0.9541971438523162, "train_speed(iter/s)": 0.244036 }, { "epoch": 1.6994435551490206, "grad_norm": 1.8622158765792847, "learning_rate": 7.410300115168096e-05, "loss": 0.10683449506759643, "memory(GiB)": 122.96, "step": 22295, "token_acc": 0.9624977243764792, "train_speed(iter/s)": 0.244046 }, { "epoch": 1.6998246817592806, "grad_norm": 0.9815818667411804, "learning_rate": 7.409251006769489e-05, "loss": 0.08318931460380555, "memory(GiB)": 122.96, "step": 22300, "token_acc": 0.9667405764966741, "train_speed(iter/s)": 0.244056 }, { "epoch": 1.7002058083695404, "grad_norm": 0.23504580557346344, "learning_rate": 7.408201760211515e-05, "loss": 0.08081969618797302, "memory(GiB)": 122.96, "step": 22305, "token_acc": 0.9653940611743693, "train_speed(iter/s)": 0.244063 }, { "epoch": 1.7005869349798002, "grad_norm": 1.767377257347107, "learning_rate": 7.407152375554346e-05, "loss": 0.14219188690185547, "memory(GiB)": 122.96, "step": 22310, "token_acc": 0.9625881631401411, "train_speed(iter/s)": 0.244066 }, { "epoch": 1.7009680615900602, "grad_norm": 0.20735037326812744, "learning_rate": 7.406102852858159e-05, "loss": 0.14146239757537843, "memory(GiB)": 122.96, "step": 22315, "token_acc": 0.9434300035803795, "train_speed(iter/s)": 0.244079 }, { "epoch": 1.7013491882003202, "grad_norm": 1.169028878211975, "learning_rate": 7.40505319218314e-05, "loss": 0.1335224151611328, "memory(GiB)": 122.96, "step": 22320, "token_acc": 0.9615085536547434, "train_speed(iter/s)": 0.244087 }, { "epoch": 1.7017303148105802, "grad_norm": 0.7683916687965393, "learning_rate": 7.40400339358948e-05, "loss": 0.0711044728755951, "memory(GiB)": 122.96, "step": 22325, "token_acc": 0.9715025906735751, "train_speed(iter/s)": 0.24409 }, { "epoch": 1.70211144142084, "grad_norm": 0.7115213871002197, "learning_rate": 7.402953457137381e-05, "loss": 0.12271498441696167, "memory(GiB)": 122.96, "step": 22330, "token_acc": 0.9523996852871754, "train_speed(iter/s)": 0.244106 }, { "epoch": 1.7024925680310998, "grad_norm": 1.102571964263916, "learning_rate": 7.401903382887054e-05, "loss": 0.11673427820205688, "memory(GiB)": 122.96, "step": 22335, "token_acc": 0.9465034965034965, "train_speed(iter/s)": 0.244115 }, { "epoch": 1.7028736946413598, "grad_norm": 0.965969443321228, "learning_rate": 7.400853170898713e-05, "loss": 0.1211472511291504, "memory(GiB)": 122.96, "step": 22340, "token_acc": 0.9536095908261663, "train_speed(iter/s)": 0.244127 }, { "epoch": 1.7032548212516199, "grad_norm": 0.5471957325935364, "learning_rate": 7.399802821232583e-05, "loss": 0.11171510219573974, "memory(GiB)": 122.96, "step": 22345, "token_acc": 0.9608329201784829, "train_speed(iter/s)": 0.24413 }, { "epoch": 1.7036359478618797, "grad_norm": 0.7011304497718811, "learning_rate": 7.3987523339489e-05, "loss": 0.19800195693969727, "memory(GiB)": 122.96, "step": 22350, "token_acc": 0.9483096521313081, "train_speed(iter/s)": 0.244136 }, { "epoch": 1.7040170744721397, "grad_norm": 0.7659358978271484, "learning_rate": 7.3977017091079e-05, "loss": 0.10736374855041504, "memory(GiB)": 122.96, "step": 22355, "token_acc": 0.956989247311828, "train_speed(iter/s)": 0.244145 }, { "epoch": 1.7043982010823995, "grad_norm": 1.086513638496399, "learning_rate": 7.396650946769834e-05, "loss": 0.10072449445724488, "memory(GiB)": 122.96, "step": 22360, "token_acc": 0.9466286799620133, "train_speed(iter/s)": 0.244156 }, { "epoch": 1.7047793276926595, "grad_norm": 1.0456907749176025, "learning_rate": 7.39560004699496e-05, "loss": 0.14515092372894287, "memory(GiB)": 122.96, "step": 22365, "token_acc": 0.9467672413793103, "train_speed(iter/s)": 0.244167 }, { "epoch": 1.7051604543029195, "grad_norm": 0.5895593166351318, "learning_rate": 7.394549009843538e-05, "loss": 0.09802674055099488, "memory(GiB)": 122.96, "step": 22370, "token_acc": 0.9653781512605042, "train_speed(iter/s)": 0.24418 }, { "epoch": 1.7055415809131793, "grad_norm": 0.43304556608200073, "learning_rate": 7.393497835375844e-05, "loss": 0.11079618930816651, "memory(GiB)": 122.96, "step": 22375, "token_acc": 0.9591509097395647, "train_speed(iter/s)": 0.244184 }, { "epoch": 1.7059227075234393, "grad_norm": 0.6907081007957458, "learning_rate": 7.392446523652155e-05, "loss": 0.11029367446899414, "memory(GiB)": 122.96, "step": 22380, "token_acc": 0.9527005433045701, "train_speed(iter/s)": 0.244198 }, { "epoch": 1.7063038341336991, "grad_norm": 0.7933582663536072, "learning_rate": 7.391395074732762e-05, "loss": 0.13040144443511964, "memory(GiB)": 122.96, "step": 22385, "token_acc": 0.9475598666262504, "train_speed(iter/s)": 0.244212 }, { "epoch": 1.7066849607439591, "grad_norm": 1.1873550415039062, "learning_rate": 7.390343488677958e-05, "loss": 0.11156731843948364, "memory(GiB)": 122.96, "step": 22390, "token_acc": 0.960822722820764, "train_speed(iter/s)": 0.244224 }, { "epoch": 1.7070660873542192, "grad_norm": 1.1339590549468994, "learning_rate": 7.389291765548047e-05, "loss": 0.11877338886260987, "memory(GiB)": 122.96, "step": 22395, "token_acc": 0.9545816733067729, "train_speed(iter/s)": 0.244237 }, { "epoch": 1.707447213964479, "grad_norm": 1.3135501146316528, "learning_rate": 7.388239905403341e-05, "loss": 0.1170524001121521, "memory(GiB)": 122.96, "step": 22400, "token_acc": 0.9482236298540347, "train_speed(iter/s)": 0.244251 }, { "epoch": 1.707447213964479, "eval_loss": 0.09982047230005264, "eval_runtime": 215.3445, "eval_samples_per_second": 2.461, "eval_steps_per_second": 2.461, "eval_token_acc": 0.9560719233781098, "step": 22400 }, { "epoch": 1.707828340574739, "grad_norm": 0.8551361560821533, "learning_rate": 7.387187908304159e-05, "loss": 0.1549553632736206, "memory(GiB)": 122.96, "step": 22405, "token_acc": 0.9555658327460653, "train_speed(iter/s)": 0.243693 }, { "epoch": 1.7082094671849988, "grad_norm": 0.650139331817627, "learning_rate": 7.386135774310829e-05, "loss": 0.13076605796813964, "memory(GiB)": 122.96, "step": 22410, "token_acc": 0.9498283098054178, "train_speed(iter/s)": 0.243703 }, { "epoch": 1.7085905937952588, "grad_norm": 1.4320175647735596, "learning_rate": 7.385083503483684e-05, "loss": 0.10523022413253784, "memory(GiB)": 122.96, "step": 22415, "token_acc": 0.9615463284925343, "train_speed(iter/s)": 0.243712 }, { "epoch": 1.7089717204055188, "grad_norm": 1.0035210847854614, "learning_rate": 7.38403109588307e-05, "loss": 0.1581351637840271, "memory(GiB)": 122.96, "step": 22420, "token_acc": 0.9334934417143913, "train_speed(iter/s)": 0.243718 }, { "epoch": 1.7093528470157786, "grad_norm": 0.5308611989021301, "learning_rate": 7.382978551569334e-05, "loss": 0.08971482515335083, "memory(GiB)": 122.96, "step": 22425, "token_acc": 0.9672426205903528, "train_speed(iter/s)": 0.243722 }, { "epoch": 1.7097339736260384, "grad_norm": 0.6965824961662292, "learning_rate": 7.381925870602838e-05, "loss": 0.1550302505493164, "memory(GiB)": 122.96, "step": 22430, "token_acc": 0.9359389895138227, "train_speed(iter/s)": 0.243731 }, { "epoch": 1.7101151002362984, "grad_norm": 0.8184794187545776, "learning_rate": 7.380873053043947e-05, "loss": 0.10501192808151245, "memory(GiB)": 122.96, "step": 22435, "token_acc": 0.9604810996563574, "train_speed(iter/s)": 0.243747 }, { "epoch": 1.7104962268465584, "grad_norm": 0.9442630410194397, "learning_rate": 7.379820098953036e-05, "loss": 0.10234864950180053, "memory(GiB)": 122.96, "step": 22440, "token_acc": 0.9603639728562615, "train_speed(iter/s)": 0.243752 }, { "epoch": 1.7108773534568185, "grad_norm": 0.7673889398574829, "learning_rate": 7.378767008390483e-05, "loss": 0.13648335933685302, "memory(GiB)": 122.96, "step": 22445, "token_acc": 0.9489330208461824, "train_speed(iter/s)": 0.243758 }, { "epoch": 1.7112584800670783, "grad_norm": 0.955918550491333, "learning_rate": 7.377713781416683e-05, "loss": 0.09668587446212769, "memory(GiB)": 122.96, "step": 22450, "token_acc": 0.9577647823261858, "train_speed(iter/s)": 0.243777 }, { "epoch": 1.711639606677338, "grad_norm": 1.0928490161895752, "learning_rate": 7.376660418092031e-05, "loss": 0.1915527105331421, "memory(GiB)": 122.96, "step": 22455, "token_acc": 0.9459745762711864, "train_speed(iter/s)": 0.243784 }, { "epoch": 1.712020733287598, "grad_norm": 0.7215978503227234, "learning_rate": 7.375606918476931e-05, "loss": 0.11463272571563721, "memory(GiB)": 122.96, "step": 22460, "token_acc": 0.96133871898442, "train_speed(iter/s)": 0.243787 }, { "epoch": 1.712401859897858, "grad_norm": 0.9757073521614075, "learning_rate": 7.374553282631803e-05, "loss": 0.10513441562652588, "memory(GiB)": 122.96, "step": 22465, "token_acc": 0.9386642435256701, "train_speed(iter/s)": 0.243803 }, { "epoch": 1.712782986508118, "grad_norm": 0.3961370587348938, "learning_rate": 7.37349951061706e-05, "loss": 0.12731605768203735, "memory(GiB)": 122.96, "step": 22470, "token_acc": 0.9478054567022538, "train_speed(iter/s)": 0.243807 }, { "epoch": 1.713164113118378, "grad_norm": 1.1209912300109863, "learning_rate": 7.372445602493135e-05, "loss": 0.16317073106765748, "memory(GiB)": 122.96, "step": 22475, "token_acc": 0.9447290793355069, "train_speed(iter/s)": 0.243816 }, { "epoch": 1.7135452397286377, "grad_norm": 0.9164186120033264, "learning_rate": 7.371391558320463e-05, "loss": 0.10938284397125245, "memory(GiB)": 122.96, "step": 22480, "token_acc": 0.9523225241016652, "train_speed(iter/s)": 0.243825 }, { "epoch": 1.7139263663388977, "grad_norm": 0.5511829257011414, "learning_rate": 7.370337378159492e-05, "loss": 0.10844937562942505, "memory(GiB)": 122.96, "step": 22485, "token_acc": 0.9636846767050488, "train_speed(iter/s)": 0.243828 }, { "epoch": 1.7143074929491577, "grad_norm": 0.5276740193367004, "learning_rate": 7.369283062070672e-05, "loss": 0.10638012886047363, "memory(GiB)": 122.96, "step": 22490, "token_acc": 0.9614574898785425, "train_speed(iter/s)": 0.243835 }, { "epoch": 1.7146886195594178, "grad_norm": 0.5546761751174927, "learning_rate": 7.368228610114462e-05, "loss": 0.12493609189987183, "memory(GiB)": 122.96, "step": 22495, "token_acc": 0.9532785241628233, "train_speed(iter/s)": 0.243847 }, { "epoch": 1.7150697461696776, "grad_norm": 1.475494146347046, "learning_rate": 7.367174022351332e-05, "loss": 0.1224939465522766, "memory(GiB)": 122.96, "step": 22500, "token_acc": 0.947860583734769, "train_speed(iter/s)": 0.243859 }, { "epoch": 1.7154508727799374, "grad_norm": 0.7512075304985046, "learning_rate": 7.366119298841758e-05, "loss": 0.13880496025085448, "memory(GiB)": 122.96, "step": 22505, "token_acc": 0.9482412060301507, "train_speed(iter/s)": 0.243864 }, { "epoch": 1.7158319993901974, "grad_norm": 2.183732271194458, "learning_rate": 7.365064439646219e-05, "loss": 0.14416224956512452, "memory(GiB)": 122.96, "step": 22510, "token_acc": 0.9470734744707348, "train_speed(iter/s)": 0.24388 }, { "epoch": 1.7162131260004574, "grad_norm": 1.0501822233200073, "learning_rate": 7.364009444825212e-05, "loss": 0.13086936473846436, "memory(GiB)": 122.96, "step": 22515, "token_acc": 0.9456706281833617, "train_speed(iter/s)": 0.243892 }, { "epoch": 1.7165942526107174, "grad_norm": 1.2555909156799316, "learning_rate": 7.362954314439233e-05, "loss": 0.12604081630706787, "memory(GiB)": 122.96, "step": 22520, "token_acc": 0.9485998526160648, "train_speed(iter/s)": 0.243902 }, { "epoch": 1.7169753792209772, "grad_norm": 0.6557461023330688, "learning_rate": 7.36189904854879e-05, "loss": 0.10930535793304444, "memory(GiB)": 122.96, "step": 22525, "token_acc": 0.9562766605728215, "train_speed(iter/s)": 0.243906 }, { "epoch": 1.717356505831237, "grad_norm": 1.1487514972686768, "learning_rate": 7.360843647214397e-05, "loss": 0.125740122795105, "memory(GiB)": 122.96, "step": 22530, "token_acc": 0.9625580350456792, "train_speed(iter/s)": 0.243913 }, { "epoch": 1.717737632441497, "grad_norm": 0.6364356279373169, "learning_rate": 7.359788110496576e-05, "loss": 0.13984442949295045, "memory(GiB)": 122.96, "step": 22535, "token_acc": 0.9422590292155509, "train_speed(iter/s)": 0.243925 }, { "epoch": 1.718118759051757, "grad_norm": 1.0086814165115356, "learning_rate": 7.358732438455859e-05, "loss": 0.12949551343917848, "memory(GiB)": 122.96, "step": 22540, "token_acc": 0.950544844928751, "train_speed(iter/s)": 0.243936 }, { "epoch": 1.718499885662017, "grad_norm": 0.5343924164772034, "learning_rate": 7.357676631152781e-05, "loss": 0.08846315145492553, "memory(GiB)": 122.96, "step": 22545, "token_acc": 0.96878612716763, "train_speed(iter/s)": 0.243937 }, { "epoch": 1.7188810122722769, "grad_norm": 1.6344157457351685, "learning_rate": 7.356620688647889e-05, "loss": 0.17456142902374266, "memory(GiB)": 122.96, "step": 22550, "token_acc": 0.9512756570113179, "train_speed(iter/s)": 0.243943 }, { "epoch": 1.7192621388825367, "grad_norm": 0.452778697013855, "learning_rate": 7.355564611001737e-05, "loss": 0.11692919731140136, "memory(GiB)": 122.96, "step": 22555, "token_acc": 0.9442128887463931, "train_speed(iter/s)": 0.243957 }, { "epoch": 1.7196432654927967, "grad_norm": 0.7473217248916626, "learning_rate": 7.354508398274886e-05, "loss": 0.14423227310180664, "memory(GiB)": 122.96, "step": 22560, "token_acc": 0.9468256525942637, "train_speed(iter/s)": 0.24397 }, { "epoch": 1.7200243921030567, "grad_norm": 0.14946900308132172, "learning_rate": 7.353452050527903e-05, "loss": 0.11053715944290161, "memory(GiB)": 122.96, "step": 22565, "token_acc": 0.9476861167002012, "train_speed(iter/s)": 0.243979 }, { "epoch": 1.7204055187133167, "grad_norm": 0.6566641926765442, "learning_rate": 7.352395567821368e-05, "loss": 0.09059802293777466, "memory(GiB)": 122.96, "step": 22570, "token_acc": 0.9588945251005773, "train_speed(iter/s)": 0.243989 }, { "epoch": 1.7207866453235765, "grad_norm": 0.7004576921463013, "learning_rate": 7.351338950215865e-05, "loss": 0.12408561706542968, "memory(GiB)": 122.96, "step": 22575, "token_acc": 0.954600241060667, "train_speed(iter/s)": 0.243991 }, { "epoch": 1.7211677719338363, "grad_norm": 0.3545568883419037, "learning_rate": 7.350282197771983e-05, "loss": 0.07770583629608155, "memory(GiB)": 122.96, "step": 22580, "token_acc": 0.9628221377270807, "train_speed(iter/s)": 0.244009 }, { "epoch": 1.7215488985440963, "grad_norm": 0.6408150792121887, "learning_rate": 7.349225310550322e-05, "loss": 0.1329951286315918, "memory(GiB)": 122.96, "step": 22585, "token_acc": 0.949645518996546, "train_speed(iter/s)": 0.244017 }, { "epoch": 1.7219300251543563, "grad_norm": 0.33633318543434143, "learning_rate": 7.348168288611495e-05, "loss": 0.09994817972183227, "memory(GiB)": 122.96, "step": 22590, "token_acc": 0.9428571428571428, "train_speed(iter/s)": 0.244034 }, { "epoch": 1.7223111517646164, "grad_norm": 0.9432477355003357, "learning_rate": 7.34711113201611e-05, "loss": 0.13830338716506957, "memory(GiB)": 122.96, "step": 22595, "token_acc": 0.9362041467304625, "train_speed(iter/s)": 0.244049 }, { "epoch": 1.7226922783748762, "grad_norm": 0.8772575855255127, "learning_rate": 7.346053840824796e-05, "loss": 0.13516937494277953, "memory(GiB)": 122.96, "step": 22600, "token_acc": 0.9416904083570751, "train_speed(iter/s)": 0.244059 }, { "epoch": 1.7226922783748762, "eval_loss": 0.09773040562868118, "eval_runtime": 214.8438, "eval_samples_per_second": 2.467, "eval_steps_per_second": 2.467, "eval_token_acc": 0.9564333473887116, "step": 22600 }, { "epoch": 1.723073404985136, "grad_norm": 1.1286256313323975, "learning_rate": 7.34499641509818e-05, "loss": 0.11407146453857422, "memory(GiB)": 122.96, "step": 22605, "token_acc": 0.9562376807742564, "train_speed(iter/s)": 0.243509 }, { "epoch": 1.723454531595396, "grad_norm": 0.5700121521949768, "learning_rate": 7.343938854896903e-05, "loss": 0.16036680936813355, "memory(GiB)": 122.96, "step": 22610, "token_acc": 0.9532235459004905, "train_speed(iter/s)": 0.243515 }, { "epoch": 1.723835658205656, "grad_norm": 1.745046615600586, "learning_rate": 7.342881160281606e-05, "loss": 0.09160689115524293, "memory(GiB)": 122.96, "step": 22615, "token_acc": 0.9585881045025014, "train_speed(iter/s)": 0.243531 }, { "epoch": 1.724216784815916, "grad_norm": 0.6543465256690979, "learning_rate": 7.34182333131295e-05, "loss": 0.08871396780014038, "memory(GiB)": 122.96, "step": 22620, "token_acc": 0.9625658731600945, "train_speed(iter/s)": 0.243538 }, { "epoch": 1.7245979114261758, "grad_norm": 0.7879281640052795, "learning_rate": 7.340765368051594e-05, "loss": 0.08861007690429687, "memory(GiB)": 122.96, "step": 22625, "token_acc": 0.9605967245013783, "train_speed(iter/s)": 0.243543 }, { "epoch": 1.7249790380364356, "grad_norm": 1.0128053426742554, "learning_rate": 7.339707270558205e-05, "loss": 0.13329391479492186, "memory(GiB)": 122.96, "step": 22630, "token_acc": 0.9457928802588996, "train_speed(iter/s)": 0.24356 }, { "epoch": 1.7253601646466956, "grad_norm": 0.7527912855148315, "learning_rate": 7.338649038893461e-05, "loss": 0.15672676563262938, "memory(GiB)": 122.96, "step": 22635, "token_acc": 0.9456984667802385, "train_speed(iter/s)": 0.243571 }, { "epoch": 1.7257412912569556, "grad_norm": 0.8982603549957275, "learning_rate": 7.337590673118049e-05, "loss": 0.15307868719100953, "memory(GiB)": 122.96, "step": 22640, "token_acc": 0.9367588932806324, "train_speed(iter/s)": 0.243584 }, { "epoch": 1.7261224178672157, "grad_norm": 0.6650236248970032, "learning_rate": 7.33653217329266e-05, "loss": 0.13341115713119506, "memory(GiB)": 122.96, "step": 22645, "token_acc": 0.9388532000951701, "train_speed(iter/s)": 0.243594 }, { "epoch": 1.7265035444774754, "grad_norm": 0.9003525376319885, "learning_rate": 7.335473539477992e-05, "loss": 0.10499167442321777, "memory(GiB)": 122.96, "step": 22650, "token_acc": 0.9672565138637313, "train_speed(iter/s)": 0.243595 }, { "epoch": 1.7268846710877352, "grad_norm": 1.134199619293213, "learning_rate": 7.334414771734754e-05, "loss": 0.11812107563018799, "memory(GiB)": 122.96, "step": 22655, "token_acc": 0.9574383452665075, "train_speed(iter/s)": 0.243612 }, { "epoch": 1.7272657976979953, "grad_norm": 0.9332714676856995, "learning_rate": 7.333355870123664e-05, "loss": 0.10398601293563843, "memory(GiB)": 122.96, "step": 22660, "token_acc": 0.963509635096351, "train_speed(iter/s)": 0.243613 }, { "epoch": 1.7276469243082553, "grad_norm": 1.9689130783081055, "learning_rate": 7.332296834705441e-05, "loss": 0.08426344394683838, "memory(GiB)": 122.96, "step": 22665, "token_acc": 0.9603638726445743, "train_speed(iter/s)": 0.243629 }, { "epoch": 1.728028050918515, "grad_norm": 1.2048760652542114, "learning_rate": 7.33123766554082e-05, "loss": 0.105281662940979, "memory(GiB)": 122.96, "step": 22670, "token_acc": 0.9565525383707202, "train_speed(iter/s)": 0.24364 }, { "epoch": 1.728409177528775, "grad_norm": 1.0969287157058716, "learning_rate": 7.330178362690536e-05, "loss": 0.13224349021911622, "memory(GiB)": 122.96, "step": 22675, "token_acc": 0.9508141682054726, "train_speed(iter/s)": 0.243646 }, { "epoch": 1.728790304139035, "grad_norm": 0.6675103902816772, "learning_rate": 7.329118926215335e-05, "loss": 0.11445480585098267, "memory(GiB)": 122.96, "step": 22680, "token_acc": 0.9559214020180563, "train_speed(iter/s)": 0.243654 }, { "epoch": 1.729171430749295, "grad_norm": 1.0770151615142822, "learning_rate": 7.328059356175971e-05, "loss": 0.09671254158020019, "memory(GiB)": 122.96, "step": 22685, "token_acc": 0.9566146612524044, "train_speed(iter/s)": 0.243664 }, { "epoch": 1.729552557359555, "grad_norm": 0.6346368789672852, "learning_rate": 7.32699965263321e-05, "loss": 0.0964931607246399, "memory(GiB)": 122.96, "step": 22690, "token_acc": 0.9590513833992095, "train_speed(iter/s)": 0.24367 }, { "epoch": 1.7299336839698147, "grad_norm": 1.2033483982086182, "learning_rate": 7.325939815647816e-05, "loss": 0.13338063955307006, "memory(GiB)": 122.96, "step": 22695, "token_acc": 0.95005291005291, "train_speed(iter/s)": 0.243682 }, { "epoch": 1.7303148105800747, "grad_norm": 0.562324583530426, "learning_rate": 7.324879845280566e-05, "loss": 0.10261318683624268, "memory(GiB)": 122.96, "step": 22700, "token_acc": 0.9676969092721834, "train_speed(iter/s)": 0.243689 }, { "epoch": 1.7306959371903345, "grad_norm": 0.888490617275238, "learning_rate": 7.323819741592248e-05, "loss": 0.11691510677337646, "memory(GiB)": 122.96, "step": 22705, "token_acc": 0.9555803571428572, "train_speed(iter/s)": 0.243702 }, { "epoch": 1.7310770638005946, "grad_norm": 0.8532058000564575, "learning_rate": 7.32275950464365e-05, "loss": 0.10857983827590942, "memory(GiB)": 122.96, "step": 22710, "token_acc": 0.9535832200427268, "train_speed(iter/s)": 0.243713 }, { "epoch": 1.7314581904108546, "grad_norm": 0.8169603943824768, "learning_rate": 7.321699134495575e-05, "loss": 0.08248804807662964, "memory(GiB)": 122.96, "step": 22715, "token_acc": 0.9670908293111014, "train_speed(iter/s)": 0.243723 }, { "epoch": 1.7318393170211144, "grad_norm": 0.683448076248169, "learning_rate": 7.320638631208827e-05, "loss": 0.11732048988342285, "memory(GiB)": 122.96, "step": 22720, "token_acc": 0.9590747330960854, "train_speed(iter/s)": 0.24373 }, { "epoch": 1.7322204436313744, "grad_norm": 0.8265263438224792, "learning_rate": 7.319577994844224e-05, "loss": 0.17483806610107422, "memory(GiB)": 122.96, "step": 22725, "token_acc": 0.9271570014144271, "train_speed(iter/s)": 0.243743 }, { "epoch": 1.7326015702416342, "grad_norm": 0.687034010887146, "learning_rate": 7.318517225462586e-05, "loss": 0.12077195644378662, "memory(GiB)": 122.96, "step": 22730, "token_acc": 0.9548975727319676, "train_speed(iter/s)": 0.243752 }, { "epoch": 1.7329826968518942, "grad_norm": 0.6395902037620544, "learning_rate": 7.317456323124742e-05, "loss": 0.13604586124420165, "memory(GiB)": 122.96, "step": 22735, "token_acc": 0.9445828144458281, "train_speed(iter/s)": 0.243762 }, { "epoch": 1.7333638234621542, "grad_norm": 0.8575174808502197, "learning_rate": 7.316395287891537e-05, "loss": 0.10479742288589478, "memory(GiB)": 122.96, "step": 22740, "token_acc": 0.9632248939179632, "train_speed(iter/s)": 0.243774 }, { "epoch": 1.733744950072414, "grad_norm": 1.4003039598464966, "learning_rate": 7.315334119823808e-05, "loss": 0.10418202877044677, "memory(GiB)": 122.96, "step": 22745, "token_acc": 0.9591013824884793, "train_speed(iter/s)": 0.243779 }, { "epoch": 1.7341260766826738, "grad_norm": 0.5381571650505066, "learning_rate": 7.314272818982414e-05, "loss": 0.11316345930099488, "memory(GiB)": 122.96, "step": 22750, "token_acc": 0.9491476451892517, "train_speed(iter/s)": 0.24379 }, { "epoch": 1.7345072032929338, "grad_norm": 1.196371078491211, "learning_rate": 7.313211385428211e-05, "loss": 0.07108050584793091, "memory(GiB)": 122.96, "step": 22755, "token_acc": 0.974561089215335, "train_speed(iter/s)": 0.243804 }, { "epoch": 1.7348883299031939, "grad_norm": 1.557081937789917, "learning_rate": 7.312149819222072e-05, "loss": 0.1411288261413574, "memory(GiB)": 122.96, "step": 22760, "token_acc": 0.9570167286245354, "train_speed(iter/s)": 0.243812 }, { "epoch": 1.7352694565134539, "grad_norm": 0.7887213826179504, "learning_rate": 7.31108812042487e-05, "loss": 0.11124507188796998, "memory(GiB)": 122.96, "step": 22765, "token_acc": 0.963031045751634, "train_speed(iter/s)": 0.243819 }, { "epoch": 1.7356505831237137, "grad_norm": 0.9597247838973999, "learning_rate": 7.310026289097487e-05, "loss": 0.16476042270660402, "memory(GiB)": 122.96, "step": 22770, "token_acc": 0.9365392195506503, "train_speed(iter/s)": 0.243827 }, { "epoch": 1.7360317097339735, "grad_norm": 0.4580088257789612, "learning_rate": 7.308964325300818e-05, "loss": 0.15317448377609252, "memory(GiB)": 122.96, "step": 22775, "token_acc": 0.9529448426301028, "train_speed(iter/s)": 0.243836 }, { "epoch": 1.7364128363442335, "grad_norm": 0.6057664155960083, "learning_rate": 7.307902229095761e-05, "loss": 0.07210761308670044, "memory(GiB)": 122.96, "step": 22780, "token_acc": 0.9691221879135421, "train_speed(iter/s)": 0.243849 }, { "epoch": 1.7367939629544935, "grad_norm": 0.4495570957660675, "learning_rate": 7.306840000543219e-05, "loss": 0.10301439762115479, "memory(GiB)": 122.96, "step": 22785, "token_acc": 0.9579632918886916, "train_speed(iter/s)": 0.243863 }, { "epoch": 1.7371750895647535, "grad_norm": 0.9155391454696655, "learning_rate": 7.305777639704109e-05, "loss": 0.14858872890472413, "memory(GiB)": 122.96, "step": 22790, "token_acc": 0.9505547515677761, "train_speed(iter/s)": 0.243874 }, { "epoch": 1.7375562161750133, "grad_norm": 0.7508010268211365, "learning_rate": 7.304715146639351e-05, "loss": 0.1141050934791565, "memory(GiB)": 122.96, "step": 22795, "token_acc": 0.9502099580083984, "train_speed(iter/s)": 0.243889 }, { "epoch": 1.7379373427852731, "grad_norm": 1.0222442150115967, "learning_rate": 7.303652521409874e-05, "loss": 0.09204410910606384, "memory(GiB)": 122.96, "step": 22800, "token_acc": 0.9593094944512947, "train_speed(iter/s)": 0.243904 }, { "epoch": 1.7379373427852731, "eval_loss": 0.098355732858181, "eval_runtime": 218.5864, "eval_samples_per_second": 2.425, "eval_steps_per_second": 2.425, "eval_token_acc": 0.9557255587012831, "step": 22800 }, { "epoch": 1.7383184693955331, "grad_norm": 0.7180343866348267, "learning_rate": 7.302589764076617e-05, "loss": 0.11628261804580689, "memory(GiB)": 122.96, "step": 22805, "token_acc": 0.9559503125292883, "train_speed(iter/s)": 0.243338 }, { "epoch": 1.7386995960057932, "grad_norm": 0.7704737782478333, "learning_rate": 7.301526874700522e-05, "loss": 0.11225894689559937, "memory(GiB)": 122.96, "step": 22810, "token_acc": 0.9478512795750845, "train_speed(iter/s)": 0.243352 }, { "epoch": 1.7390807226160532, "grad_norm": 0.806197464466095, "learning_rate": 7.30046385334254e-05, "loss": 0.1656572103500366, "memory(GiB)": 122.96, "step": 22815, "token_acc": 0.9410203479799469, "train_speed(iter/s)": 0.243357 }, { "epoch": 1.739461849226313, "grad_norm": 1.2615532875061035, "learning_rate": 7.299400700063632e-05, "loss": 0.14788055419921875, "memory(GiB)": 122.96, "step": 22820, "token_acc": 0.9429539678057046, "train_speed(iter/s)": 0.24337 }, { "epoch": 1.7398429758365728, "grad_norm": 1.1928491592407227, "learning_rate": 7.298337414924764e-05, "loss": 0.10777961015701294, "memory(GiB)": 122.96, "step": 22825, "token_acc": 0.9539708265802269, "train_speed(iter/s)": 0.243385 }, { "epoch": 1.7402241024468328, "grad_norm": 0.8378008008003235, "learning_rate": 7.29727399798691e-05, "loss": 0.10058319568634033, "memory(GiB)": 122.96, "step": 22830, "token_acc": 0.965925163944966, "train_speed(iter/s)": 0.243385 }, { "epoch": 1.7406052290570928, "grad_norm": 0.8875649571418762, "learning_rate": 7.296210449311056e-05, "loss": 0.13904275894165039, "memory(GiB)": 122.96, "step": 22835, "token_acc": 0.9481591546970447, "train_speed(iter/s)": 0.243391 }, { "epoch": 1.7409863556673528, "grad_norm": 0.8345692157745361, "learning_rate": 7.295146768958186e-05, "loss": 0.127626633644104, "memory(GiB)": 122.96, "step": 22840, "token_acc": 0.9466917529005958, "train_speed(iter/s)": 0.243398 }, { "epoch": 1.7413674822776126, "grad_norm": 1.2623095512390137, "learning_rate": 7.2940829569893e-05, "loss": 0.16688673496246337, "memory(GiB)": 122.96, "step": 22845, "token_acc": 0.9320235756385069, "train_speed(iter/s)": 0.243413 }, { "epoch": 1.7417486088878724, "grad_norm": 1.1911647319793701, "learning_rate": 7.293019013465403e-05, "loss": 0.10168817043304443, "memory(GiB)": 122.96, "step": 22850, "token_acc": 0.9593926553672316, "train_speed(iter/s)": 0.243428 }, { "epoch": 1.7421297354981324, "grad_norm": 0.6654606461524963, "learning_rate": 7.291954938447504e-05, "loss": 0.08263660669326782, "memory(GiB)": 122.96, "step": 22855, "token_acc": 0.9608032128514056, "train_speed(iter/s)": 0.243434 }, { "epoch": 1.7425108621083925, "grad_norm": 1.3501689434051514, "learning_rate": 7.290890731996628e-05, "loss": 0.1415271759033203, "memory(GiB)": 122.96, "step": 22860, "token_acc": 0.9386484884410196, "train_speed(iter/s)": 0.243447 }, { "epoch": 1.7428919887186525, "grad_norm": 1.8662434816360474, "learning_rate": 7.289826394173799e-05, "loss": 0.09292722940444946, "memory(GiB)": 122.96, "step": 22865, "token_acc": 0.9605633802816902, "train_speed(iter/s)": 0.243453 }, { "epoch": 1.7432731153289123, "grad_norm": 1.0406643152236938, "learning_rate": 7.28876192504005e-05, "loss": 0.12798954248428346, "memory(GiB)": 122.96, "step": 22870, "token_acc": 0.9462607274213323, "train_speed(iter/s)": 0.243464 }, { "epoch": 1.743654241939172, "grad_norm": 0.9909408092498779, "learning_rate": 7.28769732465643e-05, "loss": 0.1123960018157959, "memory(GiB)": 122.96, "step": 22875, "token_acc": 0.9566514842154861, "train_speed(iter/s)": 0.243467 }, { "epoch": 1.744035368549432, "grad_norm": 0.9126935601234436, "learning_rate": 7.28663259308398e-05, "loss": 0.12252837419509888, "memory(GiB)": 122.96, "step": 22880, "token_acc": 0.9507375053701848, "train_speed(iter/s)": 0.243475 }, { "epoch": 1.744416495159692, "grad_norm": 1.252841591835022, "learning_rate": 7.285567730383766e-05, "loss": 0.09498708248138428, "memory(GiB)": 122.96, "step": 22885, "token_acc": 0.9610887483227909, "train_speed(iter/s)": 0.243484 }, { "epoch": 1.7447976217699521, "grad_norm": 0.7043788433074951, "learning_rate": 7.284502736616847e-05, "loss": 0.13110564947128295, "memory(GiB)": 122.96, "step": 22890, "token_acc": 0.9596258000984736, "train_speed(iter/s)": 0.243493 }, { "epoch": 1.745178748380212, "grad_norm": 0.6210152506828308, "learning_rate": 7.283437611844298e-05, "loss": 0.0880037784576416, "memory(GiB)": 122.96, "step": 22895, "token_acc": 0.9693769799366421, "train_speed(iter/s)": 0.243501 }, { "epoch": 1.7455598749904717, "grad_norm": 1.2591511011123657, "learning_rate": 7.282372356127198e-05, "loss": 0.11742782592773438, "memory(GiB)": 122.96, "step": 22900, "token_acc": 0.9519945909398242, "train_speed(iter/s)": 0.243512 }, { "epoch": 1.7459410016007317, "grad_norm": 1.084974765777588, "learning_rate": 7.281306969526635e-05, "loss": 0.15707681179046631, "memory(GiB)": 122.96, "step": 22905, "token_acc": 0.9381036861817466, "train_speed(iter/s)": 0.24352 }, { "epoch": 1.7463221282109918, "grad_norm": 0.9754509329795837, "learning_rate": 7.280241452103704e-05, "loss": 0.1408408522605896, "memory(GiB)": 122.96, "step": 22910, "token_acc": 0.9566384472434442, "train_speed(iter/s)": 0.243527 }, { "epoch": 1.7467032548212518, "grad_norm": 1.4260382652282715, "learning_rate": 7.279175803919508e-05, "loss": 0.11025205850601197, "memory(GiB)": 122.96, "step": 22915, "token_acc": 0.9523207513093733, "train_speed(iter/s)": 0.243534 }, { "epoch": 1.7470843814315116, "grad_norm": 0.8159790635108948, "learning_rate": 7.278110025035157e-05, "loss": 0.10179712772369384, "memory(GiB)": 122.96, "step": 22920, "token_acc": 0.9624716920090586, "train_speed(iter/s)": 0.243546 }, { "epoch": 1.7474655080417714, "grad_norm": 1.6694326400756836, "learning_rate": 7.277044115511764e-05, "loss": 0.16485157012939453, "memory(GiB)": 122.96, "step": 22925, "token_acc": 0.9351598173515981, "train_speed(iter/s)": 0.243559 }, { "epoch": 1.7478466346520314, "grad_norm": 1.1610618829727173, "learning_rate": 7.275978075410461e-05, "loss": 0.17536184787750245, "memory(GiB)": 122.96, "step": 22930, "token_acc": 0.9230359520639148, "train_speed(iter/s)": 0.243573 }, { "epoch": 1.7482277612622914, "grad_norm": 1.0727465152740479, "learning_rate": 7.274911904792376e-05, "loss": 0.1388644814491272, "memory(GiB)": 122.96, "step": 22935, "token_acc": 0.9512663085188028, "train_speed(iter/s)": 0.243583 }, { "epoch": 1.7486088878725514, "grad_norm": 0.8794253468513489, "learning_rate": 7.273845603718651e-05, "loss": 0.10299206972122192, "memory(GiB)": 122.96, "step": 22940, "token_acc": 0.9579764453961456, "train_speed(iter/s)": 0.243596 }, { "epoch": 1.7489900144828112, "grad_norm": 0.5958268046379089, "learning_rate": 7.272779172250431e-05, "loss": 0.11765452623367309, "memory(GiB)": 122.96, "step": 22945, "token_acc": 0.9559085751337332, "train_speed(iter/s)": 0.243599 }, { "epoch": 1.749371141093071, "grad_norm": 1.1148196458816528, "learning_rate": 7.271712610448874e-05, "loss": 0.15339465141296388, "memory(GiB)": 122.96, "step": 22950, "token_acc": 0.936069827789573, "train_speed(iter/s)": 0.24361 }, { "epoch": 1.749752267703331, "grad_norm": 1.299093246459961, "learning_rate": 7.270645918375141e-05, "loss": 0.10864996910095215, "memory(GiB)": 122.96, "step": 22955, "token_acc": 0.9565391548167435, "train_speed(iter/s)": 0.24362 }, { "epoch": 1.750133394313591, "grad_norm": 1.5057036876678467, "learning_rate": 7.2695790960904e-05, "loss": 0.12369999885559083, "memory(GiB)": 122.96, "step": 22960, "token_acc": 0.9577613516367476, "train_speed(iter/s)": 0.243632 }, { "epoch": 1.750514520923851, "grad_norm": 0.8272818326950073, "learning_rate": 7.268512143655832e-05, "loss": 0.08364887237548828, "memory(GiB)": 122.96, "step": 22965, "token_acc": 0.9532114707362066, "train_speed(iter/s)": 0.243644 }, { "epoch": 1.7508956475341109, "grad_norm": 0.7223031520843506, "learning_rate": 7.267445061132618e-05, "loss": 0.12007169723510742, "memory(GiB)": 122.96, "step": 22970, "token_acc": 0.9567030784508441, "train_speed(iter/s)": 0.243656 }, { "epoch": 1.7512767741443707, "grad_norm": 1.2888460159301758, "learning_rate": 7.266377848581953e-05, "loss": 0.11781548261642456, "memory(GiB)": 122.96, "step": 22975, "token_acc": 0.9577586206896552, "train_speed(iter/s)": 0.243672 }, { "epoch": 1.7516579007546307, "grad_norm": 0.9592128396034241, "learning_rate": 7.265310506065035e-05, "loss": 0.1283315896987915, "memory(GiB)": 122.96, "step": 22980, "token_acc": 0.9473393481863377, "train_speed(iter/s)": 0.24368 }, { "epoch": 1.7520390273648907, "grad_norm": 0.6162494421005249, "learning_rate": 7.264243033643073e-05, "loss": 0.10888078212738037, "memory(GiB)": 122.96, "step": 22985, "token_acc": 0.9611595301250474, "train_speed(iter/s)": 0.24369 }, { "epoch": 1.7524201539751505, "grad_norm": 0.7538749575614929, "learning_rate": 7.26317543137728e-05, "loss": 0.11367861032485962, "memory(GiB)": 122.96, "step": 22990, "token_acc": 0.9567632850241546, "train_speed(iter/s)": 0.243699 }, { "epoch": 1.7528012805854105, "grad_norm": 0.8800696730613708, "learning_rate": 7.262107699328877e-05, "loss": 0.14430720806121827, "memory(GiB)": 122.96, "step": 22995, "token_acc": 0.946441672780631, "train_speed(iter/s)": 0.243712 }, { "epoch": 1.7531824071956703, "grad_norm": 0.6020234227180481, "learning_rate": 7.261039837559096e-05, "loss": 0.10905044078826905, "memory(GiB)": 122.96, "step": 23000, "token_acc": 0.9601913548607551, "train_speed(iter/s)": 0.243723 }, { "epoch": 1.7531824071956703, "eval_loss": 0.09414472430944443, "eval_runtime": 217.1729, "eval_samples_per_second": 2.44, "eval_steps_per_second": 2.44, "eval_token_acc": 0.9572616107463405, "step": 23000 }, { "epoch": 1.7535635338059303, "grad_norm": 0.5681965947151184, "learning_rate": 7.259971846129175e-05, "loss": 0.08677439689636231, "memory(GiB)": 122.96, "step": 23005, "token_acc": 0.9580854744125217, "train_speed(iter/s)": 0.243167 }, { "epoch": 1.7539446604161903, "grad_norm": 0.8315051198005676, "learning_rate": 7.258903725100352e-05, "loss": 0.12059934139251709, "memory(GiB)": 122.96, "step": 23010, "token_acc": 0.9505718954248366, "train_speed(iter/s)": 0.243182 }, { "epoch": 1.7543257870264501, "grad_norm": 1.5355359315872192, "learning_rate": 7.257835474533884e-05, "loss": 0.10595240592956542, "memory(GiB)": 122.96, "step": 23015, "token_acc": 0.9564787339268052, "train_speed(iter/s)": 0.243202 }, { "epoch": 1.7547069136367102, "grad_norm": 0.6026448011398315, "learning_rate": 7.25676709449103e-05, "loss": 0.17497695684432985, "memory(GiB)": 122.96, "step": 23020, "token_acc": 0.9359504132231405, "train_speed(iter/s)": 0.243208 }, { "epoch": 1.75508804024697, "grad_norm": 0.8504495620727539, "learning_rate": 7.255698585033057e-05, "loss": 0.08585066199302674, "memory(GiB)": 122.96, "step": 23025, "token_acc": 0.9586776859504132, "train_speed(iter/s)": 0.243217 }, { "epoch": 1.75546916685723, "grad_norm": 0.9149502515792847, "learning_rate": 7.254629946221236e-05, "loss": 0.12331409454345703, "memory(GiB)": 122.96, "step": 23030, "token_acc": 0.9616561289006731, "train_speed(iter/s)": 0.243227 }, { "epoch": 1.75585029346749, "grad_norm": 1.2212228775024414, "learning_rate": 7.253561178116851e-05, "loss": 0.1307743787765503, "memory(GiB)": 122.96, "step": 23035, "token_acc": 0.9593856655290103, "train_speed(iter/s)": 0.243232 }, { "epoch": 1.7562314200777498, "grad_norm": 0.5449307560920715, "learning_rate": 7.252492280781191e-05, "loss": 0.11745294332504272, "memory(GiB)": 122.96, "step": 23040, "token_acc": 0.9514149114872723, "train_speed(iter/s)": 0.243239 }, { "epoch": 1.7566125466880096, "grad_norm": 0.7834076881408691, "learning_rate": 7.25142325427555e-05, "loss": 0.12032512426376343, "memory(GiB)": 122.96, "step": 23045, "token_acc": 0.9535319467865843, "train_speed(iter/s)": 0.243249 }, { "epoch": 1.7569936732982696, "grad_norm": 1.2563124895095825, "learning_rate": 7.250354098661234e-05, "loss": 0.12421555519104004, "memory(GiB)": 122.96, "step": 23050, "token_acc": 0.9611127729584206, "train_speed(iter/s)": 0.243261 }, { "epoch": 1.7573747999085296, "grad_norm": 0.7794398665428162, "learning_rate": 7.249284813999554e-05, "loss": 0.10142576694488525, "memory(GiB)": 122.96, "step": 23055, "token_acc": 0.9564187588820464, "train_speed(iter/s)": 0.24327 }, { "epoch": 1.7577559265187896, "grad_norm": 1.3774030208587646, "learning_rate": 7.248215400351826e-05, "loss": 0.11984881162643432, "memory(GiB)": 122.96, "step": 23060, "token_acc": 0.9533397251518057, "train_speed(iter/s)": 0.243281 }, { "epoch": 1.7581370531290494, "grad_norm": 1.6078723669052124, "learning_rate": 7.24714585777938e-05, "loss": 0.14329617023468016, "memory(GiB)": 122.96, "step": 23065, "token_acc": 0.9471879286694102, "train_speed(iter/s)": 0.243286 }, { "epoch": 1.7585181797393092, "grad_norm": 1.7443180084228516, "learning_rate": 7.246076186343546e-05, "loss": 0.11465667486190796, "memory(GiB)": 122.96, "step": 23070, "token_acc": 0.9494794856093081, "train_speed(iter/s)": 0.243302 }, { "epoch": 1.7588993063495693, "grad_norm": 0.812034010887146, "learning_rate": 7.245006386105666e-05, "loss": 0.10880122184753419, "memory(GiB)": 122.96, "step": 23075, "token_acc": 0.964403427818062, "train_speed(iter/s)": 0.243306 }, { "epoch": 1.7592804329598293, "grad_norm": 0.910630464553833, "learning_rate": 7.243936457127088e-05, "loss": 0.1273535132408142, "memory(GiB)": 122.96, "step": 23080, "token_acc": 0.9467826086956522, "train_speed(iter/s)": 0.243321 }, { "epoch": 1.7596615595700893, "grad_norm": 0.9725729823112488, "learning_rate": 7.242866399469167e-05, "loss": 0.06961270570755004, "memory(GiB)": 122.96, "step": 23085, "token_acc": 0.9705756929637527, "train_speed(iter/s)": 0.243327 }, { "epoch": 1.760042686180349, "grad_norm": 1.0774139165878296, "learning_rate": 7.241796213193266e-05, "loss": 0.12175641059875489, "memory(GiB)": 122.96, "step": 23090, "token_acc": 0.9611266294227188, "train_speed(iter/s)": 0.243336 }, { "epoch": 1.7604238127906089, "grad_norm": 1.4455894231796265, "learning_rate": 7.240725898360756e-05, "loss": 0.127077317237854, "memory(GiB)": 122.96, "step": 23095, "token_acc": 0.9515669515669516, "train_speed(iter/s)": 0.243348 }, { "epoch": 1.760804939400869, "grad_norm": 0.6113494038581848, "learning_rate": 7.239655455033014e-05, "loss": 0.12384322881698609, "memory(GiB)": 122.96, "step": 23100, "token_acc": 0.9503035754441196, "train_speed(iter/s)": 0.243358 }, { "epoch": 1.761186066011129, "grad_norm": 0.5890191793441772, "learning_rate": 7.238584883271425e-05, "loss": 0.07339001297950745, "memory(GiB)": 122.96, "step": 23105, "token_acc": 0.9707668090847763, "train_speed(iter/s)": 0.243367 }, { "epoch": 1.761567192621389, "grad_norm": 0.6603773236274719, "learning_rate": 7.23751418313738e-05, "loss": 0.1056405782699585, "memory(GiB)": 122.96, "step": 23110, "token_acc": 0.9543881083282427, "train_speed(iter/s)": 0.24337 }, { "epoch": 1.7619483192316487, "grad_norm": 0.9181042313575745, "learning_rate": 7.236443354692281e-05, "loss": 0.09902899861335754, "memory(GiB)": 122.96, "step": 23115, "token_acc": 0.9537933817594835, "train_speed(iter/s)": 0.24338 }, { "epoch": 1.7623294458419085, "grad_norm": 0.8303046226501465, "learning_rate": 7.235372397997534e-05, "loss": 0.1574738383293152, "memory(GiB)": 122.96, "step": 23120, "token_acc": 0.9408666507062485, "train_speed(iter/s)": 0.243392 }, { "epoch": 1.7627105724521686, "grad_norm": 0.9477664232254028, "learning_rate": 7.234301313114553e-05, "loss": 0.191995108127594, "memory(GiB)": 122.96, "step": 23125, "token_acc": 0.9344384429130191, "train_speed(iter/s)": 0.243403 }, { "epoch": 1.7630916990624286, "grad_norm": 1.047004222869873, "learning_rate": 7.23323010010476e-05, "loss": 0.10680942535400391, "memory(GiB)": 122.96, "step": 23130, "token_acc": 0.9594699061292103, "train_speed(iter/s)": 0.243406 }, { "epoch": 1.7634728256726886, "grad_norm": 0.9576881527900696, "learning_rate": 7.232158759029585e-05, "loss": 0.059962570667266846, "memory(GiB)": 122.96, "step": 23135, "token_acc": 0.9649621212121212, "train_speed(iter/s)": 0.243423 }, { "epoch": 1.7638539522829484, "grad_norm": 0.9916583895683289, "learning_rate": 7.231087289950464e-05, "loss": 0.11216645240783692, "memory(GiB)": 122.96, "step": 23140, "token_acc": 0.9584812623274162, "train_speed(iter/s)": 0.243421 }, { "epoch": 1.7642350788932082, "grad_norm": 0.8796592354774475, "learning_rate": 7.230015692928838e-05, "loss": 0.11570451259613038, "memory(GiB)": 122.96, "step": 23145, "token_acc": 0.9514809590973202, "train_speed(iter/s)": 0.243434 }, { "epoch": 1.7646162055034682, "grad_norm": 1.0857036113739014, "learning_rate": 7.228943968026161e-05, "loss": 0.08872859477996826, "memory(GiB)": 122.96, "step": 23150, "token_acc": 0.952753108348135, "train_speed(iter/s)": 0.243449 }, { "epoch": 1.7649973321137282, "grad_norm": 0.8274242877960205, "learning_rate": 7.227872115303893e-05, "loss": 0.11091675758361816, "memory(GiB)": 122.96, "step": 23155, "token_acc": 0.954639786540172, "train_speed(iter/s)": 0.243453 }, { "epoch": 1.7653784587239882, "grad_norm": 0.8879777193069458, "learning_rate": 7.226800134823497e-05, "loss": 0.12477505207061768, "memory(GiB)": 122.96, "step": 23160, "token_acc": 0.9474760520934238, "train_speed(iter/s)": 0.24345 }, { "epoch": 1.765759585334248, "grad_norm": 0.6571097373962402, "learning_rate": 7.225728026646445e-05, "loss": 0.12279442548751832, "memory(GiB)": 122.96, "step": 23165, "token_acc": 0.9539078156312625, "train_speed(iter/s)": 0.24346 }, { "epoch": 1.7661407119445078, "grad_norm": 0.4126579165458679, "learning_rate": 7.224655790834223e-05, "loss": 0.15820497274398804, "memory(GiB)": 122.96, "step": 23170, "token_acc": 0.9555900621118012, "train_speed(iter/s)": 0.243467 }, { "epoch": 1.7665218385547679, "grad_norm": 0.7036758661270142, "learning_rate": 7.223583427448313e-05, "loss": 0.10529749393463135, "memory(GiB)": 122.96, "step": 23175, "token_acc": 0.9627859832030119, "train_speed(iter/s)": 0.24347 }, { "epoch": 1.7669029651650279, "grad_norm": 0.5270026922225952, "learning_rate": 7.222510936550211e-05, "loss": 0.08192024230957032, "memory(GiB)": 122.96, "step": 23180, "token_acc": 0.9645124716553288, "train_speed(iter/s)": 0.243474 }, { "epoch": 1.767284091775288, "grad_norm": 0.9647880792617798, "learning_rate": 7.221438318201422e-05, "loss": 0.11344774961471557, "memory(GiB)": 122.96, "step": 23185, "token_acc": 0.9497446922870196, "train_speed(iter/s)": 0.243488 }, { "epoch": 1.7676652183855477, "grad_norm": 0.6918342709541321, "learning_rate": 7.220365572463454e-05, "loss": 0.13512378931045532, "memory(GiB)": 122.96, "step": 23190, "token_acc": 0.9553140096618358, "train_speed(iter/s)": 0.243492 }, { "epoch": 1.7680463449958075, "grad_norm": 0.819002091884613, "learning_rate": 7.219292699397824e-05, "loss": 0.1122437596321106, "memory(GiB)": 122.96, "step": 23195, "token_acc": 0.9508689839572193, "train_speed(iter/s)": 0.2435 }, { "epoch": 1.7684274716060675, "grad_norm": 0.7933727502822876, "learning_rate": 7.218219699066058e-05, "loss": 0.08016595840454102, "memory(GiB)": 122.96, "step": 23200, "token_acc": 0.9663516939386956, "train_speed(iter/s)": 0.243512 }, { "epoch": 1.7684274716060675, "eval_loss": 0.09565918147563934, "eval_runtime": 219.1798, "eval_samples_per_second": 2.418, "eval_steps_per_second": 2.418, "eval_token_acc": 0.9568023010662008, "step": 23200 }, { "epoch": 1.7688085982163275, "grad_norm": 0.5952458381652832, "learning_rate": 7.217146571529684e-05, "loss": 0.08698221445083618, "memory(GiB)": 122.96, "step": 23205, "token_acc": 0.9569890885466653, "train_speed(iter/s)": 0.242969 }, { "epoch": 1.7691897248265875, "grad_norm": 0.40517449378967285, "learning_rate": 7.216073316850243e-05, "loss": 0.0924514353275299, "memory(GiB)": 122.96, "step": 23210, "token_acc": 0.9563352826510721, "train_speed(iter/s)": 0.242978 }, { "epoch": 1.7695708514368473, "grad_norm": 0.04934883862733841, "learning_rate": 7.21499993508928e-05, "loss": 0.08079149127006531, "memory(GiB)": 122.96, "step": 23215, "token_acc": 0.9659300184162063, "train_speed(iter/s)": 0.242985 }, { "epoch": 1.7699519780471071, "grad_norm": 0.7241352796554565, "learning_rate": 7.213926426308352e-05, "loss": 0.11921541690826416, "memory(GiB)": 122.96, "step": 23220, "token_acc": 0.9527539161192522, "train_speed(iter/s)": 0.242998 }, { "epoch": 1.7703331046573672, "grad_norm": 0.6519904136657715, "learning_rate": 7.212852790569017e-05, "loss": 0.11828763484954834, "memory(GiB)": 122.96, "step": 23225, "token_acc": 0.9577960140679953, "train_speed(iter/s)": 0.243004 }, { "epoch": 1.7707142312676272, "grad_norm": 1.0055242776870728, "learning_rate": 7.211779027932843e-05, "loss": 0.13674209117889405, "memory(GiB)": 122.96, "step": 23230, "token_acc": 0.9495738636363636, "train_speed(iter/s)": 0.243014 }, { "epoch": 1.7710953578778872, "grad_norm": 0.5549229383468628, "learning_rate": 7.210705138461406e-05, "loss": 0.15468508005142212, "memory(GiB)": 122.96, "step": 23235, "token_acc": 0.9463649046659433, "train_speed(iter/s)": 0.24302 }, { "epoch": 1.771476484488147, "grad_norm": 1.5740801095962524, "learning_rate": 7.209631122216288e-05, "loss": 0.13295700550079345, "memory(GiB)": 122.96, "step": 23240, "token_acc": 0.9537456008044244, "train_speed(iter/s)": 0.243026 }, { "epoch": 1.7718576110984068, "grad_norm": 1.0270901918411255, "learning_rate": 7.20855697925908e-05, "loss": 0.1277254819869995, "memory(GiB)": 122.96, "step": 23245, "token_acc": 0.9458879106721065, "train_speed(iter/s)": 0.243038 }, { "epoch": 1.7722387377086668, "grad_norm": 1.0498064756393433, "learning_rate": 7.207482709651376e-05, "loss": 0.09956609010696411, "memory(GiB)": 122.96, "step": 23250, "token_acc": 0.9543349626961667, "train_speed(iter/s)": 0.243044 }, { "epoch": 1.7726198643189268, "grad_norm": 0.7302316427230835, "learning_rate": 7.206408313454784e-05, "loss": 0.1446032404899597, "memory(GiB)": 122.96, "step": 23255, "token_acc": 0.9483130750880382, "train_speed(iter/s)": 0.243038 }, { "epoch": 1.7730009909291868, "grad_norm": 0.7447216510772705, "learning_rate": 7.205333790730913e-05, "loss": 0.13976681232452393, "memory(GiB)": 122.96, "step": 23260, "token_acc": 0.9527917189460476, "train_speed(iter/s)": 0.243043 }, { "epoch": 1.7733821175394466, "grad_norm": 0.5818156003952026, "learning_rate": 7.204259141541385e-05, "loss": 0.12363839149475098, "memory(GiB)": 122.96, "step": 23265, "token_acc": 0.9577739809616793, "train_speed(iter/s)": 0.243053 }, { "epoch": 1.7737632441497064, "grad_norm": 0.643611490726471, "learning_rate": 7.203184365947823e-05, "loss": 0.07917478084564208, "memory(GiB)": 122.96, "step": 23270, "token_acc": 0.9676091133681032, "train_speed(iter/s)": 0.243064 }, { "epoch": 1.7741443707599664, "grad_norm": 0.7010123133659363, "learning_rate": 7.202109464011861e-05, "loss": 0.10141603946685791, "memory(GiB)": 122.96, "step": 23275, "token_acc": 0.9605380168963984, "train_speed(iter/s)": 0.243061 }, { "epoch": 1.7745254973702265, "grad_norm": 0.8204020857810974, "learning_rate": 7.201034435795141e-05, "loss": 0.11140587329864501, "memory(GiB)": 122.96, "step": 23280, "token_acc": 0.9628893306825712, "train_speed(iter/s)": 0.243075 }, { "epoch": 1.7749066239804863, "grad_norm": 0.4217594563961029, "learning_rate": 7.19995928135931e-05, "loss": 0.09179417490959167, "memory(GiB)": 122.96, "step": 23285, "token_acc": 0.9626607319485658, "train_speed(iter/s)": 0.243083 }, { "epoch": 1.7752877505907463, "grad_norm": 1.4727509021759033, "learning_rate": 7.198884000766023e-05, "loss": 0.10547810792922974, "memory(GiB)": 122.96, "step": 23290, "token_acc": 0.9581646423751687, "train_speed(iter/s)": 0.24309 }, { "epoch": 1.775668877201006, "grad_norm": 0.7903043627738953, "learning_rate": 7.197808594076944e-05, "loss": 0.10643990039825439, "memory(GiB)": 122.96, "step": 23295, "token_acc": 0.9639119451461566, "train_speed(iter/s)": 0.243104 }, { "epoch": 1.776050003811266, "grad_norm": 0.5719894170761108, "learning_rate": 7.19673306135374e-05, "loss": 0.10060263872146606, "memory(GiB)": 122.96, "step": 23300, "token_acc": 0.9589833920483141, "train_speed(iter/s)": 0.243103 }, { "epoch": 1.7764311304215261, "grad_norm": 0.7787784337997437, "learning_rate": 7.19565740265809e-05, "loss": 0.09506222605705261, "memory(GiB)": 122.96, "step": 23305, "token_acc": 0.956232159847764, "train_speed(iter/s)": 0.243117 }, { "epoch": 1.776812257031786, "grad_norm": 2.13336443901062, "learning_rate": 7.194581618051677e-05, "loss": 0.09597482085227967, "memory(GiB)": 122.96, "step": 23310, "token_acc": 0.963871209284912, "train_speed(iter/s)": 0.243121 }, { "epoch": 1.777193383642046, "grad_norm": 0.5784302353858948, "learning_rate": 7.193505707596191e-05, "loss": 0.13365068435668945, "memory(GiB)": 122.96, "step": 23315, "token_acc": 0.9446043165467626, "train_speed(iter/s)": 0.243134 }, { "epoch": 1.7775745102523057, "grad_norm": 0.1726435273885727, "learning_rate": 7.192429671353333e-05, "loss": 0.12638185024261475, "memory(GiB)": 122.96, "step": 23320, "token_acc": 0.9634727368978295, "train_speed(iter/s)": 0.243144 }, { "epoch": 1.7779556368625657, "grad_norm": 0.6785112023353577, "learning_rate": 7.191353509384806e-05, "loss": 0.10031934976577758, "memory(GiB)": 122.96, "step": 23325, "token_acc": 0.9567063981367311, "train_speed(iter/s)": 0.243144 }, { "epoch": 1.7783367634728258, "grad_norm": 0.8125603795051575, "learning_rate": 7.190277221752326e-05, "loss": 0.1037831425666809, "memory(GiB)": 122.96, "step": 23330, "token_acc": 0.9607260726072607, "train_speed(iter/s)": 0.243156 }, { "epoch": 1.7787178900830856, "grad_norm": 0.5362970232963562, "learning_rate": 7.18920080851761e-05, "loss": 0.08221933841705323, "memory(GiB)": 122.96, "step": 23335, "token_acc": 0.9666725757844353, "train_speed(iter/s)": 0.243162 }, { "epoch": 1.7790990166933456, "grad_norm": 1.1072980165481567, "learning_rate": 7.188124269742388e-05, "loss": 0.08355308175086976, "memory(GiB)": 122.96, "step": 23340, "token_acc": 0.9683014354066986, "train_speed(iter/s)": 0.243169 }, { "epoch": 1.7794801433036054, "grad_norm": 0.11157568544149399, "learning_rate": 7.187047605488392e-05, "loss": 0.05584652423858642, "memory(GiB)": 122.96, "step": 23345, "token_acc": 0.9751499571550986, "train_speed(iter/s)": 0.24318 }, { "epoch": 1.7798612699138654, "grad_norm": 0.5501653552055359, "learning_rate": 7.185970815817367e-05, "loss": 0.09777764678001404, "memory(GiB)": 122.96, "step": 23350, "token_acc": 0.9572192513368984, "train_speed(iter/s)": 0.24319 }, { "epoch": 1.7802423965241254, "grad_norm": 1.5019243955612183, "learning_rate": 7.184893900791058e-05, "loss": 0.16942485570907592, "memory(GiB)": 122.96, "step": 23355, "token_acc": 0.92282489989079, "train_speed(iter/s)": 0.243203 }, { "epoch": 1.7806235231343852, "grad_norm": 1.0951775312423706, "learning_rate": 7.183816860471224e-05, "loss": 0.09031522870063782, "memory(GiB)": 122.96, "step": 23360, "token_acc": 0.9657065893079154, "train_speed(iter/s)": 0.243204 }, { "epoch": 1.781004649744645, "grad_norm": 1.0967986583709717, "learning_rate": 7.182739694919627e-05, "loss": 0.13735790252685548, "memory(GiB)": 122.96, "step": 23365, "token_acc": 0.9451677516274412, "train_speed(iter/s)": 0.243206 }, { "epoch": 1.781385776354905, "grad_norm": 0.7094708681106567, "learning_rate": 7.181662404198037e-05, "loss": 0.10652425289154052, "memory(GiB)": 122.96, "step": 23370, "token_acc": 0.9481865284974094, "train_speed(iter/s)": 0.243222 }, { "epoch": 1.781766902965165, "grad_norm": 1.0670284032821655, "learning_rate": 7.180584988368233e-05, "loss": 0.1577918767929077, "memory(GiB)": 122.96, "step": 23375, "token_acc": 0.9404404404404404, "train_speed(iter/s)": 0.24323 }, { "epoch": 1.782148029575425, "grad_norm": 0.5655043721199036, "learning_rate": 7.179507447491999e-05, "loss": 0.0859815776348114, "memory(GiB)": 122.96, "step": 23380, "token_acc": 0.9677858439201452, "train_speed(iter/s)": 0.24324 }, { "epoch": 1.7825291561856849, "grad_norm": 0.7330759763717651, "learning_rate": 7.178429781631126e-05, "loss": 0.12107913494110108, "memory(GiB)": 122.96, "step": 23385, "token_acc": 0.9538490999617005, "train_speed(iter/s)": 0.243248 }, { "epoch": 1.7829102827959447, "grad_norm": 0.9121013879776001, "learning_rate": 7.177351990847415e-05, "loss": 0.10400419235229492, "memory(GiB)": 122.96, "step": 23390, "token_acc": 0.9640005806357962, "train_speed(iter/s)": 0.24325 }, { "epoch": 1.7832914094062047, "grad_norm": 2.065025568008423, "learning_rate": 7.176274075202673e-05, "loss": 0.08179395794868469, "memory(GiB)": 122.96, "step": 23395, "token_acc": 0.9664981036662452, "train_speed(iter/s)": 0.243255 }, { "epoch": 1.7836725360164647, "grad_norm": 1.0746972560882568, "learning_rate": 7.17519603475871e-05, "loss": 0.12859307527542113, "memory(GiB)": 122.96, "step": 23400, "token_acc": 0.9399602385685885, "train_speed(iter/s)": 0.243266 }, { "epoch": 1.7836725360164647, "eval_loss": 0.09479209035634995, "eval_runtime": 217.8913, "eval_samples_per_second": 2.432, "eval_steps_per_second": 2.432, "eval_token_acc": 0.9573745557496537, "step": 23400 }, { "epoch": 1.7840536626267247, "grad_norm": 0.6006884574890137, "learning_rate": 7.174117869577349e-05, "loss": 0.10693862438201904, "memory(GiB)": 122.96, "step": 23405, "token_acc": 0.9575889166085513, "train_speed(iter/s)": 0.242724 }, { "epoch": 1.7844347892369845, "grad_norm": 1.5353587865829468, "learning_rate": 7.173039579720417e-05, "loss": 0.1768411159515381, "memory(GiB)": 122.96, "step": 23410, "token_acc": 0.9416859122401847, "train_speed(iter/s)": 0.242736 }, { "epoch": 1.7848159158472443, "grad_norm": 0.7948404550552368, "learning_rate": 7.171961165249749e-05, "loss": 0.07659238576889038, "memory(GiB)": 122.96, "step": 23415, "token_acc": 0.96303180503924, "train_speed(iter/s)": 0.242748 }, { "epoch": 1.7851970424575043, "grad_norm": 1.3815462589263916, "learning_rate": 7.170882626227187e-05, "loss": 0.09676730632781982, "memory(GiB)": 122.96, "step": 23420, "token_acc": 0.9710192119830674, "train_speed(iter/s)": 0.242762 }, { "epoch": 1.7855781690677643, "grad_norm": 0.7117595076560974, "learning_rate": 7.16980396271458e-05, "loss": 0.12441442012786866, "memory(GiB)": 122.96, "step": 23425, "token_acc": 0.9499553172475425, "train_speed(iter/s)": 0.24277 }, { "epoch": 1.7859592956780244, "grad_norm": 0.855291485786438, "learning_rate": 7.168725174773788e-05, "loss": 0.09868787527084351, "memory(GiB)": 122.96, "step": 23430, "token_acc": 0.9523809523809523, "train_speed(iter/s)": 0.242787 }, { "epoch": 1.7863404222882842, "grad_norm": 0.6835452914237976, "learning_rate": 7.16764626246667e-05, "loss": 0.08391751050949096, "memory(GiB)": 122.96, "step": 23435, "token_acc": 0.9652442795299938, "train_speed(iter/s)": 0.242785 }, { "epoch": 1.786721548898544, "grad_norm": 0.8185908198356628, "learning_rate": 7.166567225855096e-05, "loss": 0.11715716123580933, "memory(GiB)": 122.96, "step": 23440, "token_acc": 0.9398474178403756, "train_speed(iter/s)": 0.242797 }, { "epoch": 1.787102675508804, "grad_norm": 0.7956842184066772, "learning_rate": 7.165488065000949e-05, "loss": 0.13731250762939454, "memory(GiB)": 122.96, "step": 23445, "token_acc": 0.9387893134934054, "train_speed(iter/s)": 0.242812 }, { "epoch": 1.787483802119064, "grad_norm": 0.7758172750473022, "learning_rate": 7.164408779966109e-05, "loss": 0.18375785350799562, "memory(GiB)": 122.96, "step": 23450, "token_acc": 0.9484726936130824, "train_speed(iter/s)": 0.242823 }, { "epoch": 1.787864928729324, "grad_norm": 0.6243519186973572, "learning_rate": 7.163329370812469e-05, "loss": 0.13761688470840455, "memory(GiB)": 122.96, "step": 23455, "token_acc": 0.9515151515151515, "train_speed(iter/s)": 0.242837 }, { "epoch": 1.7882460553395838, "grad_norm": 0.9324338436126709, "learning_rate": 7.162249837601929e-05, "loss": 0.08399779200553895, "memory(GiB)": 122.96, "step": 23460, "token_acc": 0.9648829431438127, "train_speed(iter/s)": 0.242846 }, { "epoch": 1.7886271819498436, "grad_norm": 0.6019797325134277, "learning_rate": 7.161170180396394e-05, "loss": 0.10786420106887817, "memory(GiB)": 122.96, "step": 23465, "token_acc": 0.9655797101449275, "train_speed(iter/s)": 0.24285 }, { "epoch": 1.7890083085601036, "grad_norm": 0.6338000297546387, "learning_rate": 7.160090399257778e-05, "loss": 0.07087835669517517, "memory(GiB)": 122.96, "step": 23470, "token_acc": 0.9666827619507484, "train_speed(iter/s)": 0.242861 }, { "epoch": 1.7893894351703636, "grad_norm": 0.8183435797691345, "learning_rate": 7.159010494248004e-05, "loss": 0.15308600664138794, "memory(GiB)": 122.96, "step": 23475, "token_acc": 0.9308789027251398, "train_speed(iter/s)": 0.242872 }, { "epoch": 1.7897705617806237, "grad_norm": 0.8098626136779785, "learning_rate": 7.157930465428994e-05, "loss": 0.13148143291473388, "memory(GiB)": 122.96, "step": 23480, "token_acc": 0.9511518771331058, "train_speed(iter/s)": 0.242883 }, { "epoch": 1.7901516883908835, "grad_norm": 0.4449446499347687, "learning_rate": 7.156850312862689e-05, "loss": 0.09728869199752807, "memory(GiB)": 122.96, "step": 23485, "token_acc": 0.9621671635653336, "train_speed(iter/s)": 0.242878 }, { "epoch": 1.7905328150011433, "grad_norm": 0.7426866292953491, "learning_rate": 7.155770036611026e-05, "loss": 0.09480289816856384, "memory(GiB)": 122.96, "step": 23490, "token_acc": 0.9573067119796091, "train_speed(iter/s)": 0.242889 }, { "epoch": 1.7909139416114033, "grad_norm": 0.7189920544624329, "learning_rate": 7.154689636735956e-05, "loss": 0.14140409231185913, "memory(GiB)": 122.96, "step": 23495, "token_acc": 0.9329954954954955, "train_speed(iter/s)": 0.242905 }, { "epoch": 1.7912950682216633, "grad_norm": 0.9056515097618103, "learning_rate": 7.153609113299434e-05, "loss": 0.12610199451446533, "memory(GiB)": 122.96, "step": 23500, "token_acc": 0.9439212328767124, "train_speed(iter/s)": 0.24292 }, { "epoch": 1.7916761948319233, "grad_norm": 0.8275067806243896, "learning_rate": 7.152528466363423e-05, "loss": 0.16044986248016357, "memory(GiB)": 122.96, "step": 23505, "token_acc": 0.9368327402135231, "train_speed(iter/s)": 0.242931 }, { "epoch": 1.792057321442183, "grad_norm": 0.7123541235923767, "learning_rate": 7.151447695989894e-05, "loss": 0.09610059857368469, "memory(GiB)": 122.96, "step": 23510, "token_acc": 0.9609440854172521, "train_speed(iter/s)": 0.242943 }, { "epoch": 1.792438448052443, "grad_norm": 1.041144847869873, "learning_rate": 7.150366802240823e-05, "loss": 0.12009236812591553, "memory(GiB)": 122.96, "step": 23515, "token_acc": 0.9565454545454546, "train_speed(iter/s)": 0.242948 }, { "epoch": 1.792819574662703, "grad_norm": 0.6407527923583984, "learning_rate": 7.149285785178196e-05, "loss": 0.12980775833129882, "memory(GiB)": 122.96, "step": 23520, "token_acc": 0.9606924643584521, "train_speed(iter/s)": 0.242955 }, { "epoch": 1.793200701272963, "grad_norm": 0.9046735167503357, "learning_rate": 7.148204644864001e-05, "loss": 0.10800182819366455, "memory(GiB)": 122.96, "step": 23525, "token_acc": 0.9553039332538736, "train_speed(iter/s)": 0.242964 }, { "epoch": 1.793581827883223, "grad_norm": 0.5577616095542908, "learning_rate": 7.14712338136024e-05, "loss": 0.10586364269256592, "memory(GiB)": 122.96, "step": 23530, "token_acc": 0.9621182837263239, "train_speed(iter/s)": 0.242972 }, { "epoch": 1.7939629544934828, "grad_norm": 0.677080512046814, "learning_rate": 7.146041994728917e-05, "loss": 0.08933534622192382, "memory(GiB)": 122.96, "step": 23535, "token_acc": 0.9465596330275229, "train_speed(iter/s)": 0.242985 }, { "epoch": 1.7943440811037425, "grad_norm": 1.2392351627349854, "learning_rate": 7.144960485032044e-05, "loss": 0.11652226448059082, "memory(GiB)": 122.96, "step": 23540, "token_acc": 0.9635312430509229, "train_speed(iter/s)": 0.242996 }, { "epoch": 1.7947252077140026, "grad_norm": 0.6711097359657288, "learning_rate": 7.143878852331641e-05, "loss": 0.12025706768035889, "memory(GiB)": 122.96, "step": 23545, "token_acc": 0.9495007132667618, "train_speed(iter/s)": 0.24301 }, { "epoch": 1.7951063343242626, "grad_norm": 0.8127679228782654, "learning_rate": 7.142797096689734e-05, "loss": 0.08176945447921753, "memory(GiB)": 122.96, "step": 23550, "token_acc": 0.959565865077676, "train_speed(iter/s)": 0.243019 }, { "epoch": 1.7954874609345226, "grad_norm": 0.6724151968955994, "learning_rate": 7.14171521816836e-05, "loss": 0.09789879322052002, "memory(GiB)": 122.96, "step": 23555, "token_acc": 0.9615692554043235, "train_speed(iter/s)": 0.24303 }, { "epoch": 1.7958685875447824, "grad_norm": 0.8345849514007568, "learning_rate": 7.140633216829553e-05, "loss": 0.09813202619552612, "memory(GiB)": 122.96, "step": 23560, "token_acc": 0.9546563643757625, "train_speed(iter/s)": 0.243038 }, { "epoch": 1.7962497141550422, "grad_norm": 1.1370497941970825, "learning_rate": 7.139551092735366e-05, "loss": 0.12336745262145996, "memory(GiB)": 122.96, "step": 23565, "token_acc": 0.9434956395348837, "train_speed(iter/s)": 0.243048 }, { "epoch": 1.7966308407653022, "grad_norm": 0.7021245360374451, "learning_rate": 7.138468845947855e-05, "loss": 0.10761333703994751, "memory(GiB)": 122.96, "step": 23570, "token_acc": 0.952421959095802, "train_speed(iter/s)": 0.24306 }, { "epoch": 1.7970119673755622, "grad_norm": 0.9117397665977478, "learning_rate": 7.137386476529077e-05, "loss": 0.09179342389106751, "memory(GiB)": 122.96, "step": 23575, "token_acc": 0.9583689415211818, "train_speed(iter/s)": 0.243058 }, { "epoch": 1.7973930939858223, "grad_norm": 1.6145694255828857, "learning_rate": 7.136303984541104e-05, "loss": 0.12694710493087769, "memory(GiB)": 122.96, "step": 23580, "token_acc": 0.9558800190445961, "train_speed(iter/s)": 0.243065 }, { "epoch": 1.797774220596082, "grad_norm": 1.8263871669769287, "learning_rate": 7.135221370046012e-05, "loss": 0.17786080837249757, "memory(GiB)": 122.96, "step": 23585, "token_acc": 0.9389013452914798, "train_speed(iter/s)": 0.243076 }, { "epoch": 1.7981553472063418, "grad_norm": 1.1587390899658203, "learning_rate": 7.134138633105883e-05, "loss": 0.09682374000549317, "memory(GiB)": 122.96, "step": 23590, "token_acc": 0.9561304836895388, "train_speed(iter/s)": 0.243081 }, { "epoch": 1.7985364738166019, "grad_norm": 0.5998356938362122, "learning_rate": 7.133055773782805e-05, "loss": 0.12832412719726563, "memory(GiB)": 122.96, "step": 23595, "token_acc": 0.9567620286085826, "train_speed(iter/s)": 0.243094 }, { "epoch": 1.7989176004268619, "grad_norm": 0.9308950304985046, "learning_rate": 7.131972792138879e-05, "loss": 0.1097069263458252, "memory(GiB)": 122.96, "step": 23600, "token_acc": 0.9512195121951219, "train_speed(iter/s)": 0.243105 }, { "epoch": 1.7989176004268619, "eval_loss": 0.09370029717683792, "eval_runtime": 219.2979, "eval_samples_per_second": 2.417, "eval_steps_per_second": 2.417, "eval_token_acc": 0.9574649117523041, "step": 23600 }, { "epoch": 1.7992987270371217, "grad_norm": 0.8236498832702637, "learning_rate": 7.130889688236207e-05, "loss": 0.11003422737121582, "memory(GiB)": 122.96, "step": 23605, "token_acc": 0.9572435897435897, "train_speed(iter/s)": 0.24256 }, { "epoch": 1.7996798536473817, "grad_norm": 0.6321241855621338, "learning_rate": 7.129806462136897e-05, "loss": 0.10837595462799073, "memory(GiB)": 122.96, "step": 23610, "token_acc": 0.9561605906783571, "train_speed(iter/s)": 0.24257 }, { "epoch": 1.8000609802576415, "grad_norm": 1.1868499517440796, "learning_rate": 7.128723113903072e-05, "loss": 0.10675997734069824, "memory(GiB)": 122.96, "step": 23615, "token_acc": 0.9541607898448519, "train_speed(iter/s)": 0.24258 }, { "epoch": 1.8004421068679015, "grad_norm": 0.38341644406318665, "learning_rate": 7.127639643596855e-05, "loss": 0.12343090772628784, "memory(GiB)": 122.96, "step": 23620, "token_acc": 0.9587826503714354, "train_speed(iter/s)": 0.24258 }, { "epoch": 1.8008232334781615, "grad_norm": 1.0323783159255981, "learning_rate": 7.126556051280379e-05, "loss": 0.10805461406707764, "memory(GiB)": 122.96, "step": 23625, "token_acc": 0.9582514734774067, "train_speed(iter/s)": 0.242584 }, { "epoch": 1.8012043600884213, "grad_norm": 1.2695955038070679, "learning_rate": 7.125472337015779e-05, "loss": 0.12279930114746093, "memory(GiB)": 122.96, "step": 23630, "token_acc": 0.9657626466628197, "train_speed(iter/s)": 0.24259 }, { "epoch": 1.8015854866986813, "grad_norm": 0.9386529922485352, "learning_rate": 7.124388500865207e-05, "loss": 0.08190087080001832, "memory(GiB)": 122.96, "step": 23635, "token_acc": 0.9680092059838895, "train_speed(iter/s)": 0.242602 }, { "epoch": 1.8019666133089411, "grad_norm": 0.2688941955566406, "learning_rate": 7.123304542890811e-05, "loss": 0.11005868911743164, "memory(GiB)": 122.96, "step": 23640, "token_acc": 0.955719557195572, "train_speed(iter/s)": 0.242612 }, { "epoch": 1.8023477399192012, "grad_norm": 1.3765912055969238, "learning_rate": 7.122220463154752e-05, "loss": 0.13060874938964845, "memory(GiB)": 122.96, "step": 23645, "token_acc": 0.9520663696420341, "train_speed(iter/s)": 0.242617 }, { "epoch": 1.8027288665294612, "grad_norm": 2.1273770332336426, "learning_rate": 7.1211362617192e-05, "loss": 0.1003786563873291, "memory(GiB)": 122.96, "step": 23650, "token_acc": 0.9614041892940264, "train_speed(iter/s)": 0.242624 }, { "epoch": 1.803109993139721, "grad_norm": 0.5986934900283813, "learning_rate": 7.120051938646326e-05, "loss": 0.09521732330322266, "memory(GiB)": 122.96, "step": 23655, "token_acc": 0.9502012072434608, "train_speed(iter/s)": 0.242633 }, { "epoch": 1.803491119749981, "grad_norm": 0.7441849112510681, "learning_rate": 7.118967493998309e-05, "loss": 0.1361548662185669, "memory(GiB)": 122.96, "step": 23660, "token_acc": 0.9389523434423002, "train_speed(iter/s)": 0.242645 }, { "epoch": 1.8038722463602408, "grad_norm": 0.6873990893363953, "learning_rate": 7.117882927837343e-05, "loss": 0.1288763999938965, "memory(GiB)": 122.96, "step": 23665, "token_acc": 0.9530432181845372, "train_speed(iter/s)": 0.24265 }, { "epoch": 1.8042533729705008, "grad_norm": 1.441353678703308, "learning_rate": 7.116798240225619e-05, "loss": 0.11310838460922241, "memory(GiB)": 122.96, "step": 23670, "token_acc": 0.9517745868328367, "train_speed(iter/s)": 0.242663 }, { "epoch": 1.8046344995807608, "grad_norm": 0.8142199516296387, "learning_rate": 7.115713431225337e-05, "loss": 0.10748989582061767, "memory(GiB)": 122.96, "step": 23675, "token_acc": 0.9569095857272453, "train_speed(iter/s)": 0.242671 }, { "epoch": 1.8050156261910206, "grad_norm": 0.6686017513275146, "learning_rate": 7.114628500898707e-05, "loss": 0.12438158988952637, "memory(GiB)": 122.96, "step": 23680, "token_acc": 0.9500420521446594, "train_speed(iter/s)": 0.242679 }, { "epoch": 1.8053967528012804, "grad_norm": 1.2830262184143066, "learning_rate": 7.113543449307948e-05, "loss": 0.08197990655899048, "memory(GiB)": 122.96, "step": 23685, "token_acc": 0.9554956169925826, "train_speed(iter/s)": 0.242695 }, { "epoch": 1.8057778794115404, "grad_norm": 0.7994800806045532, "learning_rate": 7.11245827651528e-05, "loss": 0.12264930009841919, "memory(GiB)": 122.96, "step": 23690, "token_acc": 0.943331503841932, "train_speed(iter/s)": 0.242703 }, { "epoch": 1.8061590060218005, "grad_norm": 1.1579331159591675, "learning_rate": 7.111372982582934e-05, "loss": 0.10996310710906983, "memory(GiB)": 122.96, "step": 23695, "token_acc": 0.9568047337278106, "train_speed(iter/s)": 0.24272 }, { "epoch": 1.8065401326320605, "grad_norm": 0.7025693655014038, "learning_rate": 7.110287567573141e-05, "loss": 0.07946839332580566, "memory(GiB)": 122.96, "step": 23700, "token_acc": 0.9675630718048239, "train_speed(iter/s)": 0.242734 }, { "epoch": 1.8069212592423203, "grad_norm": 0.6859346628189087, "learning_rate": 7.109202031548153e-05, "loss": 0.08604157567024232, "memory(GiB)": 122.96, "step": 23705, "token_acc": 0.9749492213947191, "train_speed(iter/s)": 0.242747 }, { "epoch": 1.80730238585258, "grad_norm": 0.866497278213501, "learning_rate": 7.108116374570216e-05, "loss": 0.08363773226737976, "memory(GiB)": 122.96, "step": 23710, "token_acc": 0.9735632183908046, "train_speed(iter/s)": 0.24276 }, { "epoch": 1.80768351246284, "grad_norm": 1.3545655012130737, "learning_rate": 7.107030596701585e-05, "loss": 0.13259265422821045, "memory(GiB)": 122.96, "step": 23715, "token_acc": 0.9557747916312298, "train_speed(iter/s)": 0.242765 }, { "epoch": 1.8080646390731, "grad_norm": 0.19057348370552063, "learning_rate": 7.10594469800453e-05, "loss": 0.09402597546577454, "memory(GiB)": 122.96, "step": 23720, "token_acc": 0.954191336865067, "train_speed(iter/s)": 0.242779 }, { "epoch": 1.8084457656833601, "grad_norm": 1.5017684698104858, "learning_rate": 7.104858678541319e-05, "loss": 0.1405564546585083, "memory(GiB)": 122.96, "step": 23725, "token_acc": 0.9482936918304034, "train_speed(iter/s)": 0.24279 }, { "epoch": 1.80882689229362, "grad_norm": 0.6256189346313477, "learning_rate": 7.10377253837423e-05, "loss": 0.11424330472946168, "memory(GiB)": 122.96, "step": 23730, "token_acc": 0.9588270142180095, "train_speed(iter/s)": 0.242795 }, { "epoch": 1.8092080189038797, "grad_norm": 0.42148345708847046, "learning_rate": 7.102686277565548e-05, "loss": 0.09732189774513245, "memory(GiB)": 122.96, "step": 23735, "token_acc": 0.963977210071678, "train_speed(iter/s)": 0.2428 }, { "epoch": 1.8095891455141397, "grad_norm": 0.8721915483474731, "learning_rate": 7.101599896177567e-05, "loss": 0.11191459894180297, "memory(GiB)": 122.96, "step": 23740, "token_acc": 0.9570011025358324, "train_speed(iter/s)": 0.242807 }, { "epoch": 1.8099702721243998, "grad_norm": 0.7764208316802979, "learning_rate": 7.100513394272582e-05, "loss": 0.09905914068222046, "memory(GiB)": 122.96, "step": 23745, "token_acc": 0.9538461538461539, "train_speed(iter/s)": 0.242821 }, { "epoch": 1.8103513987346598, "grad_norm": 0.310560941696167, "learning_rate": 7.099426771912903e-05, "loss": 0.0986211359500885, "memory(GiB)": 122.96, "step": 23750, "token_acc": 0.9544041450777202, "train_speed(iter/s)": 0.242835 }, { "epoch": 1.8107325253449196, "grad_norm": 0.7087815403938293, "learning_rate": 7.098340029160841e-05, "loss": 0.1399161696434021, "memory(GiB)": 122.96, "step": 23755, "token_acc": 0.9438828860230045, "train_speed(iter/s)": 0.242849 }, { "epoch": 1.8111136519551794, "grad_norm": 0.4428345561027527, "learning_rate": 7.097253166078717e-05, "loss": 0.10878216028213501, "memory(GiB)": 122.96, "step": 23760, "token_acc": 0.9613794604003482, "train_speed(iter/s)": 0.242852 }, { "epoch": 1.8114947785654394, "grad_norm": 1.9942346811294556, "learning_rate": 7.096166182728854e-05, "loss": 0.12076716423034668, "memory(GiB)": 122.96, "step": 23765, "token_acc": 0.9596942321056289, "train_speed(iter/s)": 0.242864 }, { "epoch": 1.8118759051756994, "grad_norm": 0.8889882564544678, "learning_rate": 7.09507907917359e-05, "loss": 0.11307593584060668, "memory(GiB)": 122.96, "step": 23770, "token_acc": 0.9572400388726919, "train_speed(iter/s)": 0.242874 }, { "epoch": 1.8122570317859594, "grad_norm": 0.5869020819664001, "learning_rate": 7.093991855475261e-05, "loss": 0.06535944938659669, "memory(GiB)": 122.96, "step": 23775, "token_acc": 0.9690836298932385, "train_speed(iter/s)": 0.242884 }, { "epoch": 1.8126381583962192, "grad_norm": 1.2015464305877686, "learning_rate": 7.09290451169622e-05, "loss": 0.0911247968673706, "memory(GiB)": 122.96, "step": 23780, "token_acc": 0.9613980177360459, "train_speed(iter/s)": 0.2429 }, { "epoch": 1.813019285006479, "grad_norm": 0.5732821226119995, "learning_rate": 7.091817047898815e-05, "loss": 0.11794075965881348, "memory(GiB)": 122.96, "step": 23785, "token_acc": 0.9534683326152521, "train_speed(iter/s)": 0.242916 }, { "epoch": 1.813400411616739, "grad_norm": 1.000557780265808, "learning_rate": 7.090729464145409e-05, "loss": 0.08853086233139038, "memory(GiB)": 122.96, "step": 23790, "token_acc": 0.9709936473690519, "train_speed(iter/s)": 0.242914 }, { "epoch": 1.813781538226999, "grad_norm": 1.9943794012069702, "learning_rate": 7.089641760498371e-05, "loss": 0.14691172838211058, "memory(GiB)": 122.96, "step": 23795, "token_acc": 0.9423778264040846, "train_speed(iter/s)": 0.242931 }, { "epoch": 1.814162664837259, "grad_norm": 0.6959720253944397, "learning_rate": 7.088553937020075e-05, "loss": 0.07711422443389893, "memory(GiB)": 122.96, "step": 23800, "token_acc": 0.9709830240821161, "train_speed(iter/s)": 0.242937 }, { "epoch": 1.814162664837259, "eval_loss": 0.09537702798843384, "eval_runtime": 219.7952, "eval_samples_per_second": 2.411, "eval_steps_per_second": 2.411, "eval_token_acc": 0.9570281910728269, "step": 23800 }, { "epoch": 1.8145437914475189, "grad_norm": 0.707782506942749, "learning_rate": 7.087465993772904e-05, "loss": 0.15348838567733764, "memory(GiB)": 122.96, "step": 23805, "token_acc": 0.9563481253136425, "train_speed(iter/s)": 0.242397 }, { "epoch": 1.8149249180577787, "grad_norm": 1.5386277437210083, "learning_rate": 7.086377930819244e-05, "loss": 0.15817936658859252, "memory(GiB)": 122.96, "step": 23810, "token_acc": 0.9357375271149675, "train_speed(iter/s)": 0.242409 }, { "epoch": 1.8153060446680387, "grad_norm": 0.6460850834846497, "learning_rate": 7.085289748221492e-05, "loss": 0.09366993308067321, "memory(GiB)": 122.96, "step": 23815, "token_acc": 0.9555773714566205, "train_speed(iter/s)": 0.242414 }, { "epoch": 1.8156871712782987, "grad_norm": 0.5196642279624939, "learning_rate": 7.084201446042049e-05, "loss": 0.056979238986968994, "memory(GiB)": 122.96, "step": 23820, "token_acc": 0.9802919708029197, "train_speed(iter/s)": 0.242419 }, { "epoch": 1.8160682978885587, "grad_norm": 0.9274725317955017, "learning_rate": 7.083113024343327e-05, "loss": 0.0965304434299469, "memory(GiB)": 122.96, "step": 23825, "token_acc": 0.9559777571825765, "train_speed(iter/s)": 0.242435 }, { "epoch": 1.8164494244988185, "grad_norm": 1.1522376537322998, "learning_rate": 7.082024483187739e-05, "loss": 0.10552045106887817, "memory(GiB)": 122.96, "step": 23830, "token_acc": 0.9603461637819685, "train_speed(iter/s)": 0.242436 }, { "epoch": 1.8168305511090783, "grad_norm": 1.2066177129745483, "learning_rate": 7.080935822637708e-05, "loss": 0.12028855085372925, "memory(GiB)": 122.96, "step": 23835, "token_acc": 0.9570224141476454, "train_speed(iter/s)": 0.242444 }, { "epoch": 1.8172116777193383, "grad_norm": 1.1347299814224243, "learning_rate": 7.079847042755665e-05, "loss": 0.07249341607093811, "memory(GiB)": 122.96, "step": 23840, "token_acc": 0.9672700223158939, "train_speed(iter/s)": 0.242452 }, { "epoch": 1.8175928043295984, "grad_norm": 0.4896186292171478, "learning_rate": 7.078758143604045e-05, "loss": 0.11605042219161987, "memory(GiB)": 122.96, "step": 23845, "token_acc": 0.9545135757940086, "train_speed(iter/s)": 0.24245 }, { "epoch": 1.8179739309398584, "grad_norm": 1.1050901412963867, "learning_rate": 7.077669125245292e-05, "loss": 0.11876695156097412, "memory(GiB)": 122.96, "step": 23850, "token_acc": 0.959188326493388, "train_speed(iter/s)": 0.242451 }, { "epoch": 1.8183550575501182, "grad_norm": 1.5957086086273193, "learning_rate": 7.076579987741858e-05, "loss": 0.10033571720123291, "memory(GiB)": 122.96, "step": 23855, "token_acc": 0.969391708640062, "train_speed(iter/s)": 0.242466 }, { "epoch": 1.818736184160378, "grad_norm": 0.7457512021064758, "learning_rate": 7.075490731156196e-05, "loss": 0.12908560037612915, "memory(GiB)": 122.96, "step": 23860, "token_acc": 0.9543798785776236, "train_speed(iter/s)": 0.242472 }, { "epoch": 1.819117310770638, "grad_norm": 0.7157314419746399, "learning_rate": 7.074401355550774e-05, "loss": 0.11617534160614014, "memory(GiB)": 122.96, "step": 23865, "token_acc": 0.9458003169572108, "train_speed(iter/s)": 0.242478 }, { "epoch": 1.819498437380898, "grad_norm": 0.9054464101791382, "learning_rate": 7.073311860988059e-05, "loss": 0.10282866954803467, "memory(GiB)": 122.96, "step": 23870, "token_acc": 0.9611670864819479, "train_speed(iter/s)": 0.242488 }, { "epoch": 1.819879563991158, "grad_norm": 0.6933221817016602, "learning_rate": 7.072222247530531e-05, "loss": 0.10350270271301269, "memory(GiB)": 122.96, "step": 23875, "token_acc": 0.95932944606414, "train_speed(iter/s)": 0.24249 }, { "epoch": 1.8202606906014178, "grad_norm": 0.361043781042099, "learning_rate": 7.071132515240674e-05, "loss": 0.09137698411941528, "memory(GiB)": 122.96, "step": 23880, "token_acc": 0.9640186915887851, "train_speed(iter/s)": 0.242505 }, { "epoch": 1.8206418172116776, "grad_norm": 1.3981356620788574, "learning_rate": 7.070042664180976e-05, "loss": 0.09409580230712891, "memory(GiB)": 122.96, "step": 23885, "token_acc": 0.9613114754098361, "train_speed(iter/s)": 0.24251 }, { "epoch": 1.8210229438219376, "grad_norm": 0.7994858622550964, "learning_rate": 7.068952694413939e-05, "loss": 0.10956627130508423, "memory(GiB)": 122.96, "step": 23890, "token_acc": 0.9545782263878875, "train_speed(iter/s)": 0.242525 }, { "epoch": 1.8214040704321977, "grad_norm": 1.5380841493606567, "learning_rate": 7.067862606002067e-05, "loss": 0.16230069398880004, "memory(GiB)": 122.96, "step": 23895, "token_acc": 0.9411940298507463, "train_speed(iter/s)": 0.242535 }, { "epoch": 1.8217851970424574, "grad_norm": 0.6299561858177185, "learning_rate": 7.066772399007871e-05, "loss": 0.07268702983856201, "memory(GiB)": 122.96, "step": 23900, "token_acc": 0.9689799451738565, "train_speed(iter/s)": 0.242536 }, { "epoch": 1.8221663236527175, "grad_norm": 0.6240633130073547, "learning_rate": 7.065682073493867e-05, "loss": 0.09860904812812805, "memory(GiB)": 122.96, "step": 23905, "token_acc": 0.9627795287408074, "train_speed(iter/s)": 0.24254 }, { "epoch": 1.8225474502629773, "grad_norm": 0.8685871958732605, "learning_rate": 7.064591629522582e-05, "loss": 0.0944695770740509, "memory(GiB)": 122.96, "step": 23910, "token_acc": 0.9620038722168441, "train_speed(iter/s)": 0.242551 }, { "epoch": 1.8229285768732373, "grad_norm": 1.223099708557129, "learning_rate": 7.06350106715655e-05, "loss": 0.11215211153030395, "memory(GiB)": 122.96, "step": 23915, "token_acc": 0.9416922402455064, "train_speed(iter/s)": 0.242567 }, { "epoch": 1.8233097034834973, "grad_norm": 0.6510603427886963, "learning_rate": 7.062410386458305e-05, "loss": 0.13774139881134034, "memory(GiB)": 122.96, "step": 23920, "token_acc": 0.9454495614035088, "train_speed(iter/s)": 0.242579 }, { "epoch": 1.823690830093757, "grad_norm": 1.0784436464309692, "learning_rate": 7.061319587490395e-05, "loss": 0.08675770163536071, "memory(GiB)": 122.96, "step": 23925, "token_acc": 0.9633489040603666, "train_speed(iter/s)": 0.242586 }, { "epoch": 1.8240719567040171, "grad_norm": 1.2463531494140625, "learning_rate": 7.060228670315376e-05, "loss": 0.11386843919754028, "memory(GiB)": 122.96, "step": 23930, "token_acc": 0.9595679012345679, "train_speed(iter/s)": 0.242602 }, { "epoch": 1.824453083314277, "grad_norm": 2.939951181411743, "learning_rate": 7.059137634995801e-05, "loss": 0.12461512088775635, "memory(GiB)": 122.96, "step": 23935, "token_acc": 0.9544996980068452, "train_speed(iter/s)": 0.242609 }, { "epoch": 1.824834209924537, "grad_norm": 0.7030375599861145, "learning_rate": 7.058046481594237e-05, "loss": 0.06819335222244263, "memory(GiB)": 122.96, "step": 23940, "token_acc": 0.9773874862788144, "train_speed(iter/s)": 0.242621 }, { "epoch": 1.825215336534797, "grad_norm": 0.8468825221061707, "learning_rate": 7.05695521017326e-05, "loss": 0.09020789861679077, "memory(GiB)": 122.96, "step": 23945, "token_acc": 0.9606903163950143, "train_speed(iter/s)": 0.242632 }, { "epoch": 1.8255964631450567, "grad_norm": 1.0145013332366943, "learning_rate": 7.055863820795446e-05, "loss": 0.12331933975219726, "memory(GiB)": 122.96, "step": 23950, "token_acc": 0.9495798319327731, "train_speed(iter/s)": 0.242644 }, { "epoch": 1.8259775897553168, "grad_norm": 1.2795621156692505, "learning_rate": 7.054772313523381e-05, "loss": 0.08797367811203002, "memory(GiB)": 122.96, "step": 23955, "token_acc": 0.9647501711156742, "train_speed(iter/s)": 0.242653 }, { "epoch": 1.8263587163655766, "grad_norm": 0.8058647513389587, "learning_rate": 7.053680688419661e-05, "loss": 0.1426207661628723, "memory(GiB)": 122.96, "step": 23960, "token_acc": 0.9357228674409291, "train_speed(iter/s)": 0.242661 }, { "epoch": 1.8267398429758366, "grad_norm": 0.4705190360546112, "learning_rate": 7.052588945546884e-05, "loss": 0.12072993516921997, "memory(GiB)": 122.96, "step": 23965, "token_acc": 0.9653946108603474, "train_speed(iter/s)": 0.24266 }, { "epoch": 1.8271209695860966, "grad_norm": 0.15032418072223663, "learning_rate": 7.051497084967654e-05, "loss": 0.11095502376556396, "memory(GiB)": 122.96, "step": 23970, "token_acc": 0.9457364341085271, "train_speed(iter/s)": 0.242676 }, { "epoch": 1.8275020961963564, "grad_norm": 0.17093878984451294, "learning_rate": 7.050405106744589e-05, "loss": 0.056661355495452884, "memory(GiB)": 122.96, "step": 23975, "token_acc": 0.9677871148459384, "train_speed(iter/s)": 0.242693 }, { "epoch": 1.8278832228066162, "grad_norm": 0.6247076988220215, "learning_rate": 7.049313010940305e-05, "loss": 0.10714296102523804, "memory(GiB)": 122.96, "step": 23980, "token_acc": 0.9580857555805363, "train_speed(iter/s)": 0.242702 }, { "epoch": 1.8282643494168762, "grad_norm": 1.4720964431762695, "learning_rate": 7.04822079761743e-05, "loss": 0.13617091178894042, "memory(GiB)": 122.96, "step": 23985, "token_acc": 0.9504267892317794, "train_speed(iter/s)": 0.242709 }, { "epoch": 1.8286454760271362, "grad_norm": 0.6930082440376282, "learning_rate": 7.047128466838596e-05, "loss": 0.07928122282028198, "memory(GiB)": 122.96, "step": 23990, "token_acc": 0.9739524348810872, "train_speed(iter/s)": 0.242723 }, { "epoch": 1.8290266026373962, "grad_norm": 0.7904015779495239, "learning_rate": 7.046036018666446e-05, "loss": 0.11257362365722656, "memory(GiB)": 122.96, "step": 23995, "token_acc": 0.9516441005802708, "train_speed(iter/s)": 0.242734 }, { "epoch": 1.829407729247656, "grad_norm": 1.0489678382873535, "learning_rate": 7.044943453163623e-05, "loss": 0.12771780490875245, "memory(GiB)": 122.96, "step": 24000, "token_acc": 0.9569259962049336, "train_speed(iter/s)": 0.242744 }, { "epoch": 1.829407729247656, "eval_loss": 0.09477829933166504, "eval_runtime": 218.3206, "eval_samples_per_second": 2.428, "eval_steps_per_second": 2.428, "eval_token_acc": 0.9565839407264622, "step": 24000 }, { "epoch": 1.8297888558579158, "grad_norm": 0.7909244298934937, "learning_rate": 7.043850770392784e-05, "loss": 0.1340134859085083, "memory(GiB)": 122.96, "step": 24005, "token_acc": 0.9564754988017103, "train_speed(iter/s)": 0.242216 }, { "epoch": 1.8301699824681759, "grad_norm": 0.6778976917266846, "learning_rate": 7.042757970416589e-05, "loss": 0.0761795997619629, "memory(GiB)": 122.96, "step": 24010, "token_acc": 0.9716782163302199, "train_speed(iter/s)": 0.242229 }, { "epoch": 1.8305511090784359, "grad_norm": 0.7458018064498901, "learning_rate": 7.041665053297701e-05, "loss": 0.1874677300453186, "memory(GiB)": 122.96, "step": 24015, "token_acc": 0.9172932330827067, "train_speed(iter/s)": 0.242242 }, { "epoch": 1.830932235688696, "grad_norm": 1.54597806930542, "learning_rate": 7.040572019098798e-05, "loss": 0.11965630054473878, "memory(GiB)": 122.96, "step": 24020, "token_acc": 0.9555242503521836, "train_speed(iter/s)": 0.242251 }, { "epoch": 1.8313133622989557, "grad_norm": 0.7182013988494873, "learning_rate": 7.039478867882561e-05, "loss": 0.09050259590148926, "memory(GiB)": 122.96, "step": 24025, "token_acc": 0.9627983153954142, "train_speed(iter/s)": 0.242262 }, { "epoch": 1.8316944889092155, "grad_norm": 0.9259948134422302, "learning_rate": 7.038385599711674e-05, "loss": 0.09869426488876343, "memory(GiB)": 122.96, "step": 24030, "token_acc": 0.9557894736842105, "train_speed(iter/s)": 0.242278 }, { "epoch": 1.8320756155194755, "grad_norm": 0.5276364088058472, "learning_rate": 7.037292214648832e-05, "loss": 0.12362513542175294, "memory(GiB)": 122.96, "step": 24035, "token_acc": 0.963651270839027, "train_speed(iter/s)": 0.242281 }, { "epoch": 1.8324567421297355, "grad_norm": 0.5305531620979309, "learning_rate": 7.036198712756735e-05, "loss": 0.08715924620628357, "memory(GiB)": 122.96, "step": 24040, "token_acc": 0.9721961886910341, "train_speed(iter/s)": 0.242289 }, { "epoch": 1.8328378687399955, "grad_norm": 0.778765082359314, "learning_rate": 7.035105094098092e-05, "loss": 0.08246569633483887, "memory(GiB)": 122.96, "step": 24045, "token_acc": 0.9641731155058756, "train_speed(iter/s)": 0.242301 }, { "epoch": 1.8332189953502553, "grad_norm": 0.8682101964950562, "learning_rate": 7.034011358735616e-05, "loss": 0.11924052238464355, "memory(GiB)": 122.96, "step": 24050, "token_acc": 0.9488324175824175, "train_speed(iter/s)": 0.242314 }, { "epoch": 1.8336001219605151, "grad_norm": 1.1356277465820312, "learning_rate": 7.032917506732026e-05, "loss": 0.14194570779800414, "memory(GiB)": 122.96, "step": 24055, "token_acc": 0.9476426152643918, "train_speed(iter/s)": 0.242324 }, { "epoch": 1.8339812485707752, "grad_norm": 0.7413094639778137, "learning_rate": 7.031823538150052e-05, "loss": 0.11022782325744629, "memory(GiB)": 122.96, "step": 24060, "token_acc": 0.9567017422804899, "train_speed(iter/s)": 0.242334 }, { "epoch": 1.8343623751810352, "grad_norm": 1.0274509191513062, "learning_rate": 7.030729453052427e-05, "loss": 0.15316052436828614, "memory(GiB)": 122.96, "step": 24065, "token_acc": 0.9446556822982678, "train_speed(iter/s)": 0.242344 }, { "epoch": 1.8347435017912952, "grad_norm": 0.6163710951805115, "learning_rate": 7.029635251501893e-05, "loss": 0.11968309879302978, "memory(GiB)": 122.96, "step": 24070, "token_acc": 0.9555295494441194, "train_speed(iter/s)": 0.242351 }, { "epoch": 1.835124628401555, "grad_norm": 0.9568526148796082, "learning_rate": 7.028540933561192e-05, "loss": 0.0945904016494751, "memory(GiB)": 122.96, "step": 24075, "token_acc": 0.9625783348254252, "train_speed(iter/s)": 0.242355 }, { "epoch": 1.8355057550118148, "grad_norm": 0.615674614906311, "learning_rate": 7.027446499293085e-05, "loss": 0.16011400222778321, "memory(GiB)": 122.96, "step": 24080, "token_acc": 0.9579545454545455, "train_speed(iter/s)": 0.242366 }, { "epoch": 1.8358868816220748, "grad_norm": 1.0766220092773438, "learning_rate": 7.026351948760329e-05, "loss": 0.13497151136398317, "memory(GiB)": 122.96, "step": 24085, "token_acc": 0.9463026577472428, "train_speed(iter/s)": 0.242375 }, { "epoch": 1.8362680082323348, "grad_norm": 1.043893814086914, "learning_rate": 7.025257282025692e-05, "loss": 0.09734436869621277, "memory(GiB)": 122.96, "step": 24090, "token_acc": 0.9616579073100813, "train_speed(iter/s)": 0.242379 }, { "epoch": 1.8366491348425948, "grad_norm": 0.8727908730506897, "learning_rate": 7.02416249915195e-05, "loss": 0.08951289057731629, "memory(GiB)": 122.96, "step": 24095, "token_acc": 0.9679467327345928, "train_speed(iter/s)": 0.242386 }, { "epoch": 1.8370302614528546, "grad_norm": 0.5410177707672119, "learning_rate": 7.02306760020188e-05, "loss": 0.11960580348968505, "memory(GiB)": 122.96, "step": 24100, "token_acc": 0.960536253776435, "train_speed(iter/s)": 0.242393 }, { "epoch": 1.8374113880631144, "grad_norm": 0.7246606349945068, "learning_rate": 7.021972585238273e-05, "loss": 0.1214989185333252, "memory(GiB)": 122.96, "step": 24105, "token_acc": 0.9481750352893729, "train_speed(iter/s)": 0.242404 }, { "epoch": 1.8377925146733745, "grad_norm": 0.9364734292030334, "learning_rate": 7.020877454323919e-05, "loss": 0.10455282926559448, "memory(GiB)": 122.96, "step": 24110, "token_acc": 0.9602613917343695, "train_speed(iter/s)": 0.242413 }, { "epoch": 1.8381736412836345, "grad_norm": 0.6331307291984558, "learning_rate": 7.019782207521621e-05, "loss": 0.1043436050415039, "memory(GiB)": 122.96, "step": 24115, "token_acc": 0.957613351055974, "train_speed(iter/s)": 0.242419 }, { "epoch": 1.8385547678938945, "grad_norm": 1.458105206489563, "learning_rate": 7.018686844894189e-05, "loss": 0.12290433645248414, "memory(GiB)": 122.96, "step": 24120, "token_acc": 0.9510710259301015, "train_speed(iter/s)": 0.242426 }, { "epoch": 1.8389358945041543, "grad_norm": 0.45775434374809265, "learning_rate": 7.017591366504432e-05, "loss": 0.11749522686004639, "memory(GiB)": 122.96, "step": 24125, "token_acc": 0.966065464261857, "train_speed(iter/s)": 0.242428 }, { "epoch": 1.839317021114414, "grad_norm": 1.0961328744888306, "learning_rate": 7.016495772415174e-05, "loss": 0.13099316358566285, "memory(GiB)": 122.96, "step": 24130, "token_acc": 0.9570300637716368, "train_speed(iter/s)": 0.242431 }, { "epoch": 1.839698147724674, "grad_norm": 1.0575309991836548, "learning_rate": 7.015400062689241e-05, "loss": 0.11261796951293945, "memory(GiB)": 122.96, "step": 24135, "token_acc": 0.9536376604850214, "train_speed(iter/s)": 0.242445 }, { "epoch": 1.8400792743349341, "grad_norm": 2.0484399795532227, "learning_rate": 7.014304237389466e-05, "loss": 0.18221073150634765, "memory(GiB)": 122.96, "step": 24140, "token_acc": 0.9425828654229307, "train_speed(iter/s)": 0.242454 }, { "epoch": 1.8404604009451941, "grad_norm": 0.3093033730983734, "learning_rate": 7.013208296578692e-05, "loss": 0.0825664758682251, "memory(GiB)": 122.96, "step": 24145, "token_acc": 0.9602824360105914, "train_speed(iter/s)": 0.242464 }, { "epoch": 1.840841527555454, "grad_norm": 1.112473964691162, "learning_rate": 7.012112240319763e-05, "loss": 0.08140487670898437, "memory(GiB)": 122.96, "step": 24150, "token_acc": 0.9704706244232544, "train_speed(iter/s)": 0.242478 }, { "epoch": 1.8412226541657137, "grad_norm": 1.1052141189575195, "learning_rate": 7.011016068675536e-05, "loss": 0.12812352180480957, "memory(GiB)": 122.96, "step": 24155, "token_acc": 0.9564049586776859, "train_speed(iter/s)": 0.24249 }, { "epoch": 1.8416037807759738, "grad_norm": 0.7579441666603088, "learning_rate": 7.009919781708869e-05, "loss": 0.1045087456703186, "memory(GiB)": 122.96, "step": 24160, "token_acc": 0.9621230999252429, "train_speed(iter/s)": 0.2425 }, { "epoch": 1.8419849073862338, "grad_norm": 0.8986715078353882, "learning_rate": 7.00882337948263e-05, "loss": 0.12327699661254883, "memory(GiB)": 122.96, "step": 24165, "token_acc": 0.9470333587980647, "train_speed(iter/s)": 0.242511 }, { "epoch": 1.8423660339964938, "grad_norm": 0.6594895124435425, "learning_rate": 7.007726862059692e-05, "loss": 0.12309162616729737, "memory(GiB)": 122.96, "step": 24170, "token_acc": 0.9551032624962587, "train_speed(iter/s)": 0.242526 }, { "epoch": 1.8427471606067536, "grad_norm": 0.797662079334259, "learning_rate": 7.006630229502936e-05, "loss": 0.1209206223487854, "memory(GiB)": 122.96, "step": 24175, "token_acc": 0.9548894943990311, "train_speed(iter/s)": 0.24254 }, { "epoch": 1.8431282872170134, "grad_norm": 1.0522246360778809, "learning_rate": 7.005533481875245e-05, "loss": 0.10141688585281372, "memory(GiB)": 122.96, "step": 24180, "token_acc": 0.9504089979550102, "train_speed(iter/s)": 0.242557 }, { "epoch": 1.8435094138272734, "grad_norm": 0.6671926975250244, "learning_rate": 7.004436619239521e-05, "loss": 0.1348706841468811, "memory(GiB)": 122.96, "step": 24185, "token_acc": 0.9529567353107878, "train_speed(iter/s)": 0.242567 }, { "epoch": 1.8438905404375334, "grad_norm": 0.7114071846008301, "learning_rate": 7.003339641658655e-05, "loss": 0.08687095046043396, "memory(GiB)": 122.96, "step": 24190, "token_acc": 0.970337738619677, "train_speed(iter/s)": 0.242571 }, { "epoch": 1.8442716670477934, "grad_norm": 0.8366031050682068, "learning_rate": 7.002242549195558e-05, "loss": 0.13677235841751098, "memory(GiB)": 122.96, "step": 24195, "token_acc": 0.9511201629327902, "train_speed(iter/s)": 0.242588 }, { "epoch": 1.8446527936580532, "grad_norm": 1.518053412437439, "learning_rate": 7.001145341913143e-05, "loss": 0.1269094705581665, "memory(GiB)": 122.96, "step": 24200, "token_acc": 0.9357463524130191, "train_speed(iter/s)": 0.242601 }, { "epoch": 1.8446527936580532, "eval_loss": 0.09680986404418945, "eval_runtime": 220.0931, "eval_samples_per_second": 2.408, "eval_steps_per_second": 2.408, "eval_token_acc": 0.9571863140774652, "step": 24200 }, { "epoch": 1.845033920268313, "grad_norm": 0.6637988686561584, "learning_rate": 7.000048019874328e-05, "loss": 0.12280230522155762, "memory(GiB)": 122.96, "step": 24205, "token_acc": 0.9571098623043427, "train_speed(iter/s)": 0.24208 }, { "epoch": 1.845415046878573, "grad_norm": 1.8672716617584229, "learning_rate": 6.998950583142041e-05, "loss": 0.09117831587791443, "memory(GiB)": 122.96, "step": 24210, "token_acc": 0.971304347826087, "train_speed(iter/s)": 0.24209 }, { "epoch": 1.845796173488833, "grad_norm": 0.8196402788162231, "learning_rate": 6.997853031779213e-05, "loss": 0.121373450756073, "memory(GiB)": 122.96, "step": 24215, "token_acc": 0.9560957910014514, "train_speed(iter/s)": 0.242102 }, { "epoch": 1.8461773000990929, "grad_norm": 1.0248056650161743, "learning_rate": 6.996755365848786e-05, "loss": 0.1312938451766968, "memory(GiB)": 122.96, "step": 24220, "token_acc": 0.95, "train_speed(iter/s)": 0.242111 }, { "epoch": 1.8465584267093529, "grad_norm": 0.7133308053016663, "learning_rate": 6.995657585413706e-05, "loss": 0.12555835247039795, "memory(GiB)": 122.96, "step": 24225, "token_acc": 0.9632583998066232, "train_speed(iter/s)": 0.242121 }, { "epoch": 1.8469395533196127, "grad_norm": 1.9526312351226807, "learning_rate": 6.994559690536923e-05, "loss": 0.14558084011077882, "memory(GiB)": 122.96, "step": 24230, "token_acc": 0.943301687763713, "train_speed(iter/s)": 0.242134 }, { "epoch": 1.8473206799298727, "grad_norm": 0.5833352208137512, "learning_rate": 6.993461681281399e-05, "loss": 0.07016668319702149, "memory(GiB)": 122.96, "step": 24235, "token_acc": 0.9701997422680413, "train_speed(iter/s)": 0.242143 }, { "epoch": 1.8477018065401327, "grad_norm": 1.196860671043396, "learning_rate": 6.992363557710096e-05, "loss": 0.10175871849060059, "memory(GiB)": 122.96, "step": 24240, "token_acc": 0.9639338166945529, "train_speed(iter/s)": 0.242151 }, { "epoch": 1.8480829331503925, "grad_norm": 0.756585955619812, "learning_rate": 6.99126531988599e-05, "loss": 0.1194640040397644, "memory(GiB)": 122.96, "step": 24245, "token_acc": 0.9515088086516658, "train_speed(iter/s)": 0.24216 }, { "epoch": 1.8484640597606525, "grad_norm": 1.6281567811965942, "learning_rate": 6.990166967872058e-05, "loss": 0.16689667701721192, "memory(GiB)": 122.96, "step": 24250, "token_acc": 0.9389600602863603, "train_speed(iter/s)": 0.242172 }, { "epoch": 1.8488451863709123, "grad_norm": 1.0117952823638916, "learning_rate": 6.989068501731288e-05, "loss": 0.1409121036529541, "memory(GiB)": 122.96, "step": 24255, "token_acc": 0.9567901234567902, "train_speed(iter/s)": 0.24218 }, { "epoch": 1.8492263129811723, "grad_norm": 1.3113939762115479, "learning_rate": 6.987969921526669e-05, "loss": 0.12149341106414795, "memory(GiB)": 122.96, "step": 24260, "token_acc": 0.9558565337346375, "train_speed(iter/s)": 0.242193 }, { "epoch": 1.8496074395914324, "grad_norm": 1.355720043182373, "learning_rate": 6.986871227321197e-05, "loss": 0.13618087768554688, "memory(GiB)": 122.96, "step": 24265, "token_acc": 0.9418734630002236, "train_speed(iter/s)": 0.242207 }, { "epoch": 1.8499885662016922, "grad_norm": 0.9270972013473511, "learning_rate": 6.985772419177886e-05, "loss": 0.09680225253105164, "memory(GiB)": 122.96, "step": 24270, "token_acc": 0.96225614927905, "train_speed(iter/s)": 0.242221 }, { "epoch": 1.8503696928119522, "grad_norm": 0.6866962313652039, "learning_rate": 6.984673497159737e-05, "loss": 0.10834848880767822, "memory(GiB)": 122.96, "step": 24275, "token_acc": 0.9593848580441641, "train_speed(iter/s)": 0.242229 }, { "epoch": 1.850750819422212, "grad_norm": 1.0840245485305786, "learning_rate": 6.983574461329775e-05, "loss": 0.11221251487731934, "memory(GiB)": 122.96, "step": 24280, "token_acc": 0.9609820254274442, "train_speed(iter/s)": 0.242241 }, { "epoch": 1.851131946032472, "grad_norm": 1.2692883014678955, "learning_rate": 6.982475311751021e-05, "loss": 0.1259017825126648, "memory(GiB)": 122.96, "step": 24285, "token_acc": 0.9515028581965702, "train_speed(iter/s)": 0.242249 }, { "epoch": 1.851513072642732, "grad_norm": 1.0128353834152222, "learning_rate": 6.98137604848651e-05, "loss": 0.1326538324356079, "memory(GiB)": 122.96, "step": 24290, "token_acc": 0.9569569569569569, "train_speed(iter/s)": 0.242253 }, { "epoch": 1.8518941992529918, "grad_norm": 0.5504249930381775, "learning_rate": 6.980276671599276e-05, "loss": 0.12101655006408692, "memory(GiB)": 122.96, "step": 24295, "token_acc": 0.9506990434142752, "train_speed(iter/s)": 0.242258 }, { "epoch": 1.8522753258632516, "grad_norm": 0.6895307302474976, "learning_rate": 6.979177181152364e-05, "loss": 0.12048131227493286, "memory(GiB)": 122.96, "step": 24300, "token_acc": 0.9524988743809095, "train_speed(iter/s)": 0.242267 }, { "epoch": 1.8526564524735116, "grad_norm": 0.8315399289131165, "learning_rate": 6.978077577208826e-05, "loss": 0.08318830728530884, "memory(GiB)": 122.96, "step": 24305, "token_acc": 0.9632196162046909, "train_speed(iter/s)": 0.242283 }, { "epoch": 1.8530375790837716, "grad_norm": 1.3649569749832153, "learning_rate": 6.97697785983172e-05, "loss": 0.16275683641433716, "memory(GiB)": 122.96, "step": 24310, "token_acc": 0.9373211219232971, "train_speed(iter/s)": 0.242289 }, { "epoch": 1.8534187056940317, "grad_norm": 1.5859614610671997, "learning_rate": 6.975878029084105e-05, "loss": 0.16675660610198975, "memory(GiB)": 122.96, "step": 24315, "token_acc": 0.9522446520036155, "train_speed(iter/s)": 0.242292 }, { "epoch": 1.8537998323042915, "grad_norm": 0.9296463131904602, "learning_rate": 6.974778085029055e-05, "loss": 0.0774298369884491, "memory(GiB)": 122.96, "step": 24320, "token_acc": 0.9739368998628258, "train_speed(iter/s)": 0.242308 }, { "epoch": 1.8541809589145513, "grad_norm": 0.5307679772377014, "learning_rate": 6.973678027729648e-05, "loss": 0.1072800874710083, "memory(GiB)": 122.96, "step": 24325, "token_acc": 0.9538343558282208, "train_speed(iter/s)": 0.242314 }, { "epoch": 1.8545620855248113, "grad_norm": 0.8264315128326416, "learning_rate": 6.972577857248962e-05, "loss": 0.09193305373191833, "memory(GiB)": 122.96, "step": 24330, "token_acc": 0.960043342814574, "train_speed(iter/s)": 0.242319 }, { "epoch": 1.8549432121350713, "grad_norm": 0.6337208151817322, "learning_rate": 6.971477573650092e-05, "loss": 0.1127215027809143, "memory(GiB)": 122.96, "step": 24335, "token_acc": 0.9585358879039176, "train_speed(iter/s)": 0.242331 }, { "epoch": 1.8553243387453313, "grad_norm": 0.8458091020584106, "learning_rate": 6.970377176996131e-05, "loss": 0.11298857927322388, "memory(GiB)": 122.96, "step": 24340, "token_acc": 0.9602520601066408, "train_speed(iter/s)": 0.242347 }, { "epoch": 1.855705465355591, "grad_norm": 1.9747209548950195, "learning_rate": 6.969276667350185e-05, "loss": 0.12852178812026976, "memory(GiB)": 122.96, "step": 24345, "token_acc": 0.9522338403041825, "train_speed(iter/s)": 0.242357 }, { "epoch": 1.856086591965851, "grad_norm": 0.7211102247238159, "learning_rate": 6.968176044775355e-05, "loss": 0.10976569652557373, "memory(GiB)": 122.96, "step": 24350, "token_acc": 0.9589199614271938, "train_speed(iter/s)": 0.242364 }, { "epoch": 1.856467718576111, "grad_norm": 1.1108744144439697, "learning_rate": 6.967075309334767e-05, "loss": 0.10706478357315063, "memory(GiB)": 122.96, "step": 24355, "token_acc": 0.9507012324691883, "train_speed(iter/s)": 0.242381 }, { "epoch": 1.856848845186371, "grad_norm": 0.6466699838638306, "learning_rate": 6.965974461091537e-05, "loss": 0.09136489033699036, "memory(GiB)": 122.96, "step": 24360, "token_acc": 0.9712041884816754, "train_speed(iter/s)": 0.242395 }, { "epoch": 1.857229971796631, "grad_norm": 0.7743187546730042, "learning_rate": 6.964873500108794e-05, "loss": 0.07822231650352478, "memory(GiB)": 122.96, "step": 24365, "token_acc": 0.9733333333333334, "train_speed(iter/s)": 0.242407 }, { "epoch": 1.8576110984068908, "grad_norm": 0.7046512365341187, "learning_rate": 6.963772426449675e-05, "loss": 0.1021127462387085, "memory(GiB)": 122.96, "step": 24370, "token_acc": 0.9662663369833981, "train_speed(iter/s)": 0.242414 }, { "epoch": 1.8579922250171506, "grad_norm": 0.7498273849487305, "learning_rate": 6.962671240177321e-05, "loss": 0.14555213451385499, "memory(GiB)": 122.96, "step": 24375, "token_acc": 0.9385035324341683, "train_speed(iter/s)": 0.242421 }, { "epoch": 1.8583733516274106, "grad_norm": 1.2141318321228027, "learning_rate": 6.96156994135488e-05, "loss": 0.14541279077529906, "memory(GiB)": 122.96, "step": 24380, "token_acc": 0.9452380952380952, "train_speed(iter/s)": 0.242431 }, { "epoch": 1.8587544782376706, "grad_norm": 0.4955367147922516, "learning_rate": 6.960468530045503e-05, "loss": 0.07278543710708618, "memory(GiB)": 122.96, "step": 24385, "token_acc": 0.9719804134929271, "train_speed(iter/s)": 0.242436 }, { "epoch": 1.8591356048479306, "grad_norm": 1.1922231912612915, "learning_rate": 6.959367006312355e-05, "loss": 0.10836031436920165, "memory(GiB)": 122.96, "step": 24390, "token_acc": 0.9523348991434097, "train_speed(iter/s)": 0.242438 }, { "epoch": 1.8595167314581904, "grad_norm": 0.3362831771373749, "learning_rate": 6.958265370218602e-05, "loss": 0.07103281021118164, "memory(GiB)": 122.96, "step": 24395, "token_acc": 0.9648484848484848, "train_speed(iter/s)": 0.242448 }, { "epoch": 1.8598978580684502, "grad_norm": 0.47790735960006714, "learning_rate": 6.957163621827416e-05, "loss": 0.14549343585968016, "memory(GiB)": 122.96, "step": 24400, "token_acc": 0.9292604501607717, "train_speed(iter/s)": 0.242462 }, { "epoch": 1.8598978580684502, "eval_loss": 0.09444306045770645, "eval_runtime": 218.0557, "eval_samples_per_second": 2.431, "eval_steps_per_second": 2.431, "eval_token_acc": 0.9576079754231672, "step": 24400 }, { "epoch": 1.8602789846787102, "grad_norm": 0.701878011226654, "learning_rate": 6.95606176120198e-05, "loss": 0.10079268217086793, "memory(GiB)": 122.96, "step": 24405, "token_acc": 0.9578115955833327, "train_speed(iter/s)": 0.24195 }, { "epoch": 1.8606601112889702, "grad_norm": 1.115817666053772, "learning_rate": 6.954959788405479e-05, "loss": 0.1157450556755066, "memory(GiB)": 122.96, "step": 24410, "token_acc": 0.9526327384886539, "train_speed(iter/s)": 0.24196 }, { "epoch": 1.8610412378992303, "grad_norm": 0.8656884431838989, "learning_rate": 6.953857703501105e-05, "loss": 0.13295495510101318, "memory(GiB)": 122.96, "step": 24415, "token_acc": 0.9453210010881393, "train_speed(iter/s)": 0.241963 }, { "epoch": 1.86142236450949, "grad_norm": 0.7063992619514465, "learning_rate": 6.95275550655206e-05, "loss": 0.07397289276123047, "memory(GiB)": 122.96, "step": 24420, "token_acc": 0.9691912708600771, "train_speed(iter/s)": 0.241971 }, { "epoch": 1.8618034911197499, "grad_norm": 0.7390208840370178, "learning_rate": 6.951653197621548e-05, "loss": 0.11191864013671875, "memory(GiB)": 122.96, "step": 24425, "token_acc": 0.9452707856598017, "train_speed(iter/s)": 0.24198 }, { "epoch": 1.8621846177300099, "grad_norm": 0.3493862450122833, "learning_rate": 6.950550776772783e-05, "loss": 0.11717027425765991, "memory(GiB)": 122.96, "step": 24430, "token_acc": 0.9637428531585553, "train_speed(iter/s)": 0.241987 }, { "epoch": 1.86256574434027, "grad_norm": 0.8690118193626404, "learning_rate": 6.949448244068981e-05, "loss": 0.10039944648742676, "memory(GiB)": 122.96, "step": 24435, "token_acc": 0.9508098380323935, "train_speed(iter/s)": 0.242 }, { "epoch": 1.86294687095053, "grad_norm": 0.7351366877555847, "learning_rate": 6.94834559957337e-05, "loss": 0.15723568201065063, "memory(GiB)": 122.96, "step": 24440, "token_acc": 0.9375, "train_speed(iter/s)": 0.24201 }, { "epoch": 1.8633279975607897, "grad_norm": 0.6039955615997314, "learning_rate": 6.94724284334918e-05, "loss": 0.08503319025039673, "memory(GiB)": 122.96, "step": 24445, "token_acc": 0.9669688124135812, "train_speed(iter/s)": 0.242017 }, { "epoch": 1.8637091241710495, "grad_norm": 0.8427472710609436, "learning_rate": 6.94613997545965e-05, "loss": 0.15095856189727783, "memory(GiB)": 122.96, "step": 24450, "token_acc": 0.9251844046364595, "train_speed(iter/s)": 0.242031 }, { "epoch": 1.8640902507813095, "grad_norm": 1.1250957250595093, "learning_rate": 6.945036995968022e-05, "loss": 0.1045557975769043, "memory(GiB)": 122.96, "step": 24455, "token_acc": 0.9584188911704312, "train_speed(iter/s)": 0.242044 }, { "epoch": 1.8644713773915695, "grad_norm": 0.4448085427284241, "learning_rate": 6.94393390493755e-05, "loss": 0.11666032075881957, "memory(GiB)": 122.96, "step": 24460, "token_acc": 0.9554851157662624, "train_speed(iter/s)": 0.242047 }, { "epoch": 1.8648525040018296, "grad_norm": 0.9819527268409729, "learning_rate": 6.94283070243149e-05, "loss": 0.10442521572113037, "memory(GiB)": 122.96, "step": 24465, "token_acc": 0.9582582582582583, "train_speed(iter/s)": 0.24206 }, { "epoch": 1.8652336306120894, "grad_norm": 1.3671238422393799, "learning_rate": 6.941727388513102e-05, "loss": 0.12271168231964111, "memory(GiB)": 122.96, "step": 24470, "token_acc": 0.9600043355733796, "train_speed(iter/s)": 0.24206 }, { "epoch": 1.8656147572223492, "grad_norm": 0.5527922511100769, "learning_rate": 6.940623963245661e-05, "loss": 0.11500542163848877, "memory(GiB)": 122.96, "step": 24475, "token_acc": 0.9560335497835498, "train_speed(iter/s)": 0.242067 }, { "epoch": 1.8659958838326092, "grad_norm": 0.7813662886619568, "learning_rate": 6.93952042669244e-05, "loss": 0.11093573570251465, "memory(GiB)": 122.96, "step": 24480, "token_acc": 0.9504761904761905, "train_speed(iter/s)": 0.24208 }, { "epoch": 1.8663770104428692, "grad_norm": 0.3858792185783386, "learning_rate": 6.938416778916723e-05, "loss": 0.12271552085876465, "memory(GiB)": 122.96, "step": 24485, "token_acc": 0.9490462503266266, "train_speed(iter/s)": 0.242091 }, { "epoch": 1.8667581370531292, "grad_norm": 0.5808374881744385, "learning_rate": 6.937313019981801e-05, "loss": 0.09541603326797485, "memory(GiB)": 122.96, "step": 24490, "token_acc": 0.9669555796316359, "train_speed(iter/s)": 0.242092 }, { "epoch": 1.867139263663389, "grad_norm": 0.5066109299659729, "learning_rate": 6.936209149950966e-05, "loss": 0.10509436130523682, "memory(GiB)": 122.96, "step": 24495, "token_acc": 0.9594175960346965, "train_speed(iter/s)": 0.242098 }, { "epoch": 1.8675203902736488, "grad_norm": 0.8123342394828796, "learning_rate": 6.935105168887522e-05, "loss": 0.1141016125679016, "memory(GiB)": 122.96, "step": 24500, "token_acc": 0.9469832202784719, "train_speed(iter/s)": 0.242106 }, { "epoch": 1.8679015168839088, "grad_norm": 1.08533775806427, "learning_rate": 6.934001076854775e-05, "loss": 0.1322989583015442, "memory(GiB)": 122.96, "step": 24505, "token_acc": 0.9497084548104956, "train_speed(iter/s)": 0.242111 }, { "epoch": 1.8682826434941688, "grad_norm": 0.6388190984725952, "learning_rate": 6.932896873916043e-05, "loss": 0.09298046827316284, "memory(GiB)": 122.96, "step": 24510, "token_acc": 0.9625850340136054, "train_speed(iter/s)": 0.242121 }, { "epoch": 1.8686637701044289, "grad_norm": 0.6390783190727234, "learning_rate": 6.931792560134646e-05, "loss": 0.16057366132736206, "memory(GiB)": 122.96, "step": 24515, "token_acc": 0.9495528935380463, "train_speed(iter/s)": 0.242124 }, { "epoch": 1.8690448967146887, "grad_norm": 0.8562953472137451, "learning_rate": 6.930688135573909e-05, "loss": 0.08233790397644043, "memory(GiB)": 122.96, "step": 24520, "token_acc": 0.9636251541307028, "train_speed(iter/s)": 0.242138 }, { "epoch": 1.8694260233249484, "grad_norm": 0.436522901058197, "learning_rate": 6.929583600297168e-05, "loss": 0.08832259774208069, "memory(GiB)": 122.96, "step": 24525, "token_acc": 0.9531641604010025, "train_speed(iter/s)": 0.242144 }, { "epoch": 1.8698071499352085, "grad_norm": 1.44728684425354, "learning_rate": 6.92847895436776e-05, "loss": 0.09574618935585022, "memory(GiB)": 122.96, "step": 24530, "token_acc": 0.9525144559313125, "train_speed(iter/s)": 0.242152 }, { "epoch": 1.8701882765454685, "grad_norm": 0.8362749218940735, "learning_rate": 6.927374197849033e-05, "loss": 0.11955996751785278, "memory(GiB)": 122.96, "step": 24535, "token_acc": 0.957973383142657, "train_speed(iter/s)": 0.242163 }, { "epoch": 1.8705694031557283, "grad_norm": 0.7034499645233154, "learning_rate": 6.92626933080434e-05, "loss": 0.17180652618408204, "memory(GiB)": 122.96, "step": 24540, "token_acc": 0.9392550143266476, "train_speed(iter/s)": 0.242171 }, { "epoch": 1.8709505297659883, "grad_norm": 1.5691636800765991, "learning_rate": 6.925164353297042e-05, "loss": 0.12489677667617798, "memory(GiB)": 122.96, "step": 24545, "token_acc": 0.953268822279915, "train_speed(iter/s)": 0.24218 }, { "epoch": 1.871331656376248, "grad_norm": 0.662467360496521, "learning_rate": 6.9240592653905e-05, "loss": 0.10648032426834106, "memory(GiB)": 122.96, "step": 24550, "token_acc": 0.9659213847818248, "train_speed(iter/s)": 0.242184 }, { "epoch": 1.8717127829865081, "grad_norm": 1.1288537979125977, "learning_rate": 6.922954067148089e-05, "loss": 0.1263742446899414, "memory(GiB)": 122.96, "step": 24555, "token_acc": 0.9439412484700123, "train_speed(iter/s)": 0.242196 }, { "epoch": 1.8720939095967681, "grad_norm": 0.8075565695762634, "learning_rate": 6.921848758633185e-05, "loss": 0.10396476984024047, "memory(GiB)": 122.96, "step": 24560, "token_acc": 0.962874908558888, "train_speed(iter/s)": 0.2422 }, { "epoch": 1.872475036207028, "grad_norm": 0.6415311098098755, "learning_rate": 6.920743339909174e-05, "loss": 0.138031005859375, "memory(GiB)": 122.96, "step": 24565, "token_acc": 0.9513286713286714, "train_speed(iter/s)": 0.242212 }, { "epoch": 1.872856162817288, "grad_norm": 2.1600167751312256, "learning_rate": 6.919637811039445e-05, "loss": 0.12752517461776733, "memory(GiB)": 122.96, "step": 24570, "token_acc": 0.9523961661341853, "train_speed(iter/s)": 0.242226 }, { "epoch": 1.8732372894275477, "grad_norm": 0.5511002540588379, "learning_rate": 6.918532172087396e-05, "loss": 0.13950117826461791, "memory(GiB)": 122.96, "step": 24575, "token_acc": 0.9495662699213234, "train_speed(iter/s)": 0.242236 }, { "epoch": 1.8736184160378078, "grad_norm": 1.1463775634765625, "learning_rate": 6.91742642311643e-05, "loss": 0.1483471155166626, "memory(GiB)": 122.96, "step": 24580, "token_acc": 0.9328477785870357, "train_speed(iter/s)": 0.242247 }, { "epoch": 1.8739995426480678, "grad_norm": 0.9431594610214233, "learning_rate": 6.916320564189957e-05, "loss": 0.1320252776145935, "memory(GiB)": 122.96, "step": 24585, "token_acc": 0.9338432122370937, "train_speed(iter/s)": 0.242262 }, { "epoch": 1.8743806692583276, "grad_norm": 1.3550835847854614, "learning_rate": 6.915214595371394e-05, "loss": 0.12095584869384765, "memory(GiB)": 122.96, "step": 24590, "token_acc": 0.9383931406795808, "train_speed(iter/s)": 0.242276 }, { "epoch": 1.8747617958685876, "grad_norm": 0.7263226509094238, "learning_rate": 6.914108516724158e-05, "loss": 0.07665133476257324, "memory(GiB)": 122.96, "step": 24595, "token_acc": 0.9725938713854122, "train_speed(iter/s)": 0.242283 }, { "epoch": 1.8751429224788474, "grad_norm": 0.9405462741851807, "learning_rate": 6.913002328311685e-05, "loss": 0.10699925422668458, "memory(GiB)": 122.96, "step": 24600, "token_acc": 0.953886876567289, "train_speed(iter/s)": 0.242288 }, { "epoch": 1.8751429224788474, "eval_loss": 0.09408386051654816, "eval_runtime": 220.1997, "eval_samples_per_second": 2.407, "eval_steps_per_second": 2.407, "eval_token_acc": 0.9575703270887296, "step": 24600 }, { "epoch": 1.8755240490891074, "grad_norm": 0.2721266746520996, "learning_rate": 6.911896030197402e-05, "loss": 0.0838725745677948, "memory(GiB)": 122.96, "step": 24605, "token_acc": 0.9581177558747869, "train_speed(iter/s)": 0.241772 }, { "epoch": 1.8759051756993674, "grad_norm": 0.6907482147216797, "learning_rate": 6.910789622444756e-05, "loss": 0.1540897846221924, "memory(GiB)": 122.96, "step": 24610, "token_acc": 0.9404092071611253, "train_speed(iter/s)": 0.241784 }, { "epoch": 1.8762863023096272, "grad_norm": 0.9140029549598694, "learning_rate": 6.909683105117192e-05, "loss": 0.06822125911712647, "memory(GiB)": 122.96, "step": 24615, "token_acc": 0.9723831595210506, "train_speed(iter/s)": 0.241792 }, { "epoch": 1.876667428919887, "grad_norm": 1.5086561441421509, "learning_rate": 6.908576478278165e-05, "loss": 0.15809123516082763, "memory(GiB)": 122.96, "step": 24620, "token_acc": 0.9475705780679854, "train_speed(iter/s)": 0.241801 }, { "epoch": 1.877048555530147, "grad_norm": 0.6805033683776855, "learning_rate": 6.907469741991131e-05, "loss": 0.10386934280395507, "memory(GiB)": 122.96, "step": 24625, "token_acc": 0.9674743505784763, "train_speed(iter/s)": 0.241812 }, { "epoch": 1.877429682140407, "grad_norm": 1.126225471496582, "learning_rate": 6.906362896319563e-05, "loss": 0.06821250319480895, "memory(GiB)": 122.96, "step": 24630, "token_acc": 0.9744389027431422, "train_speed(iter/s)": 0.24183 }, { "epoch": 1.877810808750667, "grad_norm": 0.5992647409439087, "learning_rate": 6.905255941326926e-05, "loss": 0.08460434079170227, "memory(GiB)": 122.96, "step": 24635, "token_acc": 0.9681153090194365, "train_speed(iter/s)": 0.241838 }, { "epoch": 1.8781919353609269, "grad_norm": 1.0299367904663086, "learning_rate": 6.904148877076704e-05, "loss": 0.12335501909255982, "memory(GiB)": 122.96, "step": 24640, "token_acc": 0.9507853403141361, "train_speed(iter/s)": 0.241849 }, { "epoch": 1.8785730619711867, "grad_norm": 0.8165981769561768, "learning_rate": 6.90304170363238e-05, "loss": 0.09122146368026733, "memory(GiB)": 122.96, "step": 24645, "token_acc": 0.9588411588411588, "train_speed(iter/s)": 0.241858 }, { "epoch": 1.8789541885814467, "grad_norm": 0.8825969099998474, "learning_rate": 6.901934421057446e-05, "loss": 0.178433358669281, "memory(GiB)": 122.96, "step": 24650, "token_acc": 0.9343018213356461, "train_speed(iter/s)": 0.241868 }, { "epoch": 1.8793353151917067, "grad_norm": 0.7832249402999878, "learning_rate": 6.900827029415399e-05, "loss": 0.12123171091079712, "memory(GiB)": 122.96, "step": 24655, "token_acc": 0.9572010122125646, "train_speed(iter/s)": 0.241872 }, { "epoch": 1.8797164418019667, "grad_norm": 1.1911184787750244, "learning_rate": 6.899719528769741e-05, "loss": 0.14991663694381713, "memory(GiB)": 122.96, "step": 24660, "token_acc": 0.9382108822625269, "train_speed(iter/s)": 0.241884 }, { "epoch": 1.8800975684122265, "grad_norm": 0.6734333634376526, "learning_rate": 6.898611919183986e-05, "loss": 0.1197007179260254, "memory(GiB)": 122.96, "step": 24665, "token_acc": 0.9464701318851824, "train_speed(iter/s)": 0.241898 }, { "epoch": 1.8804786950224863, "grad_norm": 1.0047712326049805, "learning_rate": 6.897504200721647e-05, "loss": 0.09889943599700927, "memory(GiB)": 122.96, "step": 24670, "token_acc": 0.9615849969751966, "train_speed(iter/s)": 0.241907 }, { "epoch": 1.8808598216327463, "grad_norm": 1.0910698175430298, "learning_rate": 6.896396373446247e-05, "loss": 0.13568401336669922, "memory(GiB)": 122.96, "step": 24675, "token_acc": 0.9417913993822761, "train_speed(iter/s)": 0.241918 }, { "epoch": 1.8812409482430064, "grad_norm": 0.9333653450012207, "learning_rate": 6.895288437421317e-05, "loss": 0.09649399518966675, "memory(GiB)": 122.96, "step": 24680, "token_acc": 0.9587112918360964, "train_speed(iter/s)": 0.241932 }, { "epoch": 1.8816220748532664, "grad_norm": 0.7843413949012756, "learning_rate": 6.894180392710387e-05, "loss": 0.12601059675216675, "memory(GiB)": 122.96, "step": 24685, "token_acc": 0.9548709633833266, "train_speed(iter/s)": 0.241935 }, { "epoch": 1.8820032014635262, "grad_norm": 0.9800599217414856, "learning_rate": 6.893072239377005e-05, "loss": 0.11230251789093018, "memory(GiB)": 122.96, "step": 24690, "token_acc": 0.9546070460704607, "train_speed(iter/s)": 0.241949 }, { "epoch": 1.882384328073786, "grad_norm": 1.3636984825134277, "learning_rate": 6.891963977484714e-05, "loss": 0.10395592451095581, "memory(GiB)": 122.96, "step": 24695, "token_acc": 0.9620958751393534, "train_speed(iter/s)": 0.241956 }, { "epoch": 1.882765454684046, "grad_norm": 0.5814365148544312, "learning_rate": 6.890855607097068e-05, "loss": 0.09789491891860962, "memory(GiB)": 122.96, "step": 24700, "token_acc": 0.9615304202383441, "train_speed(iter/s)": 0.241963 }, { "epoch": 1.883146581294306, "grad_norm": 0.48000234365463257, "learning_rate": 6.889747128277629e-05, "loss": 0.1060525894165039, "memory(GiB)": 122.96, "step": 24705, "token_acc": 0.9583915107511868, "train_speed(iter/s)": 0.241966 }, { "epoch": 1.883527707904566, "grad_norm": 0.8437292575836182, "learning_rate": 6.88863854108996e-05, "loss": 0.10130752325057983, "memory(GiB)": 122.96, "step": 24710, "token_acc": 0.9546989866878601, "train_speed(iter/s)": 0.241976 }, { "epoch": 1.8839088345148258, "grad_norm": 0.8598158955574036, "learning_rate": 6.887529845597636e-05, "loss": 0.12806694507598876, "memory(GiB)": 122.96, "step": 24715, "token_acc": 0.9521946979574099, "train_speed(iter/s)": 0.241979 }, { "epoch": 1.8842899611250856, "grad_norm": 0.6728751063346863, "learning_rate": 6.886421041864235e-05, "loss": 0.15813401937484742, "memory(GiB)": 122.96, "step": 24720, "token_acc": 0.9534066963240458, "train_speed(iter/s)": 0.24199 }, { "epoch": 1.8846710877353456, "grad_norm": 0.5597208142280579, "learning_rate": 6.885312129953339e-05, "loss": 0.12329227924346924, "memory(GiB)": 122.96, "step": 24725, "token_acc": 0.9588000514999356, "train_speed(iter/s)": 0.241996 }, { "epoch": 1.8850522143456057, "grad_norm": 1.756227731704712, "learning_rate": 6.884203109928545e-05, "loss": 0.08641157150268555, "memory(GiB)": 122.96, "step": 24730, "token_acc": 0.9689991142604074, "train_speed(iter/s)": 0.242006 }, { "epoch": 1.8854333409558657, "grad_norm": 1.408076524734497, "learning_rate": 6.883093981853444e-05, "loss": 0.13248720169067382, "memory(GiB)": 122.96, "step": 24735, "token_acc": 0.9384615384615385, "train_speed(iter/s)": 0.242021 }, { "epoch": 1.8858144675661255, "grad_norm": 1.2376432418823242, "learning_rate": 6.881984745791642e-05, "loss": 0.11326239109039307, "memory(GiB)": 122.96, "step": 24740, "token_acc": 0.9534225019669551, "train_speed(iter/s)": 0.242026 }, { "epoch": 1.8861955941763853, "grad_norm": 0.371183305978775, "learning_rate": 6.880875401806748e-05, "loss": 0.1206861138343811, "memory(GiB)": 122.96, "step": 24745, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.242033 }, { "epoch": 1.8865767207866453, "grad_norm": 1.1030194759368896, "learning_rate": 6.879765949962377e-05, "loss": 0.11145108938217163, "memory(GiB)": 122.96, "step": 24750, "token_acc": 0.9740622505985634, "train_speed(iter/s)": 0.24204 }, { "epoch": 1.8869578473969053, "grad_norm": 0.768980860710144, "learning_rate": 6.878656390322155e-05, "loss": 0.10398710966110229, "memory(GiB)": 122.96, "step": 24755, "token_acc": 0.9563003818413237, "train_speed(iter/s)": 0.242048 }, { "epoch": 1.8873389740071653, "grad_norm": 0.6596015691757202, "learning_rate": 6.877546722949705e-05, "loss": 0.13592371940612794, "memory(GiB)": 122.96, "step": 24760, "token_acc": 0.9426096372495939, "train_speed(iter/s)": 0.242058 }, { "epoch": 1.8877201006174251, "grad_norm": 0.8336893916130066, "learning_rate": 6.876436947908664e-05, "loss": 0.12784690856933595, "memory(GiB)": 122.96, "step": 24765, "token_acc": 0.9497364699832883, "train_speed(iter/s)": 0.242058 }, { "epoch": 1.888101227227685, "grad_norm": 0.9624178409576416, "learning_rate": 6.87532706526267e-05, "loss": 0.14585951566696168, "memory(GiB)": 122.96, "step": 24770, "token_acc": 0.9446728971962617, "train_speed(iter/s)": 0.242073 }, { "epoch": 1.888482353837945, "grad_norm": 0.8079752922058105, "learning_rate": 6.874217075075376e-05, "loss": 0.11231428384780884, "memory(GiB)": 122.96, "step": 24775, "token_acc": 0.9497896213183731, "train_speed(iter/s)": 0.242083 }, { "epoch": 1.888863480448205, "grad_norm": 0.6329883337020874, "learning_rate": 6.873106977410425e-05, "loss": 0.1362109065055847, "memory(GiB)": 122.96, "step": 24780, "token_acc": 0.9462011089465008, "train_speed(iter/s)": 0.242086 }, { "epoch": 1.889244607058465, "grad_norm": 0.7140170335769653, "learning_rate": 6.871996772331484e-05, "loss": 0.12386872768402099, "memory(GiB)": 122.96, "step": 24785, "token_acc": 0.9420597067655891, "train_speed(iter/s)": 0.242093 }, { "epoch": 1.8896257336687248, "grad_norm": 0.943251371383667, "learning_rate": 6.870886459902214e-05, "loss": 0.1295098304748535, "memory(GiB)": 122.96, "step": 24790, "token_acc": 0.9559890375624698, "train_speed(iter/s)": 0.242097 }, { "epoch": 1.8900068602789846, "grad_norm": 0.3532976508140564, "learning_rate": 6.869776040186289e-05, "loss": 0.10732793807983398, "memory(GiB)": 122.96, "step": 24795, "token_acc": 0.9566929133858267, "train_speed(iter/s)": 0.242109 }, { "epoch": 1.8903879868892446, "grad_norm": 1.2943809032440186, "learning_rate": 6.868665513247384e-05, "loss": 0.132412326335907, "memory(GiB)": 122.96, "step": 24800, "token_acc": 0.9494697442295695, "train_speed(iter/s)": 0.24212 }, { "epoch": 1.8903879868892446, "eval_loss": 0.09312719851732254, "eval_runtime": 220.7128, "eval_samples_per_second": 2.401, "eval_steps_per_second": 2.401, "eval_token_acc": 0.9580371664357569, "step": 24800 }, { "epoch": 1.8907691134995046, "grad_norm": 1.6054743528366089, "learning_rate": 6.867554879149183e-05, "loss": 0.14004428386688234, "memory(GiB)": 122.96, "step": 24805, "token_acc": 0.9576726942352217, "train_speed(iter/s)": 0.241612 }, { "epoch": 1.8911502401097646, "grad_norm": 0.6145107746124268, "learning_rate": 6.866444137955376e-05, "loss": 0.09533853530883789, "memory(GiB)": 122.96, "step": 24810, "token_acc": 0.9646393210749646, "train_speed(iter/s)": 0.241618 }, { "epoch": 1.8915313667200244, "grad_norm": 0.8572534322738647, "learning_rate": 6.865333289729661e-05, "loss": 0.10994658470153809, "memory(GiB)": 122.96, "step": 24815, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.241624 }, { "epoch": 1.8919124933302842, "grad_norm": 0.4184836745262146, "learning_rate": 6.864222334535734e-05, "loss": 0.08382610082626343, "memory(GiB)": 122.96, "step": 24820, "token_acc": 0.9638249430812041, "train_speed(iter/s)": 0.241624 }, { "epoch": 1.8922936199405442, "grad_norm": 1.0743303298950195, "learning_rate": 6.863111272437312e-05, "loss": 0.11338602304458618, "memory(GiB)": 122.96, "step": 24825, "token_acc": 0.9501730103806229, "train_speed(iter/s)": 0.241637 }, { "epoch": 1.8926747465508043, "grad_norm": 1.1219459772109985, "learning_rate": 6.862000103498101e-05, "loss": 0.12385056018829346, "memory(GiB)": 122.96, "step": 24830, "token_acc": 0.9509090909090909, "train_speed(iter/s)": 0.241651 }, { "epoch": 1.893055873161064, "grad_norm": 1.0967707633972168, "learning_rate": 6.860888827781826e-05, "loss": 0.11414767503738403, "memory(GiB)": 122.96, "step": 24835, "token_acc": 0.951856946354883, "train_speed(iter/s)": 0.241661 }, { "epoch": 1.893436999771324, "grad_norm": 2.3000736236572266, "learning_rate": 6.859777445352214e-05, "loss": 0.12160284519195556, "memory(GiB)": 122.96, "step": 24840, "token_acc": 0.9536500579374276, "train_speed(iter/s)": 0.241675 }, { "epoch": 1.8938181263815839, "grad_norm": 0.7351021766662598, "learning_rate": 6.858665956272994e-05, "loss": 0.10412547588348389, "memory(GiB)": 122.96, "step": 24845, "token_acc": 0.957455268389662, "train_speed(iter/s)": 0.24169 }, { "epoch": 1.8941992529918439, "grad_norm": 0.7144935131072998, "learning_rate": 6.857554360607906e-05, "loss": 0.15285081863403321, "memory(GiB)": 122.96, "step": 24850, "token_acc": 0.9426458832933653, "train_speed(iter/s)": 0.241701 }, { "epoch": 1.894580379602104, "grad_norm": 0.9545551538467407, "learning_rate": 6.856442658420697e-05, "loss": 0.082651025056839, "memory(GiB)": 122.96, "step": 24855, "token_acc": 0.960097577975257, "train_speed(iter/s)": 0.241707 }, { "epoch": 1.8949615062123637, "grad_norm": 0.8078469634056091, "learning_rate": 6.855330849775115e-05, "loss": 0.08170257806777954, "memory(GiB)": 122.96, "step": 24860, "token_acc": 0.9574241617881852, "train_speed(iter/s)": 0.241716 }, { "epoch": 1.8953426328226237, "grad_norm": 0.8147615194320679, "learning_rate": 6.85421893473492e-05, "loss": 0.09920209050178527, "memory(GiB)": 122.96, "step": 24865, "token_acc": 0.9648760330578512, "train_speed(iter/s)": 0.241724 }, { "epoch": 1.8957237594328835, "grad_norm": 0.29824039340019226, "learning_rate": 6.853106913363874e-05, "loss": 0.10400916337966919, "memory(GiB)": 122.96, "step": 24870, "token_acc": 0.9456272518833934, "train_speed(iter/s)": 0.241739 }, { "epoch": 1.8961048860431435, "grad_norm": 2.3162291049957275, "learning_rate": 6.851994785725744e-05, "loss": 0.14934438467025757, "memory(GiB)": 122.96, "step": 24875, "token_acc": 0.9358161648177497, "train_speed(iter/s)": 0.241748 }, { "epoch": 1.8964860126534036, "grad_norm": 0.8843148350715637, "learning_rate": 6.850882551884309e-05, "loss": 0.06940490603446961, "memory(GiB)": 122.96, "step": 24880, "token_acc": 0.9787191327042762, "train_speed(iter/s)": 0.241758 }, { "epoch": 1.8968671392636633, "grad_norm": 0.9068572521209717, "learning_rate": 6.849770211903348e-05, "loss": 0.12149347066879272, "memory(GiB)": 122.96, "step": 24885, "token_acc": 0.9564765042483093, "train_speed(iter/s)": 0.241764 }, { "epoch": 1.8972482658739234, "grad_norm": 0.6874720454216003, "learning_rate": 6.84865776584665e-05, "loss": 0.11975308656692504, "memory(GiB)": 122.96, "step": 24890, "token_acc": 0.9539988968560397, "train_speed(iter/s)": 0.241764 }, { "epoch": 1.8976293924841832, "grad_norm": 0.8322707414627075, "learning_rate": 6.847545213778008e-05, "loss": 0.09546294808387756, "memory(GiB)": 122.96, "step": 24895, "token_acc": 0.960668380462725, "train_speed(iter/s)": 0.241765 }, { "epoch": 1.8980105190944432, "grad_norm": 1.4184728860855103, "learning_rate": 6.84643255576122e-05, "loss": 0.13312849998474122, "memory(GiB)": 122.96, "step": 24900, "token_acc": 0.9412866958151156, "train_speed(iter/s)": 0.241781 }, { "epoch": 1.8983916457047032, "grad_norm": 1.1837490797042847, "learning_rate": 6.845319791860096e-05, "loss": 0.1460339307785034, "memory(GiB)": 122.96, "step": 24905, "token_acc": 0.9531208849091388, "train_speed(iter/s)": 0.241793 }, { "epoch": 1.898772772314963, "grad_norm": 0.7205832600593567, "learning_rate": 6.844206922138444e-05, "loss": 0.12729363441467284, "memory(GiB)": 122.96, "step": 24910, "token_acc": 0.9408050513022889, "train_speed(iter/s)": 0.241805 }, { "epoch": 1.8991538989252228, "grad_norm": 0.7902097105979919, "learning_rate": 6.843093946660082e-05, "loss": 0.0893536388874054, "memory(GiB)": 122.96, "step": 24915, "token_acc": 0.9662415603900976, "train_speed(iter/s)": 0.241814 }, { "epoch": 1.8995350255354828, "grad_norm": 0.47735798358917236, "learning_rate": 6.841980865488837e-05, "loss": 0.1004536509513855, "memory(GiB)": 122.96, "step": 24920, "token_acc": 0.9537428500373042, "train_speed(iter/s)": 0.241825 }, { "epoch": 1.8999161521457428, "grad_norm": 1.2733780145645142, "learning_rate": 6.840867678688536e-05, "loss": 0.13194305896759034, "memory(GiB)": 122.96, "step": 24925, "token_acc": 0.9340631364562119, "train_speed(iter/s)": 0.241837 }, { "epoch": 1.9002972787560029, "grad_norm": 1.5835341215133667, "learning_rate": 6.839754386323017e-05, "loss": 0.13195319175720216, "memory(GiB)": 122.96, "step": 24930, "token_acc": 0.9499501992031872, "train_speed(iter/s)": 0.241847 }, { "epoch": 1.9006784053662626, "grad_norm": 1.0948282480239868, "learning_rate": 6.838640988456122e-05, "loss": 0.11669927835464478, "memory(GiB)": 122.96, "step": 24935, "token_acc": 0.9551983584131327, "train_speed(iter/s)": 0.24186 }, { "epoch": 1.9010595319765224, "grad_norm": 0.6331787109375, "learning_rate": 6.837527485151697e-05, "loss": 0.10433108806610107, "memory(GiB)": 122.96, "step": 24940, "token_acc": 0.9613309352517986, "train_speed(iter/s)": 0.241877 }, { "epoch": 1.9014406585867825, "grad_norm": 1.570054531097412, "learning_rate": 6.8364138764736e-05, "loss": 0.1547531008720398, "memory(GiB)": 122.96, "step": 24945, "token_acc": 0.9385052034058656, "train_speed(iter/s)": 0.241891 }, { "epoch": 1.9018217851970425, "grad_norm": 0.9378734230995178, "learning_rate": 6.835300162485687e-05, "loss": 0.088306725025177, "memory(GiB)": 122.96, "step": 24950, "token_acc": 0.9632132132132132, "train_speed(iter/s)": 0.241904 }, { "epoch": 1.9022029118073025, "grad_norm": 0.7957130670547485, "learning_rate": 6.834186343251827e-05, "loss": 0.0877190113067627, "memory(GiB)": 122.96, "step": 24955, "token_acc": 0.9587178241864983, "train_speed(iter/s)": 0.241913 }, { "epoch": 1.9025840384175623, "grad_norm": 1.6933141946792603, "learning_rate": 6.833072418835893e-05, "loss": 0.09075245261192322, "memory(GiB)": 122.96, "step": 24960, "token_acc": 0.9686876745113682, "train_speed(iter/s)": 0.241921 }, { "epoch": 1.902965165027822, "grad_norm": 0.5800952315330505, "learning_rate": 6.831958389301762e-05, "loss": 0.08738378286361695, "memory(GiB)": 122.96, "step": 24965, "token_acc": 0.9659058075700665, "train_speed(iter/s)": 0.241924 }, { "epoch": 1.903346291638082, "grad_norm": 0.8349543809890747, "learning_rate": 6.830844254713317e-05, "loss": 0.11157078742980957, "memory(GiB)": 122.96, "step": 24970, "token_acc": 0.9595680562531391, "train_speed(iter/s)": 0.241933 }, { "epoch": 1.9037274182483421, "grad_norm": 1.2131571769714355, "learning_rate": 6.829730015134452e-05, "loss": 0.1251460075378418, "memory(GiB)": 122.96, "step": 24975, "token_acc": 0.9507479022254651, "train_speed(iter/s)": 0.241948 }, { "epoch": 1.9041085448586021, "grad_norm": 0.5984216332435608, "learning_rate": 6.828615670629062e-05, "loss": 0.07409882545471191, "memory(GiB)": 122.96, "step": 24980, "token_acc": 0.9588414634146342, "train_speed(iter/s)": 0.241959 }, { "epoch": 1.904489671468862, "grad_norm": 0.5764501690864563, "learning_rate": 6.827501221261047e-05, "loss": 0.14525492191314698, "memory(GiB)": 122.96, "step": 24985, "token_acc": 0.951113525258012, "train_speed(iter/s)": 0.241965 }, { "epoch": 1.9048707980791217, "grad_norm": 1.4766972064971924, "learning_rate": 6.82638666709432e-05, "loss": 0.1032108187675476, "memory(GiB)": 122.96, "step": 24990, "token_acc": 0.9564862659776993, "train_speed(iter/s)": 0.241977 }, { "epoch": 1.9052519246893818, "grad_norm": 1.6881240606307983, "learning_rate": 6.82527200819279e-05, "loss": 0.18155087232589723, "memory(GiB)": 122.96, "step": 24995, "token_acc": 0.9336823734729494, "train_speed(iter/s)": 0.241986 }, { "epoch": 1.9056330512996418, "grad_norm": 0.914800763130188, "learning_rate": 6.824157244620384e-05, "loss": 0.05370696783065796, "memory(GiB)": 122.96, "step": 25000, "token_acc": 0.9627742946708464, "train_speed(iter/s)": 0.242001 }, { "epoch": 1.9056330512996418, "eval_loss": 0.09334749728441238, "eval_runtime": 221.3733, "eval_samples_per_second": 2.394, "eval_steps_per_second": 2.394, "eval_token_acc": 0.9581199927715198, "step": 25000 }, { "epoch": 1.9060141779099018, "grad_norm": 1.0258666276931763, "learning_rate": 6.823042376441023e-05, "loss": 0.10644853115081787, "memory(GiB)": 122.96, "step": 25005, "token_acc": 0.9581188818246039, "train_speed(iter/s)": 0.241487 }, { "epoch": 1.9063953045201616, "grad_norm": 2.187659978866577, "learning_rate": 6.821927403718644e-05, "loss": 0.13958239555358887, "memory(GiB)": 122.96, "step": 25010, "token_acc": 0.9468108108108109, "train_speed(iter/s)": 0.241497 }, { "epoch": 1.9067764311304214, "grad_norm": 0.8859712481498718, "learning_rate": 6.820812326517181e-05, "loss": 0.14543284177780152, "memory(GiB)": 122.96, "step": 25015, "token_acc": 0.9469429004547751, "train_speed(iter/s)": 0.241507 }, { "epoch": 1.9071575577406814, "grad_norm": 1.8382761478424072, "learning_rate": 6.81969714490058e-05, "loss": 0.12560364007949829, "memory(GiB)": 122.96, "step": 25020, "token_acc": 0.9537591483699268, "train_speed(iter/s)": 0.241518 }, { "epoch": 1.9075386843509414, "grad_norm": 0.2636927366256714, "learning_rate": 6.818581858932793e-05, "loss": 0.0989556610584259, "memory(GiB)": 122.96, "step": 25025, "token_acc": 0.944760101010101, "train_speed(iter/s)": 0.241531 }, { "epoch": 1.9079198109612014, "grad_norm": 0.7116786241531372, "learning_rate": 6.817466468677776e-05, "loss": 0.1324480175971985, "memory(GiB)": 122.96, "step": 25030, "token_acc": 0.9510986066452305, "train_speed(iter/s)": 0.241539 }, { "epoch": 1.9083009375714612, "grad_norm": 0.6615849733352661, "learning_rate": 6.816350974199492e-05, "loss": 0.10451627969741821, "memory(GiB)": 122.96, "step": 25035, "token_acc": 0.9651771336553945, "train_speed(iter/s)": 0.241548 }, { "epoch": 1.908682064181721, "grad_norm": 1.1573327779769897, "learning_rate": 6.815235375561907e-05, "loss": 0.13472012281417847, "memory(GiB)": 122.96, "step": 25040, "token_acc": 0.9485230857470605, "train_speed(iter/s)": 0.241553 }, { "epoch": 1.909063190791981, "grad_norm": 0.918311595916748, "learning_rate": 6.814119672828997e-05, "loss": 0.164923095703125, "memory(GiB)": 122.96, "step": 25045, "token_acc": 0.9361958266452648, "train_speed(iter/s)": 0.241561 }, { "epoch": 1.909444317402241, "grad_norm": 1.2527796030044556, "learning_rate": 6.813003866064744e-05, "loss": 0.11071850061416626, "memory(GiB)": 122.96, "step": 25050, "token_acc": 0.956386292834891, "train_speed(iter/s)": 0.241578 }, { "epoch": 1.909825444012501, "grad_norm": 0.8984056115150452, "learning_rate": 6.811887955333132e-05, "loss": 0.14210659265518188, "memory(GiB)": 122.96, "step": 25055, "token_acc": 0.9611226132965264, "train_speed(iter/s)": 0.241587 }, { "epoch": 1.910206570622761, "grad_norm": 0.714992880821228, "learning_rate": 6.810771940698153e-05, "loss": 0.1107908844947815, "memory(GiB)": 122.96, "step": 25060, "token_acc": 0.9556765163297045, "train_speed(iter/s)": 0.241601 }, { "epoch": 1.9105876972330207, "grad_norm": 0.1833721250295639, "learning_rate": 6.809655822223807e-05, "loss": 0.09667259454727173, "memory(GiB)": 122.96, "step": 25065, "token_acc": 0.9577389681375816, "train_speed(iter/s)": 0.241606 }, { "epoch": 1.9109688238432807, "grad_norm": 0.49883928894996643, "learning_rate": 6.808539599974097e-05, "loss": 0.07939456105232238, "memory(GiB)": 122.96, "step": 25070, "token_acc": 0.9698827772768259, "train_speed(iter/s)": 0.241612 }, { "epoch": 1.9113499504535407, "grad_norm": 1.4164178371429443, "learning_rate": 6.807423274013034e-05, "loss": 0.10428711175918579, "memory(GiB)": 122.96, "step": 25075, "token_acc": 0.9589237668161436, "train_speed(iter/s)": 0.241617 }, { "epoch": 1.9117310770638007, "grad_norm": 0.48240795731544495, "learning_rate": 6.806306844404633e-05, "loss": 0.10776156187057495, "memory(GiB)": 122.96, "step": 25080, "token_acc": 0.9622484616663894, "train_speed(iter/s)": 0.241618 }, { "epoch": 1.9121122036740605, "grad_norm": 1.3209139108657837, "learning_rate": 6.805190311212916e-05, "loss": 0.11361923217773437, "memory(GiB)": 122.96, "step": 25085, "token_acc": 0.9580139572131335, "train_speed(iter/s)": 0.241623 }, { "epoch": 1.9124933302843203, "grad_norm": 0.8358122110366821, "learning_rate": 6.804073674501912e-05, "loss": 0.08617077469825744, "memory(GiB)": 122.96, "step": 25090, "token_acc": 0.9566084788029925, "train_speed(iter/s)": 0.241638 }, { "epoch": 1.9128744568945804, "grad_norm": 0.985278844833374, "learning_rate": 6.802956934335657e-05, "loss": 0.13708739280700682, "memory(GiB)": 122.96, "step": 25095, "token_acc": 0.9457053849577214, "train_speed(iter/s)": 0.241649 }, { "epoch": 1.9132555835048404, "grad_norm": 1.320091962814331, "learning_rate": 6.801840090778188e-05, "loss": 0.10410542488098144, "memory(GiB)": 122.96, "step": 25100, "token_acc": 0.964203055084092, "train_speed(iter/s)": 0.241655 }, { "epoch": 1.9136367101151004, "grad_norm": 1.1884346008300781, "learning_rate": 6.80072314389355e-05, "loss": 0.07836112976074219, "memory(GiB)": 122.96, "step": 25105, "token_acc": 0.9668610003068426, "train_speed(iter/s)": 0.241658 }, { "epoch": 1.9140178367253602, "grad_norm": 1.042007327079773, "learning_rate": 6.799606093745796e-05, "loss": 0.1633504867553711, "memory(GiB)": 122.96, "step": 25110, "token_acc": 0.941206985993429, "train_speed(iter/s)": 0.241667 }, { "epoch": 1.91439896333562, "grad_norm": 0.766101062297821, "learning_rate": 6.798488940398985e-05, "loss": 0.11769750118255615, "memory(GiB)": 122.96, "step": 25115, "token_acc": 0.9459336665152204, "train_speed(iter/s)": 0.241677 }, { "epoch": 1.91478008994588, "grad_norm": 1.8784570693969727, "learning_rate": 6.797371683917177e-05, "loss": 0.1532688856124878, "memory(GiB)": 122.96, "step": 25120, "token_acc": 0.9373134328358209, "train_speed(iter/s)": 0.241682 }, { "epoch": 1.91516121655614, "grad_norm": 0.43710601329803467, "learning_rate": 6.796254324364447e-05, "loss": 0.16622772216796874, "memory(GiB)": 122.96, "step": 25125, "token_acc": 0.9494047619047619, "train_speed(iter/s)": 0.241693 }, { "epoch": 1.9155423431664, "grad_norm": 1.303815245628357, "learning_rate": 6.795136861804866e-05, "loss": 0.11955150365829467, "memory(GiB)": 122.96, "step": 25130, "token_acc": 0.955078557202921, "train_speed(iter/s)": 0.241704 }, { "epoch": 1.9159234697766598, "grad_norm": 0.8038737773895264, "learning_rate": 6.794019296302516e-05, "loss": 0.0686455488204956, "memory(GiB)": 122.96, "step": 25135, "token_acc": 0.9683813119395942, "train_speed(iter/s)": 0.241705 }, { "epoch": 1.9163045963869196, "grad_norm": 0.9664490818977356, "learning_rate": 6.792901627921484e-05, "loss": 0.14914438724517823, "memory(GiB)": 122.96, "step": 25140, "token_acc": 0.9429564210221515, "train_speed(iter/s)": 0.241712 }, { "epoch": 1.9166857229971797, "grad_norm": 1.0169305801391602, "learning_rate": 6.791783856725864e-05, "loss": 0.152068829536438, "memory(GiB)": 122.96, "step": 25145, "token_acc": 0.9387010459742156, "train_speed(iter/s)": 0.241723 }, { "epoch": 1.9170668496074397, "grad_norm": 0.9945523738861084, "learning_rate": 6.790665982779755e-05, "loss": 0.09028018712997436, "memory(GiB)": 122.96, "step": 25150, "token_acc": 0.9723766816143498, "train_speed(iter/s)": 0.241731 }, { "epoch": 1.9174479762176995, "grad_norm": 1.4863717555999756, "learning_rate": 6.789548006147262e-05, "loss": 0.11729158163070678, "memory(GiB)": 122.96, "step": 25155, "token_acc": 0.9469874387907175, "train_speed(iter/s)": 0.241743 }, { "epoch": 1.9178291028279595, "grad_norm": 0.29543983936309814, "learning_rate": 6.788429926892494e-05, "loss": 0.08882752656936646, "memory(GiB)": 122.96, "step": 25160, "token_acc": 0.96491705194452, "train_speed(iter/s)": 0.241744 }, { "epoch": 1.9182102294382193, "grad_norm": 0.6096279621124268, "learning_rate": 6.78731174507957e-05, "loss": 0.0772451937198639, "memory(GiB)": 122.96, "step": 25165, "token_acc": 0.9603823030676738, "train_speed(iter/s)": 0.24175 }, { "epoch": 1.9185913560484793, "grad_norm": 0.880556046962738, "learning_rate": 6.786193460772612e-05, "loss": 0.13917295932769774, "memory(GiB)": 122.96, "step": 25170, "token_acc": 0.9521992002908033, "train_speed(iter/s)": 0.241754 }, { "epoch": 1.9189724826587393, "grad_norm": 0.5417191386222839, "learning_rate": 6.785075074035748e-05, "loss": 0.1459651231765747, "memory(GiB)": 122.96, "step": 25175, "token_acc": 0.9593980247687726, "train_speed(iter/s)": 0.241762 }, { "epoch": 1.9193536092689991, "grad_norm": 0.8969895243644714, "learning_rate": 6.783956584933111e-05, "loss": 0.12119768857955933, "memory(GiB)": 122.96, "step": 25180, "token_acc": 0.953452154351964, "train_speed(iter/s)": 0.241771 }, { "epoch": 1.9197347358792591, "grad_norm": 0.7606545686721802, "learning_rate": 6.782837993528843e-05, "loss": 0.111759614944458, "memory(GiB)": 122.96, "step": 25185, "token_acc": 0.954954954954955, "train_speed(iter/s)": 0.24178 }, { "epoch": 1.920115862489519, "grad_norm": 1.2079427242279053, "learning_rate": 6.781719299887089e-05, "loss": 0.07344452142715455, "memory(GiB)": 122.96, "step": 25190, "token_acc": 0.9686274509803922, "train_speed(iter/s)": 0.241783 }, { "epoch": 1.920496989099779, "grad_norm": 1.076113224029541, "learning_rate": 6.780600504072003e-05, "loss": 0.16829919815063477, "memory(GiB)": 122.96, "step": 25195, "token_acc": 0.9376412961567445, "train_speed(iter/s)": 0.241791 }, { "epoch": 1.920878115710039, "grad_norm": 0.6737515926361084, "learning_rate": 6.77948160614774e-05, "loss": 0.10033812522888183, "memory(GiB)": 122.96, "step": 25200, "token_acc": 0.9628258293838863, "train_speed(iter/s)": 0.241798 }, { "epoch": 1.920878115710039, "eval_loss": 0.09176785498857498, "eval_runtime": 220.2928, "eval_samples_per_second": 2.406, "eval_steps_per_second": 2.406, "eval_token_acc": 0.9593774471417384, "step": 25200 }, { "epoch": 1.9212592423202988, "grad_norm": 0.7664294838905334, "learning_rate": 6.778362606178465e-05, "loss": 0.0991840898990631, "memory(GiB)": 122.96, "step": 25205, "token_acc": 0.9593909364858599, "train_speed(iter/s)": 0.241297 }, { "epoch": 1.9216403689305588, "grad_norm": 1.2469252347946167, "learning_rate": 6.777243504228346e-05, "loss": 0.09370362758636475, "memory(GiB)": 122.96, "step": 25210, "token_acc": 0.966142824127571, "train_speed(iter/s)": 0.241298 }, { "epoch": 1.9220214955408186, "grad_norm": 0.8442803025245667, "learning_rate": 6.776124300361562e-05, "loss": 0.10333333015441895, "memory(GiB)": 122.96, "step": 25215, "token_acc": 0.9550077041602465, "train_speed(iter/s)": 0.241311 }, { "epoch": 1.9224026221510786, "grad_norm": 0.4351358413696289, "learning_rate": 6.775004994642289e-05, "loss": 0.15733399391174316, "memory(GiB)": 122.96, "step": 25220, "token_acc": 0.9337767495662233, "train_speed(iter/s)": 0.241324 }, { "epoch": 1.9227837487613386, "grad_norm": 1.0487085580825806, "learning_rate": 6.773885587134716e-05, "loss": 0.08440716862678528, "memory(GiB)": 122.96, "step": 25225, "token_acc": 0.9627067669172933, "train_speed(iter/s)": 0.241335 }, { "epoch": 1.9231648753715984, "grad_norm": 0.7106019258499146, "learning_rate": 6.77276607790304e-05, "loss": 0.10238280296325683, "memory(GiB)": 122.96, "step": 25230, "token_acc": 0.966940866056185, "train_speed(iter/s)": 0.241342 }, { "epoch": 1.9235460019818582, "grad_norm": 1.115917682647705, "learning_rate": 6.771646467011452e-05, "loss": 0.11631821393966675, "memory(GiB)": 122.96, "step": 25235, "token_acc": 0.9448776065276518, "train_speed(iter/s)": 0.24135 }, { "epoch": 1.9239271285921182, "grad_norm": 0.9220513701438904, "learning_rate": 6.770526754524163e-05, "loss": 0.14865119457244874, "memory(GiB)": 122.96, "step": 25240, "token_acc": 0.9490644490644491, "train_speed(iter/s)": 0.241363 }, { "epoch": 1.9243082552023782, "grad_norm": 0.0016941402573138475, "learning_rate": 6.76940694050538e-05, "loss": 0.08053070306777954, "memory(GiB)": 122.96, "step": 25245, "token_acc": 0.9638336347197106, "train_speed(iter/s)": 0.241373 }, { "epoch": 1.9246893818126383, "grad_norm": 2.3710010051727295, "learning_rate": 6.76828702501932e-05, "loss": 0.10468261241912842, "memory(GiB)": 122.96, "step": 25250, "token_acc": 0.9624174707973591, "train_speed(iter/s)": 0.241389 }, { "epoch": 1.925070508422898, "grad_norm": 0.13613176345825195, "learning_rate": 6.767167008130206e-05, "loss": 0.10870151519775391, "memory(GiB)": 122.96, "step": 25255, "token_acc": 0.9508819538670285, "train_speed(iter/s)": 0.241398 }, { "epoch": 1.9254516350331579, "grad_norm": 1.7110073566436768, "learning_rate": 6.766046889902265e-05, "loss": 0.12578210830688477, "memory(GiB)": 122.96, "step": 25260, "token_acc": 0.9539113104135526, "train_speed(iter/s)": 0.241409 }, { "epoch": 1.9258327616434179, "grad_norm": 0.9698331952095032, "learning_rate": 6.76492667039973e-05, "loss": 0.14221386909484862, "memory(GiB)": 122.96, "step": 25265, "token_acc": 0.9473981102806375, "train_speed(iter/s)": 0.241412 }, { "epoch": 1.926213888253678, "grad_norm": 1.0566977262496948, "learning_rate": 6.76380634968684e-05, "loss": 0.12062525749206543, "memory(GiB)": 122.96, "step": 25270, "token_acc": 0.956781914893617, "train_speed(iter/s)": 0.24142 }, { "epoch": 1.926595014863938, "grad_norm": 0.6626619100570679, "learning_rate": 6.762685927827839e-05, "loss": 0.09047983884811402, "memory(GiB)": 122.96, "step": 25275, "token_acc": 0.9670357970641257, "train_speed(iter/s)": 0.241431 }, { "epoch": 1.9269761414741977, "grad_norm": 0.7870355844497681, "learning_rate": 6.761565404886984e-05, "loss": 0.12996337413787842, "memory(GiB)": 122.96, "step": 25280, "token_acc": 0.9512141280353201, "train_speed(iter/s)": 0.24144 }, { "epoch": 1.9273572680844575, "grad_norm": 0.7580623626708984, "learning_rate": 6.760444780928524e-05, "loss": 0.09792162179946899, "memory(GiB)": 122.96, "step": 25285, "token_acc": 0.9669282511210763, "train_speed(iter/s)": 0.241455 }, { "epoch": 1.9277383946947175, "grad_norm": 1.9348212480545044, "learning_rate": 6.759324056016729e-05, "loss": 0.10272096395492554, "memory(GiB)": 122.96, "step": 25290, "token_acc": 0.952, "train_speed(iter/s)": 0.241465 }, { "epoch": 1.9281195213049775, "grad_norm": 0.08082933723926544, "learning_rate": 6.75820323021586e-05, "loss": 0.1022484302520752, "memory(GiB)": 122.96, "step": 25295, "token_acc": 0.955070281124498, "train_speed(iter/s)": 0.241476 }, { "epoch": 1.9285006479152376, "grad_norm": 0.6600849628448486, "learning_rate": 6.757082303590197e-05, "loss": 0.15305899381637572, "memory(GiB)": 122.96, "step": 25300, "token_acc": 0.9451572327044026, "train_speed(iter/s)": 0.24149 }, { "epoch": 1.9288817745254974, "grad_norm": 0.9358091950416565, "learning_rate": 6.755961276204017e-05, "loss": 0.11588840484619141, "memory(GiB)": 122.96, "step": 25305, "token_acc": 0.9525999292536258, "train_speed(iter/s)": 0.241503 }, { "epoch": 1.9292629011357572, "grad_norm": 0.948543131351471, "learning_rate": 6.754840148121607e-05, "loss": 0.15204390287399291, "memory(GiB)": 122.96, "step": 25310, "token_acc": 0.9394752534287418, "train_speed(iter/s)": 0.241515 }, { "epoch": 1.9296440277460172, "grad_norm": 0.43907269835472107, "learning_rate": 6.753718919407257e-05, "loss": 0.0999747633934021, "memory(GiB)": 122.96, "step": 25315, "token_acc": 0.9673343605546996, "train_speed(iter/s)": 0.241527 }, { "epoch": 1.9300251543562772, "grad_norm": 1.5358482599258423, "learning_rate": 6.752597590125266e-05, "loss": 0.12752728462219237, "memory(GiB)": 122.96, "step": 25320, "token_acc": 0.9518008474576272, "train_speed(iter/s)": 0.241538 }, { "epoch": 1.9304062809665372, "grad_norm": 1.0366036891937256, "learning_rate": 6.751476160339937e-05, "loss": 0.09251788854599, "memory(GiB)": 122.96, "step": 25325, "token_acc": 0.9594260267194458, "train_speed(iter/s)": 0.241548 }, { "epoch": 1.930787407576797, "grad_norm": 0.9986807703971863, "learning_rate": 6.750354630115577e-05, "loss": 0.08221243619918824, "memory(GiB)": 122.96, "step": 25330, "token_acc": 0.9695774647887324, "train_speed(iter/s)": 0.241556 }, { "epoch": 1.9311685341870568, "grad_norm": 0.9377936124801636, "learning_rate": 6.749232999516502e-05, "loss": 0.11665012836456298, "memory(GiB)": 122.96, "step": 25335, "token_acc": 0.9554554554554554, "train_speed(iter/s)": 0.241566 }, { "epoch": 1.9315496607973168, "grad_norm": 0.8646098971366882, "learning_rate": 6.748111268607031e-05, "loss": 0.12441972494125367, "memory(GiB)": 122.96, "step": 25340, "token_acc": 0.9633911368015414, "train_speed(iter/s)": 0.241576 }, { "epoch": 1.9319307874075768, "grad_norm": 0.7443369030952454, "learning_rate": 6.74698943745149e-05, "loss": 0.12351741790771484, "memory(GiB)": 122.96, "step": 25345, "token_acc": 0.9458937198067633, "train_speed(iter/s)": 0.241587 }, { "epoch": 1.9323119140178369, "grad_norm": 0.4886271059513092, "learning_rate": 6.745867506114213e-05, "loss": 0.052913129329681396, "memory(GiB)": 122.96, "step": 25350, "token_acc": 0.9730690106601833, "train_speed(iter/s)": 0.241593 }, { "epoch": 1.9326930406280967, "grad_norm": 0.635986328125, "learning_rate": 6.744745474659537e-05, "loss": 0.11649851799011231, "memory(GiB)": 122.96, "step": 25355, "token_acc": 0.9574227581009797, "train_speed(iter/s)": 0.241606 }, { "epoch": 1.9330741672383565, "grad_norm": 0.8671284914016724, "learning_rate": 6.743623343151806e-05, "loss": 0.10125173330307007, "memory(GiB)": 122.96, "step": 25360, "token_acc": 0.9596278743198174, "train_speed(iter/s)": 0.241612 }, { "epoch": 1.9334552938486165, "grad_norm": 0.6492798924446106, "learning_rate": 6.742501111655365e-05, "loss": 0.11291807889938354, "memory(GiB)": 122.96, "step": 25365, "token_acc": 0.9508196721311475, "train_speed(iter/s)": 0.241628 }, { "epoch": 1.9338364204588765, "grad_norm": 1.1481432914733887, "learning_rate": 6.741378780234572e-05, "loss": 0.10060656070709229, "memory(GiB)": 122.96, "step": 25370, "token_acc": 0.9592466309465822, "train_speed(iter/s)": 0.241632 }, { "epoch": 1.9342175470691365, "grad_norm": 0.5594195127487183, "learning_rate": 6.740256348953788e-05, "loss": 0.11030815839767456, "memory(GiB)": 122.96, "step": 25375, "token_acc": 0.9600771456123433, "train_speed(iter/s)": 0.241641 }, { "epoch": 1.9345986736793963, "grad_norm": 0.5683815479278564, "learning_rate": 6.739133817877377e-05, "loss": 0.09129924774169922, "memory(GiB)": 122.96, "step": 25380, "token_acc": 0.9623985423223456, "train_speed(iter/s)": 0.241649 }, { "epoch": 1.934979800289656, "grad_norm": 1.0507827997207642, "learning_rate": 6.738011187069712e-05, "loss": 0.11215239763259888, "memory(GiB)": 122.96, "step": 25385, "token_acc": 0.9563042906027902, "train_speed(iter/s)": 0.241662 }, { "epoch": 1.9353609268999161, "grad_norm": 1.2869359254837036, "learning_rate": 6.736888456595173e-05, "loss": 0.12267132997512817, "memory(GiB)": 122.96, "step": 25390, "token_acc": 0.9458577951728636, "train_speed(iter/s)": 0.241675 }, { "epoch": 1.9357420535101761, "grad_norm": 1.3183735609054565, "learning_rate": 6.735765626518138e-05, "loss": 0.110951828956604, "memory(GiB)": 122.96, "step": 25395, "token_acc": 0.9631914456097059, "train_speed(iter/s)": 0.241686 }, { "epoch": 1.9361231801204362, "grad_norm": 0.5601490139961243, "learning_rate": 6.734642696903001e-05, "loss": 0.04065674245357513, "memory(GiB)": 122.96, "step": 25400, "token_acc": 0.9770114942528736, "train_speed(iter/s)": 0.241693 }, { "epoch": 1.9361231801204362, "eval_loss": 0.09472565352916718, "eval_runtime": 220.1762, "eval_samples_per_second": 2.407, "eval_steps_per_second": 2.407, "eval_token_acc": 0.9585115354496717, "step": 25400 }, { "epoch": 1.936504306730696, "grad_norm": 1.0650886297225952, "learning_rate": 6.733519667814156e-05, "loss": 0.07228307723999024, "memory(GiB)": 122.96, "step": 25405, "token_acc": 0.9590520692315531, "train_speed(iter/s)": 0.241196 }, { "epoch": 1.9368854333409558, "grad_norm": 0.7388157844543457, "learning_rate": 6.732396539316003e-05, "loss": 0.10173146724700928, "memory(GiB)": 122.96, "step": 25410, "token_acc": 0.9534435541129932, "train_speed(iter/s)": 0.241199 }, { "epoch": 1.9372665599512158, "grad_norm": 1.1310505867004395, "learning_rate": 6.731273311472945e-05, "loss": 0.11366972923278809, "memory(GiB)": 122.96, "step": 25415, "token_acc": 0.9575306479859895, "train_speed(iter/s)": 0.241209 }, { "epoch": 1.9376476865614758, "grad_norm": 0.7030280828475952, "learning_rate": 6.730149984349397e-05, "loss": 0.10917258262634277, "memory(GiB)": 122.96, "step": 25420, "token_acc": 0.9560412268719731, "train_speed(iter/s)": 0.241213 }, { "epoch": 1.9380288131717358, "grad_norm": 0.8771541118621826, "learning_rate": 6.729026558009778e-05, "loss": 0.08775395750999451, "memory(GiB)": 122.96, "step": 25425, "token_acc": 0.961982540129541, "train_speed(iter/s)": 0.241224 }, { "epoch": 1.9384099397819956, "grad_norm": 1.57017183303833, "learning_rate": 6.727903032518509e-05, "loss": 0.12893235683441162, "memory(GiB)": 122.96, "step": 25430, "token_acc": 0.9568782265411478, "train_speed(iter/s)": 0.241235 }, { "epoch": 1.9387910663922554, "grad_norm": 0.7722845077514648, "learning_rate": 6.726779407940018e-05, "loss": 0.12148727178573608, "memory(GiB)": 122.96, "step": 25435, "token_acc": 0.9488348530901722, "train_speed(iter/s)": 0.241244 }, { "epoch": 1.9391721930025154, "grad_norm": 1.3621958494186401, "learning_rate": 6.725655684338743e-05, "loss": 0.12846884727478028, "memory(GiB)": 122.96, "step": 25440, "token_acc": 0.9514268366727383, "train_speed(iter/s)": 0.241254 }, { "epoch": 1.9395533196127754, "grad_norm": 0.6706028580665588, "learning_rate": 6.724531861779123e-05, "loss": 0.1291172742843628, "memory(GiB)": 122.96, "step": 25445, "token_acc": 0.9546735556599343, "train_speed(iter/s)": 0.241261 }, { "epoch": 1.9399344462230355, "grad_norm": 0.6236817240715027, "learning_rate": 6.723407940325601e-05, "loss": 0.09779455661773681, "memory(GiB)": 122.96, "step": 25450, "token_acc": 0.9653812445223489, "train_speed(iter/s)": 0.241263 }, { "epoch": 1.9403155728332953, "grad_norm": 1.3929885625839233, "learning_rate": 6.722283920042634e-05, "loss": 0.13151650428771972, "memory(GiB)": 122.96, "step": 25455, "token_acc": 0.952457956015524, "train_speed(iter/s)": 0.241274 }, { "epoch": 1.940696699443555, "grad_norm": 1.0076351165771484, "learning_rate": 6.721159800994676e-05, "loss": 0.10368912220001221, "memory(GiB)": 122.96, "step": 25460, "token_acc": 0.9663187855787476, "train_speed(iter/s)": 0.241283 }, { "epoch": 1.941077826053815, "grad_norm": 1.31459641456604, "learning_rate": 6.720035583246189e-05, "loss": 0.08904297947883606, "memory(GiB)": 122.96, "step": 25465, "token_acc": 0.9611111111111111, "train_speed(iter/s)": 0.241293 }, { "epoch": 1.941458952664075, "grad_norm": 1.1427481174468994, "learning_rate": 6.718911266861644e-05, "loss": 0.0969819724559784, "memory(GiB)": 122.96, "step": 25470, "token_acc": 0.962409886714727, "train_speed(iter/s)": 0.241302 }, { "epoch": 1.9418400792743349, "grad_norm": 0.8767688870429993, "learning_rate": 6.717786851905515e-05, "loss": 0.1359075665473938, "memory(GiB)": 122.96, "step": 25475, "token_acc": 0.9422230051083319, "train_speed(iter/s)": 0.24131 }, { "epoch": 1.942221205884595, "grad_norm": 0.9665900468826294, "learning_rate": 6.716662338442282e-05, "loss": 0.09809097051620483, "memory(GiB)": 122.96, "step": 25480, "token_acc": 0.9605519724013799, "train_speed(iter/s)": 0.241317 }, { "epoch": 1.9426023324948547, "grad_norm": 1.3745023012161255, "learning_rate": 6.715537726536427e-05, "loss": 0.13839807510375976, "memory(GiB)": 122.96, "step": 25485, "token_acc": 0.9557544757033248, "train_speed(iter/s)": 0.241328 }, { "epoch": 1.9429834591051147, "grad_norm": 0.7256761193275452, "learning_rate": 6.714413016252448e-05, "loss": 0.11799775362014771, "memory(GiB)": 122.96, "step": 25490, "token_acc": 0.9610619469026549, "train_speed(iter/s)": 0.24134 }, { "epoch": 1.9433645857153747, "grad_norm": 0.8939529657363892, "learning_rate": 6.713288207654838e-05, "loss": 0.10995889902114868, "memory(GiB)": 122.96, "step": 25495, "token_acc": 0.9559132260321903, "train_speed(iter/s)": 0.241354 }, { "epoch": 1.9437457123256345, "grad_norm": 0.7190664410591125, "learning_rate": 6.712163300808098e-05, "loss": 0.10507405996322632, "memory(GiB)": 122.96, "step": 25500, "token_acc": 0.966094934184284, "train_speed(iter/s)": 0.241367 }, { "epoch": 1.9441268389358946, "grad_norm": 0.5132302045822144, "learning_rate": 6.71103829577674e-05, "loss": 0.06041609644889832, "memory(GiB)": 122.96, "step": 25505, "token_acc": 0.9787556904400607, "train_speed(iter/s)": 0.241366 }, { "epoch": 1.9445079655461543, "grad_norm": 0.6095417737960815, "learning_rate": 6.709913192625276e-05, "loss": 0.08416474461555482, "memory(GiB)": 122.96, "step": 25510, "token_acc": 0.9659543643607389, "train_speed(iter/s)": 0.241374 }, { "epoch": 1.9448890921564144, "grad_norm": 0.8764410614967346, "learning_rate": 6.708787991418222e-05, "loss": 0.11599817276000976, "memory(GiB)": 122.96, "step": 25515, "token_acc": 0.951212166842343, "train_speed(iter/s)": 0.241381 }, { "epoch": 1.9452702187666744, "grad_norm": 0.7054735422134399, "learning_rate": 6.70766269222011e-05, "loss": 0.09775006771087646, "memory(GiB)": 122.96, "step": 25520, "token_acc": 0.9633315872184389, "train_speed(iter/s)": 0.241388 }, { "epoch": 1.9456513453769342, "grad_norm": 1.1979063749313354, "learning_rate": 6.706537295095467e-05, "loss": 0.10299659967422485, "memory(GiB)": 122.96, "step": 25525, "token_acc": 0.9625212947189097, "train_speed(iter/s)": 0.241402 }, { "epoch": 1.9460324719871942, "grad_norm": 1.209309458732605, "learning_rate": 6.70541180010883e-05, "loss": 0.09154881238937378, "memory(GiB)": 122.96, "step": 25530, "token_acc": 0.9685662596110357, "train_speed(iter/s)": 0.241411 }, { "epoch": 1.946413598597454, "grad_norm": 0.8875465989112854, "learning_rate": 6.704286207324737e-05, "loss": 0.087744802236557, "memory(GiB)": 122.96, "step": 25535, "token_acc": 0.9697137580794091, "train_speed(iter/s)": 0.241421 }, { "epoch": 1.946794725207714, "grad_norm": 0.8628054261207581, "learning_rate": 6.703160516807742e-05, "loss": 0.1355154037475586, "memory(GiB)": 122.96, "step": 25540, "token_acc": 0.9499861840287372, "train_speed(iter/s)": 0.241432 }, { "epoch": 1.947175851817974, "grad_norm": 1.8349336385726929, "learning_rate": 6.702034728622393e-05, "loss": 0.11317617893218994, "memory(GiB)": 122.96, "step": 25545, "token_acc": 0.9651810584958217, "train_speed(iter/s)": 0.241439 }, { "epoch": 1.9475569784282338, "grad_norm": 0.5899932384490967, "learning_rate": 6.700908842833251e-05, "loss": 0.09008875489234924, "memory(GiB)": 122.96, "step": 25550, "token_acc": 0.9568801521876982, "train_speed(iter/s)": 0.241451 }, { "epoch": 1.9479381050384936, "grad_norm": 1.1865785121917725, "learning_rate": 6.69978285950488e-05, "loss": 0.15021508932113647, "memory(GiB)": 122.96, "step": 25555, "token_acc": 0.9430409914204004, "train_speed(iter/s)": 0.241449 }, { "epoch": 1.9483192316487536, "grad_norm": 0.5801621079444885, "learning_rate": 6.69865677870185e-05, "loss": 0.061830770969390866, "memory(GiB)": 122.96, "step": 25560, "token_acc": 0.9678518518518519, "train_speed(iter/s)": 0.241452 }, { "epoch": 1.9487003582590137, "grad_norm": 0.8859827518463135, "learning_rate": 6.697530600488738e-05, "loss": 0.08466415405273438, "memory(GiB)": 122.96, "step": 25565, "token_acc": 0.9633275904504061, "train_speed(iter/s)": 0.241462 }, { "epoch": 1.9490814848692737, "grad_norm": 0.7899249792098999, "learning_rate": 6.696404324930123e-05, "loss": 0.11362833976745605, "memory(GiB)": 122.96, "step": 25570, "token_acc": 0.961352657004831, "train_speed(iter/s)": 0.241473 }, { "epoch": 1.9494626114795335, "grad_norm": 0.9784717559814453, "learning_rate": 6.69527795209059e-05, "loss": 0.09864280819892883, "memory(GiB)": 122.96, "step": 25575, "token_acc": 0.951063829787234, "train_speed(iter/s)": 0.241482 }, { "epoch": 1.9498437380897933, "grad_norm": 0.7010583877563477, "learning_rate": 6.694151482034736e-05, "loss": 0.11802574396133422, "memory(GiB)": 122.96, "step": 25580, "token_acc": 0.9591102387962054, "train_speed(iter/s)": 0.241484 }, { "epoch": 1.9502248647000533, "grad_norm": 0.6966609358787537, "learning_rate": 6.693024914827155e-05, "loss": 0.13576021194458007, "memory(GiB)": 122.96, "step": 25585, "token_acc": 0.9517901453385325, "train_speed(iter/s)": 0.24149 }, { "epoch": 1.9506059913103133, "grad_norm": 0.7725162506103516, "learning_rate": 6.691898250532453e-05, "loss": 0.08475621938705444, "memory(GiB)": 122.96, "step": 25590, "token_acc": 0.95744301994302, "train_speed(iter/s)": 0.241498 }, { "epoch": 1.9509871179205733, "grad_norm": 0.5484259128570557, "learning_rate": 6.690771489215237e-05, "loss": 0.09059439301490783, "memory(GiB)": 122.96, "step": 25595, "token_acc": 0.9613439306358381, "train_speed(iter/s)": 0.241511 }, { "epoch": 1.9513682445308331, "grad_norm": 1.1703639030456543, "learning_rate": 6.689644630940121e-05, "loss": 0.10038014650344848, "memory(GiB)": 122.96, "step": 25600, "token_acc": 0.961570362178601, "train_speed(iter/s)": 0.241521 }, { "epoch": 1.9513682445308331, "eval_loss": 0.09341057389974594, "eval_runtime": 221.9371, "eval_samples_per_second": 2.388, "eval_steps_per_second": 2.388, "eval_token_acc": 0.9590235527980242, "step": 25600 }, { "epoch": 1.951749371141093, "grad_norm": 0.7381249070167542, "learning_rate": 6.688517675771729e-05, "loss": 0.07046685814857483, "memory(GiB)": 122.96, "step": 25605, "token_acc": 0.9595257296048473, "train_speed(iter/s)": 0.241023 }, { "epoch": 1.952130497751353, "grad_norm": 1.02713143825531, "learning_rate": 6.687390623774683e-05, "loss": 0.1318049669265747, "memory(GiB)": 122.96, "step": 25610, "token_acc": 0.9346781940441883, "train_speed(iter/s)": 0.241036 }, { "epoch": 1.952511624361613, "grad_norm": 0.9179072976112366, "learning_rate": 6.686263475013616e-05, "loss": 0.07630914449691772, "memory(GiB)": 122.96, "step": 25615, "token_acc": 0.966417194396469, "train_speed(iter/s)": 0.241046 }, { "epoch": 1.952892750971873, "grad_norm": 0.9029240012168884, "learning_rate": 6.68513622955316e-05, "loss": 0.08883668184280395, "memory(GiB)": 122.96, "step": 25620, "token_acc": 0.9594714964370546, "train_speed(iter/s)": 0.24105 }, { "epoch": 1.9532738775821328, "grad_norm": 0.9136301279067993, "learning_rate": 6.684008887457964e-05, "loss": 0.12069019079208373, "memory(GiB)": 122.96, "step": 25625, "token_acc": 0.9536263143704502, "train_speed(iter/s)": 0.241063 }, { "epoch": 1.9536550041923926, "grad_norm": 1.1142699718475342, "learning_rate": 6.682881448792673e-05, "loss": 0.11324212551116944, "memory(GiB)": 122.96, "step": 25630, "token_acc": 0.9410589410589411, "train_speed(iter/s)": 0.241076 }, { "epoch": 1.9540361308026526, "grad_norm": 0.6013747453689575, "learning_rate": 6.681753913621941e-05, "loss": 0.1016167163848877, "memory(GiB)": 122.96, "step": 25635, "token_acc": 0.9546533087266016, "train_speed(iter/s)": 0.241082 }, { "epoch": 1.9544172574129126, "grad_norm": 0.6005774736404419, "learning_rate": 6.680626282010425e-05, "loss": 0.12658778429031373, "memory(GiB)": 122.96, "step": 25640, "token_acc": 0.9536118162935611, "train_speed(iter/s)": 0.241092 }, { "epoch": 1.9547983840231726, "grad_norm": 0.658902645111084, "learning_rate": 6.679498554022792e-05, "loss": 0.04936954379081726, "memory(GiB)": 122.96, "step": 25645, "token_acc": 0.9702393340270552, "train_speed(iter/s)": 0.241102 }, { "epoch": 1.9551795106334324, "grad_norm": 0.8696643710136414, "learning_rate": 6.678370729723711e-05, "loss": 0.12246394157409668, "memory(GiB)": 122.96, "step": 25650, "token_acc": 0.9489478499542543, "train_speed(iter/s)": 0.241108 }, { "epoch": 1.9555606372436922, "grad_norm": 1.0736279487609863, "learning_rate": 6.677242809177856e-05, "loss": 0.11863361597061158, "memory(GiB)": 122.96, "step": 25655, "token_acc": 0.9517839922854388, "train_speed(iter/s)": 0.241118 }, { "epoch": 1.9559417638539522, "grad_norm": 0.7502106428146362, "learning_rate": 6.67611479244991e-05, "loss": 0.12168625593185425, "memory(GiB)": 122.96, "step": 25660, "token_acc": 0.9590840840840841, "train_speed(iter/s)": 0.241132 }, { "epoch": 1.9563228904642123, "grad_norm": 0.9132732152938843, "learning_rate": 6.674986679604559e-05, "loss": 0.1333797335624695, "memory(GiB)": 122.96, "step": 25665, "token_acc": 0.9505381545531503, "train_speed(iter/s)": 0.241138 }, { "epoch": 1.9567040170744723, "grad_norm": 0.6566485166549683, "learning_rate": 6.673858470706493e-05, "loss": 0.12929890155792237, "memory(GiB)": 122.96, "step": 25670, "token_acc": 0.9515447154471545, "train_speed(iter/s)": 0.241149 }, { "epoch": 1.957085143684732, "grad_norm": 0.5940651297569275, "learning_rate": 6.672730165820414e-05, "loss": 0.07238308191299439, "memory(GiB)": 122.96, "step": 25675, "token_acc": 0.9696485623003195, "train_speed(iter/s)": 0.24116 }, { "epoch": 1.9574662702949919, "grad_norm": 1.4403315782546997, "learning_rate": 6.67160176501102e-05, "loss": 0.10333422422409058, "memory(GiB)": 122.96, "step": 25680, "token_acc": 0.9662047989185536, "train_speed(iter/s)": 0.241172 }, { "epoch": 1.957847396905252, "grad_norm": 0.1298375129699707, "learning_rate": 6.670473268343022e-05, "loss": 0.06644163131713868, "memory(GiB)": 122.96, "step": 25685, "token_acc": 0.9723175204069561, "train_speed(iter/s)": 0.241172 }, { "epoch": 1.958228523515512, "grad_norm": 1.3385024070739746, "learning_rate": 6.669344675881135e-05, "loss": 0.15387394428253173, "memory(GiB)": 122.96, "step": 25690, "token_acc": 0.9348692955250333, "train_speed(iter/s)": 0.241187 }, { "epoch": 1.958609650125772, "grad_norm": 1.4127888679504395, "learning_rate": 6.668215987690079e-05, "loss": 0.1421678900718689, "memory(GiB)": 122.96, "step": 25695, "token_acc": 0.9490995497748874, "train_speed(iter/s)": 0.241188 }, { "epoch": 1.9589907767360317, "grad_norm": 0.8296637535095215, "learning_rate": 6.667087203834576e-05, "loss": 0.09716130495071411, "memory(GiB)": 122.96, "step": 25700, "token_acc": 0.9679073614557485, "train_speed(iter/s)": 0.241194 }, { "epoch": 1.9593719033462915, "grad_norm": 0.5406894683837891, "learning_rate": 6.665958324379358e-05, "loss": 0.12091450691223145, "memory(GiB)": 122.96, "step": 25705, "token_acc": 0.9604925602873269, "train_speed(iter/s)": 0.241199 }, { "epoch": 1.9597530299565515, "grad_norm": 0.8709231019020081, "learning_rate": 6.664829349389161e-05, "loss": 0.09301092624664306, "memory(GiB)": 122.96, "step": 25710, "token_acc": 0.9612716763005781, "train_speed(iter/s)": 0.241207 }, { "epoch": 1.9601341565668116, "grad_norm": 2.548740863800049, "learning_rate": 6.663700278928728e-05, "loss": 0.14764015674591063, "memory(GiB)": 122.96, "step": 25715, "token_acc": 0.9407665505226481, "train_speed(iter/s)": 0.241221 }, { "epoch": 1.9605152831770716, "grad_norm": 1.3500055074691772, "learning_rate": 6.662571113062804e-05, "loss": 0.10981954336166382, "memory(GiB)": 122.96, "step": 25720, "token_acc": 0.9504634994206257, "train_speed(iter/s)": 0.241235 }, { "epoch": 1.9608964097873314, "grad_norm": 0.9150088429450989, "learning_rate": 6.661441851856141e-05, "loss": 0.10976629257202149, "memory(GiB)": 122.96, "step": 25725, "token_acc": 0.9597349643221202, "train_speed(iter/s)": 0.24125 }, { "epoch": 1.9612775363975912, "grad_norm": 0.8238550424575806, "learning_rate": 6.660312495373498e-05, "loss": 0.1517275094985962, "memory(GiB)": 122.96, "step": 25730, "token_acc": 0.937821993249245, "train_speed(iter/s)": 0.241261 }, { "epoch": 1.9616586630078512, "grad_norm": 0.8592469692230225, "learning_rate": 6.659183043679638e-05, "loss": 0.12827932834625244, "memory(GiB)": 122.96, "step": 25735, "token_acc": 0.9418859649122807, "train_speed(iter/s)": 0.241275 }, { "epoch": 1.9620397896181112, "grad_norm": 0.7015056014060974, "learning_rate": 6.65805349683933e-05, "loss": 0.1013903021812439, "memory(GiB)": 122.96, "step": 25740, "token_acc": 0.9646127580319727, "train_speed(iter/s)": 0.241282 }, { "epoch": 1.9624209162283712, "grad_norm": 0.6849764585494995, "learning_rate": 6.656923854917349e-05, "loss": 0.13102164268493652, "memory(GiB)": 122.96, "step": 25745, "token_acc": 0.9521410579345088, "train_speed(iter/s)": 0.241289 }, { "epoch": 1.962802042838631, "grad_norm": 1.4717473983764648, "learning_rate": 6.655794117978475e-05, "loss": 0.1342444658279419, "memory(GiB)": 122.96, "step": 25750, "token_acc": 0.9534383954154728, "train_speed(iter/s)": 0.241299 }, { "epoch": 1.9631831694488908, "grad_norm": 0.5564463138580322, "learning_rate": 6.65466428608749e-05, "loss": 0.06685240268707275, "memory(GiB)": 122.96, "step": 25755, "token_acc": 0.9690801339860861, "train_speed(iter/s)": 0.241309 }, { "epoch": 1.9635642960591508, "grad_norm": 1.2880631685256958, "learning_rate": 6.653534359309186e-05, "loss": 0.10643432140350342, "memory(GiB)": 122.96, "step": 25760, "token_acc": 0.9591044776119403, "train_speed(iter/s)": 0.241319 }, { "epoch": 1.9639454226694109, "grad_norm": 0.5333672165870667, "learning_rate": 6.652404337708362e-05, "loss": 0.10490972995758056, "memory(GiB)": 122.96, "step": 25765, "token_acc": 0.9683351991044299, "train_speed(iter/s)": 0.241325 }, { "epoch": 1.9643265492796707, "grad_norm": 0.7675747871398926, "learning_rate": 6.651274221349817e-05, "loss": 0.10806642770767212, "memory(GiB)": 122.96, "step": 25770, "token_acc": 0.9562964242528934, "train_speed(iter/s)": 0.241332 }, { "epoch": 1.9647076758899307, "grad_norm": 0.8232045769691467, "learning_rate": 6.650144010298358e-05, "loss": 0.09376006126403809, "memory(GiB)": 122.96, "step": 25775, "token_acc": 0.9619714786089567, "train_speed(iter/s)": 0.241341 }, { "epoch": 1.9650888025001905, "grad_norm": 0.804635226726532, "learning_rate": 6.649013704618798e-05, "loss": 0.06518564820289612, "memory(GiB)": 122.96, "step": 25780, "token_acc": 0.9721529509559435, "train_speed(iter/s)": 0.241343 }, { "epoch": 1.9654699291104505, "grad_norm": 0.9937106370925903, "learning_rate": 6.647883304375954e-05, "loss": 0.09170768857002258, "memory(GiB)": 122.96, "step": 25785, "token_acc": 0.9618353344768439, "train_speed(iter/s)": 0.241356 }, { "epoch": 1.9658510557207105, "grad_norm": 1.1889737844467163, "learning_rate": 6.646752809634649e-05, "loss": 0.16496696472167968, "memory(GiB)": 122.96, "step": 25790, "token_acc": 0.9310958555809814, "train_speed(iter/s)": 0.241366 }, { "epoch": 1.9662321823309703, "grad_norm": 1.7084089517593384, "learning_rate": 6.645622220459713e-05, "loss": 0.1551036834716797, "memory(GiB)": 122.96, "step": 25795, "token_acc": 0.9443495240419819, "train_speed(iter/s)": 0.241376 }, { "epoch": 1.9666133089412303, "grad_norm": 1.657727837562561, "learning_rate": 6.644491536915979e-05, "loss": 0.11817775964736939, "memory(GiB)": 122.96, "step": 25800, "token_acc": 0.9529004789781799, "train_speed(iter/s)": 0.241387 }, { "epoch": 1.9666133089412303, "eval_loss": 0.09299156814813614, "eval_runtime": 219.9905, "eval_samples_per_second": 2.409, "eval_steps_per_second": 2.409, "eval_token_acc": 0.9594452141437263, "step": 25800 }, { "epoch": 1.9669944355514901, "grad_norm": 1.294771671295166, "learning_rate": 6.643360759068286e-05, "loss": 0.08172635436058044, "memory(GiB)": 122.96, "step": 25805, "token_acc": 0.9595520191200688, "train_speed(iter/s)": 0.240897 }, { "epoch": 1.9673755621617501, "grad_norm": 0.6386441588401794, "learning_rate": 6.642229886981481e-05, "loss": 0.11432948112487792, "memory(GiB)": 122.96, "step": 25810, "token_acc": 0.9651987110633727, "train_speed(iter/s)": 0.240905 }, { "epoch": 1.9677566887720102, "grad_norm": 0.7928968667984009, "learning_rate": 6.641098920720413e-05, "loss": 0.13426809310913085, "memory(GiB)": 122.96, "step": 25815, "token_acc": 0.9544205222171324, "train_speed(iter/s)": 0.240906 }, { "epoch": 1.96813781538227, "grad_norm": 0.9315042495727539, "learning_rate": 6.639967860349936e-05, "loss": 0.07573255896568298, "memory(GiB)": 122.96, "step": 25820, "token_acc": 0.964801049409707, "train_speed(iter/s)": 0.240916 }, { "epoch": 1.96851894199253, "grad_norm": 0.5195799469947815, "learning_rate": 6.638836705934913e-05, "loss": 0.09939044713973999, "memory(GiB)": 122.96, "step": 25825, "token_acc": 0.9623291476830759, "train_speed(iter/s)": 0.240914 }, { "epoch": 1.9689000686027898, "grad_norm": 0.3561801612377167, "learning_rate": 6.63770545754021e-05, "loss": 0.09861472845077515, "memory(GiB)": 122.96, "step": 25830, "token_acc": 0.9544769369912571, "train_speed(iter/s)": 0.240924 }, { "epoch": 1.9692811952130498, "grad_norm": 0.6425355076789856, "learning_rate": 6.6365741152307e-05, "loss": 0.11386866569519043, "memory(GiB)": 122.96, "step": 25835, "token_acc": 0.9542144748455428, "train_speed(iter/s)": 0.240926 }, { "epoch": 1.9696623218233098, "grad_norm": 0.6313372850418091, "learning_rate": 6.635442679071259e-05, "loss": 0.09547186493873597, "memory(GiB)": 122.96, "step": 25840, "token_acc": 0.9513212795549374, "train_speed(iter/s)": 0.240936 }, { "epoch": 1.9700434484335696, "grad_norm": 1.4487853050231934, "learning_rate": 6.634311149126769e-05, "loss": 0.1002464771270752, "memory(GiB)": 122.96, "step": 25845, "token_acc": 0.9710769230769231, "train_speed(iter/s)": 0.240943 }, { "epoch": 1.9704245750438294, "grad_norm": 2.4945571422576904, "learning_rate": 6.633179525462119e-05, "loss": 0.13840703964233397, "memory(GiB)": 122.96, "step": 25850, "token_acc": 0.9464346639372228, "train_speed(iter/s)": 0.240954 }, { "epoch": 1.9708057016540894, "grad_norm": 1.7437186241149902, "learning_rate": 6.632047808142203e-05, "loss": 0.12849892377853395, "memory(GiB)": 122.96, "step": 25855, "token_acc": 0.9591386910817227, "train_speed(iter/s)": 0.240964 }, { "epoch": 1.9711868282643494, "grad_norm": 0.8548665046691895, "learning_rate": 6.630915997231916e-05, "loss": 0.09771297574043274, "memory(GiB)": 122.96, "step": 25860, "token_acc": 0.9513023782559457, "train_speed(iter/s)": 0.240968 }, { "epoch": 1.9715679548746095, "grad_norm": 0.6223412752151489, "learning_rate": 6.629784092796167e-05, "loss": 0.11660987138748169, "memory(GiB)": 122.96, "step": 25865, "token_acc": 0.9631732168850072, "train_speed(iter/s)": 0.240971 }, { "epoch": 1.9719490814848692, "grad_norm": 1.3483121395111084, "learning_rate": 6.628652094899863e-05, "loss": 0.13441104888916017, "memory(GiB)": 122.96, "step": 25870, "token_acc": 0.9424682957005877, "train_speed(iter/s)": 0.240984 }, { "epoch": 1.972330208095129, "grad_norm": 0.8816883563995361, "learning_rate": 6.627520003607918e-05, "loss": 0.1043013334274292, "memory(GiB)": 122.96, "step": 25875, "token_acc": 0.9655812384005399, "train_speed(iter/s)": 0.240991 }, { "epoch": 1.972711334705389, "grad_norm": 0.9880886673927307, "learning_rate": 6.626387818985257e-05, "loss": 0.09282463788986206, "memory(GiB)": 122.96, "step": 25880, "token_acc": 0.964837829645347, "train_speed(iter/s)": 0.241004 }, { "epoch": 1.973092461315649, "grad_norm": 0.6471860408782959, "learning_rate": 6.625255541096799e-05, "loss": 0.07190305590629578, "memory(GiB)": 122.96, "step": 25885, "token_acc": 0.9673681000304971, "train_speed(iter/s)": 0.241016 }, { "epoch": 1.973473587925909, "grad_norm": 1.3265693187713623, "learning_rate": 6.62412317000748e-05, "loss": 0.17652335166931152, "memory(GiB)": 122.96, "step": 25890, "token_acc": 0.9248677248677248, "train_speed(iter/s)": 0.241031 }, { "epoch": 1.973854714536169, "grad_norm": 1.5288797616958618, "learning_rate": 6.62299070578223e-05, "loss": 0.1100031852722168, "memory(GiB)": 122.96, "step": 25895, "token_acc": 0.9494511105437835, "train_speed(iter/s)": 0.241041 }, { "epoch": 1.9742358411464287, "grad_norm": 0.4328297972679138, "learning_rate": 6.621858148485997e-05, "loss": 0.1015278458595276, "memory(GiB)": 122.96, "step": 25900, "token_acc": 0.9672769081100203, "train_speed(iter/s)": 0.241036 }, { "epoch": 1.9746169677566887, "grad_norm": 0.6530630588531494, "learning_rate": 6.620725498183728e-05, "loss": 0.10842350721359253, "memory(GiB)": 122.96, "step": 25905, "token_acc": 0.9621857469353834, "train_speed(iter/s)": 0.241045 }, { "epoch": 1.9749980943669487, "grad_norm": 1.2312203645706177, "learning_rate": 6.61959275494037e-05, "loss": 0.11299914121627808, "memory(GiB)": 122.96, "step": 25910, "token_acc": 0.953551912568306, "train_speed(iter/s)": 0.241056 }, { "epoch": 1.9753792209772087, "grad_norm": 0.6641731858253479, "learning_rate": 6.618459918820883e-05, "loss": 0.08720279932022094, "memory(GiB)": 122.96, "step": 25915, "token_acc": 0.9589142963376093, "train_speed(iter/s)": 0.241064 }, { "epoch": 1.9757603475874685, "grad_norm": 0.5112627744674683, "learning_rate": 6.617326989890232e-05, "loss": 0.12663037776947023, "memory(GiB)": 122.96, "step": 25920, "token_acc": 0.9549192364170338, "train_speed(iter/s)": 0.241068 }, { "epoch": 1.9761414741977283, "grad_norm": 0.6182435750961304, "learning_rate": 6.616193968213383e-05, "loss": 0.09054120779037475, "memory(GiB)": 122.96, "step": 25925, "token_acc": 0.9663461538461539, "train_speed(iter/s)": 0.241082 }, { "epoch": 1.9765226008079884, "grad_norm": 0.8282514214515686, "learning_rate": 6.615060853855308e-05, "loss": 0.07046051025390625, "memory(GiB)": 122.96, "step": 25930, "token_acc": 0.9675625579240037, "train_speed(iter/s)": 0.241092 }, { "epoch": 1.9769037274182484, "grad_norm": 0.8812187910079956, "learning_rate": 6.613927646880991e-05, "loss": 0.09891223907470703, "memory(GiB)": 122.96, "step": 25935, "token_acc": 0.960220740400092, "train_speed(iter/s)": 0.241103 }, { "epoch": 1.9772848540285084, "grad_norm": 0.9234439730644226, "learning_rate": 6.612794347355411e-05, "loss": 0.12211040258407593, "memory(GiB)": 122.96, "step": 25940, "token_acc": 0.963098016336056, "train_speed(iter/s)": 0.241104 }, { "epoch": 1.9776659806387682, "grad_norm": 1.0267410278320312, "learning_rate": 6.611660955343559e-05, "loss": 0.09554874897003174, "memory(GiB)": 122.96, "step": 25945, "token_acc": 0.9566082082806002, "train_speed(iter/s)": 0.241111 }, { "epoch": 1.978047107249028, "grad_norm": 0.9504725933074951, "learning_rate": 6.610527470910432e-05, "loss": 0.09236326217651367, "memory(GiB)": 122.96, "step": 25950, "token_acc": 0.9621596677434241, "train_speed(iter/s)": 0.241122 }, { "epoch": 1.978428233859288, "grad_norm": 1.0098053216934204, "learning_rate": 6.609393894121027e-05, "loss": 0.11468203067779541, "memory(GiB)": 122.96, "step": 25955, "token_acc": 0.9433714607162947, "train_speed(iter/s)": 0.24113 }, { "epoch": 1.978809360469548, "grad_norm": 0.9678493738174438, "learning_rate": 6.60826022504035e-05, "loss": 0.12111028432846069, "memory(GiB)": 122.96, "step": 25960, "token_acc": 0.9518468670554808, "train_speed(iter/s)": 0.241133 }, { "epoch": 1.979190487079808, "grad_norm": 0.8734918832778931, "learning_rate": 6.607126463733413e-05, "loss": 0.08237897753715515, "memory(GiB)": 122.96, "step": 25965, "token_acc": 0.9657481328869431, "train_speed(iter/s)": 0.241137 }, { "epoch": 1.9795716136900678, "grad_norm": 0.5150777697563171, "learning_rate": 6.605992610265233e-05, "loss": 0.09220286011695862, "memory(GiB)": 122.96, "step": 25970, "token_acc": 0.9566717791411042, "train_speed(iter/s)": 0.241146 }, { "epoch": 1.9799527403003276, "grad_norm": 0.8618993759155273, "learning_rate": 6.604858664700828e-05, "loss": 0.13282779455184937, "memory(GiB)": 122.96, "step": 25975, "token_acc": 0.9450122649223222, "train_speed(iter/s)": 0.241153 }, { "epoch": 1.9803338669105877, "grad_norm": 1.9674850702285767, "learning_rate": 6.603724627105226e-05, "loss": 0.1259993553161621, "memory(GiB)": 122.96, "step": 25980, "token_acc": 0.9544296631757713, "train_speed(iter/s)": 0.241165 }, { "epoch": 1.9807149935208477, "grad_norm": 0.8697695136070251, "learning_rate": 6.602590497543459e-05, "loss": 0.11042227745056152, "memory(GiB)": 122.96, "step": 25985, "token_acc": 0.9574126155082362, "train_speed(iter/s)": 0.241172 }, { "epoch": 1.9810961201311077, "grad_norm": 0.6823428869247437, "learning_rate": 6.601456276080564e-05, "loss": 0.13682951927185058, "memory(GiB)": 122.96, "step": 25990, "token_acc": 0.9424623633738916, "train_speed(iter/s)": 0.241181 }, { "epoch": 1.9814772467413675, "grad_norm": 0.7488774657249451, "learning_rate": 6.600321962781584e-05, "loss": 0.09563738703727723, "memory(GiB)": 122.96, "step": 25995, "token_acc": 0.9650494027429583, "train_speed(iter/s)": 0.241187 }, { "epoch": 1.9818583733516273, "grad_norm": 0.9882155656814575, "learning_rate": 6.599187557711564e-05, "loss": 0.14637744426727295, "memory(GiB)": 122.96, "step": 26000, "token_acc": 0.9258278145695364, "train_speed(iter/s)": 0.241203 }, { "epoch": 1.9818583733516273, "eval_loss": 0.08902326971292496, "eval_runtime": 219.0777, "eval_samples_per_second": 2.419, "eval_steps_per_second": 2.419, "eval_token_acc": 0.9598593458225408, "step": 26000 }, { "epoch": 1.9822394999618873, "grad_norm": 1.566598892211914, "learning_rate": 6.59805306093556e-05, "loss": 0.1422951936721802, "memory(GiB)": 122.96, "step": 26005, "token_acc": 0.9597559203835537, "train_speed(iter/s)": 0.240717 }, { "epoch": 1.9826206265721473, "grad_norm": 0.5387257933616638, "learning_rate": 6.596918472518628e-05, "loss": 0.09192507863044738, "memory(GiB)": 122.96, "step": 26010, "token_acc": 0.9663359319631467, "train_speed(iter/s)": 0.240723 }, { "epoch": 1.9830017531824073, "grad_norm": 1.585392713546753, "learning_rate": 6.595783792525833e-05, "loss": 0.12191234827041626, "memory(GiB)": 122.96, "step": 26015, "token_acc": 0.9467140319715808, "train_speed(iter/s)": 0.240735 }, { "epoch": 1.9833828797926671, "grad_norm": 0.936375081539154, "learning_rate": 6.594649021022241e-05, "loss": 0.157107150554657, "memory(GiB)": 122.96, "step": 26020, "token_acc": 0.9419546882751778, "train_speed(iter/s)": 0.240739 }, { "epoch": 1.983764006402927, "grad_norm": 0.6655643582344055, "learning_rate": 6.59351415807293e-05, "loss": 0.10039744377136231, "memory(GiB)": 122.96, "step": 26025, "token_acc": 0.9620946538124452, "train_speed(iter/s)": 0.240749 }, { "epoch": 1.984145133013187, "grad_norm": 0.5425387620925903, "learning_rate": 6.592379203742977e-05, "loss": 0.11313588619232177, "memory(GiB)": 122.96, "step": 26030, "token_acc": 0.9530814100938372, "train_speed(iter/s)": 0.240761 }, { "epoch": 1.984526259623447, "grad_norm": 0.5904073715209961, "learning_rate": 6.591244158097464e-05, "loss": 0.11707713603973388, "memory(GiB)": 122.96, "step": 26035, "token_acc": 0.9548362815205118, "train_speed(iter/s)": 0.240768 }, { "epoch": 1.984907386233707, "grad_norm": 0.454168438911438, "learning_rate": 6.590109021201485e-05, "loss": 0.08734502196311951, "memory(GiB)": 122.96, "step": 26040, "token_acc": 0.9564564564564565, "train_speed(iter/s)": 0.24078 }, { "epoch": 1.9852885128439668, "grad_norm": 0.8447994589805603, "learning_rate": 6.588973793120133e-05, "loss": 0.08911625146865845, "memory(GiB)": 122.96, "step": 26045, "token_acc": 0.9649869884078542, "train_speed(iter/s)": 0.240789 }, { "epoch": 1.9856696394542266, "grad_norm": 0.8310016393661499, "learning_rate": 6.587838473918505e-05, "loss": 0.09370272755622863, "memory(GiB)": 122.96, "step": 26050, "token_acc": 0.9470899470899471, "train_speed(iter/s)": 0.240803 }, { "epoch": 1.9860507660644866, "grad_norm": 0.6698933243751526, "learning_rate": 6.58670306366171e-05, "loss": 0.11950793266296386, "memory(GiB)": 122.96, "step": 26055, "token_acc": 0.9564121945074326, "train_speed(iter/s)": 0.240814 }, { "epoch": 1.9864318926747466, "grad_norm": 1.802700400352478, "learning_rate": 6.585567562414859e-05, "loss": 0.09391063451766968, "memory(GiB)": 122.96, "step": 26060, "token_acc": 0.9730492813141683, "train_speed(iter/s)": 0.240825 }, { "epoch": 1.9868130192850066, "grad_norm": 0.5994386672973633, "learning_rate": 6.584431970243064e-05, "loss": 0.06451411843299866, "memory(GiB)": 122.96, "step": 26065, "token_acc": 0.9611989702096359, "train_speed(iter/s)": 0.240832 }, { "epoch": 1.9871941458952664, "grad_norm": 1.9238004684448242, "learning_rate": 6.58329628721145e-05, "loss": 0.0855342447757721, "memory(GiB)": 122.96, "step": 26070, "token_acc": 0.9619191049913941, "train_speed(iter/s)": 0.240841 }, { "epoch": 1.9875752725055262, "grad_norm": 1.2280157804489136, "learning_rate": 6.58216051338514e-05, "loss": 0.0618017852306366, "memory(GiB)": 122.96, "step": 26075, "token_acc": 0.9713603818615751, "train_speed(iter/s)": 0.240857 }, { "epoch": 1.9879563991157863, "grad_norm": 0.5866227746009827, "learning_rate": 6.581024648829268e-05, "loss": 0.10722067356109619, "memory(GiB)": 122.96, "step": 26080, "token_acc": 0.9492920015308075, "train_speed(iter/s)": 0.240866 }, { "epoch": 1.9883375257260463, "grad_norm": 0.4604701101779938, "learning_rate": 6.579888693608967e-05, "loss": 0.0966400682926178, "memory(GiB)": 122.96, "step": 26085, "token_acc": 0.9712843168191858, "train_speed(iter/s)": 0.240873 }, { "epoch": 1.988718652336306, "grad_norm": 0.7597588300704956, "learning_rate": 6.578752647789383e-05, "loss": 0.11330108642578125, "memory(GiB)": 122.96, "step": 26090, "token_acc": 0.9551412758520245, "train_speed(iter/s)": 0.240883 }, { "epoch": 1.989099778946566, "grad_norm": 0.5999710559844971, "learning_rate": 6.577616511435661e-05, "loss": 0.1200719952583313, "memory(GiB)": 122.96, "step": 26095, "token_acc": 0.9482529854046882, "train_speed(iter/s)": 0.240894 }, { "epoch": 1.9894809055568259, "grad_norm": 1.4099926948547363, "learning_rate": 6.576480284612952e-05, "loss": 0.1576218843460083, "memory(GiB)": 122.96, "step": 26100, "token_acc": 0.9396526346776568, "train_speed(iter/s)": 0.240906 }, { "epoch": 1.989862032167086, "grad_norm": 0.7893127799034119, "learning_rate": 6.575343967386416e-05, "loss": 0.10392158031463623, "memory(GiB)": 122.96, "step": 26105, "token_acc": 0.9620497014062801, "train_speed(iter/s)": 0.240917 }, { "epoch": 1.990243158777346, "grad_norm": 0.7216599583625793, "learning_rate": 6.574207559821213e-05, "loss": 0.08679304122924805, "memory(GiB)": 122.96, "step": 26110, "token_acc": 0.9550190176947246, "train_speed(iter/s)": 0.24092 }, { "epoch": 1.9906242853876057, "grad_norm": 0.6808780431747437, "learning_rate": 6.573071061982512e-05, "loss": 0.08989113569259644, "memory(GiB)": 122.96, "step": 26115, "token_acc": 0.9634535588892436, "train_speed(iter/s)": 0.240927 }, { "epoch": 1.9910054119978657, "grad_norm": 1.2944449186325073, "learning_rate": 6.571934473935485e-05, "loss": 0.09862396121025085, "memory(GiB)": 122.96, "step": 26120, "token_acc": 0.9633688405241781, "train_speed(iter/s)": 0.240921 }, { "epoch": 1.9913865386081255, "grad_norm": 0.5983178019523621, "learning_rate": 6.570797795745311e-05, "loss": 0.08003859519958496, "memory(GiB)": 122.96, "step": 26125, "token_acc": 0.9634733893557423, "train_speed(iter/s)": 0.240922 }, { "epoch": 1.9917676652183856, "grad_norm": 1.7874500751495361, "learning_rate": 6.569661027477173e-05, "loss": 0.13768817186355592, "memory(GiB)": 122.96, "step": 26130, "token_acc": 0.9386892177589852, "train_speed(iter/s)": 0.240939 }, { "epoch": 1.9921487918286456, "grad_norm": 0.4989190697669983, "learning_rate": 6.568524169196258e-05, "loss": 0.06939210891723632, "memory(GiB)": 122.96, "step": 26135, "token_acc": 0.9737721779377732, "train_speed(iter/s)": 0.24095 }, { "epoch": 1.9925299184389054, "grad_norm": 0.812203586101532, "learning_rate": 6.567387220967762e-05, "loss": 0.11524865627288819, "memory(GiB)": 122.96, "step": 26140, "token_acc": 0.9537423069287274, "train_speed(iter/s)": 0.240958 }, { "epoch": 1.9929110450491654, "grad_norm": 1.0384266376495361, "learning_rate": 6.566250182856882e-05, "loss": 0.13886805772781372, "memory(GiB)": 122.96, "step": 26145, "token_acc": 0.960679945054945, "train_speed(iter/s)": 0.240963 }, { "epoch": 1.9932921716594252, "grad_norm": 0.8786221146583557, "learning_rate": 6.565113054928822e-05, "loss": 0.11722745895385742, "memory(GiB)": 122.96, "step": 26150, "token_acc": 0.9544459045116075, "train_speed(iter/s)": 0.240972 }, { "epoch": 1.9936732982696852, "grad_norm": 0.9618441462516785, "learning_rate": 6.563975837248791e-05, "loss": 0.1277235269546509, "memory(GiB)": 122.96, "step": 26155, "token_acc": 0.9556200145032633, "train_speed(iter/s)": 0.240975 }, { "epoch": 1.9940544248799452, "grad_norm": 0.8579375743865967, "learning_rate": 6.562838529882005e-05, "loss": 0.0825173556804657, "memory(GiB)": 122.96, "step": 26160, "token_acc": 0.9635173058933583, "train_speed(iter/s)": 0.240988 }, { "epoch": 1.994435551490205, "grad_norm": 0.7828971743583679, "learning_rate": 6.56170113289368e-05, "loss": 0.17754406929016114, "memory(GiB)": 122.96, "step": 26165, "token_acc": 0.9329363024339721, "train_speed(iter/s)": 0.240998 }, { "epoch": 1.9948166781004648, "grad_norm": 0.9978771209716797, "learning_rate": 6.560563646349042e-05, "loss": 0.09512693285942078, "memory(GiB)": 122.96, "step": 26170, "token_acc": 0.9595338297183554, "train_speed(iter/s)": 0.241011 }, { "epoch": 1.9951978047107248, "grad_norm": 1.3242008686065674, "learning_rate": 6.559426070313323e-05, "loss": 0.15674927234649658, "memory(GiB)": 122.96, "step": 26175, "token_acc": 0.9314040728831725, "train_speed(iter/s)": 0.241017 }, { "epoch": 1.9955789313209849, "grad_norm": 1.0694735050201416, "learning_rate": 6.558288404851755e-05, "loss": 0.079948091506958, "memory(GiB)": 122.96, "step": 26180, "token_acc": 0.9686374231828777, "train_speed(iter/s)": 0.241027 }, { "epoch": 1.9959600579312449, "grad_norm": 1.0680488348007202, "learning_rate": 6.557150650029577e-05, "loss": 0.09720447063446044, "memory(GiB)": 122.96, "step": 26185, "token_acc": 0.9587852494577006, "train_speed(iter/s)": 0.24104 }, { "epoch": 1.9963411845415047, "grad_norm": 1.4117043018341064, "learning_rate": 6.556012805912036e-05, "loss": 0.11324896812438964, "memory(GiB)": 122.96, "step": 26190, "token_acc": 0.95856, "train_speed(iter/s)": 0.241044 }, { "epoch": 1.9967223111517645, "grad_norm": 1.948655128479004, "learning_rate": 6.554874872564381e-05, "loss": 0.12777948379516602, "memory(GiB)": 122.96, "step": 26195, "token_acc": 0.9458333333333333, "train_speed(iter/s)": 0.241055 }, { "epoch": 1.9971034377620245, "grad_norm": 0.8801734447479248, "learning_rate": 6.55373685005187e-05, "loss": 0.11586674451828002, "memory(GiB)": 122.96, "step": 26200, "token_acc": 0.9554933875890133, "train_speed(iter/s)": 0.241066 }, { "epoch": 1.9971034377620245, "eval_loss": 0.09197056293487549, "eval_runtime": 219.9195, "eval_samples_per_second": 2.41, "eval_steps_per_second": 2.41, "eval_token_acc": 0.9595129811457141, "step": 26200 }, { "epoch": 1.9974845643722845, "grad_norm": 0.8584057092666626, "learning_rate": 6.552598738439757e-05, "loss": 0.10808560848236085, "memory(GiB)": 122.96, "step": 26205, "token_acc": 0.9593632204414008, "train_speed(iter/s)": 0.240586 }, { "epoch": 1.9978656909825445, "grad_norm": 0.10203911364078522, "learning_rate": 6.551460537793314e-05, "loss": 0.09256476759910584, "memory(GiB)": 122.96, "step": 26210, "token_acc": 0.962152209492635, "train_speed(iter/s)": 0.240594 }, { "epoch": 1.9982468175928043, "grad_norm": 0.8576064109802246, "learning_rate": 6.550322248177808e-05, "loss": 0.09724140167236328, "memory(GiB)": 122.96, "step": 26215, "token_acc": 0.9585737840065952, "train_speed(iter/s)": 0.240606 }, { "epoch": 1.998627944203064, "grad_norm": 0.7598915100097656, "learning_rate": 6.549183869658514e-05, "loss": 0.1032977819442749, "memory(GiB)": 122.96, "step": 26220, "token_acc": 0.9618016672887894, "train_speed(iter/s)": 0.240605 }, { "epoch": 1.9990090708133241, "grad_norm": 1.3811780214309692, "learning_rate": 6.548045402300715e-05, "loss": 0.15722169876098632, "memory(GiB)": 122.96, "step": 26225, "token_acc": 0.9392702832453192, "train_speed(iter/s)": 0.240613 }, { "epoch": 1.9993901974235841, "grad_norm": 0.49171149730682373, "learning_rate": 6.546906846169697e-05, "loss": 0.06413478851318359, "memory(GiB)": 122.96, "step": 26230, "token_acc": 0.9747474747474747, "train_speed(iter/s)": 0.240619 }, { "epoch": 1.9997713240338442, "grad_norm": 0.7431855797767639, "learning_rate": 6.54576820133075e-05, "loss": 0.08560182452201844, "memory(GiB)": 122.96, "step": 26235, "token_acc": 0.9617239300783604, "train_speed(iter/s)": 0.240631 }, { "epoch": 2.000152450644104, "grad_norm": 0.6061593890190125, "learning_rate": 6.544629467849169e-05, "loss": 0.10114694833755493, "memory(GiB)": 122.96, "step": 26240, "token_acc": 0.9576547231270358, "train_speed(iter/s)": 0.240641 }, { "epoch": 2.0005335772543638, "grad_norm": 0.859828531742096, "learning_rate": 6.543490645790255e-05, "loss": 0.08493696451187134, "memory(GiB)": 122.96, "step": 26245, "token_acc": 0.9672575364118776, "train_speed(iter/s)": 0.240641 }, { "epoch": 2.000914703864624, "grad_norm": 0.417591392993927, "learning_rate": 6.542351735219318e-05, "loss": 0.0856185495853424, "memory(GiB)": 122.96, "step": 26250, "token_acc": 0.9670757258305896, "train_speed(iter/s)": 0.240647 }, { "epoch": 2.001295830474884, "grad_norm": 1.1556613445281982, "learning_rate": 6.541212736201663e-05, "loss": 0.07369548678398133, "memory(GiB)": 122.96, "step": 26255, "token_acc": 0.9720713731574864, "train_speed(iter/s)": 0.240654 }, { "epoch": 2.001676957085144, "grad_norm": 0.9057174324989319, "learning_rate": 6.540073648802611e-05, "loss": 0.0809739112854004, "memory(GiB)": 122.96, "step": 26260, "token_acc": 0.9630365659777425, "train_speed(iter/s)": 0.240669 }, { "epoch": 2.002058083695404, "grad_norm": 0.8596274256706238, "learning_rate": 6.538934473087483e-05, "loss": 0.10142668485641479, "memory(GiB)": 122.96, "step": 26265, "token_acc": 0.9570707070707071, "train_speed(iter/s)": 0.240673 }, { "epoch": 2.0024392103056634, "grad_norm": 1.5172368288040161, "learning_rate": 6.537795209121604e-05, "loss": 0.12128397226333618, "memory(GiB)": 122.96, "step": 26270, "token_acc": 0.9560963678610738, "train_speed(iter/s)": 0.240681 }, { "epoch": 2.0028203369159234, "grad_norm": 1.2023491859436035, "learning_rate": 6.536655856970306e-05, "loss": 0.10887273550033569, "memory(GiB)": 122.96, "step": 26275, "token_acc": 0.9625498007968127, "train_speed(iter/s)": 0.240687 }, { "epoch": 2.0032014635261834, "grad_norm": 1.5285547971725464, "learning_rate": 6.535516416698926e-05, "loss": 0.10067343711853027, "memory(GiB)": 122.96, "step": 26280, "token_acc": 0.9634017347154643, "train_speed(iter/s)": 0.240695 }, { "epoch": 2.0035825901364435, "grad_norm": 0.9714431762695312, "learning_rate": 6.534376888372804e-05, "loss": 0.06601279973983765, "memory(GiB)": 122.96, "step": 26285, "token_acc": 0.968647942521228, "train_speed(iter/s)": 0.240697 }, { "epoch": 2.003963716746703, "grad_norm": 0.9681770205497742, "learning_rate": 6.533237272057289e-05, "loss": 0.1127902626991272, "memory(GiB)": 122.96, "step": 26290, "token_acc": 0.9527083333333334, "train_speed(iter/s)": 0.240705 }, { "epoch": 2.004344843356963, "grad_norm": 0.8474037051200867, "learning_rate": 6.53209756781773e-05, "loss": 0.10433038473129272, "memory(GiB)": 122.96, "step": 26295, "token_acc": 0.9587301587301588, "train_speed(iter/s)": 0.240713 }, { "epoch": 2.004725969967223, "grad_norm": 0.7044438123703003, "learning_rate": 6.530957775719488e-05, "loss": 0.10840727090835571, "memory(GiB)": 122.96, "step": 26300, "token_acc": 0.9573978123200921, "train_speed(iter/s)": 0.240724 }, { "epoch": 2.005107096577483, "grad_norm": 1.090207815170288, "learning_rate": 6.52981789582792e-05, "loss": 0.12008180618286132, "memory(GiB)": 122.96, "step": 26305, "token_acc": 0.9544346521657604, "train_speed(iter/s)": 0.240733 }, { "epoch": 2.005488223187743, "grad_norm": 0.7394430041313171, "learning_rate": 6.528677928208394e-05, "loss": 0.10662018060684204, "memory(GiB)": 122.96, "step": 26310, "token_acc": 0.9586020698965052, "train_speed(iter/s)": 0.240736 }, { "epoch": 2.0058693497980027, "grad_norm": 1.054826259613037, "learning_rate": 6.527537872926285e-05, "loss": 0.08400206565856934, "memory(GiB)": 122.96, "step": 26315, "token_acc": 0.9657100260901975, "train_speed(iter/s)": 0.240751 }, { "epoch": 2.0062504764082627, "grad_norm": 1.4686473608016968, "learning_rate": 6.526397730046967e-05, "loss": 0.09502209424972534, "memory(GiB)": 122.96, "step": 26320, "token_acc": 0.9682893383613301, "train_speed(iter/s)": 0.240757 }, { "epoch": 2.0066316030185227, "grad_norm": 0.9168114066123962, "learning_rate": 6.525257499635822e-05, "loss": 0.09492986798286437, "memory(GiB)": 122.96, "step": 26325, "token_acc": 0.9676515537442689, "train_speed(iter/s)": 0.240767 }, { "epoch": 2.0070127296287827, "grad_norm": 0.891483724117279, "learning_rate": 6.524117181758239e-05, "loss": 0.05466879606246948, "memory(GiB)": 122.96, "step": 26330, "token_acc": 0.9671386588298156, "train_speed(iter/s)": 0.240779 }, { "epoch": 2.0073938562390428, "grad_norm": 0.7309017181396484, "learning_rate": 6.522976776479606e-05, "loss": 0.15330554246902467, "memory(GiB)": 122.96, "step": 26335, "token_acc": 0.9464317494216052, "train_speed(iter/s)": 0.240786 }, { "epoch": 2.0077749828493023, "grad_norm": 0.517546534538269, "learning_rate": 6.521836283865322e-05, "loss": 0.08996121883392334, "memory(GiB)": 122.96, "step": 26340, "token_acc": 0.9641456582633053, "train_speed(iter/s)": 0.240796 }, { "epoch": 2.0081561094595624, "grad_norm": 0.7283541560173035, "learning_rate": 6.520695703980792e-05, "loss": 0.12563036680221557, "memory(GiB)": 122.96, "step": 26345, "token_acc": 0.959615673322324, "train_speed(iter/s)": 0.240798 }, { "epoch": 2.0085372360698224, "grad_norm": 1.0979572534561157, "learning_rate": 6.51955503689142e-05, "loss": 0.16642476320266725, "memory(GiB)": 122.96, "step": 26350, "token_acc": 0.9328660436137072, "train_speed(iter/s)": 0.240803 }, { "epoch": 2.0089183626800824, "grad_norm": 1.3895010948181152, "learning_rate": 6.518414282662615e-05, "loss": 0.10846613645553589, "memory(GiB)": 122.96, "step": 26355, "token_acc": 0.9562657695542472, "train_speed(iter/s)": 0.240815 }, { "epoch": 2.0092994892903424, "grad_norm": 1.0041382312774658, "learning_rate": 6.517273441359799e-05, "loss": 0.10181916952133178, "memory(GiB)": 122.96, "step": 26360, "token_acc": 0.9633947939262473, "train_speed(iter/s)": 0.240824 }, { "epoch": 2.009680615900602, "grad_norm": 0.9571253657341003, "learning_rate": 6.516132513048393e-05, "loss": 0.063021719455719, "memory(GiB)": 122.96, "step": 26365, "token_acc": 0.9642528100239544, "train_speed(iter/s)": 0.240829 }, { "epoch": 2.010061742510862, "grad_norm": 0.8564225435256958, "learning_rate": 6.514991497793823e-05, "loss": 0.093866628408432, "memory(GiB)": 122.96, "step": 26370, "token_acc": 0.972568578553616, "train_speed(iter/s)": 0.240844 }, { "epoch": 2.010442869121122, "grad_norm": 1.4974019527435303, "learning_rate": 6.513850395661521e-05, "loss": 0.10186458826065063, "memory(GiB)": 122.96, "step": 26375, "token_acc": 0.9605358435916003, "train_speed(iter/s)": 0.240858 }, { "epoch": 2.010823995731382, "grad_norm": 1.0393095016479492, "learning_rate": 6.512709206716922e-05, "loss": 0.10906683206558228, "memory(GiB)": 122.96, "step": 26380, "token_acc": 0.965463515969878, "train_speed(iter/s)": 0.24087 }, { "epoch": 2.011205122341642, "grad_norm": 0.968572199344635, "learning_rate": 6.511567931025472e-05, "loss": 0.09424675703048706, "memory(GiB)": 122.96, "step": 26385, "token_acc": 0.9572129206832403, "train_speed(iter/s)": 0.240879 }, { "epoch": 2.0115862489519016, "grad_norm": 0.7902241945266724, "learning_rate": 6.510426568652617e-05, "loss": 0.12020928859710693, "memory(GiB)": 122.96, "step": 26390, "token_acc": 0.9500682128240109, "train_speed(iter/s)": 0.24089 }, { "epoch": 2.0119673755621617, "grad_norm": 1.1019465923309326, "learning_rate": 6.509285119663804e-05, "loss": 0.11630077362060547, "memory(GiB)": 122.96, "step": 26395, "token_acc": 0.9477015825169556, "train_speed(iter/s)": 0.240896 }, { "epoch": 2.0123485021724217, "grad_norm": 1.9062634706497192, "learning_rate": 6.508143584124495e-05, "loss": 0.09044396877288818, "memory(GiB)": 122.96, "step": 26400, "token_acc": 0.95907279971025, "train_speed(iter/s)": 0.240907 }, { "epoch": 2.0123485021724217, "eval_loss": 0.09217917919158936, "eval_runtime": 218.1394, "eval_samples_per_second": 2.43, "eval_steps_per_second": 2.43, "eval_token_acc": 0.9599497018251912, "step": 26400 }, { "epoch": 2.0127296287826817, "grad_norm": 0.7773627042770386, "learning_rate": 6.50700196210015e-05, "loss": 0.11915383338928223, "memory(GiB)": 122.96, "step": 26405, "token_acc": 0.9595161816617721, "train_speed(iter/s)": 0.240441 }, { "epoch": 2.0131107553929417, "grad_norm": 1.280651330947876, "learning_rate": 6.505860253656236e-05, "loss": 0.15290580987930297, "memory(GiB)": 122.96, "step": 26410, "token_acc": 0.9423173803526448, "train_speed(iter/s)": 0.240453 }, { "epoch": 2.0134918820032013, "grad_norm": 0.7859970927238464, "learning_rate": 6.504718458858224e-05, "loss": 0.09577568173408509, "memory(GiB)": 122.96, "step": 26415, "token_acc": 0.9645332246229107, "train_speed(iter/s)": 0.240462 }, { "epoch": 2.0138730086134613, "grad_norm": 0.9667031764984131, "learning_rate": 6.503576577771591e-05, "loss": 0.06313449740409852, "memory(GiB)": 122.96, "step": 26420, "token_acc": 0.9713908450704225, "train_speed(iter/s)": 0.240472 }, { "epoch": 2.0142541352237213, "grad_norm": 1.0064960718154907, "learning_rate": 6.502434610461821e-05, "loss": 0.09952600002288818, "memory(GiB)": 122.96, "step": 26425, "token_acc": 0.9602177554438861, "train_speed(iter/s)": 0.240486 }, { "epoch": 2.0146352618339813, "grad_norm": 0.8050191402435303, "learning_rate": 6.501292556994395e-05, "loss": 0.08019761443138122, "memory(GiB)": 122.96, "step": 26430, "token_acc": 0.962893864013267, "train_speed(iter/s)": 0.240494 }, { "epoch": 2.0150163884442414, "grad_norm": 1.15009343624115, "learning_rate": 6.500150417434809e-05, "loss": 0.12226300239562989, "memory(GiB)": 122.96, "step": 26435, "token_acc": 0.9541838134430727, "train_speed(iter/s)": 0.240506 }, { "epoch": 2.015397515054501, "grad_norm": 1.0684654712677002, "learning_rate": 6.499008191848558e-05, "loss": 0.05846806764602661, "memory(GiB)": 122.96, "step": 26440, "token_acc": 0.9788321167883212, "train_speed(iter/s)": 0.240523 }, { "epoch": 2.015778641664761, "grad_norm": 0.699732780456543, "learning_rate": 6.497865880301144e-05, "loss": 0.06882889866828919, "memory(GiB)": 122.96, "step": 26445, "token_acc": 0.9778022381214456, "train_speed(iter/s)": 0.240528 }, { "epoch": 2.016159768275021, "grad_norm": 1.2265650033950806, "learning_rate": 6.496723482858072e-05, "loss": 0.1207455039024353, "memory(GiB)": 122.96, "step": 26450, "token_acc": 0.9563246806757314, "train_speed(iter/s)": 0.240535 }, { "epoch": 2.016540894885281, "grad_norm": 0.7658952474594116, "learning_rate": 6.495580999584854e-05, "loss": 0.0905352234840393, "memory(GiB)": 122.96, "step": 26455, "token_acc": 0.9652173913043478, "train_speed(iter/s)": 0.240541 }, { "epoch": 2.016922021495541, "grad_norm": 0.7466670870780945, "learning_rate": 6.494438430547008e-05, "loss": 0.1024355411529541, "memory(GiB)": 122.96, "step": 26460, "token_acc": 0.9656799656799657, "train_speed(iter/s)": 0.240555 }, { "epoch": 2.0173031481058006, "grad_norm": 1.9980459213256836, "learning_rate": 6.493295775810051e-05, "loss": 0.1346266508102417, "memory(GiB)": 122.96, "step": 26465, "token_acc": 0.9475839475839476, "train_speed(iter/s)": 0.240566 }, { "epoch": 2.0176842747160606, "grad_norm": 1.0913875102996826, "learning_rate": 6.492153035439511e-05, "loss": 0.17365705966949463, "memory(GiB)": 122.96, "step": 26470, "token_acc": 0.9275263695134399, "train_speed(iter/s)": 0.240579 }, { "epoch": 2.0180654013263206, "grad_norm": 0.5431331396102905, "learning_rate": 6.491010209500919e-05, "loss": 0.071640545129776, "memory(GiB)": 122.96, "step": 26475, "token_acc": 0.9728427507665353, "train_speed(iter/s)": 0.240587 }, { "epoch": 2.0184465279365806, "grad_norm": 1.8722729682922363, "learning_rate": 6.48986729805981e-05, "loss": 0.12376170158386231, "memory(GiB)": 122.96, "step": 26480, "token_acc": 0.9529025191675794, "train_speed(iter/s)": 0.240597 }, { "epoch": 2.0188276545468407, "grad_norm": 0.6465417742729187, "learning_rate": 6.488724301181728e-05, "loss": 0.06191643476486206, "memory(GiB)": 122.96, "step": 26485, "token_acc": 0.974366529169122, "train_speed(iter/s)": 0.240608 }, { "epoch": 2.0192087811571002, "grad_norm": 0.9489759802818298, "learning_rate": 6.487581218932212e-05, "loss": 0.13101584911346437, "memory(GiB)": 122.96, "step": 26490, "token_acc": 0.9401453612654981, "train_speed(iter/s)": 0.240624 }, { "epoch": 2.0195899077673602, "grad_norm": 0.41995611786842346, "learning_rate": 6.48643805137682e-05, "loss": 0.1478276491165161, "memory(GiB)": 122.96, "step": 26495, "token_acc": 0.9368605670856475, "train_speed(iter/s)": 0.240637 }, { "epoch": 2.0199710343776203, "grad_norm": 0.3118656575679779, "learning_rate": 6.485294798581101e-05, "loss": 0.05752885341644287, "memory(GiB)": 122.96, "step": 26500, "token_acc": 0.9731543624161074, "train_speed(iter/s)": 0.240652 }, { "epoch": 2.0203521609878803, "grad_norm": 0.7779620885848999, "learning_rate": 6.484151460610619e-05, "loss": 0.11755051612854003, "memory(GiB)": 122.96, "step": 26505, "token_acc": 0.9537484885126964, "train_speed(iter/s)": 0.240658 }, { "epoch": 2.0207332875981403, "grad_norm": 1.614286184310913, "learning_rate": 6.483008037530938e-05, "loss": 0.1173932671546936, "memory(GiB)": 122.96, "step": 26510, "token_acc": 0.9476014760147602, "train_speed(iter/s)": 0.240674 }, { "epoch": 2.0211144142084, "grad_norm": 0.8454567193984985, "learning_rate": 6.481864529407627e-05, "loss": 0.09817641377449035, "memory(GiB)": 122.96, "step": 26515, "token_acc": 0.9633187772925764, "train_speed(iter/s)": 0.240682 }, { "epoch": 2.02149554081866, "grad_norm": 0.8441240191459656, "learning_rate": 6.480720936306263e-05, "loss": 0.12766919136047364, "memory(GiB)": 122.96, "step": 26520, "token_acc": 0.9565780946208684, "train_speed(iter/s)": 0.240691 }, { "epoch": 2.02187666742892, "grad_norm": 0.765451967716217, "learning_rate": 6.479577258292425e-05, "loss": 0.1428079128265381, "memory(GiB)": 122.96, "step": 26525, "token_acc": 0.9489534387014097, "train_speed(iter/s)": 0.240703 }, { "epoch": 2.02225779403918, "grad_norm": 0.6839261054992676, "learning_rate": 6.478433495431698e-05, "loss": 0.0710952877998352, "memory(GiB)": 122.96, "step": 26530, "token_acc": 0.9676398170946183, "train_speed(iter/s)": 0.240716 }, { "epoch": 2.02263892064944, "grad_norm": 0.548771858215332, "learning_rate": 6.477289647789669e-05, "loss": 0.11097638607025147, "memory(GiB)": 122.96, "step": 26535, "token_acc": 0.9593008087659797, "train_speed(iter/s)": 0.240727 }, { "epoch": 2.0230200472596995, "grad_norm": 0.10899920761585236, "learning_rate": 6.476145715431935e-05, "loss": 0.08356081247329712, "memory(GiB)": 122.96, "step": 26540, "token_acc": 0.9548272807794509, "train_speed(iter/s)": 0.240737 }, { "epoch": 2.0234011738699595, "grad_norm": 0.6064369082450867, "learning_rate": 6.475001698424093e-05, "loss": 0.10546542406082153, "memory(GiB)": 122.96, "step": 26545, "token_acc": 0.9615566037735849, "train_speed(iter/s)": 0.240747 }, { "epoch": 2.0237823004802196, "grad_norm": 1.3736778497695923, "learning_rate": 6.47385759683175e-05, "loss": 0.11532256603240967, "memory(GiB)": 122.96, "step": 26550, "token_acc": 0.9510113780025284, "train_speed(iter/s)": 0.240758 }, { "epoch": 2.0241634270904796, "grad_norm": 0.8541922569274902, "learning_rate": 6.472713410720512e-05, "loss": 0.09139158725738525, "memory(GiB)": 122.96, "step": 26555, "token_acc": 0.9579333709768493, "train_speed(iter/s)": 0.24077 }, { "epoch": 2.0245445537007396, "grad_norm": 1.2088003158569336, "learning_rate": 6.471569140155997e-05, "loss": 0.10490204095840454, "memory(GiB)": 122.96, "step": 26560, "token_acc": 0.953232462173315, "train_speed(iter/s)": 0.240782 }, { "epoch": 2.024925680310999, "grad_norm": 0.8693092465400696, "learning_rate": 6.470424785203816e-05, "loss": 0.12608013153076172, "memory(GiB)": 122.96, "step": 26565, "token_acc": 0.9515115234959592, "train_speed(iter/s)": 0.240793 }, { "epoch": 2.025306806921259, "grad_norm": 0.7688348889350891, "learning_rate": 6.469280345929603e-05, "loss": 0.11140514612197876, "memory(GiB)": 122.96, "step": 26570, "token_acc": 0.954375, "train_speed(iter/s)": 0.240805 }, { "epoch": 2.025687933531519, "grad_norm": 0.3812229335308075, "learning_rate": 6.468135822398978e-05, "loss": 0.08680573701858521, "memory(GiB)": 122.96, "step": 26575, "token_acc": 0.9621092516577202, "train_speed(iter/s)": 0.240817 }, { "epoch": 2.0260690601417792, "grad_norm": 0.8090770244598389, "learning_rate": 6.466991214677575e-05, "loss": 0.08493500351905822, "memory(GiB)": 122.96, "step": 26580, "token_acc": 0.9668328636462172, "train_speed(iter/s)": 0.240828 }, { "epoch": 2.0264501867520393, "grad_norm": 0.7277560234069824, "learning_rate": 6.465846522831033e-05, "loss": 0.16696085929870605, "memory(GiB)": 122.96, "step": 26585, "token_acc": 0.9172113289760349, "train_speed(iter/s)": 0.240842 }, { "epoch": 2.026831313362299, "grad_norm": 0.575595498085022, "learning_rate": 6.464701746924998e-05, "loss": 0.11890441179275513, "memory(GiB)": 122.96, "step": 26590, "token_acc": 0.9565395095367848, "train_speed(iter/s)": 0.240843 }, { "epoch": 2.027212439972559, "grad_norm": 1.0139473676681519, "learning_rate": 6.463556887025114e-05, "loss": 0.07395694851875305, "memory(GiB)": 122.96, "step": 26595, "token_acc": 0.9650053022269353, "train_speed(iter/s)": 0.240858 }, { "epoch": 2.027593566582819, "grad_norm": 1.3475227355957031, "learning_rate": 6.462411943197033e-05, "loss": 0.1238883376121521, "memory(GiB)": 122.96, "step": 26600, "token_acc": 0.9517386091127098, "train_speed(iter/s)": 0.24087 }, { "epoch": 2.027593566582819, "eval_loss": 0.09551186859607697, "eval_runtime": 218.2412, "eval_samples_per_second": 2.429, "eval_steps_per_second": 2.429, "eval_token_acc": 0.9591666164688875, "step": 26600 }, { "epoch": 2.027974693193079, "grad_norm": 0.8470348119735718, "learning_rate": 6.461266915506415e-05, "loss": 0.09251853227615356, "memory(GiB)": 122.96, "step": 26605, "token_acc": 0.9593675027262814, "train_speed(iter/s)": 0.240404 }, { "epoch": 2.0283558198033385, "grad_norm": 1.3194552659988403, "learning_rate": 6.46012180401892e-05, "loss": 0.0934902310371399, "memory(GiB)": 122.96, "step": 26610, "token_acc": 0.9553462940461726, "train_speed(iter/s)": 0.240417 }, { "epoch": 2.0287369464135985, "grad_norm": 0.5419029593467712, "learning_rate": 6.458976608800216e-05, "loss": 0.059352487325668335, "memory(GiB)": 122.96, "step": 26615, "token_acc": 0.9711462450592885, "train_speed(iter/s)": 0.240421 }, { "epoch": 2.0291180730238585, "grad_norm": 1.4419090747833252, "learning_rate": 6.457831329915972e-05, "loss": 0.17204283475875853, "memory(GiB)": 122.96, "step": 26620, "token_acc": 0.9486997635933806, "train_speed(iter/s)": 0.240432 }, { "epoch": 2.0294991996341185, "grad_norm": 1.0486336946487427, "learning_rate": 6.456685967431868e-05, "loss": 0.11818337440490723, "memory(GiB)": 122.96, "step": 26625, "token_acc": 0.957736516357206, "train_speed(iter/s)": 0.240437 }, { "epoch": 2.0298803262443785, "grad_norm": 0.1766224503517151, "learning_rate": 6.455540521413583e-05, "loss": 0.09560262560844421, "memory(GiB)": 122.96, "step": 26630, "token_acc": 0.9665610700457585, "train_speed(iter/s)": 0.240448 }, { "epoch": 2.030261452854638, "grad_norm": 1.2283374071121216, "learning_rate": 6.454394991926804e-05, "loss": 0.13426157236099243, "memory(GiB)": 122.96, "step": 26635, "token_acc": 0.945353594389246, "train_speed(iter/s)": 0.240461 }, { "epoch": 2.030642579464898, "grad_norm": 0.4415243864059448, "learning_rate": 6.453249379037222e-05, "loss": 0.07588891386985779, "memory(GiB)": 122.96, "step": 26640, "token_acc": 0.9622550205661747, "train_speed(iter/s)": 0.240475 }, { "epoch": 2.031023706075158, "grad_norm": 0.931455135345459, "learning_rate": 6.45210368281053e-05, "loss": 0.1065969467163086, "memory(GiB)": 122.96, "step": 26645, "token_acc": 0.9520043632397055, "train_speed(iter/s)": 0.240485 }, { "epoch": 2.031404832685418, "grad_norm": 1.7391352653503418, "learning_rate": 6.450957903312432e-05, "loss": 0.10227539539337158, "memory(GiB)": 122.96, "step": 26650, "token_acc": 0.9614532565352237, "train_speed(iter/s)": 0.2405 }, { "epoch": 2.031785959295678, "grad_norm": 0.8141867518424988, "learning_rate": 6.449812040608631e-05, "loss": 0.08699576854705811, "memory(GiB)": 122.96, "step": 26655, "token_acc": 0.9656850192061459, "train_speed(iter/s)": 0.240513 }, { "epoch": 2.0321670859059378, "grad_norm": 1.1230524778366089, "learning_rate": 6.44866609476484e-05, "loss": 0.09306571483612061, "memory(GiB)": 122.96, "step": 26660, "token_acc": 0.9675590551181102, "train_speed(iter/s)": 0.240517 }, { "epoch": 2.0325482125161978, "grad_norm": 0.6255595088005066, "learning_rate": 6.447520065846766e-05, "loss": 0.07012152075767517, "memory(GiB)": 122.96, "step": 26665, "token_acc": 0.9644924739482825, "train_speed(iter/s)": 0.24053 }, { "epoch": 2.032929339126458, "grad_norm": 0.5268899202346802, "learning_rate": 6.446373953920137e-05, "loss": 0.06670078039169311, "memory(GiB)": 122.96, "step": 26670, "token_acc": 0.9745623223103397, "train_speed(iter/s)": 0.240534 }, { "epoch": 2.033310465736718, "grad_norm": 0.9293346405029297, "learning_rate": 6.445227759050673e-05, "loss": 0.0800336480140686, "memory(GiB)": 122.96, "step": 26675, "token_acc": 0.9632555356074207, "train_speed(iter/s)": 0.240536 }, { "epoch": 2.033691592346978, "grad_norm": 0.9993466734886169, "learning_rate": 6.444081481304105e-05, "loss": 0.09748769998550415, "memory(GiB)": 122.96, "step": 26680, "token_acc": 0.9612949468402819, "train_speed(iter/s)": 0.240535 }, { "epoch": 2.0340727189572374, "grad_norm": 0.5400479435920715, "learning_rate": 6.442935120746163e-05, "loss": 0.10227404832839966, "memory(GiB)": 122.96, "step": 26685, "token_acc": 0.9624517962248833, "train_speed(iter/s)": 0.240541 }, { "epoch": 2.0344538455674974, "grad_norm": 0.6307332515716553, "learning_rate": 6.441788677442588e-05, "loss": 0.10041751861572265, "memory(GiB)": 122.96, "step": 26690, "token_acc": 0.9628764884426804, "train_speed(iter/s)": 0.240544 }, { "epoch": 2.0348349721777574, "grad_norm": 1.122648000717163, "learning_rate": 6.440642151459124e-05, "loss": 0.0903249740600586, "memory(GiB)": 122.96, "step": 26695, "token_acc": 0.965105767492316, "train_speed(iter/s)": 0.240551 }, { "epoch": 2.0352160987880175, "grad_norm": 0.7174046635627747, "learning_rate": 6.439495542861519e-05, "loss": 0.09585032463073731, "memory(GiB)": 122.96, "step": 26700, "token_acc": 0.9623843782117163, "train_speed(iter/s)": 0.240558 }, { "epoch": 2.0355972253982775, "grad_norm": 0.6395531296730042, "learning_rate": 6.438348851715523e-05, "loss": 0.10657339096069336, "memory(GiB)": 122.96, "step": 26705, "token_acc": 0.9571806945261919, "train_speed(iter/s)": 0.240562 }, { "epoch": 2.035978352008537, "grad_norm": 0.6430047154426575, "learning_rate": 6.437202078086897e-05, "loss": 0.06949906349182129, "memory(GiB)": 122.96, "step": 26710, "token_acc": 0.9725059906671711, "train_speed(iter/s)": 0.240564 }, { "epoch": 2.036359478618797, "grad_norm": 0.5911123156547546, "learning_rate": 6.4360552220414e-05, "loss": 0.07076040506362916, "memory(GiB)": 122.96, "step": 26715, "token_acc": 0.9693414672297825, "train_speed(iter/s)": 0.24057 }, { "epoch": 2.036740605229057, "grad_norm": 1.0295257568359375, "learning_rate": 6.434908283644799e-05, "loss": 0.09599714279174805, "memory(GiB)": 122.96, "step": 26720, "token_acc": 0.9566029900332226, "train_speed(iter/s)": 0.240578 }, { "epoch": 2.037121731839317, "grad_norm": 1.0713828802108765, "learning_rate": 6.433761262962869e-05, "loss": 0.09896995425224304, "memory(GiB)": 122.96, "step": 26725, "token_acc": 0.9642932157109851, "train_speed(iter/s)": 0.240588 }, { "epoch": 2.037502858449577, "grad_norm": 2.3962037563323975, "learning_rate": 6.432614160061384e-05, "loss": 0.15164027214050294, "memory(GiB)": 122.96, "step": 26730, "token_acc": 0.9519738496354035, "train_speed(iter/s)": 0.240597 }, { "epoch": 2.0378839850598367, "grad_norm": 0.5605326890945435, "learning_rate": 6.431466975006122e-05, "loss": 0.10237768888473511, "memory(GiB)": 122.96, "step": 26735, "token_acc": 0.9584382871536524, "train_speed(iter/s)": 0.2406 }, { "epoch": 2.0382651116700967, "grad_norm": 0.7966943979263306, "learning_rate": 6.430319707862875e-05, "loss": 0.13025960922241211, "memory(GiB)": 122.96, "step": 26740, "token_acc": 0.9482884195193008, "train_speed(iter/s)": 0.240604 }, { "epoch": 2.0386462382803567, "grad_norm": 0.8392658233642578, "learning_rate": 6.429172358697429e-05, "loss": 0.09468330144882202, "memory(GiB)": 122.96, "step": 26745, "token_acc": 0.9657551750575006, "train_speed(iter/s)": 0.240613 }, { "epoch": 2.0390273648906168, "grad_norm": 0.9187453389167786, "learning_rate": 6.42802492757558e-05, "loss": 0.1464400053024292, "memory(GiB)": 122.96, "step": 26750, "token_acc": 0.9419715264011533, "train_speed(iter/s)": 0.240623 }, { "epoch": 2.0394084915008768, "grad_norm": 0.7360518574714661, "learning_rate": 6.426877414563128e-05, "loss": 0.07116318941116333, "memory(GiB)": 122.96, "step": 26755, "token_acc": 0.9657621100684758, "train_speed(iter/s)": 0.240635 }, { "epoch": 2.0397896181111363, "grad_norm": 0.6470309495925903, "learning_rate": 6.425729819725879e-05, "loss": 0.06796538829803467, "memory(GiB)": 122.96, "step": 26760, "token_acc": 0.9778891509433962, "train_speed(iter/s)": 0.240646 }, { "epoch": 2.0401707447213964, "grad_norm": 0.20103265345096588, "learning_rate": 6.42458214312964e-05, "loss": 0.06214249134063721, "memory(GiB)": 122.96, "step": 26765, "token_acc": 0.9719827586206896, "train_speed(iter/s)": 0.240659 }, { "epoch": 2.0405518713316564, "grad_norm": 0.6246945858001709, "learning_rate": 6.423434384840226e-05, "loss": 0.0559209942817688, "memory(GiB)": 122.96, "step": 26770, "token_acc": 0.9727615965480043, "train_speed(iter/s)": 0.240671 }, { "epoch": 2.0409329979419164, "grad_norm": 0.9861221313476562, "learning_rate": 6.422286544923457e-05, "loss": 0.09110467433929444, "memory(GiB)": 122.96, "step": 26775, "token_acc": 0.9594175220816424, "train_speed(iter/s)": 0.240681 }, { "epoch": 2.0413141245521764, "grad_norm": 0.8526818752288818, "learning_rate": 6.421138623445154e-05, "loss": 0.09530458450317383, "memory(GiB)": 122.96, "step": 26780, "token_acc": 0.9712283290298783, "train_speed(iter/s)": 0.240694 }, { "epoch": 2.041695251162436, "grad_norm": 1.2367308139801025, "learning_rate": 6.419990620471146e-05, "loss": 0.15079550743103026, "memory(GiB)": 122.96, "step": 26785, "token_acc": 0.9443694301917365, "train_speed(iter/s)": 0.240704 }, { "epoch": 2.042076377772696, "grad_norm": 1.4641131162643433, "learning_rate": 6.418842536067264e-05, "loss": 0.0863089382648468, "memory(GiB)": 122.96, "step": 26790, "token_acc": 0.9688404184286669, "train_speed(iter/s)": 0.240713 }, { "epoch": 2.042457504382956, "grad_norm": 1.4458788633346558, "learning_rate": 6.41769437029935e-05, "loss": 0.12274694442749023, "memory(GiB)": 122.96, "step": 26795, "token_acc": 0.9565119928267205, "train_speed(iter/s)": 0.240723 }, { "epoch": 2.042838630993216, "grad_norm": 1.3332107067108154, "learning_rate": 6.41654612323324e-05, "loss": 0.1410604476928711, "memory(GiB)": 122.96, "step": 26800, "token_acc": 0.9466759972008397, "train_speed(iter/s)": 0.240728 }, { "epoch": 2.042838630993216, "eval_loss": 0.091831736266613, "eval_runtime": 217.8685, "eval_samples_per_second": 2.433, "eval_steps_per_second": 2.433, "eval_token_acc": 0.9597238118185651, "step": 26800 }, { "epoch": 2.043219757603476, "grad_norm": 1.4090169668197632, "learning_rate": 6.415397794934784e-05, "loss": 0.1401495337486267, "memory(GiB)": 122.96, "step": 26805, "token_acc": 0.9593794366673778, "train_speed(iter/s)": 0.240267 }, { "epoch": 2.0436008842137356, "grad_norm": 0.4587613642215729, "learning_rate": 6.414249385469834e-05, "loss": 0.08828035593032837, "memory(GiB)": 122.96, "step": 26810, "token_acc": 0.96494708994709, "train_speed(iter/s)": 0.240272 }, { "epoch": 2.0439820108239957, "grad_norm": 1.096441626548767, "learning_rate": 6.413100894904243e-05, "loss": 0.10843360424041748, "memory(GiB)": 122.96, "step": 26815, "token_acc": 0.9617067833698031, "train_speed(iter/s)": 0.240287 }, { "epoch": 2.0443631374342557, "grad_norm": 0.821088433265686, "learning_rate": 6.411952323303874e-05, "loss": 0.10005060434341431, "memory(GiB)": 122.96, "step": 26820, "token_acc": 0.9543285616905249, "train_speed(iter/s)": 0.240301 }, { "epoch": 2.0447442640445157, "grad_norm": 0.18083836138248444, "learning_rate": 6.41080367073459e-05, "loss": 0.0933132827281952, "memory(GiB)": 122.96, "step": 26825, "token_acc": 0.9620047041794826, "train_speed(iter/s)": 0.240306 }, { "epoch": 2.0451253906547757, "grad_norm": 0.6764624118804932, "learning_rate": 6.409654937262263e-05, "loss": 0.0989052951335907, "memory(GiB)": 122.96, "step": 26830, "token_acc": 0.9603598538093899, "train_speed(iter/s)": 0.240309 }, { "epoch": 2.0455065172650353, "grad_norm": 0.09515105932950974, "learning_rate": 6.40850612295277e-05, "loss": 0.07678887844085694, "memory(GiB)": 122.96, "step": 26835, "token_acc": 0.9644475426978041, "train_speed(iter/s)": 0.240322 }, { "epoch": 2.0458876438752953, "grad_norm": 0.4114013910293579, "learning_rate": 6.407357227871984e-05, "loss": 0.12432831525802612, "memory(GiB)": 122.96, "step": 26840, "token_acc": 0.9516339869281045, "train_speed(iter/s)": 0.240333 }, { "epoch": 2.0462687704855553, "grad_norm": 0.6978557705879211, "learning_rate": 6.406208252085793e-05, "loss": 0.105396568775177, "memory(GiB)": 122.96, "step": 26845, "token_acc": 0.9655130978130257, "train_speed(iter/s)": 0.240335 }, { "epoch": 2.0466498970958154, "grad_norm": 0.6079176068305969, "learning_rate": 6.405059195660084e-05, "loss": 0.1144120454788208, "memory(GiB)": 122.96, "step": 26850, "token_acc": 0.9491778774289985, "train_speed(iter/s)": 0.240343 }, { "epoch": 2.0470310237060754, "grad_norm": 0.8246468901634216, "learning_rate": 6.40391005866075e-05, "loss": 0.1418940544128418, "memory(GiB)": 122.96, "step": 26855, "token_acc": 0.950720512364348, "train_speed(iter/s)": 0.240345 }, { "epoch": 2.047412150316335, "grad_norm": 1.000343680381775, "learning_rate": 6.40276084115369e-05, "loss": 0.09540516138076782, "memory(GiB)": 122.96, "step": 26860, "token_acc": 0.9582139446036294, "train_speed(iter/s)": 0.240354 }, { "epoch": 2.047793276926595, "grad_norm": 1.1081000566482544, "learning_rate": 6.401611543204807e-05, "loss": 0.10979394912719727, "memory(GiB)": 122.96, "step": 26865, "token_acc": 0.9617897727272727, "train_speed(iter/s)": 0.240355 }, { "epoch": 2.048174403536855, "grad_norm": 0.5214206576347351, "learning_rate": 6.400462164880003e-05, "loss": 0.07766851782798767, "memory(GiB)": 122.96, "step": 26870, "token_acc": 0.9659739201303994, "train_speed(iter/s)": 0.240362 }, { "epoch": 2.048555530147115, "grad_norm": 0.31989967823028564, "learning_rate": 6.399312706245193e-05, "loss": 0.09318844079971314, "memory(GiB)": 122.96, "step": 26875, "token_acc": 0.9607002500893176, "train_speed(iter/s)": 0.240366 }, { "epoch": 2.048936656757375, "grad_norm": 1.0048671960830688, "learning_rate": 6.398163167366294e-05, "loss": 0.143338143825531, "memory(GiB)": 122.96, "step": 26880, "token_acc": 0.9484435797665369, "train_speed(iter/s)": 0.240376 }, { "epoch": 2.0493177833676346, "grad_norm": 0.7947468757629395, "learning_rate": 6.397013548309226e-05, "loss": 0.14447057247161865, "memory(GiB)": 122.96, "step": 26885, "token_acc": 0.9602076124567474, "train_speed(iter/s)": 0.240376 }, { "epoch": 2.0496989099778946, "grad_norm": 0.6386239528656006, "learning_rate": 6.395863849139914e-05, "loss": 0.1317639946937561, "memory(GiB)": 122.96, "step": 26890, "token_acc": 0.952633504023409, "train_speed(iter/s)": 0.240383 }, { "epoch": 2.0500800365881546, "grad_norm": 0.5396363735198975, "learning_rate": 6.394714069924285e-05, "loss": 0.09763221740722657, "memory(GiB)": 122.96, "step": 26895, "token_acc": 0.96215360253365, "train_speed(iter/s)": 0.240389 }, { "epoch": 2.0504611631984146, "grad_norm": 0.8052300810813904, "learning_rate": 6.39356421072828e-05, "loss": 0.06803131699562073, "memory(GiB)": 122.96, "step": 26900, "token_acc": 0.9681453515809344, "train_speed(iter/s)": 0.240401 }, { "epoch": 2.0508422898086742, "grad_norm": 0.641417384147644, "learning_rate": 6.392414271617833e-05, "loss": 0.10935251712799073, "memory(GiB)": 122.96, "step": 26905, "token_acc": 0.9626834381551362, "train_speed(iter/s)": 0.240408 }, { "epoch": 2.0512234164189342, "grad_norm": 0.7287918925285339, "learning_rate": 6.39126425265889e-05, "loss": 0.105903959274292, "memory(GiB)": 122.96, "step": 26910, "token_acc": 0.961764705882353, "train_speed(iter/s)": 0.240412 }, { "epoch": 2.0516045430291943, "grad_norm": 0.5422208905220032, "learning_rate": 6.390114153917397e-05, "loss": 0.07489492297172547, "memory(GiB)": 122.96, "step": 26915, "token_acc": 0.9693815064298836, "train_speed(iter/s)": 0.240418 }, { "epoch": 2.0519856696394543, "grad_norm": 1.1631685495376587, "learning_rate": 6.38896397545931e-05, "loss": 0.09598802924156188, "memory(GiB)": 122.96, "step": 26920, "token_acc": 0.9579655317360235, "train_speed(iter/s)": 0.240432 }, { "epoch": 2.0523667962497143, "grad_norm": 1.1039069890975952, "learning_rate": 6.387813717350582e-05, "loss": 0.12486727237701416, "memory(GiB)": 122.96, "step": 26925, "token_acc": 0.9461053487899126, "train_speed(iter/s)": 0.240439 }, { "epoch": 2.052747922859974, "grad_norm": 1.448319911956787, "learning_rate": 6.38666337965718e-05, "loss": 0.1017630934715271, "memory(GiB)": 122.96, "step": 26930, "token_acc": 0.9601761056633981, "train_speed(iter/s)": 0.240446 }, { "epoch": 2.053129049470234, "grad_norm": 1.5498502254486084, "learning_rate": 6.385512962445068e-05, "loss": 0.1384517192840576, "memory(GiB)": 122.96, "step": 26935, "token_acc": 0.9599285349668198, "train_speed(iter/s)": 0.240456 }, { "epoch": 2.053510176080494, "grad_norm": 0.8316618800163269, "learning_rate": 6.384362465780213e-05, "loss": 0.12466484308242798, "memory(GiB)": 122.96, "step": 26940, "token_acc": 0.9524995211645279, "train_speed(iter/s)": 0.240463 }, { "epoch": 2.053891302690754, "grad_norm": 1.3200207948684692, "learning_rate": 6.3832118897286e-05, "loss": 0.09037742018699646, "memory(GiB)": 122.96, "step": 26945, "token_acc": 0.9413377192982456, "train_speed(iter/s)": 0.240478 }, { "epoch": 2.054272429301014, "grad_norm": 0.7724085450172424, "learning_rate": 6.382061234356203e-05, "loss": 0.07299980521202087, "memory(GiB)": 122.96, "step": 26950, "token_acc": 0.9688231850117096, "train_speed(iter/s)": 0.240485 }, { "epoch": 2.0546535559112735, "grad_norm": 1.1012468338012695, "learning_rate": 6.380910499729005e-05, "loss": 0.08051310181617737, "memory(GiB)": 122.96, "step": 26955, "token_acc": 0.9683787561146052, "train_speed(iter/s)": 0.240491 }, { "epoch": 2.0550346825215335, "grad_norm": 0.7243775129318237, "learning_rate": 6.379759685912999e-05, "loss": 0.12897799015045167, "memory(GiB)": 122.96, "step": 26960, "token_acc": 0.9543147208121827, "train_speed(iter/s)": 0.240503 }, { "epoch": 2.0554158091317936, "grad_norm": 1.0771678686141968, "learning_rate": 6.378608792974179e-05, "loss": 0.1282886743545532, "memory(GiB)": 122.96, "step": 26965, "token_acc": 0.9476011496794163, "train_speed(iter/s)": 0.240511 }, { "epoch": 2.0557969357420536, "grad_norm": 0.9604676961898804, "learning_rate": 6.377457820978543e-05, "loss": 0.11191043853759766, "memory(GiB)": 122.96, "step": 26970, "token_acc": 0.9593466424682395, "train_speed(iter/s)": 0.240518 }, { "epoch": 2.0561780623523136, "grad_norm": 0.46029436588287354, "learning_rate": 6.376306769992092e-05, "loss": 0.074744713306427, "memory(GiB)": 122.96, "step": 26975, "token_acc": 0.9601711652402897, "train_speed(iter/s)": 0.24053 }, { "epoch": 2.056559188962573, "grad_norm": 0.8221893906593323, "learning_rate": 6.375155640080834e-05, "loss": 0.09392922520637512, "memory(GiB)": 122.96, "step": 26980, "token_acc": 0.9672648720505151, "train_speed(iter/s)": 0.240537 }, { "epoch": 2.056940315572833, "grad_norm": 0.8226714134216309, "learning_rate": 6.374004431310783e-05, "loss": 0.09526990652084351, "memory(GiB)": 122.96, "step": 26985, "token_acc": 0.9666409266409266, "train_speed(iter/s)": 0.240547 }, { "epoch": 2.057321442183093, "grad_norm": 0.08478694409132004, "learning_rate": 6.372853143747954e-05, "loss": 0.0838483989238739, "memory(GiB)": 122.96, "step": 26990, "token_acc": 0.9648351648351648, "train_speed(iter/s)": 0.240546 }, { "epoch": 2.0577025687933532, "grad_norm": 0.7727572321891785, "learning_rate": 6.371701777458366e-05, "loss": 0.08975453972816468, "memory(GiB)": 122.96, "step": 26995, "token_acc": 0.9608038201352964, "train_speed(iter/s)": 0.240558 }, { "epoch": 2.0580836954036132, "grad_norm": 1.1627360582351685, "learning_rate": 6.370550332508047e-05, "loss": 0.12549465894699097, "memory(GiB)": 122.96, "step": 27000, "token_acc": 0.9558773997979118, "train_speed(iter/s)": 0.240562 }, { "epoch": 2.0580836954036132, "eval_loss": 0.08914484083652496, "eval_runtime": 215.6991, "eval_samples_per_second": 2.457, "eval_steps_per_second": 2.457, "eval_token_acc": 0.9601530028311548, "step": 27000 }, { "epoch": 2.058464822013873, "grad_norm": 1.056495189666748, "learning_rate": 6.369398808963029e-05, "loss": 0.1255308508872986, "memory(GiB)": 122.96, "step": 27005, "token_acc": 0.9596201391965472, "train_speed(iter/s)": 0.240105 }, { "epoch": 2.058845948624133, "grad_norm": 0.8481756448745728, "learning_rate": 6.368247206889342e-05, "loss": 0.1052697777748108, "memory(GiB)": 122.96, "step": 27010, "token_acc": 0.9679703846772896, "train_speed(iter/s)": 0.24011 }, { "epoch": 2.059227075234393, "grad_norm": 1.715609073638916, "learning_rate": 6.367095526353027e-05, "loss": 0.14658159017562866, "memory(GiB)": 122.96, "step": 27015, "token_acc": 0.9544602196624699, "train_speed(iter/s)": 0.240121 }, { "epoch": 2.059608201844653, "grad_norm": 2.168832778930664, "learning_rate": 6.365943767420128e-05, "loss": 0.16356544494628905, "memory(GiB)": 122.96, "step": 27020, "token_acc": 0.940893470790378, "train_speed(iter/s)": 0.240131 }, { "epoch": 2.059989328454913, "grad_norm": 0.5267851948738098, "learning_rate": 6.364791930156693e-05, "loss": 0.09318562746047973, "memory(GiB)": 122.96, "step": 27025, "token_acc": 0.9558852235018543, "train_speed(iter/s)": 0.240139 }, { "epoch": 2.0603704550651725, "grad_norm": 0.28233256936073303, "learning_rate": 6.363640014628774e-05, "loss": 0.0920604407787323, "memory(GiB)": 122.96, "step": 27030, "token_acc": 0.9663536776212832, "train_speed(iter/s)": 0.24015 }, { "epoch": 2.0607515816754325, "grad_norm": 1.2045133113861084, "learning_rate": 6.362488020902428e-05, "loss": 0.08805954456329346, "memory(GiB)": 122.96, "step": 27035, "token_acc": 0.9686274509803922, "train_speed(iter/s)": 0.240163 }, { "epoch": 2.0611327082856925, "grad_norm": 1.2009985446929932, "learning_rate": 6.361335949043719e-05, "loss": 0.14710952043533326, "memory(GiB)": 122.96, "step": 27040, "token_acc": 0.9341434731477852, "train_speed(iter/s)": 0.240175 }, { "epoch": 2.0615138348959525, "grad_norm": 0.9667435884475708, "learning_rate": 6.360183799118708e-05, "loss": 0.07160587310791015, "memory(GiB)": 122.96, "step": 27045, "token_acc": 0.971614301191766, "train_speed(iter/s)": 0.240183 }, { "epoch": 2.0618949615062125, "grad_norm": 0.5232541561126709, "learning_rate": 6.359031571193468e-05, "loss": 0.07269712686538696, "memory(GiB)": 122.96, "step": 27050, "token_acc": 0.9700318098108154, "train_speed(iter/s)": 0.240187 }, { "epoch": 2.062276088116472, "grad_norm": 1.229220986366272, "learning_rate": 6.357879265334076e-05, "loss": 0.09179596900939942, "memory(GiB)": 122.96, "step": 27055, "token_acc": 0.9638723254998246, "train_speed(iter/s)": 0.240201 }, { "epoch": 2.062657214726732, "grad_norm": 0.7136285305023193, "learning_rate": 6.356726881606608e-05, "loss": 0.06252117156982422, "memory(GiB)": 122.96, "step": 27060, "token_acc": 0.9740110835085037, "train_speed(iter/s)": 0.240206 }, { "epoch": 2.063038341336992, "grad_norm": 1.1598740816116333, "learning_rate": 6.35557442007715e-05, "loss": 0.09664551615715027, "memory(GiB)": 122.96, "step": 27065, "token_acc": 0.9623574666849842, "train_speed(iter/s)": 0.240209 }, { "epoch": 2.063419467947252, "grad_norm": 1.3786028623580933, "learning_rate": 6.354421880811789e-05, "loss": 0.12123782634735107, "memory(GiB)": 122.96, "step": 27070, "token_acc": 0.94740545294635, "train_speed(iter/s)": 0.240216 }, { "epoch": 2.063800594557512, "grad_norm": 1.0676100254058838, "learning_rate": 6.35326926387662e-05, "loss": 0.13816661834716798, "memory(GiB)": 122.96, "step": 27075, "token_acc": 0.9375494071146245, "train_speed(iter/s)": 0.240228 }, { "epoch": 2.0641817211677718, "grad_norm": 0.7866661548614502, "learning_rate": 6.352116569337736e-05, "loss": 0.1314984679222107, "memory(GiB)": 122.96, "step": 27080, "token_acc": 0.95086891225059, "train_speed(iter/s)": 0.240238 }, { "epoch": 2.064562847778032, "grad_norm": 2.376122236251831, "learning_rate": 6.350963797261243e-05, "loss": 0.10533781051635742, "memory(GiB)": 122.96, "step": 27085, "token_acc": 0.96310755416748, "train_speed(iter/s)": 0.240244 }, { "epoch": 2.064943974388292, "grad_norm": 0.44144678115844727, "learning_rate": 6.349810947713245e-05, "loss": 0.07771116495132446, "memory(GiB)": 122.96, "step": 27090, "token_acc": 0.9700395703787451, "train_speed(iter/s)": 0.240255 }, { "epoch": 2.065325100998552, "grad_norm": 3.0964107513427734, "learning_rate": 6.348658020759854e-05, "loss": 0.0897703766822815, "memory(GiB)": 122.96, "step": 27095, "token_acc": 0.9645048203330412, "train_speed(iter/s)": 0.240266 }, { "epoch": 2.065706227608812, "grad_norm": 1.9238094091415405, "learning_rate": 6.347505016467184e-05, "loss": 0.10292158126831055, "memory(GiB)": 122.96, "step": 27100, "token_acc": 0.9528710725893824, "train_speed(iter/s)": 0.240282 }, { "epoch": 2.0660873542190714, "grad_norm": 1.2437328100204468, "learning_rate": 6.346351934901353e-05, "loss": 0.08587864637374878, "memory(GiB)": 122.96, "step": 27105, "token_acc": 0.9590243902439024, "train_speed(iter/s)": 0.240291 }, { "epoch": 2.0664684808293314, "grad_norm": 0.4304482638835907, "learning_rate": 6.345198776128487e-05, "loss": 0.08620719909667969, "memory(GiB)": 122.96, "step": 27110, "token_acc": 0.9599455040871935, "train_speed(iter/s)": 0.240302 }, { "epoch": 2.0668496074395915, "grad_norm": 0.5446736216545105, "learning_rate": 6.344045540214713e-05, "loss": 0.0955781638622284, "memory(GiB)": 122.96, "step": 27115, "token_acc": 0.9643086129514854, "train_speed(iter/s)": 0.240312 }, { "epoch": 2.0672307340498515, "grad_norm": 1.7266288995742798, "learning_rate": 6.342892227226167e-05, "loss": 0.10722672939300537, "memory(GiB)": 122.96, "step": 27120, "token_acc": 0.958930018913807, "train_speed(iter/s)": 0.240318 }, { "epoch": 2.0676118606601115, "grad_norm": 1.0265812873840332, "learning_rate": 6.341738837228982e-05, "loss": 0.09324288368225098, "memory(GiB)": 122.96, "step": 27125, "token_acc": 0.9645071295722256, "train_speed(iter/s)": 0.240325 }, { "epoch": 2.067992987270371, "grad_norm": 1.67649245262146, "learning_rate": 6.3405853702893e-05, "loss": 0.13621947765350342, "memory(GiB)": 122.96, "step": 27130, "token_acc": 0.9497619047619048, "train_speed(iter/s)": 0.240335 }, { "epoch": 2.068374113880631, "grad_norm": 0.7165817022323608, "learning_rate": 6.33943182647327e-05, "loss": 0.07382301092147828, "memory(GiB)": 122.96, "step": 27135, "token_acc": 0.9729617304492513, "train_speed(iter/s)": 0.240347 }, { "epoch": 2.068755240490891, "grad_norm": 0.8800249099731445, "learning_rate": 6.338278205847039e-05, "loss": 0.11724995374679566, "memory(GiB)": 122.96, "step": 27140, "token_acc": 0.9550634272199527, "train_speed(iter/s)": 0.240358 }, { "epoch": 2.069136367101151, "grad_norm": 0.5025755167007446, "learning_rate": 6.337124508476765e-05, "loss": 0.10026086568832397, "memory(GiB)": 122.96, "step": 27145, "token_acc": 0.9688301440126258, "train_speed(iter/s)": 0.240365 }, { "epoch": 2.069517493711411, "grad_norm": 0.7522721290588379, "learning_rate": 6.335970734428604e-05, "loss": 0.10567986965179443, "memory(GiB)": 122.96, "step": 27150, "token_acc": 0.9611919611919612, "train_speed(iter/s)": 0.240372 }, { "epoch": 2.0698986203216707, "grad_norm": 1.5401307344436646, "learning_rate": 6.334816883768719e-05, "loss": 0.10995858907699585, "memory(GiB)": 122.96, "step": 27155, "token_acc": 0.9524534043362495, "train_speed(iter/s)": 0.240385 }, { "epoch": 2.0702797469319307, "grad_norm": 1.2714885473251343, "learning_rate": 6.333662956563283e-05, "loss": 0.08846145868301392, "memory(GiB)": 122.96, "step": 27160, "token_acc": 0.9675925925925926, "train_speed(iter/s)": 0.2404 }, { "epoch": 2.0706608735421907, "grad_norm": 0.9218497276306152, "learning_rate": 6.332508952878465e-05, "loss": 0.0684882640838623, "memory(GiB)": 122.96, "step": 27165, "token_acc": 0.9694179546201908, "train_speed(iter/s)": 0.240413 }, { "epoch": 2.0710420001524508, "grad_norm": 1.6380723714828491, "learning_rate": 6.331354872780441e-05, "loss": 0.10499304533004761, "memory(GiB)": 122.96, "step": 27170, "token_acc": 0.9615198451113263, "train_speed(iter/s)": 0.240423 }, { "epoch": 2.071423126762711, "grad_norm": 0.6559543609619141, "learning_rate": 6.330200716335395e-05, "loss": 0.08890421390533447, "memory(GiB)": 122.96, "step": 27175, "token_acc": 0.9607745159275453, "train_speed(iter/s)": 0.240429 }, { "epoch": 2.0718042533729704, "grad_norm": 0.8537452816963196, "learning_rate": 6.329046483609511e-05, "loss": 0.09030271768569946, "memory(GiB)": 122.96, "step": 27180, "token_acc": 0.9644581804133668, "train_speed(iter/s)": 0.240436 }, { "epoch": 2.0721853799832304, "grad_norm": 1.826904535293579, "learning_rate": 6.327892174668977e-05, "loss": 0.12254018783569336, "memory(GiB)": 122.96, "step": 27185, "token_acc": 0.9507042253521126, "train_speed(iter/s)": 0.240445 }, { "epoch": 2.0725665065934904, "grad_norm": 0.756435751914978, "learning_rate": 6.32673778957999e-05, "loss": 0.08373212814331055, "memory(GiB)": 122.96, "step": 27190, "token_acc": 0.9644287396937573, "train_speed(iter/s)": 0.240449 }, { "epoch": 2.0729476332037504, "grad_norm": 0.6416155695915222, "learning_rate": 6.325583328408747e-05, "loss": 0.08263529539108276, "memory(GiB)": 122.96, "step": 27195, "token_acc": 0.9644729178800233, "train_speed(iter/s)": 0.240458 }, { "epoch": 2.07332875981401, "grad_norm": 0.8863335251808167, "learning_rate": 6.324428791221452e-05, "loss": 0.10952297449111939, "memory(GiB)": 122.96, "step": 27200, "token_acc": 0.9580573951434879, "train_speed(iter/s)": 0.240463 }, { "epoch": 2.07332875981401, "eval_loss": 0.09116149693727493, "eval_runtime": 217.8586, "eval_samples_per_second": 2.433, "eval_steps_per_second": 2.433, "eval_token_acc": 0.9597313414854527, "step": 27200 }, { "epoch": 2.07370988642427, "grad_norm": 0.7835207581520081, "learning_rate": 6.323274178084312e-05, "loss": 0.07261168956756592, "memory(GiB)": 122.96, "step": 27205, "token_acc": 0.9600676720240352, "train_speed(iter/s)": 0.240011 }, { "epoch": 2.07409101303453, "grad_norm": 0.5070050954818726, "learning_rate": 6.322119489063538e-05, "loss": 0.06022197604179382, "memory(GiB)": 122.96, "step": 27210, "token_acc": 0.9727520435967303, "train_speed(iter/s)": 0.240013 }, { "epoch": 2.07447213964479, "grad_norm": 1.064069390296936, "learning_rate": 6.320964724225347e-05, "loss": 0.12444254159927368, "memory(GiB)": 122.96, "step": 27215, "token_acc": 0.955026455026455, "train_speed(iter/s)": 0.240023 }, { "epoch": 2.07485326625505, "grad_norm": 0.7624698281288147, "learning_rate": 6.319809883635957e-05, "loss": 0.08999449014663696, "memory(GiB)": 122.96, "step": 27220, "token_acc": 0.9729924378826071, "train_speed(iter/s)": 0.240028 }, { "epoch": 2.07523439286531, "grad_norm": 0.592252790927887, "learning_rate": 6.318654967361598e-05, "loss": 0.078788423538208, "memory(GiB)": 122.96, "step": 27225, "token_acc": 0.9678225894118926, "train_speed(iter/s)": 0.24003 }, { "epoch": 2.0756155194755697, "grad_norm": 0.4671790599822998, "learning_rate": 6.317499975468495e-05, "loss": 0.10422115325927735, "memory(GiB)": 122.96, "step": 27230, "token_acc": 0.9691252144082333, "train_speed(iter/s)": 0.240034 }, { "epoch": 2.0759966460858297, "grad_norm": 0.7090429663658142, "learning_rate": 6.316344908022882e-05, "loss": 0.07802949547767639, "memory(GiB)": 122.96, "step": 27235, "token_acc": 0.9633587786259542, "train_speed(iter/s)": 0.240048 }, { "epoch": 2.0763777726960897, "grad_norm": 0.8349044919013977, "learning_rate": 6.315189765090998e-05, "loss": 0.07221020460128784, "memory(GiB)": 122.96, "step": 27240, "token_acc": 0.9621848739495799, "train_speed(iter/s)": 0.240061 }, { "epoch": 2.0767588993063497, "grad_norm": 0.9693659543991089, "learning_rate": 6.314034546739084e-05, "loss": 0.10632894039154053, "memory(GiB)": 122.96, "step": 27245, "token_acc": 0.9578189300411523, "train_speed(iter/s)": 0.24007 }, { "epoch": 2.0771400259166093, "grad_norm": 0.43815377354621887, "learning_rate": 6.312879253033386e-05, "loss": 0.07895511984825135, "memory(GiB)": 122.96, "step": 27250, "token_acc": 0.9700400627068455, "train_speed(iter/s)": 0.240077 }, { "epoch": 2.0775211525268693, "grad_norm": 0.4931529462337494, "learning_rate": 6.311723884040154e-05, "loss": 0.10346498489379882, "memory(GiB)": 122.96, "step": 27255, "token_acc": 0.9574819888980749, "train_speed(iter/s)": 0.240079 }, { "epoch": 2.0779022791371293, "grad_norm": 1.4831833839416504, "learning_rate": 6.310568439825646e-05, "loss": 0.13021314144134521, "memory(GiB)": 122.96, "step": 27260, "token_acc": 0.9375491996851221, "train_speed(iter/s)": 0.240088 }, { "epoch": 2.0782834057473893, "grad_norm": 0.6185627579689026, "learning_rate": 6.309412920456121e-05, "loss": 0.0798985481262207, "memory(GiB)": 122.96, "step": 27265, "token_acc": 0.9624587458745875, "train_speed(iter/s)": 0.240096 }, { "epoch": 2.0786645323576494, "grad_norm": 1.0083574056625366, "learning_rate": 6.308257325997839e-05, "loss": 0.1391900062561035, "memory(GiB)": 122.96, "step": 27270, "token_acc": 0.9560909705021392, "train_speed(iter/s)": 0.240104 }, { "epoch": 2.079045658967909, "grad_norm": 0.6064718961715698, "learning_rate": 6.307101656517072e-05, "loss": 0.10200719833374024, "memory(GiB)": 122.96, "step": 27275, "token_acc": 0.9595287858635759, "train_speed(iter/s)": 0.24011 }, { "epoch": 2.079426785578169, "grad_norm": 1.1932419538497925, "learning_rate": 6.305945912080091e-05, "loss": 0.12151408195495605, "memory(GiB)": 122.96, "step": 27280, "token_acc": 0.9559289790741915, "train_speed(iter/s)": 0.240121 }, { "epoch": 2.079807912188429, "grad_norm": 1.6018142700195312, "learning_rate": 6.304790092753171e-05, "loss": 0.12491326332092285, "memory(GiB)": 122.96, "step": 27285, "token_acc": 0.9430801987224983, "train_speed(iter/s)": 0.24013 }, { "epoch": 2.080189038798689, "grad_norm": 0.958804190158844, "learning_rate": 6.303634198602593e-05, "loss": 0.08824538588523864, "memory(GiB)": 122.96, "step": 27290, "token_acc": 0.9634561783834005, "train_speed(iter/s)": 0.240141 }, { "epoch": 2.080570165408949, "grad_norm": 0.8639811873435974, "learning_rate": 6.302478229694646e-05, "loss": 0.10456247329711914, "memory(GiB)": 122.96, "step": 27295, "token_acc": 0.9691150267273807, "train_speed(iter/s)": 0.240149 }, { "epoch": 2.0809512920192086, "grad_norm": 0.9370761513710022, "learning_rate": 6.301322186095616e-05, "loss": 0.07610681056976318, "memory(GiB)": 122.96, "step": 27300, "token_acc": 0.9726669269816478, "train_speed(iter/s)": 0.240156 }, { "epoch": 2.0813324186294686, "grad_norm": 0.630244255065918, "learning_rate": 6.300166067871797e-05, "loss": 0.09241738319396972, "memory(GiB)": 122.96, "step": 27305, "token_acc": 0.9549526490913745, "train_speed(iter/s)": 0.240166 }, { "epoch": 2.0817135452397286, "grad_norm": 2.707411289215088, "learning_rate": 6.299009875089488e-05, "loss": 0.08493274450302124, "memory(GiB)": 122.96, "step": 27310, "token_acc": 0.9620670698185816, "train_speed(iter/s)": 0.240181 }, { "epoch": 2.0820946718499886, "grad_norm": 0.7390384674072266, "learning_rate": 6.29785360781499e-05, "loss": 0.1043481707572937, "memory(GiB)": 122.96, "step": 27315, "token_acc": 0.9567438692098093, "train_speed(iter/s)": 0.240193 }, { "epoch": 2.0824757984602487, "grad_norm": 0.817465603351593, "learning_rate": 6.29669726611461e-05, "loss": 0.1131706714630127, "memory(GiB)": 122.96, "step": 27320, "token_acc": 0.9595739530380053, "train_speed(iter/s)": 0.240205 }, { "epoch": 2.0828569250705082, "grad_norm": 0.768190324306488, "learning_rate": 6.29554085005466e-05, "loss": 0.08308249115943908, "memory(GiB)": 122.96, "step": 27325, "token_acc": 0.9672131147540983, "train_speed(iter/s)": 0.240208 }, { "epoch": 2.0832380516807683, "grad_norm": 0.6892905831336975, "learning_rate": 6.294384359701455e-05, "loss": 0.07333185672760009, "memory(GiB)": 122.96, "step": 27330, "token_acc": 0.9719646323053699, "train_speed(iter/s)": 0.240218 }, { "epoch": 2.0836191782910283, "grad_norm": 0.9774745106697083, "learning_rate": 6.293227795121313e-05, "loss": 0.10476322174072265, "memory(GiB)": 122.96, "step": 27335, "token_acc": 0.9513415188722146, "train_speed(iter/s)": 0.240227 }, { "epoch": 2.0840003049012883, "grad_norm": 0.59339839220047, "learning_rate": 6.292071156380559e-05, "loss": 0.08289227485656739, "memory(GiB)": 122.96, "step": 27340, "token_acc": 0.9675798346571568, "train_speed(iter/s)": 0.240233 }, { "epoch": 2.0843814315115483, "grad_norm": 0.4842331111431122, "learning_rate": 6.290914443545519e-05, "loss": 0.08499487042427063, "memory(GiB)": 122.96, "step": 27345, "token_acc": 0.9682638456751711, "train_speed(iter/s)": 0.240237 }, { "epoch": 2.084762558121808, "grad_norm": 1.64462149143219, "learning_rate": 6.289757656682527e-05, "loss": 0.1094969630241394, "memory(GiB)": 122.96, "step": 27350, "token_acc": 0.968336483931947, "train_speed(iter/s)": 0.240251 }, { "epoch": 2.085143684732068, "grad_norm": 1.0304597616195679, "learning_rate": 6.288600795857917e-05, "loss": 0.05425162315368652, "memory(GiB)": 122.96, "step": 27355, "token_acc": 0.9778531073446328, "train_speed(iter/s)": 0.240259 }, { "epoch": 2.085524811342328, "grad_norm": 0.822722315788269, "learning_rate": 6.287443861138032e-05, "loss": 0.10501564741134643, "memory(GiB)": 122.96, "step": 27360, "token_acc": 0.957290390337826, "train_speed(iter/s)": 0.240266 }, { "epoch": 2.085905937952588, "grad_norm": 0.880167543888092, "learning_rate": 6.286286852589217e-05, "loss": 0.11050130128860473, "memory(GiB)": 122.96, "step": 27365, "token_acc": 0.9613102302397342, "train_speed(iter/s)": 0.240275 }, { "epoch": 2.086287064562848, "grad_norm": 0.7569501399993896, "learning_rate": 6.28512977027782e-05, "loss": 0.09736736416816712, "memory(GiB)": 122.96, "step": 27370, "token_acc": 0.9607686148919136, "train_speed(iter/s)": 0.240289 }, { "epoch": 2.0866681911731075, "grad_norm": 0.849359929561615, "learning_rate": 6.283972614270194e-05, "loss": 0.10230519771575927, "memory(GiB)": 122.96, "step": 27375, "token_acc": 0.964114010989011, "train_speed(iter/s)": 0.240295 }, { "epoch": 2.0870493177833676, "grad_norm": 1.004315972328186, "learning_rate": 6.282815384632697e-05, "loss": 0.09945677518844605, "memory(GiB)": 122.96, "step": 27380, "token_acc": 0.9713420316868593, "train_speed(iter/s)": 0.240304 }, { "epoch": 2.0874304443936276, "grad_norm": 0.41830363869667053, "learning_rate": 6.28165808143169e-05, "loss": 0.07099489569664001, "memory(GiB)": 122.96, "step": 27385, "token_acc": 0.9658671586715867, "train_speed(iter/s)": 0.240311 }, { "epoch": 2.0878115710038876, "grad_norm": 0.6833613514900208, "learning_rate": 6.28050070473354e-05, "loss": 0.08008714914321899, "memory(GiB)": 122.96, "step": 27390, "token_acc": 0.9661843876177658, "train_speed(iter/s)": 0.240314 }, { "epoch": 2.0881926976141476, "grad_norm": 0.5011914968490601, "learning_rate": 6.279343254604617e-05, "loss": 0.09524340629577636, "memory(GiB)": 122.96, "step": 27395, "token_acc": 0.9627354627354627, "train_speed(iter/s)": 0.240323 }, { "epoch": 2.088573824224407, "grad_norm": 1.5527230501174927, "learning_rate": 6.278185731111296e-05, "loss": 0.07801447510719299, "memory(GiB)": 122.96, "step": 27400, "token_acc": 0.9705453484981044, "train_speed(iter/s)": 0.240329 }, { "epoch": 2.088573824224407, "eval_loss": 0.09103234112262726, "eval_runtime": 219.3587, "eval_samples_per_second": 2.416, "eval_steps_per_second": 2.416, "eval_token_acc": 0.9604391301728812, "step": 27400 }, { "epoch": 2.088954950834667, "grad_norm": 0.7897855639457703, "learning_rate": 6.277028134319953e-05, "loss": 0.10184295177459717, "memory(GiB)": 122.96, "step": 27405, "token_acc": 0.9603171149215192, "train_speed(iter/s)": 0.239874 }, { "epoch": 2.089336077444927, "grad_norm": 1.2169387340545654, "learning_rate": 6.275870464296974e-05, "loss": 0.09539178609848023, "memory(GiB)": 122.96, "step": 27410, "token_acc": 0.9606019151846785, "train_speed(iter/s)": 0.239885 }, { "epoch": 2.0897172040551872, "grad_norm": 0.8620839715003967, "learning_rate": 6.274712721108745e-05, "loss": 0.12181757688522339, "memory(GiB)": 122.96, "step": 27415, "token_acc": 0.953654860587792, "train_speed(iter/s)": 0.239894 }, { "epoch": 2.0900983306654473, "grad_norm": 2.1008286476135254, "learning_rate": 6.273554904821656e-05, "loss": 0.09877347350120544, "memory(GiB)": 122.96, "step": 27420, "token_acc": 0.9653044591919897, "train_speed(iter/s)": 0.239894 }, { "epoch": 2.090479457275707, "grad_norm": 0.6257491707801819, "learning_rate": 6.272397015502103e-05, "loss": 0.15242698192596435, "memory(GiB)": 122.96, "step": 27425, "token_acc": 0.9547489983502239, "train_speed(iter/s)": 0.239905 }, { "epoch": 2.090860583885967, "grad_norm": 1.0099023580551147, "learning_rate": 6.271239053216487e-05, "loss": 0.112270188331604, "memory(GiB)": 122.96, "step": 27430, "token_acc": 0.9612736660929432, "train_speed(iter/s)": 0.239911 }, { "epoch": 2.091241710496227, "grad_norm": 3.634157419204712, "learning_rate": 6.27008101803121e-05, "loss": 0.1270209550857544, "memory(GiB)": 122.96, "step": 27435, "token_acc": 0.9424019607843137, "train_speed(iter/s)": 0.239922 }, { "epoch": 2.091622837106487, "grad_norm": 0.1678226888179779, "learning_rate": 6.268922910012679e-05, "loss": 0.07883673310279846, "memory(GiB)": 122.96, "step": 27440, "token_acc": 0.9672848510106272, "train_speed(iter/s)": 0.23993 }, { "epoch": 2.092003963716747, "grad_norm": 0.7551071047782898, "learning_rate": 6.26776472922731e-05, "loss": 0.06967081427574158, "memory(GiB)": 122.96, "step": 27445, "token_acc": 0.9683257918552036, "train_speed(iter/s)": 0.23994 }, { "epoch": 2.0923850903270065, "grad_norm": 0.7371070981025696, "learning_rate": 6.266606475741515e-05, "loss": 0.08049039840698242, "memory(GiB)": 122.96, "step": 27450, "token_acc": 0.9674756480105663, "train_speed(iter/s)": 0.239946 }, { "epoch": 2.0927662169372665, "grad_norm": 1.295127511024475, "learning_rate": 6.265448149621718e-05, "loss": 0.12148127555847169, "memory(GiB)": 122.96, "step": 27455, "token_acc": 0.9453366275478691, "train_speed(iter/s)": 0.239959 }, { "epoch": 2.0931473435475265, "grad_norm": 1.0731515884399414, "learning_rate": 6.264289750934342e-05, "loss": 0.14644325971603395, "memory(GiB)": 122.96, "step": 27460, "token_acc": 0.9450113378684807, "train_speed(iter/s)": 0.239969 }, { "epoch": 2.0935284701577865, "grad_norm": 0.9959650635719299, "learning_rate": 6.263131279745815e-05, "loss": 0.09337406754493713, "memory(GiB)": 122.96, "step": 27465, "token_acc": 0.9709816368170483, "train_speed(iter/s)": 0.23997 }, { "epoch": 2.0939095967680466, "grad_norm": 1.279561161994934, "learning_rate": 6.26197273612257e-05, "loss": 0.10333985090255737, "memory(GiB)": 122.96, "step": 27470, "token_acc": 0.9732586068855084, "train_speed(iter/s)": 0.239973 }, { "epoch": 2.094290723378306, "grad_norm": 0.9768779277801514, "learning_rate": 6.260814120131046e-05, "loss": 0.0790955364704132, "memory(GiB)": 122.96, "step": 27475, "token_acc": 0.9670027497708524, "train_speed(iter/s)": 0.239985 }, { "epoch": 2.094671849988566, "grad_norm": 1.4307076930999756, "learning_rate": 6.259655431837683e-05, "loss": 0.11093891859054565, "memory(GiB)": 122.96, "step": 27480, "token_acc": 0.9598630989421282, "train_speed(iter/s)": 0.239997 }, { "epoch": 2.095052976598826, "grad_norm": 0.9147241711616516, "learning_rate": 6.258496671308927e-05, "loss": 0.10938501358032227, "memory(GiB)": 122.96, "step": 27485, "token_acc": 0.9598757442402278, "train_speed(iter/s)": 0.240006 }, { "epoch": 2.095434103209086, "grad_norm": 0.7322588562965393, "learning_rate": 6.257337838611225e-05, "loss": 0.11372298002243042, "memory(GiB)": 122.96, "step": 27490, "token_acc": 0.9567706842255941, "train_speed(iter/s)": 0.240015 }, { "epoch": 2.0958152298193458, "grad_norm": 0.7225372791290283, "learning_rate": 6.256178933811034e-05, "loss": 0.0990553617477417, "memory(GiB)": 122.96, "step": 27495, "token_acc": 0.9633507853403142, "train_speed(iter/s)": 0.240025 }, { "epoch": 2.096196356429606, "grad_norm": 0.8304914236068726, "learning_rate": 6.25501995697481e-05, "loss": 0.08449924588203431, "memory(GiB)": 122.96, "step": 27500, "token_acc": 0.9657206044968669, "train_speed(iter/s)": 0.240039 }, { "epoch": 2.096577483039866, "grad_norm": 0.6901969313621521, "learning_rate": 6.253860908169017e-05, "loss": 0.10392621755599976, "memory(GiB)": 122.96, "step": 27505, "token_acc": 0.9551316021904257, "train_speed(iter/s)": 0.240046 }, { "epoch": 2.096958609650126, "grad_norm": 1.3439838886260986, "learning_rate": 6.252701787460118e-05, "loss": 0.0905199646949768, "memory(GiB)": 122.96, "step": 27510, "token_acc": 0.967741935483871, "train_speed(iter/s)": 0.240058 }, { "epoch": 2.097339736260386, "grad_norm": 0.6792594790458679, "learning_rate": 6.251542594914586e-05, "loss": 0.10548669099807739, "memory(GiB)": 122.96, "step": 27515, "token_acc": 0.9658787255909558, "train_speed(iter/s)": 0.240066 }, { "epoch": 2.097720862870646, "grad_norm": 0.8755074143409729, "learning_rate": 6.250383330598892e-05, "loss": 0.12976969480514527, "memory(GiB)": 122.96, "step": 27520, "token_acc": 0.9550075708414449, "train_speed(iter/s)": 0.240074 }, { "epoch": 2.0981019894809054, "grad_norm": 0.985792875289917, "learning_rate": 6.249223994579518e-05, "loss": 0.13325886726379393, "memory(GiB)": 122.96, "step": 27525, "token_acc": 0.951270207852194, "train_speed(iter/s)": 0.240082 }, { "epoch": 2.0984831160911654, "grad_norm": 0.5580151081085205, "learning_rate": 6.248064586922945e-05, "loss": 0.11278994083404541, "memory(GiB)": 122.96, "step": 27530, "token_acc": 0.9641451068616423, "train_speed(iter/s)": 0.240088 }, { "epoch": 2.0988642427014255, "grad_norm": 1.3276885747909546, "learning_rate": 6.24690510769566e-05, "loss": 0.0966147541999817, "memory(GiB)": 122.96, "step": 27535, "token_acc": 0.9632919879316125, "train_speed(iter/s)": 0.240095 }, { "epoch": 2.0992453693116855, "grad_norm": 1.6260159015655518, "learning_rate": 6.245745556964153e-05, "loss": 0.07067039012908935, "memory(GiB)": 122.96, "step": 27540, "token_acc": 0.9602996254681648, "train_speed(iter/s)": 0.240112 }, { "epoch": 2.099626495921945, "grad_norm": 0.4616506099700928, "learning_rate": 6.244585934794918e-05, "loss": 0.11402645111083984, "memory(GiB)": 122.96, "step": 27545, "token_acc": 0.9685479643543596, "train_speed(iter/s)": 0.240116 }, { "epoch": 2.100007622532205, "grad_norm": 0.6388801336288452, "learning_rate": 6.243426241254458e-05, "loss": 0.08127616047859192, "memory(GiB)": 122.96, "step": 27550, "token_acc": 0.9678391959798995, "train_speed(iter/s)": 0.240122 }, { "epoch": 2.100388749142465, "grad_norm": 1.0668185949325562, "learning_rate": 6.242266476409271e-05, "loss": 0.1456449031829834, "memory(GiB)": 122.96, "step": 27555, "token_acc": 0.9439480097481722, "train_speed(iter/s)": 0.240133 }, { "epoch": 2.100769875752725, "grad_norm": 0.7531003355979919, "learning_rate": 6.241106640325867e-05, "loss": 0.0777139961719513, "memory(GiB)": 122.96, "step": 27560, "token_acc": 0.9666790077748982, "train_speed(iter/s)": 0.240134 }, { "epoch": 2.101151002362985, "grad_norm": 0.7044401168823242, "learning_rate": 6.239946733070756e-05, "loss": 0.08413103222846985, "memory(GiB)": 122.96, "step": 27565, "token_acc": 0.9678877259752616, "train_speed(iter/s)": 0.240141 }, { "epoch": 2.1015321289732447, "grad_norm": 0.825570821762085, "learning_rate": 6.238786754710455e-05, "loss": 0.11540061235427856, "memory(GiB)": 122.96, "step": 27570, "token_acc": 0.9610956728860659, "train_speed(iter/s)": 0.240153 }, { "epoch": 2.1019132555835047, "grad_norm": 0.7807924747467041, "learning_rate": 6.237626705311482e-05, "loss": 0.08759520649909973, "memory(GiB)": 122.96, "step": 27575, "token_acc": 0.9558021806853583, "train_speed(iter/s)": 0.240162 }, { "epoch": 2.1022943821937647, "grad_norm": 1.221156358718872, "learning_rate": 6.23646658494036e-05, "loss": 0.1410202145576477, "memory(GiB)": 122.96, "step": 27580, "token_acc": 0.9436220472440945, "train_speed(iter/s)": 0.240173 }, { "epoch": 2.1026755088040248, "grad_norm": 0.6920754313468933, "learning_rate": 6.23530639366362e-05, "loss": 0.07761870622634888, "memory(GiB)": 122.96, "step": 27585, "token_acc": 0.9656181419166057, "train_speed(iter/s)": 0.240183 }, { "epoch": 2.103056635414285, "grad_norm": 0.12092328816652298, "learning_rate": 6.234146131547787e-05, "loss": 0.08397802114486694, "memory(GiB)": 122.96, "step": 27590, "token_acc": 0.9658661465317566, "train_speed(iter/s)": 0.240184 }, { "epoch": 2.1034377620245444, "grad_norm": 0.9724830985069275, "learning_rate": 6.232985798659404e-05, "loss": 0.11129393577575683, "memory(GiB)": 122.96, "step": 27595, "token_acc": 0.957084631982738, "train_speed(iter/s)": 0.240194 }, { "epoch": 2.1038188886348044, "grad_norm": 1.2841081619262695, "learning_rate": 6.231825395065004e-05, "loss": 0.15161778926849365, "memory(GiB)": 122.96, "step": 27600, "token_acc": 0.9542524175551699, "train_speed(iter/s)": 0.2402 }, { "epoch": 2.1038188886348044, "eval_loss": 0.08985377103090286, "eval_runtime": 218.075, "eval_samples_per_second": 2.43, "eval_steps_per_second": 2.43, "eval_token_acc": 0.9604014818384434, "step": 27600 }, { "epoch": 2.1042000152450644, "grad_norm": 0.9209967851638794, "learning_rate": 6.230664920831136e-05, "loss": 0.09173312783241272, "memory(GiB)": 122.96, "step": 27605, "token_acc": 0.9606718528995757, "train_speed(iter/s)": 0.239749 }, { "epoch": 2.1045811418553244, "grad_norm": 1.8660341501235962, "learning_rate": 6.229504376024345e-05, "loss": 0.14252665042877197, "memory(GiB)": 122.96, "step": 27610, "token_acc": 0.9441602728047741, "train_speed(iter/s)": 0.239758 }, { "epoch": 2.1049622684655844, "grad_norm": 0.6952311992645264, "learning_rate": 6.228343760711184e-05, "loss": 0.07365024685859681, "memory(GiB)": 122.96, "step": 27615, "token_acc": 0.9716729021913415, "train_speed(iter/s)": 0.239769 }, { "epoch": 2.105343395075844, "grad_norm": 1.340843915939331, "learning_rate": 6.227183074958208e-05, "loss": 0.06616742014884949, "memory(GiB)": 122.96, "step": 27620, "token_acc": 0.9753784056508578, "train_speed(iter/s)": 0.239778 }, { "epoch": 2.105724521686104, "grad_norm": 2.5461747646331787, "learning_rate": 6.226022318831977e-05, "loss": 0.1364797830581665, "memory(GiB)": 122.96, "step": 27625, "token_acc": 0.942865264354747, "train_speed(iter/s)": 0.239789 }, { "epoch": 2.106105648296364, "grad_norm": 0.8338993191719055, "learning_rate": 6.224861492399057e-05, "loss": 0.09014132022857665, "memory(GiB)": 122.96, "step": 27630, "token_acc": 0.9645844223732357, "train_speed(iter/s)": 0.239788 }, { "epoch": 2.106486774906624, "grad_norm": 0.7258082628250122, "learning_rate": 6.223700595726014e-05, "loss": 0.05745429992675781, "memory(GiB)": 122.96, "step": 27635, "token_acc": 0.978103448275862, "train_speed(iter/s)": 0.239792 }, { "epoch": 2.106867901516884, "grad_norm": 0.6892473101615906, "learning_rate": 6.22253962887942e-05, "loss": 0.13936772346496581, "memory(GiB)": 122.96, "step": 27640, "token_acc": 0.9491623339110341, "train_speed(iter/s)": 0.239804 }, { "epoch": 2.1072490281271437, "grad_norm": 1.0793795585632324, "learning_rate": 6.221378591925853e-05, "loss": 0.09958188533782959, "memory(GiB)": 122.96, "step": 27645, "token_acc": 0.9662375516152538, "train_speed(iter/s)": 0.239814 }, { "epoch": 2.1076301547374037, "grad_norm": 0.9540955424308777, "learning_rate": 6.22021748493189e-05, "loss": 0.10529236793518067, "memory(GiB)": 122.96, "step": 27650, "token_acc": 0.957370159111378, "train_speed(iter/s)": 0.239821 }, { "epoch": 2.1080112813476637, "grad_norm": 0.7717848420143127, "learning_rate": 6.219056307964117e-05, "loss": 0.1130135178565979, "memory(GiB)": 122.96, "step": 27655, "token_acc": 0.9507251153592617, "train_speed(iter/s)": 0.239829 }, { "epoch": 2.1083924079579237, "grad_norm": 1.7596319913864136, "learning_rate": 6.217895061089122e-05, "loss": 0.13061641454696654, "memory(GiB)": 122.96, "step": 27660, "token_acc": 0.9492208490059107, "train_speed(iter/s)": 0.239838 }, { "epoch": 2.1087735345681837, "grad_norm": 0.8522200584411621, "learning_rate": 6.216733744373496e-05, "loss": 0.0719691812992096, "memory(GiB)": 122.96, "step": 27665, "token_acc": 0.9617996604414262, "train_speed(iter/s)": 0.239852 }, { "epoch": 2.1091546611784433, "grad_norm": 0.758590817451477, "learning_rate": 6.215572357883837e-05, "loss": 0.11448359489440918, "memory(GiB)": 122.96, "step": 27670, "token_acc": 0.9539303196084077, "train_speed(iter/s)": 0.23986 }, { "epoch": 2.1095357877887033, "grad_norm": 0.8414782881736755, "learning_rate": 6.214410901686745e-05, "loss": 0.11950762271881103, "memory(GiB)": 122.96, "step": 27675, "token_acc": 0.9595808383233533, "train_speed(iter/s)": 0.239871 }, { "epoch": 2.1099169143989633, "grad_norm": 0.4685438871383667, "learning_rate": 6.213249375848823e-05, "loss": 0.1349416732788086, "memory(GiB)": 122.96, "step": 27680, "token_acc": 0.9593900481540931, "train_speed(iter/s)": 0.239877 }, { "epoch": 2.1102980410092234, "grad_norm": 0.35739293694496155, "learning_rate": 6.21208778043668e-05, "loss": 0.11778390407562256, "memory(GiB)": 122.96, "step": 27685, "token_acc": 0.9463151207115629, "train_speed(iter/s)": 0.239888 }, { "epoch": 2.1106791676194834, "grad_norm": 0.7673335671424866, "learning_rate": 6.210926115516925e-05, "loss": 0.11495099067687989, "memory(GiB)": 122.96, "step": 27690, "token_acc": 0.9627128596594245, "train_speed(iter/s)": 0.239898 }, { "epoch": 2.111060294229743, "grad_norm": 0.7692159414291382, "learning_rate": 6.209764381156179e-05, "loss": 0.10626416206359864, "memory(GiB)": 122.96, "step": 27695, "token_acc": 0.9637305699481865, "train_speed(iter/s)": 0.239908 }, { "epoch": 2.111441420840003, "grad_norm": 1.2469465732574463, "learning_rate": 6.20860257742106e-05, "loss": 0.10212780237197876, "memory(GiB)": 122.96, "step": 27700, "token_acc": 0.9548319327731093, "train_speed(iter/s)": 0.23992 }, { "epoch": 2.111822547450263, "grad_norm": 1.2303427457809448, "learning_rate": 6.207440704378189e-05, "loss": 0.11788485050201417, "memory(GiB)": 122.96, "step": 27705, "token_acc": 0.9539808592954592, "train_speed(iter/s)": 0.239927 }, { "epoch": 2.112203674060523, "grad_norm": 0.7697901129722595, "learning_rate": 6.2062787620942e-05, "loss": 0.07162842750549317, "memory(GiB)": 122.96, "step": 27710, "token_acc": 0.9697063369397217, "train_speed(iter/s)": 0.239939 }, { "epoch": 2.112584800670783, "grad_norm": 0.6067183613777161, "learning_rate": 6.205116750635719e-05, "loss": 0.10851017236709595, "memory(GiB)": 122.96, "step": 27715, "token_acc": 0.9597560975609756, "train_speed(iter/s)": 0.239954 }, { "epoch": 2.1129659272810426, "grad_norm": 1.2871036529541016, "learning_rate": 6.203954670069388e-05, "loss": 0.09070445299148559, "memory(GiB)": 122.96, "step": 27720, "token_acc": 0.9687636522498908, "train_speed(iter/s)": 0.239964 }, { "epoch": 2.1133470538913026, "grad_norm": 1.2937735319137573, "learning_rate": 6.202792520461842e-05, "loss": 0.08251258134841918, "memory(GiB)": 122.96, "step": 27725, "token_acc": 0.9701673538685424, "train_speed(iter/s)": 0.239974 }, { "epoch": 2.1137281805015626, "grad_norm": 0.8037604093551636, "learning_rate": 6.201630301879727e-05, "loss": 0.054020369052886964, "memory(GiB)": 122.96, "step": 27730, "token_acc": 0.9780508395084889, "train_speed(iter/s)": 0.23997 }, { "epoch": 2.1141093071118227, "grad_norm": 0.9780164361000061, "learning_rate": 6.200468014389689e-05, "loss": 0.10729857683181762, "memory(GiB)": 122.96, "step": 27735, "token_acc": 0.9623655913978495, "train_speed(iter/s)": 0.239979 }, { "epoch": 2.1144904337220827, "grad_norm": 1.5614261627197266, "learning_rate": 6.199305658058382e-05, "loss": 0.0917648732662201, "memory(GiB)": 122.96, "step": 27740, "token_acc": 0.964735516372796, "train_speed(iter/s)": 0.239989 }, { "epoch": 2.1148715603323422, "grad_norm": 0.5690711140632629, "learning_rate": 6.198143232952463e-05, "loss": 0.07042406797409058, "memory(GiB)": 122.96, "step": 27745, "token_acc": 0.9702241552358648, "train_speed(iter/s)": 0.240002 }, { "epoch": 2.1152526869426023, "grad_norm": 0.5193389058113098, "learning_rate": 6.196980739138586e-05, "loss": 0.08511825799942016, "memory(GiB)": 122.96, "step": 27750, "token_acc": 0.965046650884278, "train_speed(iter/s)": 0.240004 }, { "epoch": 2.1156338135528623, "grad_norm": 0.5709406137466431, "learning_rate": 6.195818176683419e-05, "loss": 0.08614201545715332, "memory(GiB)": 122.96, "step": 27755, "token_acc": 0.9622847772615468, "train_speed(iter/s)": 0.240007 }, { "epoch": 2.1160149401631223, "grad_norm": 0.921393871307373, "learning_rate": 6.194655545653631e-05, "loss": 0.08205643892288209, "memory(GiB)": 122.96, "step": 27760, "token_acc": 0.9615614717319773, "train_speed(iter/s)": 0.240009 }, { "epoch": 2.1163960667733823, "grad_norm": 0.5504015684127808, "learning_rate": 6.19349284611589e-05, "loss": 0.07086114883422852, "memory(GiB)": 122.96, "step": 27765, "token_acc": 0.9723988439306358, "train_speed(iter/s)": 0.240013 }, { "epoch": 2.116777193383642, "grad_norm": 0.7585349082946777, "learning_rate": 6.192330078136873e-05, "loss": 0.05852036476135254, "memory(GiB)": 122.96, "step": 27770, "token_acc": 0.9755529685681025, "train_speed(iter/s)": 0.240026 }, { "epoch": 2.117158319993902, "grad_norm": 0.7448979020118713, "learning_rate": 6.19116724178326e-05, "loss": 0.10204025506973266, "memory(GiB)": 122.96, "step": 27775, "token_acc": 0.965322373490881, "train_speed(iter/s)": 0.240035 }, { "epoch": 2.117539446604162, "grad_norm": 0.977942705154419, "learning_rate": 6.190004337121732e-05, "loss": 0.10226210355758666, "memory(GiB)": 122.96, "step": 27780, "token_acc": 0.9538820782253357, "train_speed(iter/s)": 0.240042 }, { "epoch": 2.117920573214422, "grad_norm": 1.2242116928100586, "learning_rate": 6.188841364218978e-05, "loss": 0.09320048093795777, "memory(GiB)": 122.96, "step": 27785, "token_acc": 0.966259573587249, "train_speed(iter/s)": 0.240051 }, { "epoch": 2.118301699824682, "grad_norm": 0.9573265910148621, "learning_rate": 6.187678323141689e-05, "loss": 0.0681601345539093, "memory(GiB)": 122.96, "step": 27790, "token_acc": 0.9759372507311885, "train_speed(iter/s)": 0.240055 }, { "epoch": 2.1186828264349415, "grad_norm": 0.8420156240463257, "learning_rate": 6.18651521395656e-05, "loss": 0.09692507982254028, "memory(GiB)": 122.96, "step": 27795, "token_acc": 0.9550521395181589, "train_speed(iter/s)": 0.240063 }, { "epoch": 2.1190639530452016, "grad_norm": 1.116483211517334, "learning_rate": 6.185352036730287e-05, "loss": 0.10330965518951415, "memory(GiB)": 122.96, "step": 27800, "token_acc": 0.9627118644067797, "train_speed(iter/s)": 0.240074 }, { "epoch": 2.1190639530452016, "eval_loss": 0.090018130838871, "eval_runtime": 219.1831, "eval_samples_per_second": 2.418, "eval_steps_per_second": 2.418, "eval_token_acc": 0.9606876091801698, "step": 27800 }, { "epoch": 2.1194450796554616, "grad_norm": 1.5741393566131592, "learning_rate": 6.184188791529579e-05, "loss": 0.11982957124710084, "memory(GiB)": 122.96, "step": 27805, "token_acc": 0.9604590024158022, "train_speed(iter/s)": 0.239625 }, { "epoch": 2.1198262062657216, "grad_norm": 0.9526782631874084, "learning_rate": 6.183025478421138e-05, "loss": 0.10716099739074707, "memory(GiB)": 122.96, "step": 27810, "token_acc": 0.9605813697423475, "train_speed(iter/s)": 0.239634 }, { "epoch": 2.1202073328759816, "grad_norm": 0.5382548570632935, "learning_rate": 6.181862097471674e-05, "loss": 0.14860497713088988, "memory(GiB)": 122.96, "step": 27815, "token_acc": 0.9493370551290998, "train_speed(iter/s)": 0.239638 }, { "epoch": 2.120588459486241, "grad_norm": 0.6813355684280396, "learning_rate": 6.180698648747906e-05, "loss": 0.11101962327957153, "memory(GiB)": 122.96, "step": 27820, "token_acc": 0.9605784128077122, "train_speed(iter/s)": 0.239642 }, { "epoch": 2.120969586096501, "grad_norm": 0.94278484582901, "learning_rate": 6.179535132316547e-05, "loss": 0.11362979412078858, "memory(GiB)": 122.96, "step": 27825, "token_acc": 0.9569032258064516, "train_speed(iter/s)": 0.239654 }, { "epoch": 2.1213507127067612, "grad_norm": 0.8431946635246277, "learning_rate": 6.178371548244323e-05, "loss": 0.10990517139434815, "memory(GiB)": 122.96, "step": 27830, "token_acc": 0.958128078817734, "train_speed(iter/s)": 0.239665 }, { "epoch": 2.1217318393170213, "grad_norm": 0.8008997440338135, "learning_rate": 6.177207896597958e-05, "loss": 0.1387685179710388, "memory(GiB)": 122.96, "step": 27835, "token_acc": 0.9392430278884463, "train_speed(iter/s)": 0.239677 }, { "epoch": 2.122112965927281, "grad_norm": 0.919292151927948, "learning_rate": 6.176044177444185e-05, "loss": 0.12360445261001587, "memory(GiB)": 122.96, "step": 27840, "token_acc": 0.9514647252556769, "train_speed(iter/s)": 0.239683 }, { "epoch": 2.122494092537541, "grad_norm": 1.302345633506775, "learning_rate": 6.174880390849735e-05, "loss": 0.10691941976547241, "memory(GiB)": 122.96, "step": 27845, "token_acc": 0.9613793103448276, "train_speed(iter/s)": 0.239689 }, { "epoch": 2.122875219147801, "grad_norm": 0.5413615703582764, "learning_rate": 6.173716536881346e-05, "loss": 0.132038414478302, "memory(GiB)": 122.96, "step": 27850, "token_acc": 0.946985446985447, "train_speed(iter/s)": 0.239697 }, { "epoch": 2.123256345758061, "grad_norm": 0.5248980522155762, "learning_rate": 6.172552615605762e-05, "loss": 0.09266903400421142, "memory(GiB)": 122.96, "step": 27855, "token_acc": 0.9646970989761092, "train_speed(iter/s)": 0.239699 }, { "epoch": 2.123637472368321, "grad_norm": 1.5410182476043701, "learning_rate": 6.171388627089726e-05, "loss": 0.11054692268371583, "memory(GiB)": 122.96, "step": 27860, "token_acc": 0.9563485741596304, "train_speed(iter/s)": 0.239707 }, { "epoch": 2.1240185989785805, "grad_norm": 0.8714772462844849, "learning_rate": 6.170224571399987e-05, "loss": 0.06086079478263855, "memory(GiB)": 122.96, "step": 27865, "token_acc": 0.9655694286795309, "train_speed(iter/s)": 0.23972 }, { "epoch": 2.1243997255888405, "grad_norm": 1.0499274730682373, "learning_rate": 6.1690604486033e-05, "loss": 0.07207356691360474, "memory(GiB)": 122.96, "step": 27870, "token_acc": 0.973729863692689, "train_speed(iter/s)": 0.239729 }, { "epoch": 2.1247808521991005, "grad_norm": 0.9508426785469055, "learning_rate": 6.167896258766423e-05, "loss": 0.07693119645118714, "memory(GiB)": 122.96, "step": 27875, "token_acc": 0.968609865470852, "train_speed(iter/s)": 0.239742 }, { "epoch": 2.1251619788093605, "grad_norm": 0.7007280588150024, "learning_rate": 6.166732001956113e-05, "loss": 0.09342229962348939, "memory(GiB)": 122.96, "step": 27880, "token_acc": 0.9742436631234669, "train_speed(iter/s)": 0.239747 }, { "epoch": 2.1255431054196205, "grad_norm": 0.7621156573295593, "learning_rate": 6.165567678239138e-05, "loss": 0.0761534571647644, "memory(GiB)": 122.96, "step": 27885, "token_acc": 0.9630515683147262, "train_speed(iter/s)": 0.239758 }, { "epoch": 2.12592423202988, "grad_norm": 0.4037191569805145, "learning_rate": 6.164403287682264e-05, "loss": 0.09237680435180665, "memory(GiB)": 122.96, "step": 27890, "token_acc": 0.9710169491525423, "train_speed(iter/s)": 0.239762 }, { "epoch": 2.12630535864014, "grad_norm": 0.5879103541374207, "learning_rate": 6.163238830352267e-05, "loss": 0.06853762865066529, "memory(GiB)": 122.96, "step": 27895, "token_acc": 0.9667338709677419, "train_speed(iter/s)": 0.239769 }, { "epoch": 2.1266864852504, "grad_norm": 1.0007086992263794, "learning_rate": 6.162074306315922e-05, "loss": 0.0879755973815918, "memory(GiB)": 122.96, "step": 27900, "token_acc": 0.9675449871465296, "train_speed(iter/s)": 0.23977 }, { "epoch": 2.12706761186066, "grad_norm": 1.7321882247924805, "learning_rate": 6.160909715640006e-05, "loss": 0.09466269016265869, "memory(GiB)": 122.96, "step": 27905, "token_acc": 0.9624190064794816, "train_speed(iter/s)": 0.239779 }, { "epoch": 2.12744873847092, "grad_norm": 0.5722376704216003, "learning_rate": 6.159745058391305e-05, "loss": 0.063826984167099, "memory(GiB)": 122.96, "step": 27910, "token_acc": 0.9723082699709003, "train_speed(iter/s)": 0.239778 }, { "epoch": 2.1278298650811798, "grad_norm": 0.5586673617362976, "learning_rate": 6.158580334636607e-05, "loss": 0.0876001238822937, "memory(GiB)": 122.96, "step": 27915, "token_acc": 0.9628647214854111, "train_speed(iter/s)": 0.239792 }, { "epoch": 2.12821099169144, "grad_norm": 0.6319655776023865, "learning_rate": 6.157415544442704e-05, "loss": 0.14738489389419557, "memory(GiB)": 122.96, "step": 27920, "token_acc": 0.9507658643326039, "train_speed(iter/s)": 0.239802 }, { "epoch": 2.1285921183017, "grad_norm": 0.6159265637397766, "learning_rate": 6.156250687876391e-05, "loss": 0.10440703630447387, "memory(GiB)": 122.96, "step": 27925, "token_acc": 0.959578804347826, "train_speed(iter/s)": 0.239814 }, { "epoch": 2.12897324491196, "grad_norm": 1.4795209169387817, "learning_rate": 6.155085765004467e-05, "loss": 0.09945797324180602, "memory(GiB)": 122.96, "step": 27930, "token_acc": 0.9610825318720644, "train_speed(iter/s)": 0.239822 }, { "epoch": 2.12935437152222, "grad_norm": 1.104069471359253, "learning_rate": 6.153920775893734e-05, "loss": 0.11859880685806275, "memory(GiB)": 122.96, "step": 27935, "token_acc": 0.9399612653324726, "train_speed(iter/s)": 0.239834 }, { "epoch": 2.1297354981324794, "grad_norm": 1.0906697511672974, "learning_rate": 6.152755720610998e-05, "loss": 0.06528820991516113, "memory(GiB)": 122.96, "step": 27940, "token_acc": 0.9682586333578251, "train_speed(iter/s)": 0.239838 }, { "epoch": 2.1301166247427394, "grad_norm": 0.5203943848609924, "learning_rate": 6.151590599223072e-05, "loss": 0.08457621335983276, "memory(GiB)": 122.96, "step": 27945, "token_acc": 0.9703953904232069, "train_speed(iter/s)": 0.239842 }, { "epoch": 2.1304977513529995, "grad_norm": 0.6331333518028259, "learning_rate": 6.15042541179677e-05, "loss": 0.08070627450942994, "memory(GiB)": 122.96, "step": 27950, "token_acc": 0.9677194612169067, "train_speed(iter/s)": 0.239853 }, { "epoch": 2.1308788779632595, "grad_norm": 1.3844211101531982, "learning_rate": 6.149260158398909e-05, "loss": 0.1089299201965332, "memory(GiB)": 122.96, "step": 27955, "token_acc": 0.9562091503267974, "train_speed(iter/s)": 0.239869 }, { "epoch": 2.1312600045735195, "grad_norm": 0.4578879773616791, "learning_rate": 6.14809483909631e-05, "loss": 0.1007508635520935, "memory(GiB)": 122.96, "step": 27960, "token_acc": 0.9643351431917385, "train_speed(iter/s)": 0.23987 }, { "epoch": 2.131641131183779, "grad_norm": 0.6534649729728699, "learning_rate": 6.1469294539558e-05, "loss": 0.10212352275848388, "memory(GiB)": 122.96, "step": 27965, "token_acc": 0.9605435066631827, "train_speed(iter/s)": 0.239881 }, { "epoch": 2.132022257794039, "grad_norm": 0.91864413022995, "learning_rate": 6.145764003044209e-05, "loss": 0.1396666646003723, "memory(GiB)": 122.96, "step": 27970, "token_acc": 0.9275808936825886, "train_speed(iter/s)": 0.23989 }, { "epoch": 2.132403384404299, "grad_norm": 0.7016887664794922, "learning_rate": 6.144598486428368e-05, "loss": 0.10935758352279663, "memory(GiB)": 122.96, "step": 27975, "token_acc": 0.9674556213017751, "train_speed(iter/s)": 0.239903 }, { "epoch": 2.132784511014559, "grad_norm": 0.5596987009048462, "learning_rate": 6.143432904175117e-05, "loss": 0.05607488751411438, "memory(GiB)": 122.96, "step": 27980, "token_acc": 0.9766339315949881, "train_speed(iter/s)": 0.239903 }, { "epoch": 2.133165637624819, "grad_norm": 0.7366549372673035, "learning_rate": 6.142267256351295e-05, "loss": 0.10587786436080933, "memory(GiB)": 122.96, "step": 27985, "token_acc": 0.963056255247691, "train_speed(iter/s)": 0.239908 }, { "epoch": 2.1335467642350787, "grad_norm": 1.6402859687805176, "learning_rate": 6.141101543023745e-05, "loss": 0.13984909057617187, "memory(GiB)": 122.96, "step": 27990, "token_acc": 0.9538989700833742, "train_speed(iter/s)": 0.239921 }, { "epoch": 2.1339278908453387, "grad_norm": 0.6540932655334473, "learning_rate": 6.139935764259319e-05, "loss": 0.0892623484134674, "memory(GiB)": 122.96, "step": 27995, "token_acc": 0.9606974813174647, "train_speed(iter/s)": 0.239933 }, { "epoch": 2.1343090174555988, "grad_norm": 0.521432638168335, "learning_rate": 6.138769920124866e-05, "loss": 0.12113819122314454, "memory(GiB)": 122.96, "step": 28000, "token_acc": 0.9532779590625927, "train_speed(iter/s)": 0.23994 }, { "epoch": 2.1343090174555988, "eval_loss": 0.0899355337023735, "eval_runtime": 222.3948, "eval_samples_per_second": 2.383, "eval_steps_per_second": 2.383, "eval_token_acc": 0.9603186555026806, "step": 28000 }, { "epoch": 2.1346901440658588, "grad_norm": 0.7031410336494446, "learning_rate": 6.137604010687243e-05, "loss": 0.07744794487953185, "memory(GiB)": 122.96, "step": 28005, "token_acc": 0.9604096154262497, "train_speed(iter/s)": 0.239489 }, { "epoch": 2.135071270676119, "grad_norm": 0.9439501166343689, "learning_rate": 6.136438036013308e-05, "loss": 0.09381205439567566, "memory(GiB)": 122.96, "step": 28010, "token_acc": 0.9694444444444444, "train_speed(iter/s)": 0.239492 }, { "epoch": 2.1354523972863784, "grad_norm": 1.019777536392212, "learning_rate": 6.135271996169927e-05, "loss": 0.08376663327217101, "memory(GiB)": 122.96, "step": 28015, "token_acc": 0.9668793267272974, "train_speed(iter/s)": 0.239494 }, { "epoch": 2.1358335238966384, "grad_norm": 0.6609342694282532, "learning_rate": 6.134105891223964e-05, "loss": 0.12731715440750122, "memory(GiB)": 122.96, "step": 28020, "token_acc": 0.9578108865614398, "train_speed(iter/s)": 0.239494 }, { "epoch": 2.1362146505068984, "grad_norm": 1.1665223836898804, "learning_rate": 6.132939721242289e-05, "loss": 0.11257238388061523, "memory(GiB)": 122.96, "step": 28025, "token_acc": 0.9591801584567688, "train_speed(iter/s)": 0.239501 }, { "epoch": 2.1365957771171584, "grad_norm": 1.1340506076812744, "learning_rate": 6.131773486291781e-05, "loss": 0.05358799695968628, "memory(GiB)": 122.96, "step": 28030, "token_acc": 0.9785532533624137, "train_speed(iter/s)": 0.239514 }, { "epoch": 2.1369769037274184, "grad_norm": 0.7952711582183838, "learning_rate": 6.130607186439317e-05, "loss": 0.13592766523361205, "memory(GiB)": 122.96, "step": 28035, "token_acc": 0.944318610796527, "train_speed(iter/s)": 0.239522 }, { "epoch": 2.137358030337678, "grad_norm": 0.918303370475769, "learning_rate": 6.129440821751774e-05, "loss": 0.10717002153396607, "memory(GiB)": 122.96, "step": 28040, "token_acc": 0.9604365620736699, "train_speed(iter/s)": 0.239527 }, { "epoch": 2.137739156947938, "grad_norm": 1.8507052659988403, "learning_rate": 6.128274392296044e-05, "loss": 0.08610897064208985, "memory(GiB)": 122.96, "step": 28045, "token_acc": 0.969054441260745, "train_speed(iter/s)": 0.239541 }, { "epoch": 2.138120283558198, "grad_norm": 1.4840810298919678, "learning_rate": 6.127107898139013e-05, "loss": 0.09622470736503601, "memory(GiB)": 122.96, "step": 28050, "token_acc": 0.967457627118644, "train_speed(iter/s)": 0.239547 }, { "epoch": 2.138501410168458, "grad_norm": 1.106105089187622, "learning_rate": 6.125941339347575e-05, "loss": 0.10969299077987671, "memory(GiB)": 122.96, "step": 28055, "token_acc": 0.9708591840571535, "train_speed(iter/s)": 0.239553 }, { "epoch": 2.138882536778718, "grad_norm": 1.0433018207550049, "learning_rate": 6.124774715988624e-05, "loss": 0.11918030977249146, "memory(GiB)": 122.96, "step": 28060, "token_acc": 0.9485937791022537, "train_speed(iter/s)": 0.239562 }, { "epoch": 2.1392636633889777, "grad_norm": 0.5080896019935608, "learning_rate": 6.123608028129064e-05, "loss": 0.12598009109497071, "memory(GiB)": 122.96, "step": 28065, "token_acc": 0.9645372233400402, "train_speed(iter/s)": 0.239573 }, { "epoch": 2.1396447899992377, "grad_norm": 0.7710150480270386, "learning_rate": 6.122441275835798e-05, "loss": 0.09826524257659912, "memory(GiB)": 122.96, "step": 28070, "token_acc": 0.9653330167398789, "train_speed(iter/s)": 0.239572 }, { "epoch": 2.1400259166094977, "grad_norm": 0.6538243889808655, "learning_rate": 6.121274459175732e-05, "loss": 0.10576131343841552, "memory(GiB)": 122.96, "step": 28075, "token_acc": 0.9566574839302112, "train_speed(iter/s)": 0.23958 }, { "epoch": 2.1404070432197577, "grad_norm": 0.6489638090133667, "learning_rate": 6.120107578215781e-05, "loss": 0.09150153398513794, "memory(GiB)": 122.96, "step": 28080, "token_acc": 0.9596899224806201, "train_speed(iter/s)": 0.239592 }, { "epoch": 2.1407881698300177, "grad_norm": 0.7096555829048157, "learning_rate": 6.118940633022858e-05, "loss": 0.09272821545600891, "memory(GiB)": 122.96, "step": 28085, "token_acc": 0.9601979345955249, "train_speed(iter/s)": 0.239599 }, { "epoch": 2.1411692964402773, "grad_norm": 0.28069791197776794, "learning_rate": 6.117773623663881e-05, "loss": 0.07790007591247558, "memory(GiB)": 122.96, "step": 28090, "token_acc": 0.9672862453531599, "train_speed(iter/s)": 0.239612 }, { "epoch": 2.1415504230505373, "grad_norm": 1.622114658355713, "learning_rate": 6.116606550205773e-05, "loss": 0.1252113699913025, "memory(GiB)": 122.96, "step": 28095, "token_acc": 0.9525252525252526, "train_speed(iter/s)": 0.239621 }, { "epoch": 2.1419315496607974, "grad_norm": 0.6619305610656738, "learning_rate": 6.115439412715464e-05, "loss": 0.09359397888183593, "memory(GiB)": 122.96, "step": 28100, "token_acc": 0.9655937846836848, "train_speed(iter/s)": 0.239625 }, { "epoch": 2.1423126762710574, "grad_norm": 0.102820485830307, "learning_rate": 6.11427221125988e-05, "loss": 0.08729241490364074, "memory(GiB)": 122.96, "step": 28105, "token_acc": 0.955233291298865, "train_speed(iter/s)": 0.239635 }, { "epoch": 2.1426938028813174, "grad_norm": 1.069226622581482, "learning_rate": 6.113104945905954e-05, "loss": 0.07913691401481629, "memory(GiB)": 122.96, "step": 28110, "token_acc": 0.9713910761154856, "train_speed(iter/s)": 0.239644 }, { "epoch": 2.143074929491577, "grad_norm": 0.8148454427719116, "learning_rate": 6.111937616720627e-05, "loss": 0.09348658323287964, "memory(GiB)": 122.96, "step": 28115, "token_acc": 0.960327868852459, "train_speed(iter/s)": 0.239649 }, { "epoch": 2.143456056101837, "grad_norm": 0.547702431678772, "learning_rate": 6.110770223770837e-05, "loss": 0.10335943698883057, "memory(GiB)": 122.96, "step": 28120, "token_acc": 0.9539473684210527, "train_speed(iter/s)": 0.239656 }, { "epoch": 2.143837182712097, "grad_norm": 0.8776295781135559, "learning_rate": 6.109602767123528e-05, "loss": 0.07049182653427125, "memory(GiB)": 122.96, "step": 28125, "token_acc": 0.9719002458728486, "train_speed(iter/s)": 0.239666 }, { "epoch": 2.144218309322357, "grad_norm": 1.3733737468719482, "learning_rate": 6.108435246845651e-05, "loss": 0.119430410861969, "memory(GiB)": 122.96, "step": 28130, "token_acc": 0.9456437933693138, "train_speed(iter/s)": 0.239679 }, { "epoch": 2.1445994359326166, "grad_norm": 0.6764029264450073, "learning_rate": 6.107267663004155e-05, "loss": 0.11741838455200196, "memory(GiB)": 122.96, "step": 28135, "token_acc": 0.9546365914786967, "train_speed(iter/s)": 0.239689 }, { "epoch": 2.1449805625428766, "grad_norm": 0.8878328800201416, "learning_rate": 6.106100015665998e-05, "loss": 0.08556505441665649, "memory(GiB)": 122.96, "step": 28140, "token_acc": 0.9608540925266904, "train_speed(iter/s)": 0.239702 }, { "epoch": 2.1453616891531366, "grad_norm": 1.1151158809661865, "learning_rate": 6.104932304898138e-05, "loss": 0.08544302582740784, "memory(GiB)": 122.96, "step": 28145, "token_acc": 0.9658950344583849, "train_speed(iter/s)": 0.239708 }, { "epoch": 2.1457428157633966, "grad_norm": 0.7038438320159912, "learning_rate": 6.103764530767537e-05, "loss": 0.07581700086593628, "memory(GiB)": 122.96, "step": 28150, "token_acc": 0.9694513715710723, "train_speed(iter/s)": 0.239711 }, { "epoch": 2.1461239423736567, "grad_norm": 0.7154973149299622, "learning_rate": 6.102596693341163e-05, "loss": 0.12285703420639038, "memory(GiB)": 122.96, "step": 28155, "token_acc": 0.9494546979865772, "train_speed(iter/s)": 0.239717 }, { "epoch": 2.1465050689839167, "grad_norm": 0.8372565507888794, "learning_rate": 6.101428792685985e-05, "loss": 0.08536692857742309, "memory(GiB)": 122.96, "step": 28160, "token_acc": 0.9613793103448276, "train_speed(iter/s)": 0.239727 }, { "epoch": 2.1468861955941763, "grad_norm": 1.0679185390472412, "learning_rate": 6.100260828868975e-05, "loss": 0.07424387931823731, "memory(GiB)": 122.96, "step": 28165, "token_acc": 0.9730046948356808, "train_speed(iter/s)": 0.239733 }, { "epoch": 2.1472673222044363, "grad_norm": 0.42938724160194397, "learning_rate": 6.099092801957116e-05, "loss": 0.11934046745300293, "memory(GiB)": 122.96, "step": 28170, "token_acc": 0.9632641615255187, "train_speed(iter/s)": 0.239739 }, { "epoch": 2.1476484488146963, "grad_norm": 0.9847074151039124, "learning_rate": 6.097924712017383e-05, "loss": 0.11800200939178467, "memory(GiB)": 122.96, "step": 28175, "token_acc": 0.9551630434782609, "train_speed(iter/s)": 0.239748 }, { "epoch": 2.1480295754249563, "grad_norm": 0.6662660241127014, "learning_rate": 6.096756559116763e-05, "loss": 0.07804281711578369, "memory(GiB)": 122.96, "step": 28180, "token_acc": 0.9671772428884027, "train_speed(iter/s)": 0.239759 }, { "epoch": 2.148410702035216, "grad_norm": 0.8077803254127502, "learning_rate": 6.095588343322245e-05, "loss": 0.11857231855392455, "memory(GiB)": 122.96, "step": 28185, "token_acc": 0.9505391467416784, "train_speed(iter/s)": 0.239769 }, { "epoch": 2.148791828645476, "grad_norm": 0.637674868106842, "learning_rate": 6.094420064700819e-05, "loss": 0.09026944637298584, "memory(GiB)": 122.96, "step": 28190, "token_acc": 0.9685825621042377, "train_speed(iter/s)": 0.23978 }, { "epoch": 2.149172955255736, "grad_norm": 1.4687750339508057, "learning_rate": 6.09325172331948e-05, "loss": 0.11450526714324952, "memory(GiB)": 122.96, "step": 28195, "token_acc": 0.9489432935760619, "train_speed(iter/s)": 0.239788 }, { "epoch": 2.149554081865996, "grad_norm": 1.1059716939926147, "learning_rate": 6.092083319245229e-05, "loss": 0.09341561198234558, "memory(GiB)": 122.96, "step": 28200, "token_acc": 0.9552789303826649, "train_speed(iter/s)": 0.239798 }, { "epoch": 2.149554081865996, "eval_loss": 0.09128241240978241, "eval_runtime": 221.699, "eval_samples_per_second": 2.391, "eval_steps_per_second": 2.391, "eval_token_acc": 0.9608909101861334, "step": 28200 }, { "epoch": 2.149935208476256, "grad_norm": 0.7577376961708069, "learning_rate": 6.0909148525450676e-05, "loss": 0.06801459193229675, "memory(GiB)": 122.96, "step": 28205, "token_acc": 0.9616201359346037, "train_speed(iter/s)": 0.239347 }, { "epoch": 2.1503163350865155, "grad_norm": 1.174790859222412, "learning_rate": 6.089746323286002e-05, "loss": 0.10775550603866577, "memory(GiB)": 122.96, "step": 28210, "token_acc": 0.9597292724196278, "train_speed(iter/s)": 0.239349 }, { "epoch": 2.1506974616967756, "grad_norm": 1.023391604423523, "learning_rate": 6.088577731535041e-05, "loss": 0.08467478156089783, "memory(GiB)": 122.96, "step": 28215, "token_acc": 0.9626818469323213, "train_speed(iter/s)": 0.239357 }, { "epoch": 2.1510785883070356, "grad_norm": 1.0269309282302856, "learning_rate": 6.0874090773592e-05, "loss": 0.07792733907699585, "memory(GiB)": 122.96, "step": 28220, "token_acc": 0.9640866873065016, "train_speed(iter/s)": 0.23937 }, { "epoch": 2.1514597149172956, "grad_norm": 1.719300627708435, "learning_rate": 6.0862403608254925e-05, "loss": 0.15043263435363768, "memory(GiB)": 122.96, "step": 28225, "token_acc": 0.9445879988515647, "train_speed(iter/s)": 0.239383 }, { "epoch": 2.1518408415275556, "grad_norm": 1.3176071643829346, "learning_rate": 6.085071582000943e-05, "loss": 0.07715824842453003, "memory(GiB)": 122.96, "step": 28230, "token_acc": 0.9722331626624655, "train_speed(iter/s)": 0.239389 }, { "epoch": 2.152221968137815, "grad_norm": 1.2917969226837158, "learning_rate": 6.083902740952572e-05, "loss": 0.1180142879486084, "memory(GiB)": 122.96, "step": 28235, "token_acc": 0.954831424423294, "train_speed(iter/s)": 0.239396 }, { "epoch": 2.152603094748075, "grad_norm": 1.271726369857788, "learning_rate": 6.08273383774741e-05, "loss": 0.1064296841621399, "memory(GiB)": 122.96, "step": 28240, "token_acc": 0.9577284372331341, "train_speed(iter/s)": 0.239408 }, { "epoch": 2.1529842213583352, "grad_norm": 0.49032965302467346, "learning_rate": 6.081564872452488e-05, "loss": 0.06001255512237549, "memory(GiB)": 122.96, "step": 28245, "token_acc": 0.9656276686592656, "train_speed(iter/s)": 0.239418 }, { "epoch": 2.1533653479685952, "grad_norm": 0.9699283242225647, "learning_rate": 6.0803958451348365e-05, "loss": 0.11291947364807128, "memory(GiB)": 122.96, "step": 28250, "token_acc": 0.9640387275242047, "train_speed(iter/s)": 0.239427 }, { "epoch": 2.1537464745788553, "grad_norm": 0.8026388883590698, "learning_rate": 6.079226755861499e-05, "loss": 0.09559043049812317, "memory(GiB)": 122.96, "step": 28255, "token_acc": 0.9642934429777105, "train_speed(iter/s)": 0.239434 }, { "epoch": 2.154127601189115, "grad_norm": 0.06248077377676964, "learning_rate": 6.078057604699515e-05, "loss": 0.08909776210784912, "memory(GiB)": 122.96, "step": 28260, "token_acc": 0.9715205547300644, "train_speed(iter/s)": 0.239444 }, { "epoch": 2.154508727799375, "grad_norm": 1.3593966960906982, "learning_rate": 6.07688839171593e-05, "loss": 0.10180673599243165, "memory(GiB)": 122.96, "step": 28265, "token_acc": 0.9606625258799172, "train_speed(iter/s)": 0.239455 }, { "epoch": 2.154889854409635, "grad_norm": 1.2693363428115845, "learning_rate": 6.075719116977794e-05, "loss": 0.09159737825393677, "memory(GiB)": 122.96, "step": 28270, "token_acc": 0.9675655976676385, "train_speed(iter/s)": 0.23946 }, { "epoch": 2.155270981019895, "grad_norm": 0.4697171151638031, "learning_rate": 6.074549780552158e-05, "loss": 0.10791506767272949, "memory(GiB)": 122.96, "step": 28275, "token_acc": 0.9639169655472232, "train_speed(iter/s)": 0.239465 }, { "epoch": 2.155652107630155, "grad_norm": 1.8569612503051758, "learning_rate": 6.0733803825060776e-05, "loss": 0.08581749200820923, "memory(GiB)": 122.96, "step": 28280, "token_acc": 0.9605776736924277, "train_speed(iter/s)": 0.239477 }, { "epoch": 2.1560332342404145, "grad_norm": 0.019115058705210686, "learning_rate": 6.0722109229066146e-05, "loss": 0.07598314285278321, "memory(GiB)": 122.96, "step": 28285, "token_acc": 0.964828897338403, "train_speed(iter/s)": 0.239489 }, { "epoch": 2.1564143608506745, "grad_norm": 0.8687453269958496, "learning_rate": 6.0710414018208294e-05, "loss": 0.09754187464714051, "memory(GiB)": 122.96, "step": 28290, "token_acc": 0.9624656698199573, "train_speed(iter/s)": 0.2395 }, { "epoch": 2.1567954874609345, "grad_norm": 0.8459134101867676, "learning_rate": 6.069871819315791e-05, "loss": 0.09486402869224549, "memory(GiB)": 122.96, "step": 28295, "token_acc": 0.9631821170282708, "train_speed(iter/s)": 0.239508 }, { "epoch": 2.1571766140711945, "grad_norm": 0.839917778968811, "learning_rate": 6.0687021754585685e-05, "loss": 0.10098495483398437, "memory(GiB)": 122.96, "step": 28300, "token_acc": 0.958051689860835, "train_speed(iter/s)": 0.239516 }, { "epoch": 2.1575577406814546, "grad_norm": 1.873209834098816, "learning_rate": 6.067532470316236e-05, "loss": 0.0949479341506958, "memory(GiB)": 122.96, "step": 28305, "token_acc": 0.96672, "train_speed(iter/s)": 0.239516 }, { "epoch": 2.157938867291714, "grad_norm": 0.45355430245399475, "learning_rate": 6.06636270395587e-05, "loss": 0.08942803740501404, "memory(GiB)": 122.96, "step": 28310, "token_acc": 0.9641571450931288, "train_speed(iter/s)": 0.239524 }, { "epoch": 2.158319993901974, "grad_norm": 0.8409352898597717, "learning_rate": 6.065192876444551e-05, "loss": 0.10198326110839843, "memory(GiB)": 122.96, "step": 28315, "token_acc": 0.96062871967038, "train_speed(iter/s)": 0.239531 }, { "epoch": 2.158701120512234, "grad_norm": 0.8375471234321594, "learning_rate": 6.064022987849365e-05, "loss": 0.1107219099998474, "memory(GiB)": 122.96, "step": 28320, "token_acc": 0.9553646860039425, "train_speed(iter/s)": 0.239538 }, { "epoch": 2.159082247122494, "grad_norm": 1.2194017171859741, "learning_rate": 6.062853038237397e-05, "loss": 0.1135896921157837, "memory(GiB)": 122.96, "step": 28325, "token_acc": 0.9537387836490528, "train_speed(iter/s)": 0.239546 }, { "epoch": 2.159463373732754, "grad_norm": 1.4612364768981934, "learning_rate": 6.061683027675741e-05, "loss": 0.10336819887161255, "memory(GiB)": 122.96, "step": 28330, "token_acc": 0.95377969762419, "train_speed(iter/s)": 0.239556 }, { "epoch": 2.159844500343014, "grad_norm": 1.1058543920516968, "learning_rate": 6.060512956231489e-05, "loss": 0.07791873812675476, "memory(GiB)": 122.96, "step": 28335, "token_acc": 0.9678374947101143, "train_speed(iter/s)": 0.239568 }, { "epoch": 2.160225626953274, "grad_norm": 0.4513099789619446, "learning_rate": 6.05934282397174e-05, "loss": 0.08736817240715027, "memory(GiB)": 122.96, "step": 28340, "token_acc": 0.9653267873580853, "train_speed(iter/s)": 0.239578 }, { "epoch": 2.160606753563534, "grad_norm": 1.246484637260437, "learning_rate": 6.0581726309635964e-05, "loss": 0.089145427942276, "memory(GiB)": 122.96, "step": 28345, "token_acc": 0.9589622641509434, "train_speed(iter/s)": 0.239589 }, { "epoch": 2.160987880173794, "grad_norm": 0.7784377336502075, "learning_rate": 6.057002377274163e-05, "loss": 0.09838126301765442, "memory(GiB)": 122.96, "step": 28350, "token_acc": 0.958687040181098, "train_speed(iter/s)": 0.239597 }, { "epoch": 2.161369006784054, "grad_norm": 0.6538731455802917, "learning_rate": 6.055832062970549e-05, "loss": 0.08941572308540344, "memory(GiB)": 122.96, "step": 28355, "token_acc": 0.966081718177056, "train_speed(iter/s)": 0.239599 }, { "epoch": 2.1617501333943134, "grad_norm": 0.9794332981109619, "learning_rate": 6.0546616881198636e-05, "loss": 0.08159719109535217, "memory(GiB)": 122.96, "step": 28360, "token_acc": 0.9687953555878084, "train_speed(iter/s)": 0.239601 }, { "epoch": 2.1621312600045735, "grad_norm": 0.5321868062019348, "learning_rate": 6.053491252789227e-05, "loss": 0.07450066804885865, "memory(GiB)": 122.96, "step": 28365, "token_acc": 0.9704845814977974, "train_speed(iter/s)": 0.23961 }, { "epoch": 2.1625123866148335, "grad_norm": 1.2130212783813477, "learning_rate": 6.052320757045754e-05, "loss": 0.10071868896484375, "memory(GiB)": 122.96, "step": 28370, "token_acc": 0.9629233511586452, "train_speed(iter/s)": 0.239624 }, { "epoch": 2.1628935132250935, "grad_norm": 1.260616421699524, "learning_rate": 6.051150200956571e-05, "loss": 0.08311924934387208, "memory(GiB)": 122.96, "step": 28375, "token_acc": 0.9671018276762402, "train_speed(iter/s)": 0.239633 }, { "epoch": 2.1632746398353535, "grad_norm": 0.6260547041893005, "learning_rate": 6.049979584588801e-05, "loss": 0.08966317772865295, "memory(GiB)": 122.96, "step": 28380, "token_acc": 0.9686459975477316, "train_speed(iter/s)": 0.239638 }, { "epoch": 2.163655766445613, "grad_norm": 1.0439443588256836, "learning_rate": 6.048808908009573e-05, "loss": 0.11280975341796876, "memory(GiB)": 122.96, "step": 28385, "token_acc": 0.9566756574511126, "train_speed(iter/s)": 0.239644 }, { "epoch": 2.164036893055873, "grad_norm": 1.173966646194458, "learning_rate": 6.047638171286023e-05, "loss": 0.1509438157081604, "memory(GiB)": 122.96, "step": 28390, "token_acc": 0.9469785575048733, "train_speed(iter/s)": 0.239652 }, { "epoch": 2.164418019666133, "grad_norm": 1.4136862754821777, "learning_rate": 6.046467374485286e-05, "loss": 0.06052640080451965, "memory(GiB)": 122.96, "step": 28395, "token_acc": 0.9754768392370572, "train_speed(iter/s)": 0.239666 }, { "epoch": 2.164799146276393, "grad_norm": 0.6411550045013428, "learning_rate": 6.045296517674499e-05, "loss": 0.1616116762161255, "memory(GiB)": 122.96, "step": 28400, "token_acc": 0.95257854179016, "train_speed(iter/s)": 0.239671 }, { "epoch": 2.164799146276393, "eval_loss": 0.08840744942426682, "eval_runtime": 221.1479, "eval_samples_per_second": 2.397, "eval_steps_per_second": 2.397, "eval_token_acc": 0.9615008132040238, "step": 28400 }, { "epoch": 2.165180272886653, "grad_norm": 0.7650735974311829, "learning_rate": 6.044125600920809e-05, "loss": 0.13469887971878053, "memory(GiB)": 122.96, "step": 28405, "token_acc": 0.9610543537538629, "train_speed(iter/s)": 0.239231 }, { "epoch": 2.1655613994969127, "grad_norm": 0.6748127341270447, "learning_rate": 6.0429546242913613e-05, "loss": 0.07637932300567626, "memory(GiB)": 122.96, "step": 28410, "token_acc": 0.9765560165975103, "train_speed(iter/s)": 0.239237 }, { "epoch": 2.1659425261071727, "grad_norm": 0.6356326937675476, "learning_rate": 6.041783587853306e-05, "loss": 0.08393945693969726, "memory(GiB)": 122.96, "step": 28415, "token_acc": 0.9694072657743786, "train_speed(iter/s)": 0.239249 }, { "epoch": 2.1663236527174328, "grad_norm": 0.976385772228241, "learning_rate": 6.040612491673795e-05, "loss": 0.09353143572807313, "memory(GiB)": 122.96, "step": 28420, "token_acc": 0.9582958295829583, "train_speed(iter/s)": 0.239258 }, { "epoch": 2.166704779327693, "grad_norm": 1.268261432647705, "learning_rate": 6.039441335819987e-05, "loss": 0.08541704416275024, "memory(GiB)": 122.96, "step": 28425, "token_acc": 0.9567836563645888, "train_speed(iter/s)": 0.239268 }, { "epoch": 2.1670859059379524, "grad_norm": 1.0949665307998657, "learning_rate": 6.0382701203590416e-05, "loss": 0.04326063394546509, "memory(GiB)": 122.96, "step": 28430, "token_acc": 0.974191063174114, "train_speed(iter/s)": 0.239279 }, { "epoch": 2.1674670325482124, "grad_norm": 0.6081709861755371, "learning_rate": 6.0370988453581246e-05, "loss": 0.06323828101158142, "memory(GiB)": 122.96, "step": 28435, "token_acc": 0.981300539083558, "train_speed(iter/s)": 0.239284 }, { "epoch": 2.1678481591584724, "grad_norm": 1.1607609987258911, "learning_rate": 6.0359275108844006e-05, "loss": 0.10040632486343384, "memory(GiB)": 122.96, "step": 28440, "token_acc": 0.9493405275779376, "train_speed(iter/s)": 0.239296 }, { "epoch": 2.1682292857687324, "grad_norm": 1.8013986349105835, "learning_rate": 6.0347561170050414e-05, "loss": 0.08583697080612182, "memory(GiB)": 122.96, "step": 28445, "token_acc": 0.9606613454960091, "train_speed(iter/s)": 0.239307 }, { "epoch": 2.1686104123789924, "grad_norm": 1.5438437461853027, "learning_rate": 6.0335846637872206e-05, "loss": 0.13214806318283082, "memory(GiB)": 122.96, "step": 28450, "token_acc": 0.9500078112794876, "train_speed(iter/s)": 0.239313 }, { "epoch": 2.1689915389892525, "grad_norm": 1.00194251537323, "learning_rate": 6.032413151298115e-05, "loss": 0.11759432554244995, "memory(GiB)": 122.96, "step": 28455, "token_acc": 0.9587020648967551, "train_speed(iter/s)": 0.239321 }, { "epoch": 2.169372665599512, "grad_norm": 1.572457194328308, "learning_rate": 6.031241579604907e-05, "loss": 0.09750629663467407, "memory(GiB)": 122.96, "step": 28460, "token_acc": 0.953340402969247, "train_speed(iter/s)": 0.239335 }, { "epoch": 2.169753792209772, "grad_norm": 0.7564902901649475, "learning_rate": 6.03006994877478e-05, "loss": 0.09577387571334839, "memory(GiB)": 122.96, "step": 28465, "token_acc": 0.9552934722984092, "train_speed(iter/s)": 0.239346 }, { "epoch": 2.170134918820032, "grad_norm": 0.9626314043998718, "learning_rate": 6.028898258874921e-05, "loss": 0.10857858657836914, "memory(GiB)": 122.96, "step": 28470, "token_acc": 0.9659006671608599, "train_speed(iter/s)": 0.239353 }, { "epoch": 2.170516045430292, "grad_norm": 0.5888820886611938, "learning_rate": 6.0277265099725225e-05, "loss": 0.072148597240448, "memory(GiB)": 122.96, "step": 28475, "token_acc": 0.9686809137803979, "train_speed(iter/s)": 0.239353 }, { "epoch": 2.1708971720405517, "grad_norm": 0.7854037880897522, "learning_rate": 6.026554702134777e-05, "loss": 0.06444537043571472, "memory(GiB)": 122.96, "step": 28480, "token_acc": 0.9755855498213577, "train_speed(iter/s)": 0.23936 }, { "epoch": 2.1712782986508117, "grad_norm": 0.6096973419189453, "learning_rate": 6.0253828354288836e-05, "loss": 0.10829472541809082, "memory(GiB)": 122.96, "step": 28485, "token_acc": 0.9637496007665283, "train_speed(iter/s)": 0.239365 }, { "epoch": 2.1716594252610717, "grad_norm": 0.6725614666938782, "learning_rate": 6.024210909922041e-05, "loss": 0.10484235286712647, "memory(GiB)": 122.96, "step": 28490, "token_acc": 0.9591280653950953, "train_speed(iter/s)": 0.239374 }, { "epoch": 2.1720405518713317, "grad_norm": 0.6311729550361633, "learning_rate": 6.023038925681458e-05, "loss": 0.0723418951034546, "memory(GiB)": 122.96, "step": 28495, "token_acc": 0.9707078925956062, "train_speed(iter/s)": 0.239384 }, { "epoch": 2.1724216784815917, "grad_norm": 0.9527662992477417, "learning_rate": 6.02186688277434e-05, "loss": 0.10140331983566284, "memory(GiB)": 122.96, "step": 28500, "token_acc": 0.9564990883042459, "train_speed(iter/s)": 0.239393 }, { "epoch": 2.1728028050918513, "grad_norm": 0.76530522108078, "learning_rate": 6.020694781267897e-05, "loss": 0.11537402868270874, "memory(GiB)": 122.96, "step": 28505, "token_acc": 0.9521008403361344, "train_speed(iter/s)": 0.2394 }, { "epoch": 2.1731839317021113, "grad_norm": 1.814502239227295, "learning_rate": 6.0195226212293454e-05, "loss": 0.08594166040420533, "memory(GiB)": 122.96, "step": 28510, "token_acc": 0.9675586076249112, "train_speed(iter/s)": 0.239409 }, { "epoch": 2.1735650583123713, "grad_norm": 0.7927237153053284, "learning_rate": 6.018350402725904e-05, "loss": 0.09713976979255676, "memory(GiB)": 122.96, "step": 28515, "token_acc": 0.960446247464503, "train_speed(iter/s)": 0.239413 }, { "epoch": 2.1739461849226314, "grad_norm": 0.6296750903129578, "learning_rate": 6.017178125824792e-05, "loss": 0.12309565544128417, "memory(GiB)": 122.96, "step": 28520, "token_acc": 0.9586448034723897, "train_speed(iter/s)": 0.239416 }, { "epoch": 2.1743273115328914, "grad_norm": 0.6733710169792175, "learning_rate": 6.0160057905932335e-05, "loss": 0.09520531892776489, "memory(GiB)": 122.96, "step": 28525, "token_acc": 0.9543279686820356, "train_speed(iter/s)": 0.23943 }, { "epoch": 2.174708438143151, "grad_norm": 0.9160845279693604, "learning_rate": 6.0148333970984596e-05, "loss": 0.10420083999633789, "memory(GiB)": 122.96, "step": 28530, "token_acc": 0.9493670886075949, "train_speed(iter/s)": 0.239441 }, { "epoch": 2.175089564753411, "grad_norm": 0.46705806255340576, "learning_rate": 6.013660945407699e-05, "loss": 0.09287576079368591, "memory(GiB)": 122.96, "step": 28535, "token_acc": 0.9552364864864865, "train_speed(iter/s)": 0.239453 }, { "epoch": 2.175470691363671, "grad_norm": 1.444498896598816, "learning_rate": 6.012488435588186e-05, "loss": 0.10871402025222779, "memory(GiB)": 122.96, "step": 28540, "token_acc": 0.9584382871536524, "train_speed(iter/s)": 0.239467 }, { "epoch": 2.175851817973931, "grad_norm": 1.0508748292922974, "learning_rate": 6.011315867707161e-05, "loss": 0.064911949634552, "memory(GiB)": 122.96, "step": 28545, "token_acc": 0.9676627097830536, "train_speed(iter/s)": 0.239473 }, { "epoch": 2.176232944584191, "grad_norm": 1.6996991634368896, "learning_rate": 6.010143241831864e-05, "loss": 0.10519801378250122, "memory(GiB)": 122.96, "step": 28550, "token_acc": 0.9586519399603511, "train_speed(iter/s)": 0.239485 }, { "epoch": 2.1766140711944506, "grad_norm": 1.941916584968567, "learning_rate": 6.008970558029538e-05, "loss": 0.1063997745513916, "memory(GiB)": 122.96, "step": 28555, "token_acc": 0.9574119574119574, "train_speed(iter/s)": 0.239489 }, { "epoch": 2.1769951978047106, "grad_norm": 1.1384410858154297, "learning_rate": 6.007797816367434e-05, "loss": 0.09455177783966065, "memory(GiB)": 122.96, "step": 28560, "token_acc": 0.963597907157965, "train_speed(iter/s)": 0.239492 }, { "epoch": 2.1773763244149706, "grad_norm": 0.7243781685829163, "learning_rate": 6.0066250169128025e-05, "loss": 0.09158454537391662, "memory(GiB)": 122.96, "step": 28565, "token_acc": 0.9671586715867159, "train_speed(iter/s)": 0.239497 }, { "epoch": 2.1777574510252307, "grad_norm": 1.3454619646072388, "learning_rate": 6.0054521597328986e-05, "loss": 0.1299346685409546, "memory(GiB)": 122.96, "step": 28570, "token_acc": 0.951751866743251, "train_speed(iter/s)": 0.239509 }, { "epoch": 2.1781385776354907, "grad_norm": 1.0876895189285278, "learning_rate": 6.004279244894977e-05, "loss": 0.09224478006362916, "memory(GiB)": 122.96, "step": 28575, "token_acc": 0.9574116870254209, "train_speed(iter/s)": 0.23952 }, { "epoch": 2.1785197042457503, "grad_norm": 0.535778284072876, "learning_rate": 6.0031062724663024e-05, "loss": 0.08370343446731568, "memory(GiB)": 122.96, "step": 28580, "token_acc": 0.9671232876712329, "train_speed(iter/s)": 0.239525 }, { "epoch": 2.1789008308560103, "grad_norm": 0.5618976354598999, "learning_rate": 6.001933242514137e-05, "loss": 0.08695634603500366, "memory(GiB)": 122.96, "step": 28585, "token_acc": 0.9686783804430863, "train_speed(iter/s)": 0.239531 }, { "epoch": 2.1792819574662703, "grad_norm": 0.6672285199165344, "learning_rate": 6.0007601551057505e-05, "loss": 0.07080357074737549, "memory(GiB)": 122.96, "step": 28590, "token_acc": 0.9536455818445195, "train_speed(iter/s)": 0.23954 }, { "epoch": 2.1796630840765303, "grad_norm": 0.6590083837509155, "learning_rate": 5.999587010308413e-05, "loss": 0.08805898427963257, "memory(GiB)": 122.96, "step": 28595, "token_acc": 0.9664179104477612, "train_speed(iter/s)": 0.239548 }, { "epoch": 2.1800442106867903, "grad_norm": 0.08052240312099457, "learning_rate": 5.998413808189399e-05, "loss": 0.060230147838592527, "memory(GiB)": 122.96, "step": 28600, "token_acc": 0.9641008515611955, "train_speed(iter/s)": 0.239558 }, { "epoch": 2.1800442106867903, "eval_loss": 0.08790554106235504, "eval_runtime": 219.6024, "eval_samples_per_second": 2.413, "eval_steps_per_second": 2.413, "eval_token_acc": 0.9606951388470574, "step": 28600 }, { "epoch": 2.18042533729705, "grad_norm": 0.7962251901626587, "learning_rate": 5.9972405488159876e-05, "loss": 0.10768903493881225, "memory(GiB)": 122.96, "step": 28605, "token_acc": 0.9603134645272858, "train_speed(iter/s)": 0.239127 }, { "epoch": 2.18080646390731, "grad_norm": 0.16098840534687042, "learning_rate": 5.9960672322554565e-05, "loss": 0.05665057897567749, "memory(GiB)": 122.96, "step": 28610, "token_acc": 0.9772972972972973, "train_speed(iter/s)": 0.239139 }, { "epoch": 2.18118759051757, "grad_norm": 1.2436665296554565, "learning_rate": 5.9948938585750925e-05, "loss": 0.09734983444213867, "memory(GiB)": 122.96, "step": 28615, "token_acc": 0.9554670528602462, "train_speed(iter/s)": 0.239153 }, { "epoch": 2.18156871712783, "grad_norm": 1.5661016702651978, "learning_rate": 5.993720427842184e-05, "loss": 0.10554989576339721, "memory(GiB)": 122.96, "step": 28620, "token_acc": 0.9520414776409591, "train_speed(iter/s)": 0.239165 }, { "epoch": 2.18194984373809, "grad_norm": 0.8689875602722168, "learning_rate": 5.992546940124019e-05, "loss": 0.09545673131942749, "memory(GiB)": 122.96, "step": 28625, "token_acc": 0.9605305726302168, "train_speed(iter/s)": 0.239175 }, { "epoch": 2.1823309703483496, "grad_norm": 0.7754881978034973, "learning_rate": 5.9913733954878916e-05, "loss": 0.11277850866317748, "memory(GiB)": 122.96, "step": 28630, "token_acc": 0.9605488850771869, "train_speed(iter/s)": 0.239178 }, { "epoch": 2.1827120969586096, "grad_norm": 1.1624606847763062, "learning_rate": 5.9901997940011024e-05, "loss": 0.0861231803894043, "memory(GiB)": 122.96, "step": 28635, "token_acc": 0.9639261744966443, "train_speed(iter/s)": 0.239186 }, { "epoch": 2.1830932235688696, "grad_norm": 2.2545154094696045, "learning_rate": 5.989026135730951e-05, "loss": 0.0957340955734253, "memory(GiB)": 122.96, "step": 28640, "token_acc": 0.962843295638126, "train_speed(iter/s)": 0.239199 }, { "epoch": 2.1834743501791296, "grad_norm": 1.4343751668930054, "learning_rate": 5.9878524207447386e-05, "loss": 0.11221576929092407, "memory(GiB)": 122.96, "step": 28645, "token_acc": 0.9578001633542064, "train_speed(iter/s)": 0.239211 }, { "epoch": 2.1838554767893896, "grad_norm": 0.37737441062927246, "learning_rate": 5.9866786491097745e-05, "loss": 0.09704537391662597, "memory(GiB)": 122.96, "step": 28650, "token_acc": 0.9512676983865657, "train_speed(iter/s)": 0.239223 }, { "epoch": 2.184236603399649, "grad_norm": 1.5277860164642334, "learning_rate": 5.985504820893369e-05, "loss": 0.07541357278823853, "memory(GiB)": 122.96, "step": 28655, "token_acc": 0.9706870229007634, "train_speed(iter/s)": 0.239225 }, { "epoch": 2.184617730009909, "grad_norm": 0.8790802359580994, "learning_rate": 5.984330936162834e-05, "loss": 0.08938062191009521, "memory(GiB)": 122.96, "step": 28660, "token_acc": 0.9562545191612437, "train_speed(iter/s)": 0.239239 }, { "epoch": 2.1849988566201692, "grad_norm": 1.5419442653656006, "learning_rate": 5.9831569949854893e-05, "loss": 0.11099369525909424, "memory(GiB)": 122.96, "step": 28665, "token_acc": 0.9613817537643933, "train_speed(iter/s)": 0.239244 }, { "epoch": 2.1853799832304293, "grad_norm": 1.3076199293136597, "learning_rate": 5.9819829974286534e-05, "loss": 0.08465604782104492, "memory(GiB)": 122.96, "step": 28670, "token_acc": 0.9668310937234224, "train_speed(iter/s)": 0.23925 }, { "epoch": 2.1857611098406893, "grad_norm": 0.6575175523757935, "learning_rate": 5.980808943559648e-05, "loss": 0.09034868478775024, "memory(GiB)": 122.96, "step": 28675, "token_acc": 0.9593665969524948, "train_speed(iter/s)": 0.239255 }, { "epoch": 2.186142236450949, "grad_norm": 0.782276451587677, "learning_rate": 5.979634833445803e-05, "loss": 0.08056789636611938, "memory(GiB)": 122.96, "step": 28680, "token_acc": 0.9655859445752581, "train_speed(iter/s)": 0.239262 }, { "epoch": 2.186523363061209, "grad_norm": 0.22168083488941193, "learning_rate": 5.978460667154445e-05, "loss": 0.09123271107673644, "memory(GiB)": 122.96, "step": 28685, "token_acc": 0.9540559942569993, "train_speed(iter/s)": 0.239277 }, { "epoch": 2.186904489671469, "grad_norm": 0.7801160216331482, "learning_rate": 5.9772864447529085e-05, "loss": 0.10322872400283814, "memory(GiB)": 122.96, "step": 28690, "token_acc": 0.960863425458947, "train_speed(iter/s)": 0.239284 }, { "epoch": 2.187285616281729, "grad_norm": 0.8688017725944519, "learning_rate": 5.976112166308529e-05, "loss": 0.10132591724395752, "memory(GiB)": 122.96, "step": 28695, "token_acc": 0.9524026072457178, "train_speed(iter/s)": 0.239289 }, { "epoch": 2.187666742891989, "grad_norm": 1.526847243309021, "learning_rate": 5.9749378318886486e-05, "loss": 0.08846145868301392, "memory(GiB)": 122.96, "step": 28700, "token_acc": 0.9617850098619329, "train_speed(iter/s)": 0.239299 }, { "epoch": 2.1880478695022485, "grad_norm": 0.688266396522522, "learning_rate": 5.973763441560607e-05, "loss": 0.09201809167861938, "memory(GiB)": 122.96, "step": 28705, "token_acc": 0.9595736861447997, "train_speed(iter/s)": 0.239302 }, { "epoch": 2.1884289961125085, "grad_norm": 0.7336198091506958, "learning_rate": 5.97258899539175e-05, "loss": 0.09969985485076904, "memory(GiB)": 122.96, "step": 28710, "token_acc": 0.9553208773354996, "train_speed(iter/s)": 0.23931 }, { "epoch": 2.1888101227227685, "grad_norm": 0.608311116695404, "learning_rate": 5.9714144934494275e-05, "loss": 0.12818397283554078, "memory(GiB)": 122.96, "step": 28715, "token_acc": 0.9570088587806149, "train_speed(iter/s)": 0.239318 }, { "epoch": 2.1891912493330286, "grad_norm": 0.3513854444026947, "learning_rate": 5.970239935800993e-05, "loss": 0.06647705435752868, "memory(GiB)": 122.96, "step": 28720, "token_acc": 0.9781453041937389, "train_speed(iter/s)": 0.239327 }, { "epoch": 2.189572375943288, "grad_norm": 0.586907148361206, "learning_rate": 5.9690653225137995e-05, "loss": 0.07961885333061218, "memory(GiB)": 122.96, "step": 28725, "token_acc": 0.9724491734752042, "train_speed(iter/s)": 0.239335 }, { "epoch": 2.189953502553548, "grad_norm": 0.7322615385055542, "learning_rate": 5.967890653655207e-05, "loss": 0.07993540763854981, "memory(GiB)": 122.96, "step": 28730, "token_acc": 0.960809928151535, "train_speed(iter/s)": 0.239346 }, { "epoch": 2.190334629163808, "grad_norm": 0.5982035398483276, "learning_rate": 5.966715929292577e-05, "loss": 0.12689443826675414, "memory(GiB)": 122.96, "step": 28735, "token_acc": 0.9593606068816039, "train_speed(iter/s)": 0.239355 }, { "epoch": 2.190715755774068, "grad_norm": 1.250866174697876, "learning_rate": 5.965541149493276e-05, "loss": 0.08027002811431885, "memory(GiB)": 122.96, "step": 28740, "token_acc": 0.9619068350021768, "train_speed(iter/s)": 0.239365 }, { "epoch": 2.191096882384328, "grad_norm": 2.6117444038391113, "learning_rate": 5.964366314324667e-05, "loss": 0.1620384931564331, "memory(GiB)": 122.96, "step": 28745, "token_acc": 0.9372188139059304, "train_speed(iter/s)": 0.239376 }, { "epoch": 2.1914780089945882, "grad_norm": 0.7443350553512573, "learning_rate": 5.963191423854129e-05, "loss": 0.07109590768814086, "memory(GiB)": 122.96, "step": 28750, "token_acc": 0.9684850018984938, "train_speed(iter/s)": 0.239378 }, { "epoch": 2.191859135604848, "grad_norm": 1.21170973777771, "learning_rate": 5.962016478149031e-05, "loss": 0.10468604564666747, "memory(GiB)": 122.96, "step": 28755, "token_acc": 0.9562951082598236, "train_speed(iter/s)": 0.239391 }, { "epoch": 2.192240262215108, "grad_norm": 2.0057857036590576, "learning_rate": 5.960841477276752e-05, "loss": 0.16838459968566893, "memory(GiB)": 122.96, "step": 28760, "token_acc": 0.9501067535678166, "train_speed(iter/s)": 0.239391 }, { "epoch": 2.192621388825368, "grad_norm": 1.6185537576675415, "learning_rate": 5.959666421304673e-05, "loss": 0.1333222508430481, "memory(GiB)": 122.96, "step": 28765, "token_acc": 0.9585311016737447, "train_speed(iter/s)": 0.239393 }, { "epoch": 2.193002515435628, "grad_norm": 0.7884822487831116, "learning_rate": 5.958491310300178e-05, "loss": 0.07670620083808899, "memory(GiB)": 122.96, "step": 28770, "token_acc": 0.9659090909090909, "train_speed(iter/s)": 0.239403 }, { "epoch": 2.1933836420458874, "grad_norm": 0.8232865929603577, "learning_rate": 5.957316144330656e-05, "loss": 0.0726938545703888, "memory(GiB)": 122.96, "step": 28775, "token_acc": 0.9582595497090817, "train_speed(iter/s)": 0.239414 }, { "epoch": 2.1937647686561474, "grad_norm": 0.09501532465219498, "learning_rate": 5.9561409234634936e-05, "loss": 0.07519101500511169, "memory(GiB)": 122.96, "step": 28780, "token_acc": 0.9756171898811338, "train_speed(iter/s)": 0.239422 }, { "epoch": 2.1941458952664075, "grad_norm": 0.9810920357704163, "learning_rate": 5.9549656477660876e-05, "loss": 0.07845911383628845, "memory(GiB)": 122.96, "step": 28785, "token_acc": 0.9672173799921946, "train_speed(iter/s)": 0.239423 }, { "epoch": 2.1945270218766675, "grad_norm": 1.9384510517120361, "learning_rate": 5.953790317305834e-05, "loss": 0.1578362464904785, "memory(GiB)": 122.96, "step": 28790, "token_acc": 0.944672131147541, "train_speed(iter/s)": 0.239436 }, { "epoch": 2.1949081484869275, "grad_norm": 1.311988115310669, "learning_rate": 5.95261493215013e-05, "loss": 0.1248600959777832, "memory(GiB)": 122.96, "step": 28795, "token_acc": 0.9531353135313532, "train_speed(iter/s)": 0.239445 }, { "epoch": 2.1952892750971875, "grad_norm": 1.1928349733352661, "learning_rate": 5.9514394923663805e-05, "loss": 0.09211788773536682, "memory(GiB)": 122.96, "step": 28800, "token_acc": 0.9707927677329624, "train_speed(iter/s)": 0.239451 }, { "epoch": 2.1952892750971875, "eval_loss": 0.08703914284706116, "eval_runtime": 217.1105, "eval_samples_per_second": 2.441, "eval_steps_per_second": 2.441, "eval_token_acc": 0.9611920968616349, "step": 28800 }, { "epoch": 2.195670401707447, "grad_norm": 0.9304694533348083, "learning_rate": 5.950263998021992e-05, "loss": 0.0922511339187622, "memory(GiB)": 122.96, "step": 28805, "token_acc": 0.961218547780566, "train_speed(iter/s)": 0.239034 }, { "epoch": 2.196051528317707, "grad_norm": 0.8685370683670044, "learning_rate": 5.9490884491843734e-05, "loss": 0.09206722378730774, "memory(GiB)": 122.96, "step": 28810, "token_acc": 0.9637423114276464, "train_speed(iter/s)": 0.239045 }, { "epoch": 2.196432654927967, "grad_norm": 0.904920220375061, "learning_rate": 5.947912845920935e-05, "loss": 0.08852599859237671, "memory(GiB)": 122.96, "step": 28815, "token_acc": 0.9669728356458862, "train_speed(iter/s)": 0.239051 }, { "epoch": 2.196813781538227, "grad_norm": 1.7651609182357788, "learning_rate": 5.9467371882990966e-05, "loss": 0.09556234478950501, "memory(GiB)": 122.96, "step": 28820, "token_acc": 0.9678571428571429, "train_speed(iter/s)": 0.239066 }, { "epoch": 2.1971949081484867, "grad_norm": 1.6504875421524048, "learning_rate": 5.94556147638627e-05, "loss": 0.13407727479934692, "memory(GiB)": 122.96, "step": 28825, "token_acc": 0.9517058041648205, "train_speed(iter/s)": 0.239073 }, { "epoch": 2.1975760347587467, "grad_norm": 1.3088833093643188, "learning_rate": 5.944385710249884e-05, "loss": 0.10677444934844971, "memory(GiB)": 122.96, "step": 28830, "token_acc": 0.9680209698558322, "train_speed(iter/s)": 0.239081 }, { "epoch": 2.1979571613690068, "grad_norm": 0.928633451461792, "learning_rate": 5.943209889957357e-05, "loss": 0.13191263675689696, "memory(GiB)": 122.96, "step": 28835, "token_acc": 0.9454478437154441, "train_speed(iter/s)": 0.239088 }, { "epoch": 2.198338287979267, "grad_norm": 0.6400498151779175, "learning_rate": 5.942034015576122e-05, "loss": 0.09703855514526367, "memory(GiB)": 122.96, "step": 28840, "token_acc": 0.9489944622559021, "train_speed(iter/s)": 0.2391 }, { "epoch": 2.198719414589527, "grad_norm": 0.8739282488822937, "learning_rate": 5.9408580871736085e-05, "loss": 0.11858887672424316, "memory(GiB)": 122.96, "step": 28845, "token_acc": 0.9597855227882037, "train_speed(iter/s)": 0.239104 }, { "epoch": 2.1991005411997864, "grad_norm": 0.85493004322052, "learning_rate": 5.939682104817248e-05, "loss": 0.09487816095352172, "memory(GiB)": 122.96, "step": 28850, "token_acc": 0.9676258992805755, "train_speed(iter/s)": 0.239112 }, { "epoch": 2.1994816678100464, "grad_norm": 0.5722710490226746, "learning_rate": 5.9385060685744795e-05, "loss": 0.0870305359363556, "memory(GiB)": 122.96, "step": 28855, "token_acc": 0.9614604462474645, "train_speed(iter/s)": 0.239118 }, { "epoch": 2.1998627944203064, "grad_norm": 1.2472933530807495, "learning_rate": 5.937329978512744e-05, "loss": 0.11022617816925048, "memory(GiB)": 122.96, "step": 28860, "token_acc": 0.9604963805584281, "train_speed(iter/s)": 0.239124 }, { "epoch": 2.2002439210305664, "grad_norm": 0.8768842220306396, "learning_rate": 5.936153834699484e-05, "loss": 0.10049896240234375, "memory(GiB)": 122.96, "step": 28865, "token_acc": 0.960668633235005, "train_speed(iter/s)": 0.239136 }, { "epoch": 2.2006250476408264, "grad_norm": 1.1748061180114746, "learning_rate": 5.934977637202145e-05, "loss": 0.10433646440505981, "memory(GiB)": 122.96, "step": 28870, "token_acc": 0.9622199062011464, "train_speed(iter/s)": 0.239147 }, { "epoch": 2.201006174251086, "grad_norm": 2.416335105895996, "learning_rate": 5.933801386088178e-05, "loss": 0.14414005279541015, "memory(GiB)": 122.96, "step": 28875, "token_acc": 0.9518972786508241, "train_speed(iter/s)": 0.239153 }, { "epoch": 2.201387300861346, "grad_norm": 0.973145067691803, "learning_rate": 5.9326250814250365e-05, "loss": 0.07905175685882568, "memory(GiB)": 122.96, "step": 28880, "token_acc": 0.9729431253451132, "train_speed(iter/s)": 0.239162 }, { "epoch": 2.201768427471606, "grad_norm": 0.48518306016921997, "learning_rate": 5.931448723280171e-05, "loss": 0.08148943781852722, "memory(GiB)": 122.96, "step": 28885, "token_acc": 0.965491030708422, "train_speed(iter/s)": 0.239149 }, { "epoch": 2.202149554081866, "grad_norm": 0.6291018724441528, "learning_rate": 5.930272311721045e-05, "loss": 0.11322218179702759, "memory(GiB)": 122.96, "step": 28890, "token_acc": 0.9562638991845812, "train_speed(iter/s)": 0.239157 }, { "epoch": 2.202530680692126, "grad_norm": 1.0008366107940674, "learning_rate": 5.929095846815119e-05, "loss": 0.10402286052703857, "memory(GiB)": 122.96, "step": 28895, "token_acc": 0.9523690147906744, "train_speed(iter/s)": 0.239167 }, { "epoch": 2.2029118073023857, "grad_norm": 0.4491863548755646, "learning_rate": 5.927919328629859e-05, "loss": 0.08090718388557434, "memory(GiB)": 122.96, "step": 28900, "token_acc": 0.96474891918856, "train_speed(iter/s)": 0.23918 }, { "epoch": 2.2032929339126457, "grad_norm": 1.2019726037979126, "learning_rate": 5.92674275723273e-05, "loss": 0.09680279493331909, "memory(GiB)": 122.96, "step": 28905, "token_acc": 0.9624957381520627, "train_speed(iter/s)": 0.239193 }, { "epoch": 2.2036740605229057, "grad_norm": 1.344130277633667, "learning_rate": 5.925566132691205e-05, "loss": 0.1089707374572754, "memory(GiB)": 122.96, "step": 28910, "token_acc": 0.9671232876712329, "train_speed(iter/s)": 0.239203 }, { "epoch": 2.2040551871331657, "grad_norm": 0.7479249835014343, "learning_rate": 5.924389455072759e-05, "loss": 0.06621025800704956, "memory(GiB)": 122.96, "step": 28915, "token_acc": 0.9733582216188699, "train_speed(iter/s)": 0.239209 }, { "epoch": 2.2044363137434257, "grad_norm": 0.5978522896766663, "learning_rate": 5.9232127244448645e-05, "loss": 0.07215047478675843, "memory(GiB)": 122.96, "step": 28920, "token_acc": 0.9726632836798167, "train_speed(iter/s)": 0.239214 }, { "epoch": 2.2048174403536853, "grad_norm": 1.9057236909866333, "learning_rate": 5.922035940875006e-05, "loss": 0.10894174575805664, "memory(GiB)": 122.96, "step": 28925, "token_acc": 0.9701058201058201, "train_speed(iter/s)": 0.239222 }, { "epoch": 2.2051985669639453, "grad_norm": 0.7570027709007263, "learning_rate": 5.9208591044306663e-05, "loss": 0.07306674718856812, "memory(GiB)": 122.96, "step": 28930, "token_acc": 0.9726716222125055, "train_speed(iter/s)": 0.239231 }, { "epoch": 2.2055796935742054, "grad_norm": 2.1561875343322754, "learning_rate": 5.9196822151793273e-05, "loss": 0.12193996906280517, "memory(GiB)": 122.96, "step": 28935, "token_acc": 0.9548654244306418, "train_speed(iter/s)": 0.239244 }, { "epoch": 2.2059608201844654, "grad_norm": 0.7276331782341003, "learning_rate": 5.918505273188484e-05, "loss": 0.099605792760849, "memory(GiB)": 122.96, "step": 28940, "token_acc": 0.9523115368596418, "train_speed(iter/s)": 0.239254 }, { "epoch": 2.2063419467947254, "grad_norm": 0.6190464496612549, "learning_rate": 5.917328278525625e-05, "loss": 0.08943166732788085, "memory(GiB)": 122.96, "step": 28945, "token_acc": 0.969390402075227, "train_speed(iter/s)": 0.239265 }, { "epoch": 2.206723073404985, "grad_norm": 1.7998408079147339, "learning_rate": 5.9161512312582444e-05, "loss": 0.10294674634933472, "memory(GiB)": 122.96, "step": 28950, "token_acc": 0.9537839399191219, "train_speed(iter/s)": 0.239275 }, { "epoch": 2.207104200015245, "grad_norm": 0.6933124661445618, "learning_rate": 5.914974131453844e-05, "loss": 0.15494229793548583, "memory(GiB)": 122.96, "step": 28955, "token_acc": 0.9455933037912359, "train_speed(iter/s)": 0.239287 }, { "epoch": 2.207485326625505, "grad_norm": 0.6874995827674866, "learning_rate": 5.913796979179922e-05, "loss": 0.0863486647605896, "memory(GiB)": 122.96, "step": 28960, "token_acc": 0.9669353172143005, "train_speed(iter/s)": 0.239295 }, { "epoch": 2.207866453235765, "grad_norm": 1.1348522901535034, "learning_rate": 5.912619774503985e-05, "loss": 0.1489327669143677, "memory(GiB)": 122.96, "step": 28965, "token_acc": 0.9477343562694798, "train_speed(iter/s)": 0.239307 }, { "epoch": 2.208247579846025, "grad_norm": 0.8211100697517395, "learning_rate": 5.911442517493537e-05, "loss": 0.07432869076728821, "memory(GiB)": 122.96, "step": 28970, "token_acc": 0.9664138678223185, "train_speed(iter/s)": 0.239311 }, { "epoch": 2.2086287064562846, "grad_norm": 0.624995231628418, "learning_rate": 5.9102652082160916e-05, "loss": 0.13534436225891114, "memory(GiB)": 122.96, "step": 28975, "token_acc": 0.9496958737465067, "train_speed(iter/s)": 0.239317 }, { "epoch": 2.2090098330665446, "grad_norm": 1.7683682441711426, "learning_rate": 5.909087846739161e-05, "loss": 0.10982645750045776, "memory(GiB)": 122.96, "step": 28980, "token_acc": 0.960603371783496, "train_speed(iter/s)": 0.239321 }, { "epoch": 2.2093909596768047, "grad_norm": 0.6065993309020996, "learning_rate": 5.9079104331302605e-05, "loss": 0.06256929039955139, "memory(GiB)": 122.96, "step": 28985, "token_acc": 0.9672645739910314, "train_speed(iter/s)": 0.239332 }, { "epoch": 2.2097720862870647, "grad_norm": 0.6746577620506287, "learning_rate": 5.9067329674569106e-05, "loss": 0.06942579746246338, "memory(GiB)": 122.96, "step": 28990, "token_acc": 0.9718334809565987, "train_speed(iter/s)": 0.239337 }, { "epoch": 2.2101532128973247, "grad_norm": 0.5669054388999939, "learning_rate": 5.9055554497866314e-05, "loss": 0.10228986740112304, "memory(GiB)": 122.96, "step": 28995, "token_acc": 0.9576202118989405, "train_speed(iter/s)": 0.239345 }, { "epoch": 2.2105343395075843, "grad_norm": 1.609857201576233, "learning_rate": 5.904377880186951e-05, "loss": 0.13702352046966554, "memory(GiB)": 122.96, "step": 29000, "token_acc": 0.9426079902656662, "train_speed(iter/s)": 0.239354 }, { "epoch": 2.2105343395075843, "eval_loss": 0.08696001023054123, "eval_runtime": 218.7979, "eval_samples_per_second": 2.422, "eval_steps_per_second": 2.422, "eval_token_acc": 0.9605746641768568, "step": 29000 }, { "epoch": 2.2109154661178443, "grad_norm": 0.7371930480003357, "learning_rate": 5.903200258725395e-05, "loss": 0.12809984683990477, "memory(GiB)": 122.96, "step": 29005, "token_acc": 0.9603543516809083, "train_speed(iter/s)": 0.23893 }, { "epoch": 2.2112965927281043, "grad_norm": 1.5786106586456299, "learning_rate": 5.9020225854694966e-05, "loss": 0.104678213596344, "memory(GiB)": 122.96, "step": 29010, "token_acc": 0.9629032258064516, "train_speed(iter/s)": 0.238938 }, { "epoch": 2.2116777193383643, "grad_norm": 1.012172818183899, "learning_rate": 5.900844860486788e-05, "loss": 0.07319798469543456, "memory(GiB)": 122.96, "step": 29015, "token_acc": 0.9644985747603005, "train_speed(iter/s)": 0.238947 }, { "epoch": 2.212058845948624, "grad_norm": 1.2177917957305908, "learning_rate": 5.899667083844807e-05, "loss": 0.08102163672447205, "memory(GiB)": 122.96, "step": 29020, "token_acc": 0.9735276259866423, "train_speed(iter/s)": 0.238949 }, { "epoch": 2.212439972558884, "grad_norm": 0.9177025556564331, "learning_rate": 5.8984892556110926e-05, "loss": 0.176019024848938, "memory(GiB)": 122.96, "step": 29025, "token_acc": 0.9409992467988954, "train_speed(iter/s)": 0.238958 }, { "epoch": 2.212821099169144, "grad_norm": 0.3995053172111511, "learning_rate": 5.8973113758531904e-05, "loss": 0.0647564947605133, "memory(GiB)": 122.96, "step": 29030, "token_acc": 0.9712281963675597, "train_speed(iter/s)": 0.238962 }, { "epoch": 2.213202225779404, "grad_norm": 1.8051702976226807, "learning_rate": 5.896133444638645e-05, "loss": 0.10192911624908448, "memory(GiB)": 122.96, "step": 29035, "token_acc": 0.957983193277311, "train_speed(iter/s)": 0.238974 }, { "epoch": 2.213583352389664, "grad_norm": 1.2736706733703613, "learning_rate": 5.8949554620350034e-05, "loss": 0.09819488525390625, "memory(GiB)": 122.96, "step": 29040, "token_acc": 0.9492521367521367, "train_speed(iter/s)": 0.238984 }, { "epoch": 2.213964478999924, "grad_norm": 1.540177822113037, "learning_rate": 5.893777428109821e-05, "loss": 0.11246720552444459, "memory(GiB)": 122.96, "step": 29045, "token_acc": 0.9624829467939973, "train_speed(iter/s)": 0.238991 }, { "epoch": 2.2143456056101836, "grad_norm": 1.0434714555740356, "learning_rate": 5.8925993429306505e-05, "loss": 0.10113387107849121, "memory(GiB)": 122.96, "step": 29050, "token_acc": 0.9563877330867622, "train_speed(iter/s)": 0.238996 }, { "epoch": 2.2147267322204436, "grad_norm": 0.7075568437576294, "learning_rate": 5.8914212065650487e-05, "loss": 0.08273571729660034, "memory(GiB)": 122.96, "step": 29055, "token_acc": 0.9725103734439834, "train_speed(iter/s)": 0.239004 }, { "epoch": 2.2151078588307036, "grad_norm": 0.6343828439712524, "learning_rate": 5.890243019080579e-05, "loss": 0.12945072650909423, "memory(GiB)": 122.96, "step": 29060, "token_acc": 0.9515753530963836, "train_speed(iter/s)": 0.239009 }, { "epoch": 2.2154889854409636, "grad_norm": 0.7775021195411682, "learning_rate": 5.889064780544803e-05, "loss": 0.09086321592330933, "memory(GiB)": 122.96, "step": 29065, "token_acc": 0.9641818181818181, "train_speed(iter/s)": 0.239017 }, { "epoch": 2.215870112051223, "grad_norm": 0.7064670324325562, "learning_rate": 5.887886491025286e-05, "loss": 0.09379048943519593, "memory(GiB)": 122.96, "step": 29070, "token_acc": 0.9577564785232516, "train_speed(iter/s)": 0.239022 }, { "epoch": 2.216251238661483, "grad_norm": 0.5275242328643799, "learning_rate": 5.8867081505896004e-05, "loss": 0.0943835735321045, "memory(GiB)": 122.96, "step": 29075, "token_acc": 0.9699609492340042, "train_speed(iter/s)": 0.239033 }, { "epoch": 2.2166323652717432, "grad_norm": 0.5650860667228699, "learning_rate": 5.885529759305317e-05, "loss": 0.09944668412208557, "memory(GiB)": 122.96, "step": 29080, "token_acc": 0.9637033886289522, "train_speed(iter/s)": 0.239037 }, { "epoch": 2.2170134918820033, "grad_norm": 0.8123016953468323, "learning_rate": 5.884351317240012e-05, "loss": 0.10056686401367188, "memory(GiB)": 122.96, "step": 29085, "token_acc": 0.9569313593539704, "train_speed(iter/s)": 0.239044 }, { "epoch": 2.2173946184922633, "grad_norm": 0.7849647998809814, "learning_rate": 5.8831728244612616e-05, "loss": 0.09735267162322998, "memory(GiB)": 122.96, "step": 29090, "token_acc": 0.9625462392108508, "train_speed(iter/s)": 0.239047 }, { "epoch": 2.2177757451025233, "grad_norm": 0.5093734264373779, "learning_rate": 5.881994281036648e-05, "loss": 0.08406122922897338, "memory(GiB)": 122.96, "step": 29095, "token_acc": 0.9726290516206483, "train_speed(iter/s)": 0.23905 }, { "epoch": 2.218156871712783, "grad_norm": 0.5452955961227417, "learning_rate": 5.880815687033756e-05, "loss": 0.08209398984909058, "memory(GiB)": 122.96, "step": 29100, "token_acc": 0.9652703186537773, "train_speed(iter/s)": 0.239061 }, { "epoch": 2.218537998323043, "grad_norm": 0.9549239873886108, "learning_rate": 5.879637042520172e-05, "loss": 0.125529146194458, "memory(GiB)": 122.96, "step": 29105, "token_acc": 0.9460925039872409, "train_speed(iter/s)": 0.239069 }, { "epoch": 2.218919124933303, "grad_norm": 1.3914787769317627, "learning_rate": 5.8784583475634844e-05, "loss": 0.08892745971679687, "memory(GiB)": 122.96, "step": 29110, "token_acc": 0.9661182375906302, "train_speed(iter/s)": 0.239075 }, { "epoch": 2.219300251543563, "grad_norm": 0.7880571484565735, "learning_rate": 5.877279602231289e-05, "loss": 0.12273286581039429, "memory(GiB)": 122.96, "step": 29115, "token_acc": 0.9468332980300783, "train_speed(iter/s)": 0.239083 }, { "epoch": 2.2196813781538225, "grad_norm": 1.160551905632019, "learning_rate": 5.876100806591179e-05, "loss": 0.0924648642539978, "memory(GiB)": 122.96, "step": 29120, "token_acc": 0.9660527266161069, "train_speed(iter/s)": 0.239093 }, { "epoch": 2.2200625047640825, "grad_norm": 0.9735986590385437, "learning_rate": 5.874921960710753e-05, "loss": 0.08272572755813598, "memory(GiB)": 122.96, "step": 29125, "token_acc": 0.969381652912331, "train_speed(iter/s)": 0.239094 }, { "epoch": 2.2204436313743425, "grad_norm": 0.46535831689834595, "learning_rate": 5.873743064657613e-05, "loss": 0.09919875860214233, "memory(GiB)": 122.96, "step": 29130, "token_acc": 0.9563742123121668, "train_speed(iter/s)": 0.239105 }, { "epoch": 2.2208247579846025, "grad_norm": 0.9705584645271301, "learning_rate": 5.8725641184993627e-05, "loss": 0.08847461342811584, "memory(GiB)": 122.96, "step": 29135, "token_acc": 0.9638646847770375, "train_speed(iter/s)": 0.239113 }, { "epoch": 2.2212058845948626, "grad_norm": 0.8367186188697815, "learning_rate": 5.871385122303609e-05, "loss": 0.09746550917625427, "memory(GiB)": 122.96, "step": 29140, "token_acc": 0.9596662030598053, "train_speed(iter/s)": 0.239125 }, { "epoch": 2.221587011205122, "grad_norm": 1.1570125818252563, "learning_rate": 5.8702060761379626e-05, "loss": 0.10395164489746093, "memory(GiB)": 122.96, "step": 29145, "token_acc": 0.959551325628824, "train_speed(iter/s)": 0.239137 }, { "epoch": 2.221968137815382, "grad_norm": 0.8230816125869751, "learning_rate": 5.869026980070036e-05, "loss": 0.10961235761642456, "memory(GiB)": 122.96, "step": 29150, "token_acc": 0.9565070802427512, "train_speed(iter/s)": 0.239148 }, { "epoch": 2.222349264425642, "grad_norm": 1.1831822395324707, "learning_rate": 5.867847834167444e-05, "loss": 0.08497329950332641, "memory(GiB)": 122.96, "step": 29155, "token_acc": 0.96991123815255, "train_speed(iter/s)": 0.239151 }, { "epoch": 2.222730391035902, "grad_norm": 1.070433259010315, "learning_rate": 5.866668638497807e-05, "loss": 0.10299686193466187, "memory(GiB)": 122.96, "step": 29160, "token_acc": 0.9530430818326875, "train_speed(iter/s)": 0.239159 }, { "epoch": 2.223111517646162, "grad_norm": 1.0181251764297485, "learning_rate": 5.8654893931287444e-05, "loss": 0.06437152624130249, "memory(GiB)": 122.96, "step": 29165, "token_acc": 0.9677320691237192, "train_speed(iter/s)": 0.239162 }, { "epoch": 2.223492644256422, "grad_norm": 1.321587085723877, "learning_rate": 5.8643100981278834e-05, "loss": 0.13958818912506105, "memory(GiB)": 122.96, "step": 29170, "token_acc": 0.9440364649278298, "train_speed(iter/s)": 0.239171 }, { "epoch": 2.223873770866682, "grad_norm": 0.9863234162330627, "learning_rate": 5.863130753562847e-05, "loss": 0.10321824550628662, "memory(GiB)": 122.96, "step": 29175, "token_acc": 0.9586347908147216, "train_speed(iter/s)": 0.239173 }, { "epoch": 2.224254897476942, "grad_norm": 0.7619325518608093, "learning_rate": 5.8619513595012686e-05, "loss": 0.07746029496192933, "memory(GiB)": 122.96, "step": 29180, "token_acc": 0.9579344357412243, "train_speed(iter/s)": 0.239182 }, { "epoch": 2.224636024087202, "grad_norm": 0.7200804948806763, "learning_rate": 5.86077191601078e-05, "loss": 0.10979831218719482, "memory(GiB)": 122.96, "step": 29185, "token_acc": 0.9593628088426528, "train_speed(iter/s)": 0.239189 }, { "epoch": 2.225017150697462, "grad_norm": 1.650307536125183, "learning_rate": 5.859592423159016e-05, "loss": 0.09587050676345825, "memory(GiB)": 122.96, "step": 29190, "token_acc": 0.964327714621717, "train_speed(iter/s)": 0.239191 }, { "epoch": 2.2253982773077214, "grad_norm": 1.3075900077819824, "learning_rate": 5.8584128810136155e-05, "loss": 0.11553184986114502, "memory(GiB)": 122.96, "step": 29195, "token_acc": 0.951162336393827, "train_speed(iter/s)": 0.239198 }, { "epoch": 2.2257794039179815, "grad_norm": 0.6939677000045776, "learning_rate": 5.857233289642219e-05, "loss": 0.08010249137878418, "memory(GiB)": 122.96, "step": 29200, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.239198 }, { "epoch": 2.2257794039179815, "eval_loss": 0.08711579442024231, "eval_runtime": 218.3058, "eval_samples_per_second": 2.428, "eval_steps_per_second": 2.428, "eval_token_acc": 0.9612372748629601, "step": 29200 }, { "epoch": 2.2261605305282415, "grad_norm": 1.1321016550064087, "learning_rate": 5.856053649112473e-05, "loss": 0.12472519874572754, "memory(GiB)": 122.96, "step": 29205, "token_acc": 0.9606788079470199, "train_speed(iter/s)": 0.238782 }, { "epoch": 2.2265416571385015, "grad_norm": 0.8853023052215576, "learning_rate": 5.854873959492021e-05, "loss": 0.0946509599685669, "memory(GiB)": 122.96, "step": 29210, "token_acc": 0.9630091438071487, "train_speed(iter/s)": 0.238788 }, { "epoch": 2.2269227837487615, "grad_norm": 0.4898860454559326, "learning_rate": 5.853694220848516e-05, "loss": 0.041424742341041564, "memory(GiB)": 122.96, "step": 29215, "token_acc": 0.9836878507112097, "train_speed(iter/s)": 0.238789 }, { "epoch": 2.227303910359021, "grad_norm": 0.9387263059616089, "learning_rate": 5.852514433249608e-05, "loss": 0.06566023230552673, "memory(GiB)": 122.96, "step": 29220, "token_acc": 0.9725897920604915, "train_speed(iter/s)": 0.238797 }, { "epoch": 2.227685036969281, "grad_norm": 1.6788650751113892, "learning_rate": 5.851334596762954e-05, "loss": 0.09921015501022339, "memory(GiB)": 122.96, "step": 29225, "token_acc": 0.9685201529861724, "train_speed(iter/s)": 0.238808 }, { "epoch": 2.228066163579541, "grad_norm": 1.2840332984924316, "learning_rate": 5.8501547114562105e-05, "loss": 0.07443768382072449, "memory(GiB)": 122.96, "step": 29230, "token_acc": 0.9713125491223474, "train_speed(iter/s)": 0.238809 }, { "epoch": 2.228447290189801, "grad_norm": 1.2149189710617065, "learning_rate": 5.8489747773970405e-05, "loss": 0.08804042339324951, "memory(GiB)": 122.96, "step": 29235, "token_acc": 0.9596330275229358, "train_speed(iter/s)": 0.238818 }, { "epoch": 2.228828416800061, "grad_norm": 1.1432230472564697, "learning_rate": 5.8477947946531075e-05, "loss": 0.07277494668960571, "memory(GiB)": 122.96, "step": 29240, "token_acc": 0.9690038123653241, "train_speed(iter/s)": 0.238825 }, { "epoch": 2.2292095434103207, "grad_norm": 0.5275107622146606, "learning_rate": 5.846614763292075e-05, "loss": 0.11627265214920043, "memory(GiB)": 122.96, "step": 29245, "token_acc": 0.9636697247706422, "train_speed(iter/s)": 0.238831 }, { "epoch": 2.2295906700205808, "grad_norm": 0.6106123924255371, "learning_rate": 5.845434683381618e-05, "loss": 0.06221815347671509, "memory(GiB)": 122.96, "step": 29250, "token_acc": 0.9716252868766951, "train_speed(iter/s)": 0.23884 }, { "epoch": 2.2299717966308408, "grad_norm": 0.7409005761146545, "learning_rate": 5.844254554989403e-05, "loss": 0.058583295345306395, "memory(GiB)": 122.96, "step": 29255, "token_acc": 0.9596882558452029, "train_speed(iter/s)": 0.238851 }, { "epoch": 2.230352923241101, "grad_norm": 2.711928367614746, "learning_rate": 5.843074378183107e-05, "loss": 0.11682937145233155, "memory(GiB)": 122.96, "step": 29260, "token_acc": 0.9497163486333161, "train_speed(iter/s)": 0.238861 }, { "epoch": 2.230734049851361, "grad_norm": 1.1495715379714966, "learning_rate": 5.841894153030408e-05, "loss": 0.10385620594024658, "memory(GiB)": 122.96, "step": 29265, "token_acc": 0.960735171261487, "train_speed(iter/s)": 0.23887 }, { "epoch": 2.2311151764616204, "grad_norm": 1.0255138874053955, "learning_rate": 5.840713879598987e-05, "loss": 0.09732089638710022, "memory(GiB)": 122.96, "step": 29270, "token_acc": 0.9618222649983958, "train_speed(iter/s)": 0.238877 }, { "epoch": 2.2314963030718804, "grad_norm": 1.3367868661880493, "learning_rate": 5.839533557956526e-05, "loss": 0.10010727643966674, "memory(GiB)": 122.96, "step": 29275, "token_acc": 0.9639446501656597, "train_speed(iter/s)": 0.238885 }, { "epoch": 2.2318774296821404, "grad_norm": 1.109761357307434, "learning_rate": 5.83835318817071e-05, "loss": 0.12616634368896484, "memory(GiB)": 122.96, "step": 29280, "token_acc": 0.9523992322456813, "train_speed(iter/s)": 0.23889 }, { "epoch": 2.2322585562924004, "grad_norm": 0.5499297976493835, "learning_rate": 5.8371727703092304e-05, "loss": 0.05514953136444092, "memory(GiB)": 122.96, "step": 29285, "token_acc": 0.980497646267653, "train_speed(iter/s)": 0.238893 }, { "epoch": 2.2326396829026605, "grad_norm": 1.902769684791565, "learning_rate": 5.835992304439778e-05, "loss": 0.14690487384796141, "memory(GiB)": 122.96, "step": 29290, "token_acc": 0.9639696586599241, "train_speed(iter/s)": 0.238901 }, { "epoch": 2.23302080951292, "grad_norm": 0.7917807102203369, "learning_rate": 5.834811790630045e-05, "loss": 0.08196125626564026, "memory(GiB)": 122.96, "step": 29295, "token_acc": 0.9725760527498876, "train_speed(iter/s)": 0.238903 }, { "epoch": 2.23340193612318, "grad_norm": 0.5552849769592285, "learning_rate": 5.833631228947729e-05, "loss": 0.11354115009307861, "memory(GiB)": 122.96, "step": 29300, "token_acc": 0.9648280482358196, "train_speed(iter/s)": 0.238901 }, { "epoch": 2.23378306273344, "grad_norm": 1.1155108213424683, "learning_rate": 5.832450619460532e-05, "loss": 0.08670984506607056, "memory(GiB)": 122.96, "step": 29305, "token_acc": 0.9670286278381046, "train_speed(iter/s)": 0.238909 }, { "epoch": 2.2341641893437, "grad_norm": 1.1885536909103394, "learning_rate": 5.8312699622361556e-05, "loss": 0.09489326477050782, "memory(GiB)": 122.96, "step": 29310, "token_acc": 0.959040506902014, "train_speed(iter/s)": 0.238916 }, { "epoch": 2.23454531595396, "grad_norm": 1.3075968027114868, "learning_rate": 5.8300892573423024e-05, "loss": 0.11890120506286621, "memory(GiB)": 122.96, "step": 29315, "token_acc": 0.953173546382601, "train_speed(iter/s)": 0.238924 }, { "epoch": 2.2349264425642197, "grad_norm": 1.1169874668121338, "learning_rate": 5.828908504846685e-05, "loss": 0.06777180433273315, "memory(GiB)": 122.96, "step": 29320, "token_acc": 0.9621374865735768, "train_speed(iter/s)": 0.238934 }, { "epoch": 2.2353075691744797, "grad_norm": 0.7007449865341187, "learning_rate": 5.8277277048170095e-05, "loss": 0.05845197439193726, "memory(GiB)": 122.96, "step": 29325, "token_acc": 0.9740661112604139, "train_speed(iter/s)": 0.238939 }, { "epoch": 2.2356886957847397, "grad_norm": 0.6309834122657776, "learning_rate": 5.826546857320992e-05, "loss": 0.08815938830375672, "memory(GiB)": 122.96, "step": 29330, "token_acc": 0.9709500130855797, "train_speed(iter/s)": 0.23895 }, { "epoch": 2.2360698223949997, "grad_norm": 0.5346353054046631, "learning_rate": 5.825365962426348e-05, "loss": 0.09333096742630005, "memory(GiB)": 122.96, "step": 29335, "token_acc": 0.9690069576217584, "train_speed(iter/s)": 0.238959 }, { "epoch": 2.2364509490052598, "grad_norm": 1.0738356113433838, "learning_rate": 5.8241850202007965e-05, "loss": 0.07084048986434936, "memory(GiB)": 122.96, "step": 29340, "token_acc": 0.9810350939383198, "train_speed(iter/s)": 0.238964 }, { "epoch": 2.2368320756155193, "grad_norm": 0.9912185072898865, "learning_rate": 5.823004030712058e-05, "loss": 0.07830387353897095, "memory(GiB)": 122.96, "step": 29345, "token_acc": 0.9700365408038977, "train_speed(iter/s)": 0.238973 }, { "epoch": 2.2372132022257794, "grad_norm": 0.8207936882972717, "learning_rate": 5.821822994027858e-05, "loss": 0.11806774139404297, "memory(GiB)": 122.96, "step": 29350, "token_acc": 0.9526288391462779, "train_speed(iter/s)": 0.238982 }, { "epoch": 2.2375943288360394, "grad_norm": 0.9739744663238525, "learning_rate": 5.8206419102159225e-05, "loss": 0.08510139584541321, "memory(GiB)": 122.96, "step": 29355, "token_acc": 0.9590604026845637, "train_speed(iter/s)": 0.238993 }, { "epoch": 2.2379754554462994, "grad_norm": 0.8472847938537598, "learning_rate": 5.819460779343982e-05, "loss": 0.1151192307472229, "memory(GiB)": 122.96, "step": 29360, "token_acc": 0.9558885605740819, "train_speed(iter/s)": 0.239 }, { "epoch": 2.238356582056559, "grad_norm": 1.8394434452056885, "learning_rate": 5.818279601479768e-05, "loss": 0.08861685395240784, "memory(GiB)": 122.96, "step": 29365, "token_acc": 0.9653821032005225, "train_speed(iter/s)": 0.23901 }, { "epoch": 2.238737708666819, "grad_norm": 0.6740123629570007, "learning_rate": 5.817098376691017e-05, "loss": 0.05714611411094665, "memory(GiB)": 122.96, "step": 29370, "token_acc": 0.9738950633238563, "train_speed(iter/s)": 0.239019 }, { "epoch": 2.239118835277079, "grad_norm": 0.8111326694488525, "learning_rate": 5.815917105045467e-05, "loss": 0.06929715275764466, "memory(GiB)": 122.96, "step": 29375, "token_acc": 0.96736, "train_speed(iter/s)": 0.23903 }, { "epoch": 2.239499961887339, "grad_norm": 0.6377689242362976, "learning_rate": 5.814735786610856e-05, "loss": 0.08702963590621948, "memory(GiB)": 122.96, "step": 29380, "token_acc": 0.9604722792607803, "train_speed(iter/s)": 0.239042 }, { "epoch": 2.239881088497599, "grad_norm": 0.3633939027786255, "learning_rate": 5.813554421454928e-05, "loss": 0.11089174747467041, "memory(GiB)": 122.96, "step": 29385, "token_acc": 0.9424054206662902, "train_speed(iter/s)": 0.239053 }, { "epoch": 2.240262215107859, "grad_norm": 2.0164549350738525, "learning_rate": 5.812373009645429e-05, "loss": 0.1079336404800415, "memory(GiB)": 122.96, "step": 29390, "token_acc": 0.950812274368231, "train_speed(iter/s)": 0.239063 }, { "epoch": 2.2406433417181186, "grad_norm": 1.0745865106582642, "learning_rate": 5.811191551250109e-05, "loss": 0.07274160385131836, "memory(GiB)": 122.96, "step": 29395, "token_acc": 0.9693823684673588, "train_speed(iter/s)": 0.239069 }, { "epoch": 2.2410244683283786, "grad_norm": 0.8852519989013672, "learning_rate": 5.810010046336717e-05, "loss": 0.12792248725891114, "memory(GiB)": 122.96, "step": 29400, "token_acc": 0.9565452578434908, "train_speed(iter/s)": 0.239072 }, { "epoch": 2.2410244683283786, "eval_loss": 0.0905846357345581, "eval_runtime": 218.5654, "eval_samples_per_second": 2.425, "eval_steps_per_second": 2.425, "eval_token_acc": 0.9607252575146076, "step": 29400 }, { "epoch": 2.2414055949386387, "grad_norm": 6.267818927764893, "learning_rate": 5.808828494973009e-05, "loss": 0.07672276496887206, "memory(GiB)": 122.96, "step": 29405, "token_acc": 0.9610281201457544, "train_speed(iter/s)": 0.238657 }, { "epoch": 2.2417867215488987, "grad_norm": 0.8856683373451233, "learning_rate": 5.8076468972267396e-05, "loss": 0.10768457651138305, "memory(GiB)": 122.96, "step": 29410, "token_acc": 0.955945252352438, "train_speed(iter/s)": 0.238663 }, { "epoch": 2.2421678481591583, "grad_norm": 1.1531215906143188, "learning_rate": 5.806465253165667e-05, "loss": 0.05633485913276672, "memory(GiB)": 122.96, "step": 29415, "token_acc": 0.9771812080536912, "train_speed(iter/s)": 0.23867 }, { "epoch": 2.2425489747694183, "grad_norm": 1.021605134010315, "learning_rate": 5.8052835628575564e-05, "loss": 0.0828878939151764, "memory(GiB)": 122.96, "step": 29420, "token_acc": 0.9675456389452333, "train_speed(iter/s)": 0.238675 }, { "epoch": 2.2429301013796783, "grad_norm": 0.598179817199707, "learning_rate": 5.80410182637017e-05, "loss": 0.12422515153884887, "memory(GiB)": 122.96, "step": 29425, "token_acc": 0.9581785500299581, "train_speed(iter/s)": 0.238676 }, { "epoch": 2.2433112279899383, "grad_norm": 1.0756993293762207, "learning_rate": 5.802920043771276e-05, "loss": 0.07833443880081177, "memory(GiB)": 122.96, "step": 29430, "token_acc": 0.9656593406593407, "train_speed(iter/s)": 0.238687 }, { "epoch": 2.2436923546001983, "grad_norm": 0.6223176717758179, "learning_rate": 5.8017382151286415e-05, "loss": 0.08666513562202453, "memory(GiB)": 122.96, "step": 29435, "token_acc": 0.9661204198121893, "train_speed(iter/s)": 0.238697 }, { "epoch": 2.244073481210458, "grad_norm": 0.9155974388122559, "learning_rate": 5.800556340510043e-05, "loss": 0.10642174482345582, "memory(GiB)": 122.96, "step": 29440, "token_acc": 0.9560327198364008, "train_speed(iter/s)": 0.238707 }, { "epoch": 2.244454607820718, "grad_norm": 0.5949214696884155, "learning_rate": 5.799374419983252e-05, "loss": 0.06528844833374023, "memory(GiB)": 122.96, "step": 29445, "token_acc": 0.9736197987489802, "train_speed(iter/s)": 0.238717 }, { "epoch": 2.244835734430978, "grad_norm": 0.9791076183319092, "learning_rate": 5.798192453616048e-05, "loss": 0.0936412751674652, "memory(GiB)": 122.96, "step": 29450, "token_acc": 0.956371082060167, "train_speed(iter/s)": 0.238724 }, { "epoch": 2.245216861041238, "grad_norm": 0.6201654076576233, "learning_rate": 5.797010441476212e-05, "loss": 0.09089666604995728, "memory(GiB)": 122.96, "step": 29455, "token_acc": 0.9673013245033113, "train_speed(iter/s)": 0.23873 }, { "epoch": 2.245597987651498, "grad_norm": 0.7913779020309448, "learning_rate": 5.7958283836315254e-05, "loss": 0.06674984693527222, "memory(GiB)": 122.96, "step": 29460, "token_acc": 0.972989017512615, "train_speed(iter/s)": 0.238739 }, { "epoch": 2.2459791142617576, "grad_norm": 1.0334267616271973, "learning_rate": 5.7946462801497724e-05, "loss": 0.10688748359680175, "memory(GiB)": 122.96, "step": 29465, "token_acc": 0.9595448798988622, "train_speed(iter/s)": 0.238752 }, { "epoch": 2.2463602408720176, "grad_norm": 1.190184473991394, "learning_rate": 5.793464131098745e-05, "loss": 0.0941691517829895, "memory(GiB)": 122.96, "step": 29470, "token_acc": 0.9620676831535887, "train_speed(iter/s)": 0.238764 }, { "epoch": 2.2467413674822776, "grad_norm": 0.9701222777366638, "learning_rate": 5.792281936546231e-05, "loss": 0.1198868989944458, "memory(GiB)": 122.96, "step": 29475, "token_acc": 0.9347536617842876, "train_speed(iter/s)": 0.238776 }, { "epoch": 2.2471224940925376, "grad_norm": 1.4663405418395996, "learning_rate": 5.791099696560024e-05, "loss": 0.11845667362213134, "memory(GiB)": 122.96, "step": 29480, "token_acc": 0.9386120996441281, "train_speed(iter/s)": 0.238789 }, { "epoch": 2.2475036207027976, "grad_norm": 0.7954038381576538, "learning_rate": 5.78991741120792e-05, "loss": 0.07939584851264954, "memory(GiB)": 122.96, "step": 29485, "token_acc": 0.9709754637941352, "train_speed(iter/s)": 0.2388 }, { "epoch": 2.247884747313057, "grad_norm": 1.0619233846664429, "learning_rate": 5.7887350805577204e-05, "loss": 0.09561090469360352, "memory(GiB)": 122.96, "step": 29490, "token_acc": 0.9528907922912205, "train_speed(iter/s)": 0.238813 }, { "epoch": 2.2482658739233172, "grad_norm": 1.2254269123077393, "learning_rate": 5.7875527046772216e-05, "loss": 0.12045505046844482, "memory(GiB)": 122.96, "step": 29495, "token_acc": 0.9457547169811321, "train_speed(iter/s)": 0.238821 }, { "epoch": 2.2486470005335772, "grad_norm": 0.6814715266227722, "learning_rate": 5.786370283634231e-05, "loss": 0.09624773263931274, "memory(GiB)": 122.96, "step": 29500, "token_acc": 0.9566701680672269, "train_speed(iter/s)": 0.23883 }, { "epoch": 2.2490281271438373, "grad_norm": 1.7416231632232666, "learning_rate": 5.785187817496553e-05, "loss": 0.11033997535705567, "memory(GiB)": 122.96, "step": 29505, "token_acc": 0.9623346751006325, "train_speed(iter/s)": 0.238838 }, { "epoch": 2.2494092537540973, "grad_norm": 1.3084601163864136, "learning_rate": 5.784005306331999e-05, "loss": 0.12139544486999512, "memory(GiB)": 122.96, "step": 29510, "token_acc": 0.9569555859909998, "train_speed(iter/s)": 0.238847 }, { "epoch": 2.249790380364357, "grad_norm": 0.9811158776283264, "learning_rate": 5.7828227502083784e-05, "loss": 0.1024258017539978, "memory(GiB)": 122.96, "step": 29515, "token_acc": 0.960800161648818, "train_speed(iter/s)": 0.238857 }, { "epoch": 2.250171506974617, "grad_norm": 0.8672311305999756, "learning_rate": 5.781640149193506e-05, "loss": 0.11518096923828125, "memory(GiB)": 122.96, "step": 29520, "token_acc": 0.9597002262443439, "train_speed(iter/s)": 0.238859 }, { "epoch": 2.250552633584877, "grad_norm": 0.9036468863487244, "learning_rate": 5.7804575033551975e-05, "loss": 0.08559463024139405, "memory(GiB)": 122.96, "step": 29525, "token_acc": 0.9634175123908426, "train_speed(iter/s)": 0.238869 }, { "epoch": 2.250933760195137, "grad_norm": 1.728669285774231, "learning_rate": 5.779274812761274e-05, "loss": 0.1039236068725586, "memory(GiB)": 122.96, "step": 29530, "token_acc": 0.9579950582421461, "train_speed(iter/s)": 0.238881 }, { "epoch": 2.251314886805397, "grad_norm": 0.8602311015129089, "learning_rate": 5.778092077479555e-05, "loss": 0.10353094339370728, "memory(GiB)": 122.96, "step": 29535, "token_acc": 0.9572141668647492, "train_speed(iter/s)": 0.238891 }, { "epoch": 2.2516960134156565, "grad_norm": 0.6858974099159241, "learning_rate": 5.776909297577867e-05, "loss": 0.08912047743797302, "memory(GiB)": 122.96, "step": 29540, "token_acc": 0.9643507676294562, "train_speed(iter/s)": 0.2389 }, { "epoch": 2.2520771400259165, "grad_norm": 1.405182123184204, "learning_rate": 5.775726473124036e-05, "loss": 0.09641697406768798, "memory(GiB)": 122.96, "step": 29545, "token_acc": 0.9584289722009691, "train_speed(iter/s)": 0.238912 }, { "epoch": 2.2524582666361765, "grad_norm": 0.7709251046180725, "learning_rate": 5.77454360418589e-05, "loss": 0.09534600973129273, "memory(GiB)": 122.96, "step": 29550, "token_acc": 0.9538131041890441, "train_speed(iter/s)": 0.238923 }, { "epoch": 2.2528393932464366, "grad_norm": 0.5804107785224915, "learning_rate": 5.773360690831265e-05, "loss": 0.10576581954956055, "memory(GiB)": 122.96, "step": 29555, "token_acc": 0.9558335460811845, "train_speed(iter/s)": 0.238933 }, { "epoch": 2.2532205198566966, "grad_norm": 1.3757576942443848, "learning_rate": 5.7721777331279916e-05, "loss": 0.15266804695129393, "memory(GiB)": 122.96, "step": 29560, "token_acc": 0.9458128078817734, "train_speed(iter/s)": 0.238942 }, { "epoch": 2.253601646466956, "grad_norm": 1.2293621301651, "learning_rate": 5.7709947311439097e-05, "loss": 0.10273809432983398, "memory(GiB)": 122.96, "step": 29565, "token_acc": 0.9582438558816512, "train_speed(iter/s)": 0.238952 }, { "epoch": 2.253982773077216, "grad_norm": 1.1848936080932617, "learning_rate": 5.7698116849468566e-05, "loss": 0.14323937892913818, "memory(GiB)": 122.96, "step": 29570, "token_acc": 0.9504021447721179, "train_speed(iter/s)": 0.238965 }, { "epoch": 2.254363899687476, "grad_norm": 0.7263049483299255, "learning_rate": 5.768628594604677e-05, "loss": 0.1147235631942749, "memory(GiB)": 122.96, "step": 29575, "token_acc": 0.9605045672031318, "train_speed(iter/s)": 0.238963 }, { "epoch": 2.254745026297736, "grad_norm": 1.0968860387802124, "learning_rate": 5.767445460185214e-05, "loss": 0.09442520141601562, "memory(GiB)": 122.96, "step": 29580, "token_acc": 0.9677053824362606, "train_speed(iter/s)": 0.238972 }, { "epoch": 2.2551261529079962, "grad_norm": 1.0992894172668457, "learning_rate": 5.766262281756315e-05, "loss": 0.082880038022995, "memory(GiB)": 122.96, "step": 29585, "token_acc": 0.9719905922599957, "train_speed(iter/s)": 0.238978 }, { "epoch": 2.255507279518256, "grad_norm": 0.8327937126159668, "learning_rate": 5.7650790593858296e-05, "loss": 0.11388685703277587, "memory(GiB)": 122.96, "step": 29590, "token_acc": 0.9459773453383677, "train_speed(iter/s)": 0.23899 }, { "epoch": 2.255888406128516, "grad_norm": 0.8314562439918518, "learning_rate": 5.7638957931416115e-05, "loss": 0.1194993495941162, "memory(GiB)": 122.96, "step": 29595, "token_acc": 0.9546534653465346, "train_speed(iter/s)": 0.238997 }, { "epoch": 2.256269532738776, "grad_norm": 0.7081538438796997, "learning_rate": 5.7627124830915145e-05, "loss": 0.1468212366104126, "memory(GiB)": 122.96, "step": 29600, "token_acc": 0.9488324175824175, "train_speed(iter/s)": 0.239009 }, { "epoch": 2.256269532738776, "eval_loss": 0.08791312575340271, "eval_runtime": 218.8732, "eval_samples_per_second": 2.421, "eval_steps_per_second": 2.421, "eval_token_acc": 0.9613050418649479, "step": 29600 }, { "epoch": 2.256650659349036, "grad_norm": 1.1298879384994507, "learning_rate": 5.761529129303395e-05, "loss": 0.08747146725654602, "memory(GiB)": 122.96, "step": 29605, "token_acc": 0.9612556746555129, "train_speed(iter/s)": 0.238598 }, { "epoch": 2.2570317859592954, "grad_norm": 0.4600697159767151, "learning_rate": 5.760345731845115e-05, "loss": 0.0877134621143341, "memory(GiB)": 122.96, "step": 29610, "token_acc": 0.9614877727339288, "train_speed(iter/s)": 0.2386 }, { "epoch": 2.2574129125695555, "grad_norm": 1.1674805879592896, "learning_rate": 5.759162290784535e-05, "loss": 0.08997402787208557, "memory(GiB)": 122.96, "step": 29615, "token_acc": 0.9654300168634065, "train_speed(iter/s)": 0.238608 }, { "epoch": 2.2577940391798155, "grad_norm": 0.7436054348945618, "learning_rate": 5.75797880618952e-05, "loss": 0.08923695683479309, "memory(GiB)": 122.96, "step": 29620, "token_acc": 0.963265306122449, "train_speed(iter/s)": 0.238615 }, { "epoch": 2.2581751657900755, "grad_norm": 2.73691463470459, "learning_rate": 5.7567952781279376e-05, "loss": 0.15548186302185057, "memory(GiB)": 122.96, "step": 29625, "token_acc": 0.9431716082659479, "train_speed(iter/s)": 0.238624 }, { "epoch": 2.2585562924003355, "grad_norm": 1.335565209388733, "learning_rate": 5.755611706667659e-05, "loss": 0.0911983847618103, "memory(GiB)": 122.96, "step": 29630, "token_acc": 0.9662837535431896, "train_speed(iter/s)": 0.238628 }, { "epoch": 2.2589374190105955, "grad_norm": 1.3673577308654785, "learning_rate": 5.754428091876557e-05, "loss": 0.0821040391921997, "memory(GiB)": 122.96, "step": 29635, "token_acc": 0.9707638144803307, "train_speed(iter/s)": 0.238633 }, { "epoch": 2.259318545620855, "grad_norm": 0.929169774055481, "learning_rate": 5.7532444338225025e-05, "loss": 0.08508868217468261, "memory(GiB)": 122.96, "step": 29640, "token_acc": 0.9622030237580994, "train_speed(iter/s)": 0.238644 }, { "epoch": 2.259699672231115, "grad_norm": 0.7419663071632385, "learning_rate": 5.752060732573377e-05, "loss": 0.1555894136428833, "memory(GiB)": 122.96, "step": 29645, "token_acc": 0.9452252997106242, "train_speed(iter/s)": 0.238654 }, { "epoch": 2.260080798841375, "grad_norm": 0.9477188587188721, "learning_rate": 5.750876988197057e-05, "loss": 0.09023836255073547, "memory(GiB)": 122.96, "step": 29650, "token_acc": 0.9625, "train_speed(iter/s)": 0.238664 }, { "epoch": 2.260461925451635, "grad_norm": 0.9495424032211304, "learning_rate": 5.7496932007614266e-05, "loss": 0.07207100987434387, "memory(GiB)": 122.96, "step": 29655, "token_acc": 0.9747774480712166, "train_speed(iter/s)": 0.238672 }, { "epoch": 2.2608430520618947, "grad_norm": 1.3611626625061035, "learning_rate": 5.7485093703343705e-05, "loss": 0.11563383340835572, "memory(GiB)": 122.96, "step": 29660, "token_acc": 0.947381009327912, "train_speed(iter/s)": 0.238683 }, { "epoch": 2.2612241786721547, "grad_norm": 1.336162805557251, "learning_rate": 5.7473254969837754e-05, "loss": 0.09074771404266357, "memory(GiB)": 122.96, "step": 29665, "token_acc": 0.9699101211410708, "train_speed(iter/s)": 0.238689 }, { "epoch": 2.2616053052824148, "grad_norm": 1.2417963743209839, "learning_rate": 5.746141580777532e-05, "loss": 0.13604289293289185, "memory(GiB)": 122.96, "step": 29670, "token_acc": 0.9582966226138032, "train_speed(iter/s)": 0.238697 }, { "epoch": 2.261986431892675, "grad_norm": 0.5591177940368652, "learning_rate": 5.74495762178353e-05, "loss": 0.07404351234436035, "memory(GiB)": 122.96, "step": 29675, "token_acc": 0.968276102492592, "train_speed(iter/s)": 0.238702 }, { "epoch": 2.262367558502935, "grad_norm": 1.3885908126831055, "learning_rate": 5.7437736200696656e-05, "loss": 0.08617292642593384, "memory(GiB)": 122.96, "step": 29680, "token_acc": 0.9660493827160493, "train_speed(iter/s)": 0.23871 }, { "epoch": 2.262748685113195, "grad_norm": 1.2833900451660156, "learning_rate": 5.742589575703836e-05, "loss": 0.07827628254890442, "memory(GiB)": 122.96, "step": 29685, "token_acc": 0.964578313253012, "train_speed(iter/s)": 0.23872 }, { "epoch": 2.2631298117234544, "grad_norm": 0.403937429189682, "learning_rate": 5.74140548875394e-05, "loss": 0.10113543272018433, "memory(GiB)": 122.96, "step": 29690, "token_acc": 0.9608144448713023, "train_speed(iter/s)": 0.238724 }, { "epoch": 2.2635109383337144, "grad_norm": 0.9561710357666016, "learning_rate": 5.740221359287879e-05, "loss": 0.09870997667312623, "memory(GiB)": 122.96, "step": 29695, "token_acc": 0.9611197511664075, "train_speed(iter/s)": 0.238735 }, { "epoch": 2.2638920649439744, "grad_norm": 1.167987585067749, "learning_rate": 5.739037187373559e-05, "loss": 0.10019593238830567, "memory(GiB)": 122.96, "step": 29700, "token_acc": 0.9648356978295137, "train_speed(iter/s)": 0.23874 }, { "epoch": 2.2642731915542345, "grad_norm": 1.2988662719726562, "learning_rate": 5.7378529730788875e-05, "loss": 0.09572029113769531, "memory(GiB)": 122.96, "step": 29705, "token_acc": 0.9673032009891469, "train_speed(iter/s)": 0.238746 }, { "epoch": 2.264654318164494, "grad_norm": 0.7734620571136475, "learning_rate": 5.736668716471769e-05, "loss": 0.11107032299041748, "memory(GiB)": 122.96, "step": 29710, "token_acc": 0.9695203944419543, "train_speed(iter/s)": 0.238754 }, { "epoch": 2.265035444774754, "grad_norm": 0.7956851124763489, "learning_rate": 5.7354844176201205e-05, "loss": 0.08022345900535584, "memory(GiB)": 122.96, "step": 29715, "token_acc": 0.9650302622730329, "train_speed(iter/s)": 0.238758 }, { "epoch": 2.265416571385014, "grad_norm": 0.8381355404853821, "learning_rate": 5.7343000765918534e-05, "loss": 0.12027335166931152, "memory(GiB)": 122.96, "step": 29720, "token_acc": 0.9521390374331551, "train_speed(iter/s)": 0.238767 }, { "epoch": 2.265797697995274, "grad_norm": 0.8090341091156006, "learning_rate": 5.733115693454882e-05, "loss": 0.1254490375518799, "memory(GiB)": 122.96, "step": 29725, "token_acc": 0.958295001533272, "train_speed(iter/s)": 0.238779 }, { "epoch": 2.266178824605534, "grad_norm": 1.2990212440490723, "learning_rate": 5.731931268277131e-05, "loss": 0.09208222031593323, "memory(GiB)": 122.96, "step": 29730, "token_acc": 0.966381015161503, "train_speed(iter/s)": 0.238784 }, { "epoch": 2.266559951215794, "grad_norm": 0.8209584951400757, "learning_rate": 5.730746801126518e-05, "loss": 0.08643304109573365, "memory(GiB)": 122.96, "step": 29735, "token_acc": 0.9575375486742073, "train_speed(iter/s)": 0.238791 }, { "epoch": 2.2669410778260537, "grad_norm": 0.7107672095298767, "learning_rate": 5.729562292070965e-05, "loss": 0.1150052309036255, "memory(GiB)": 122.96, "step": 29740, "token_acc": 0.958129007921539, "train_speed(iter/s)": 0.238797 }, { "epoch": 2.2673222044363137, "grad_norm": 0.574143648147583, "learning_rate": 5.728377741178401e-05, "loss": 0.07381318807601929, "memory(GiB)": 122.96, "step": 29745, "token_acc": 0.9711609110947832, "train_speed(iter/s)": 0.238804 }, { "epoch": 2.2677033310465737, "grad_norm": 1.244683027267456, "learning_rate": 5.727193148516754e-05, "loss": 0.0874578297138214, "memory(GiB)": 122.96, "step": 29750, "token_acc": 0.968429258152527, "train_speed(iter/s)": 0.238805 }, { "epoch": 2.2680844576568338, "grad_norm": 0.7323178052902222, "learning_rate": 5.726008514153954e-05, "loss": 0.15346014499664307, "memory(GiB)": 122.96, "step": 29755, "token_acc": 0.9545346346108069, "train_speed(iter/s)": 0.238812 }, { "epoch": 2.2684655842670933, "grad_norm": 1.6105327606201172, "learning_rate": 5.724823838157933e-05, "loss": 0.1242634892463684, "memory(GiB)": 122.96, "step": 29760, "token_acc": 0.9373349339735895, "train_speed(iter/s)": 0.238822 }, { "epoch": 2.2688467108773533, "grad_norm": 1.0090878009796143, "learning_rate": 5.723639120596631e-05, "loss": 0.09517564177513123, "memory(GiB)": 122.96, "step": 29765, "token_acc": 0.955503512880562, "train_speed(iter/s)": 0.238835 }, { "epoch": 2.2692278374876134, "grad_norm": 1.6345921754837036, "learning_rate": 5.722454361537984e-05, "loss": 0.09699450731277466, "memory(GiB)": 122.96, "step": 29770, "token_acc": 0.9714285714285714, "train_speed(iter/s)": 0.238839 }, { "epoch": 2.2696089640978734, "grad_norm": 0.8633151650428772, "learning_rate": 5.721269561049931e-05, "loss": 0.12385165691375732, "memory(GiB)": 122.96, "step": 29775, "token_acc": 0.9517594369801663, "train_speed(iter/s)": 0.238842 }, { "epoch": 2.2699900907081334, "grad_norm": 1.075720191001892, "learning_rate": 5.720084719200416e-05, "loss": 0.08616209030151367, "memory(GiB)": 122.96, "step": 29780, "token_acc": 0.9656319290465631, "train_speed(iter/s)": 0.238852 }, { "epoch": 2.2703712173183934, "grad_norm": 0.6088447570800781, "learning_rate": 5.7188998360573833e-05, "loss": 0.1071089744567871, "memory(GiB)": 122.96, "step": 29785, "token_acc": 0.9367747098839536, "train_speed(iter/s)": 0.238865 }, { "epoch": 2.270752343928653, "grad_norm": 0.92210453748703, "learning_rate": 5.7177149116887815e-05, "loss": 0.09238345623016357, "memory(GiB)": 122.96, "step": 29790, "token_acc": 0.9639686684073107, "train_speed(iter/s)": 0.238873 }, { "epoch": 2.271133470538913, "grad_norm": 0.9962391257286072, "learning_rate": 5.71652994616256e-05, "loss": 0.08590492010116577, "memory(GiB)": 122.96, "step": 29795, "token_acc": 0.9695577254451465, "train_speed(iter/s)": 0.238881 }, { "epoch": 2.271514597149173, "grad_norm": 1.0882277488708496, "learning_rate": 5.715344939546672e-05, "loss": 0.11061415672302247, "memory(GiB)": 122.96, "step": 29800, "token_acc": 0.9589994842702424, "train_speed(iter/s)": 0.23889 }, { "epoch": 2.271514597149173, "eval_loss": 0.08697597682476044, "eval_runtime": 219.757, "eval_samples_per_second": 2.412, "eval_steps_per_second": 2.412, "eval_token_acc": 0.9615384615384616, "step": 29800 }, { "epoch": 2.271895723759433, "grad_norm": 0.6743706464767456, "learning_rate": 5.7141598919090714e-05, "loss": 0.06047337055206299, "memory(GiB)": 122.96, "step": 29805, "token_acc": 0.9619292670126538, "train_speed(iter/s)": 0.238479 }, { "epoch": 2.2722768503696926, "grad_norm": 1.2512203454971313, "learning_rate": 5.7129748033177136e-05, "loss": 0.12391581535339355, "memory(GiB)": 122.96, "step": 29810, "token_acc": 0.9612659423712802, "train_speed(iter/s)": 0.238489 }, { "epoch": 2.2726579769799526, "grad_norm": 0.7535390257835388, "learning_rate": 5.711789673840559e-05, "loss": 0.12657305002212524, "memory(GiB)": 122.96, "step": 29815, "token_acc": 0.9506637168141593, "train_speed(iter/s)": 0.238498 }, { "epoch": 2.2730391035902127, "grad_norm": 0.9032631516456604, "learning_rate": 5.710604503545572e-05, "loss": 0.1016353964805603, "memory(GiB)": 122.96, "step": 29820, "token_acc": 0.956767603064575, "train_speed(iter/s)": 0.238502 }, { "epoch": 2.2734202302004727, "grad_norm": 1.3334821462631226, "learning_rate": 5.7094192925007125e-05, "loss": 0.09974836707115173, "memory(GiB)": 122.96, "step": 29825, "token_acc": 0.9639019013502342, "train_speed(iter/s)": 0.238511 }, { "epoch": 2.2738013568107327, "grad_norm": 1.1291440725326538, "learning_rate": 5.70823404077395e-05, "loss": 0.11560415029525757, "memory(GiB)": 122.96, "step": 29830, "token_acc": 0.9580805295091009, "train_speed(iter/s)": 0.238525 }, { "epoch": 2.2741824834209923, "grad_norm": 0.575461208820343, "learning_rate": 5.707048748433251e-05, "loss": 0.06047348976135254, "memory(GiB)": 122.96, "step": 29835, "token_acc": 0.9587601078167116, "train_speed(iter/s)": 0.238535 }, { "epoch": 2.2745636100312523, "grad_norm": 0.7454870939254761, "learning_rate": 5.7058634155465887e-05, "loss": 0.10287492275238037, "memory(GiB)": 122.96, "step": 29840, "token_acc": 0.9611669623496945, "train_speed(iter/s)": 0.238543 }, { "epoch": 2.2749447366415123, "grad_norm": 2.078479290008545, "learning_rate": 5.704678042181932e-05, "loss": 0.08376376628875733, "memory(GiB)": 122.96, "step": 29845, "token_acc": 0.9693769799366421, "train_speed(iter/s)": 0.238549 }, { "epoch": 2.2753258632517723, "grad_norm": 1.6120598316192627, "learning_rate": 5.7034926284072634e-05, "loss": 0.12300379276275634, "memory(GiB)": 122.96, "step": 29850, "token_acc": 0.9570858283433133, "train_speed(iter/s)": 0.238555 }, { "epoch": 2.2757069898620323, "grad_norm": 1.0315546989440918, "learning_rate": 5.7023071742905554e-05, "loss": 0.087534499168396, "memory(GiB)": 122.96, "step": 29855, "token_acc": 0.9668508287292817, "train_speed(iter/s)": 0.23856 }, { "epoch": 2.276088116472292, "grad_norm": 1.2807683944702148, "learning_rate": 5.70112167989979e-05, "loss": 0.1032545804977417, "memory(GiB)": 122.96, "step": 29860, "token_acc": 0.96068669527897, "train_speed(iter/s)": 0.238568 }, { "epoch": 2.276469243082552, "grad_norm": 1.0422446727752686, "learning_rate": 5.6999361453029495e-05, "loss": 0.10515725612640381, "memory(GiB)": 122.96, "step": 29865, "token_acc": 0.9572827054286562, "train_speed(iter/s)": 0.238578 }, { "epoch": 2.276850369692812, "grad_norm": 0.9309831857681274, "learning_rate": 5.6987505705680214e-05, "loss": 0.1001811146736145, "memory(GiB)": 122.96, "step": 29870, "token_acc": 0.9560677966101695, "train_speed(iter/s)": 0.238579 }, { "epoch": 2.277231496303072, "grad_norm": 1.674796462059021, "learning_rate": 5.697564955762988e-05, "loss": 0.11318085193634034, "memory(GiB)": 122.96, "step": 29875, "token_acc": 0.9531653746770026, "train_speed(iter/s)": 0.238589 }, { "epoch": 2.277612622913332, "grad_norm": 0.7955397963523865, "learning_rate": 5.696379300955843e-05, "loss": 0.09756169319152833, "memory(GiB)": 122.96, "step": 29880, "token_acc": 0.952204748689485, "train_speed(iter/s)": 0.2386 }, { "epoch": 2.2779937495235916, "grad_norm": 0.5379127264022827, "learning_rate": 5.695193606214576e-05, "loss": 0.11191200017929077, "memory(GiB)": 122.96, "step": 29885, "token_acc": 0.9590004489001945, "train_speed(iter/s)": 0.238605 }, { "epoch": 2.2783748761338516, "grad_norm": 1.4161854982376099, "learning_rate": 5.694007871607182e-05, "loss": 0.07625535130500793, "memory(GiB)": 122.96, "step": 29890, "token_acc": 0.9713180059185067, "train_speed(iter/s)": 0.238613 }, { "epoch": 2.2787560027441116, "grad_norm": 0.7542302012443542, "learning_rate": 5.6928220972016565e-05, "loss": 0.06954213976860046, "memory(GiB)": 122.96, "step": 29895, "token_acc": 0.9757556675062973, "train_speed(iter/s)": 0.238615 }, { "epoch": 2.2791371293543716, "grad_norm": 1.1198467016220093, "learning_rate": 5.6916362830659996e-05, "loss": 0.1120072841644287, "memory(GiB)": 122.96, "step": 29900, "token_acc": 0.9632500717772036, "train_speed(iter/s)": 0.238616 }, { "epoch": 2.279518255964631, "grad_norm": 0.6305238008499146, "learning_rate": 5.690450429268211e-05, "loss": 0.0713611364364624, "memory(GiB)": 122.96, "step": 29905, "token_acc": 0.9792854998498949, "train_speed(iter/s)": 0.238618 }, { "epoch": 2.279899382574891, "grad_norm": 1.3654106855392456, "learning_rate": 5.6892645358762954e-05, "loss": 0.11996427774429322, "memory(GiB)": 122.96, "step": 29910, "token_acc": 0.9614035087719298, "train_speed(iter/s)": 0.238624 }, { "epoch": 2.2802805091851512, "grad_norm": 0.5335673093795776, "learning_rate": 5.688078602958256e-05, "loss": 0.05447434186935425, "memory(GiB)": 122.96, "step": 29915, "token_acc": 0.9786723540389229, "train_speed(iter/s)": 0.238636 }, { "epoch": 2.2806616357954113, "grad_norm": 1.2790799140930176, "learning_rate": 5.686892630582103e-05, "loss": 0.12166061401367187, "memory(GiB)": 122.96, "step": 29920, "token_acc": 0.9546591619762351, "train_speed(iter/s)": 0.238646 }, { "epoch": 2.2810427624056713, "grad_norm": 0.960541844367981, "learning_rate": 5.685706618815845e-05, "loss": 0.08063465356826782, "memory(GiB)": 122.96, "step": 29925, "token_acc": 0.9660474055092889, "train_speed(iter/s)": 0.238653 }, { "epoch": 2.2814238890159313, "grad_norm": 0.9165868163108826, "learning_rate": 5.684520567727492e-05, "loss": 0.08080205917358399, "memory(GiB)": 122.96, "step": 29930, "token_acc": 0.9712894560107455, "train_speed(iter/s)": 0.238659 }, { "epoch": 2.281805015626191, "grad_norm": 0.857231080532074, "learning_rate": 5.683334477385064e-05, "loss": 0.10593725442886352, "memory(GiB)": 122.96, "step": 29935, "token_acc": 0.9586567164179104, "train_speed(iter/s)": 0.238663 }, { "epoch": 2.282186142236451, "grad_norm": 0.5822929739952087, "learning_rate": 5.682148347856574e-05, "loss": 0.07412965297698974, "memory(GiB)": 122.96, "step": 29940, "token_acc": 0.9695712309820194, "train_speed(iter/s)": 0.238666 }, { "epoch": 2.282567268846711, "grad_norm": 1.1302849054336548, "learning_rate": 5.680962179210042e-05, "loss": 0.08544887900352478, "memory(GiB)": 122.96, "step": 29945, "token_acc": 0.967741935483871, "train_speed(iter/s)": 0.238672 }, { "epoch": 2.282948395456971, "grad_norm": 1.9605662822723389, "learning_rate": 5.679775971513488e-05, "loss": 0.14741196632385253, "memory(GiB)": 122.96, "step": 29950, "token_acc": 0.9371385466431984, "train_speed(iter/s)": 0.23868 }, { "epoch": 2.2833295220672305, "grad_norm": 1.4360101222991943, "learning_rate": 5.6785897248349375e-05, "loss": 0.12397966384887696, "memory(GiB)": 122.96, "step": 29955, "token_acc": 0.9544223107569721, "train_speed(iter/s)": 0.238686 }, { "epoch": 2.2837106486774905, "grad_norm": 0.766730546951294, "learning_rate": 5.6774034392424146e-05, "loss": 0.09912843704223633, "memory(GiB)": 122.96, "step": 29960, "token_acc": 0.9590075512405609, "train_speed(iter/s)": 0.238692 }, { "epoch": 2.2840917752877505, "grad_norm": 1.454736590385437, "learning_rate": 5.6762171148039476e-05, "loss": 0.09029557704925537, "memory(GiB)": 122.96, "step": 29965, "token_acc": 0.9604547701433515, "train_speed(iter/s)": 0.238699 }, { "epoch": 2.2844729018980106, "grad_norm": 1.0544531345367432, "learning_rate": 5.6750307515875676e-05, "loss": 0.04767584800720215, "memory(GiB)": 122.96, "step": 29970, "token_acc": 0.9779449922158796, "train_speed(iter/s)": 0.238709 }, { "epoch": 2.2848540285082706, "grad_norm": 1.1560943126678467, "learning_rate": 5.673844349661308e-05, "loss": 0.09997313022613526, "memory(GiB)": 122.96, "step": 29975, "token_acc": 0.9546811397557666, "train_speed(iter/s)": 0.238719 }, { "epoch": 2.2852351551185306, "grad_norm": 0.8264760375022888, "learning_rate": 5.672657909093201e-05, "loss": 0.10029778480529786, "memory(GiB)": 122.96, "step": 29980, "token_acc": 0.9584717607973422, "train_speed(iter/s)": 0.238729 }, { "epoch": 2.28561628172879, "grad_norm": 0.8145834803581238, "learning_rate": 5.671471429951284e-05, "loss": 0.10831784009933472, "memory(GiB)": 122.96, "step": 29985, "token_acc": 0.9621152328334649, "train_speed(iter/s)": 0.238736 }, { "epoch": 2.28599740833905, "grad_norm": 0.8219764232635498, "learning_rate": 5.670284912303596e-05, "loss": 0.07515093684196472, "memory(GiB)": 122.96, "step": 29990, "token_acc": 0.9783956415555138, "train_speed(iter/s)": 0.238742 }, { "epoch": 2.28637853494931, "grad_norm": 0.968137800693512, "learning_rate": 5.669098356218181e-05, "loss": 0.09011992216110229, "memory(GiB)": 122.96, "step": 29995, "token_acc": 0.9623911322248614, "train_speed(iter/s)": 0.238753 }, { "epoch": 2.2867596615595702, "grad_norm": 0.8031928539276123, "learning_rate": 5.6679117617630774e-05, "loss": 0.09410815834999084, "memory(GiB)": 122.96, "step": 30000, "token_acc": 0.9696796338672768, "train_speed(iter/s)": 0.238763 }, { "epoch": 2.2867596615595702, "eval_loss": 0.08565158396959305, "eval_runtime": 220.3841, "eval_samples_per_second": 2.405, "eval_steps_per_second": 2.405, "eval_token_acc": 0.9615234022046865, "step": 30000 }, { "epoch": 2.28714078816983, "grad_norm": 1.6018885374069214, "learning_rate": 5.666725129006336e-05, "loss": 0.12226029634475707, "memory(GiB)": 122.96, "step": 30005, "token_acc": 0.9612921787627071, "train_speed(iter/s)": 0.238357 }, { "epoch": 2.28752191478009, "grad_norm": 0.7853828072547913, "learning_rate": 5.665538458016002e-05, "loss": 0.06646475791931153, "memory(GiB)": 122.96, "step": 30010, "token_acc": 0.969885277246654, "train_speed(iter/s)": 0.238366 }, { "epoch": 2.28790304139035, "grad_norm": 1.6445398330688477, "learning_rate": 5.664351748860124e-05, "loss": 0.10731363296508789, "memory(GiB)": 122.96, "step": 30015, "token_acc": 0.9616691842900302, "train_speed(iter/s)": 0.238374 }, { "epoch": 2.28828416800061, "grad_norm": 0.7935417890548706, "learning_rate": 5.663165001606758e-05, "loss": 0.07839056253433227, "memory(GiB)": 122.96, "step": 30020, "token_acc": 0.9686173438447544, "train_speed(iter/s)": 0.238381 }, { "epoch": 2.28866529461087, "grad_norm": 0.8570477366447449, "learning_rate": 5.661978216323957e-05, "loss": 0.11483265161514282, "memory(GiB)": 122.96, "step": 30025, "token_acc": 0.9498491704374057, "train_speed(iter/s)": 0.238392 }, { "epoch": 2.28904642122113, "grad_norm": 0.6342665553092957, "learning_rate": 5.660791393079776e-05, "loss": 0.1066136360168457, "memory(GiB)": 122.96, "step": 30030, "token_acc": 0.9509733237202596, "train_speed(iter/s)": 0.238401 }, { "epoch": 2.2894275478313895, "grad_norm": 0.4324440360069275, "learning_rate": 5.659604531942275e-05, "loss": 0.06097148656845093, "memory(GiB)": 122.96, "step": 30035, "token_acc": 0.9763816721776098, "train_speed(iter/s)": 0.238413 }, { "epoch": 2.2898086744416495, "grad_norm": 1.0332353115081787, "learning_rate": 5.658417632979516e-05, "loss": 0.08567940592765808, "memory(GiB)": 122.96, "step": 30040, "token_acc": 0.9721812434141202, "train_speed(iter/s)": 0.23842 }, { "epoch": 2.2901898010519095, "grad_norm": 0.8696891069412231, "learning_rate": 5.6572306962595614e-05, "loss": 0.10466337203979492, "memory(GiB)": 122.96, "step": 30045, "token_acc": 0.9524828767123288, "train_speed(iter/s)": 0.238432 }, { "epoch": 2.2905709276621695, "grad_norm": 0.5334836840629578, "learning_rate": 5.656043721850475e-05, "loss": 0.08469771146774292, "memory(GiB)": 122.96, "step": 30050, "token_acc": 0.9661883555995382, "train_speed(iter/s)": 0.238437 }, { "epoch": 2.290952054272429, "grad_norm": 0.4797254204750061, "learning_rate": 5.6548567098203264e-05, "loss": 0.08297701478004456, "memory(GiB)": 122.96, "step": 30055, "token_acc": 0.9675855801272342, "train_speed(iter/s)": 0.238448 }, { "epoch": 2.291333180882689, "grad_norm": 0.9111169576644897, "learning_rate": 5.6536696602371844e-05, "loss": 0.0894980251789093, "memory(GiB)": 122.96, "step": 30060, "token_acc": 0.9636470819414423, "train_speed(iter/s)": 0.238456 }, { "epoch": 2.291714307492949, "grad_norm": 1.3819149732589722, "learning_rate": 5.652482573169121e-05, "loss": 0.07388848066329956, "memory(GiB)": 122.96, "step": 30065, "token_acc": 0.9645072363886974, "train_speed(iter/s)": 0.238467 }, { "epoch": 2.292095434103209, "grad_norm": 0.6442959904670715, "learning_rate": 5.6512954486842087e-05, "loss": 0.09306795001029969, "memory(GiB)": 122.96, "step": 30070, "token_acc": 0.9605168700646087, "train_speed(iter/s)": 0.238478 }, { "epoch": 2.292476560713469, "grad_norm": 1.342428207397461, "learning_rate": 5.6501082868505264e-05, "loss": 0.10841739177703857, "memory(GiB)": 122.96, "step": 30075, "token_acc": 0.950969942442976, "train_speed(iter/s)": 0.238487 }, { "epoch": 2.292857687323729, "grad_norm": 1.4731405973434448, "learning_rate": 5.64892108773615e-05, "loss": 0.11086174249649047, "memory(GiB)": 122.96, "step": 30080, "token_acc": 0.9616069661587175, "train_speed(iter/s)": 0.238495 }, { "epoch": 2.2932388139339888, "grad_norm": 1.05408775806427, "learning_rate": 5.6477338514091603e-05, "loss": 0.17720167636871337, "memory(GiB)": 122.96, "step": 30085, "token_acc": 0.938123415046492, "train_speed(iter/s)": 0.238501 }, { "epoch": 2.293619940544249, "grad_norm": 0.8599543571472168, "learning_rate": 5.64654657793764e-05, "loss": 0.08904974460601807, "memory(GiB)": 122.96, "step": 30090, "token_acc": 0.9591010325976919, "train_speed(iter/s)": 0.238509 }, { "epoch": 2.294001067154509, "grad_norm": 1.159589171409607, "learning_rate": 5.645359267389674e-05, "loss": 0.07328202724456787, "memory(GiB)": 122.96, "step": 30095, "token_acc": 0.9715396188565697, "train_speed(iter/s)": 0.238512 }, { "epoch": 2.294382193764769, "grad_norm": 1.2138080596923828, "learning_rate": 5.64417191983335e-05, "loss": 0.12958072423934935, "memory(GiB)": 122.96, "step": 30100, "token_acc": 0.9470134874759152, "train_speed(iter/s)": 0.238521 }, { "epoch": 2.2947633203750284, "grad_norm": 0.5516689419746399, "learning_rate": 5.6429845353367525e-05, "loss": 0.10445159673690796, "memory(GiB)": 122.96, "step": 30105, "token_acc": 0.9620347394540943, "train_speed(iter/s)": 0.238529 }, { "epoch": 2.2951444469852884, "grad_norm": 1.0084072351455688, "learning_rate": 5.6417971139679794e-05, "loss": 0.10232421159744262, "memory(GiB)": 122.96, "step": 30110, "token_acc": 0.9625902012913027, "train_speed(iter/s)": 0.238535 }, { "epoch": 2.2955255735955484, "grad_norm": 1.0957235097885132, "learning_rate": 5.6406096557951184e-05, "loss": 0.08904439210891724, "memory(GiB)": 122.96, "step": 30115, "token_acc": 0.9532142857142857, "train_speed(iter/s)": 0.238548 }, { "epoch": 2.2959067002058084, "grad_norm": 1.4330031871795654, "learning_rate": 5.6394221608862665e-05, "loss": 0.09481902122497558, "memory(GiB)": 122.96, "step": 30120, "token_acc": 0.9638873132917376, "train_speed(iter/s)": 0.238555 }, { "epoch": 2.2962878268160685, "grad_norm": 0.6576471328735352, "learning_rate": 5.638234629309521e-05, "loss": 0.09287800192832947, "memory(GiB)": 122.96, "step": 30125, "token_acc": 0.9619295341688342, "train_speed(iter/s)": 0.238564 }, { "epoch": 2.296668953426328, "grad_norm": 0.6586018800735474, "learning_rate": 5.637047061132982e-05, "loss": 0.10298845767974854, "memory(GiB)": 122.96, "step": 30130, "token_acc": 0.96175, "train_speed(iter/s)": 0.238574 }, { "epoch": 2.297050080036588, "grad_norm": 0.5814892053604126, "learning_rate": 5.635859456424749e-05, "loss": 0.10202263593673706, "memory(GiB)": 122.96, "step": 30135, "token_acc": 0.9631476863397063, "train_speed(iter/s)": 0.238582 }, { "epoch": 2.297431206646848, "grad_norm": 1.1876754760742188, "learning_rate": 5.634671815252928e-05, "loss": 0.11343873739242553, "memory(GiB)": 122.96, "step": 30140, "token_acc": 0.965279730109643, "train_speed(iter/s)": 0.238581 }, { "epoch": 2.297812333257108, "grad_norm": 0.9658925533294678, "learning_rate": 5.633484137685624e-05, "loss": 0.08904585242271423, "memory(GiB)": 122.96, "step": 30145, "token_acc": 0.9680314960629921, "train_speed(iter/s)": 0.238584 }, { "epoch": 2.298193459867368, "grad_norm": 0.9896979331970215, "learning_rate": 5.6322964237909435e-05, "loss": 0.09617437124252319, "memory(GiB)": 122.96, "step": 30150, "token_acc": 0.9694050991501416, "train_speed(iter/s)": 0.238593 }, { "epoch": 2.2985745864776277, "grad_norm": 0.47590625286102295, "learning_rate": 5.631108673636997e-05, "loss": 0.07874135375022888, "memory(GiB)": 122.96, "step": 30155, "token_acc": 0.9712343096234309, "train_speed(iter/s)": 0.2386 }, { "epoch": 2.2989557130878877, "grad_norm": 0.9263352751731873, "learning_rate": 5.6299208872918965e-05, "loss": 0.13139302730560304, "memory(GiB)": 122.96, "step": 30160, "token_acc": 0.9598684210526316, "train_speed(iter/s)": 0.238606 }, { "epoch": 2.2993368396981477, "grad_norm": 1.3084970712661743, "learning_rate": 5.628733064823757e-05, "loss": 0.09020146131515502, "memory(GiB)": 122.96, "step": 30165, "token_acc": 0.9710467706013363, "train_speed(iter/s)": 0.238617 }, { "epoch": 2.2997179663084077, "grad_norm": 0.5923680663108826, "learning_rate": 5.627545206300695e-05, "loss": 0.06907802820205688, "memory(GiB)": 122.96, "step": 30170, "token_acc": 0.9716646989374262, "train_speed(iter/s)": 0.238623 }, { "epoch": 2.3000990929186678, "grad_norm": 0.4494882822036743, "learning_rate": 5.6263573117908254e-05, "loss": 0.08535515666007995, "memory(GiB)": 122.96, "step": 30175, "token_acc": 0.9688961646398503, "train_speed(iter/s)": 0.238627 }, { "epoch": 2.3004802195289273, "grad_norm": 2.483584403991699, "learning_rate": 5.625169381362272e-05, "loss": 0.09593017101287842, "memory(GiB)": 122.96, "step": 30180, "token_acc": 0.9601532567049809, "train_speed(iter/s)": 0.238641 }, { "epoch": 2.3008613461391874, "grad_norm": 1.7419637441635132, "learning_rate": 5.623981415083156e-05, "loss": 0.07688462138175964, "memory(GiB)": 122.96, "step": 30185, "token_acc": 0.9739, "train_speed(iter/s)": 0.238639 }, { "epoch": 2.3012424727494474, "grad_norm": 1.051803469657898, "learning_rate": 5.622793413021601e-05, "loss": 0.11167666912078858, "memory(GiB)": 122.96, "step": 30190, "token_acc": 0.962401055408971, "train_speed(iter/s)": 0.238648 }, { "epoch": 2.3016235993597074, "grad_norm": 0.47380131483078003, "learning_rate": 5.621605375245733e-05, "loss": 0.09182702898979186, "memory(GiB)": 122.96, "step": 30195, "token_acc": 0.9645727406921437, "train_speed(iter/s)": 0.238652 }, { "epoch": 2.3020047259699674, "grad_norm": 0.7379163503646851, "learning_rate": 5.620417301823683e-05, "loss": 0.08092120289802551, "memory(GiB)": 122.96, "step": 30200, "token_acc": 0.9654901960784313, "train_speed(iter/s)": 0.238658 }, { "epoch": 2.3020047259699674, "eval_loss": 0.08588273823261261, "eval_runtime": 217.717, "eval_samples_per_second": 2.434, "eval_steps_per_second": 2.434, "eval_token_acc": 0.9613728088669358, "step": 30200 }, { "epoch": 2.302385852580227, "grad_norm": 1.02250075340271, "learning_rate": 5.619229192823578e-05, "loss": 0.07433177232742309, "memory(GiB)": 122.96, "step": 30205, "token_acc": 0.9615754638504105, "train_speed(iter/s)": 0.238255 }, { "epoch": 2.302766979190487, "grad_norm": 1.0403233766555786, "learning_rate": 5.618041048313555e-05, "loss": 0.09110434055328369, "memory(GiB)": 122.96, "step": 30210, "token_acc": 0.9631776338220253, "train_speed(iter/s)": 0.238265 }, { "epoch": 2.303148105800747, "grad_norm": 1.0276206731796265, "learning_rate": 5.616852868361744e-05, "loss": 0.10374884605407715, "memory(GiB)": 122.96, "step": 30215, "token_acc": 0.9519625073227885, "train_speed(iter/s)": 0.238276 }, { "epoch": 2.303529232411007, "grad_norm": 1.0621287822723389, "learning_rate": 5.615664653036283e-05, "loss": 0.12455391883850098, "memory(GiB)": 122.96, "step": 30220, "token_acc": 0.9605263157894737, "train_speed(iter/s)": 0.238285 }, { "epoch": 2.303910359021267, "grad_norm": 1.6187670230865479, "learning_rate": 5.614476402405313e-05, "loss": 0.1078485131263733, "memory(GiB)": 122.96, "step": 30225, "token_acc": 0.9690721649484536, "train_speed(iter/s)": 0.238292 }, { "epoch": 2.3042914856315266, "grad_norm": 1.8578234910964966, "learning_rate": 5.613288116536971e-05, "loss": 0.10274761915206909, "memory(GiB)": 122.96, "step": 30230, "token_acc": 0.9637345679012346, "train_speed(iter/s)": 0.238302 }, { "epoch": 2.3046726122417867, "grad_norm": 0.608182966709137, "learning_rate": 5.6120997954994024e-05, "loss": 0.11286866664886475, "memory(GiB)": 122.96, "step": 30235, "token_acc": 0.9631260319207485, "train_speed(iter/s)": 0.238306 }, { "epoch": 2.3050537388520467, "grad_norm": 0.7439010143280029, "learning_rate": 5.610911439360751e-05, "loss": 0.11431206464767456, "memory(GiB)": 122.96, "step": 30240, "token_acc": 0.9536665450565487, "train_speed(iter/s)": 0.238318 }, { "epoch": 2.3054348654623067, "grad_norm": 1.0681089162826538, "learning_rate": 5.609723048189164e-05, "loss": 0.08236660957336425, "memory(GiB)": 122.96, "step": 30245, "token_acc": 0.9657303370786516, "train_speed(iter/s)": 0.238332 }, { "epoch": 2.3058159920725663, "grad_norm": 2.441105842590332, "learning_rate": 5.608534622052789e-05, "loss": 0.12681906223297118, "memory(GiB)": 122.96, "step": 30250, "token_acc": 0.9535673839184597, "train_speed(iter/s)": 0.238345 }, { "epoch": 2.3061971186828263, "grad_norm": 1.2382164001464844, "learning_rate": 5.6073461610197776e-05, "loss": 0.08116672039031983, "memory(GiB)": 122.96, "step": 30255, "token_acc": 0.9690574420818787, "train_speed(iter/s)": 0.238351 }, { "epoch": 2.3065782452930863, "grad_norm": 0.8052395582199097, "learning_rate": 5.606157665158281e-05, "loss": 0.10957798957824708, "memory(GiB)": 122.96, "step": 30260, "token_acc": 0.9577836411609498, "train_speed(iter/s)": 0.238355 }, { "epoch": 2.3069593719033463, "grad_norm": 0.6693406701087952, "learning_rate": 5.6049691345364574e-05, "loss": 0.1316436529159546, "memory(GiB)": 122.96, "step": 30265, "token_acc": 0.9623326525981393, "train_speed(iter/s)": 0.238363 }, { "epoch": 2.3073404985136063, "grad_norm": 0.5280429720878601, "learning_rate": 5.603780569222461e-05, "loss": 0.06008493900299072, "memory(GiB)": 122.96, "step": 30270, "token_acc": 0.9800248653128886, "train_speed(iter/s)": 0.238356 }, { "epoch": 2.3077216251238664, "grad_norm": 0.8467960953712463, "learning_rate": 5.60259196928445e-05, "loss": 0.09650521874427795, "memory(GiB)": 122.96, "step": 30275, "token_acc": 0.9627840400787261, "train_speed(iter/s)": 0.238365 }, { "epoch": 2.308102751734126, "grad_norm": 0.07875487208366394, "learning_rate": 5.601403334790586e-05, "loss": 0.07940338253974914, "memory(GiB)": 122.96, "step": 30280, "token_acc": 0.9590368115139773, "train_speed(iter/s)": 0.238373 }, { "epoch": 2.308483878344386, "grad_norm": 0.8582872748374939, "learning_rate": 5.6002146658090325e-05, "loss": 0.06232047080993652, "memory(GiB)": 122.96, "step": 30285, "token_acc": 0.9773503505302894, "train_speed(iter/s)": 0.238378 }, { "epoch": 2.308865004954646, "grad_norm": 0.5808085203170776, "learning_rate": 5.599025962407951e-05, "loss": 0.07400666475296021, "memory(GiB)": 122.96, "step": 30290, "token_acc": 0.974022633744856, "train_speed(iter/s)": 0.238379 }, { "epoch": 2.309246131564906, "grad_norm": 1.0523375272750854, "learning_rate": 5.597837224655512e-05, "loss": 0.07244296073913574, "memory(GiB)": 122.96, "step": 30295, "token_acc": 0.9731990115947539, "train_speed(iter/s)": 0.238385 }, { "epoch": 2.3096272581751656, "grad_norm": 0.644485592842102, "learning_rate": 5.5966484526198825e-05, "loss": 0.10635370016098022, "memory(GiB)": 122.96, "step": 30300, "token_acc": 0.9524408656265727, "train_speed(iter/s)": 0.238395 }, { "epoch": 2.3100083847854256, "grad_norm": 0.8479452133178711, "learning_rate": 5.5954596463692334e-05, "loss": 0.1049458622932434, "memory(GiB)": 122.96, "step": 30305, "token_acc": 0.9634405554768315, "train_speed(iter/s)": 0.238398 }, { "epoch": 2.3103895113956856, "grad_norm": 1.393730640411377, "learning_rate": 5.594270805971735e-05, "loss": 0.08601288795471192, "memory(GiB)": 122.96, "step": 30310, "token_acc": 0.9692253272019808, "train_speed(iter/s)": 0.238405 }, { "epoch": 2.3107706380059456, "grad_norm": 2.3764901161193848, "learning_rate": 5.5930819314955664e-05, "loss": 0.1026681661605835, "memory(GiB)": 122.96, "step": 30315, "token_acc": 0.9744754488427428, "train_speed(iter/s)": 0.238411 }, { "epoch": 2.3111517646162056, "grad_norm": 1.1725142002105713, "learning_rate": 5.591893023008899e-05, "loss": 0.15994828939437866, "memory(GiB)": 122.96, "step": 30320, "token_acc": 0.9348795718108831, "train_speed(iter/s)": 0.238422 }, { "epoch": 2.3115328912264657, "grad_norm": 1.4953439235687256, "learning_rate": 5.590704080579915e-05, "loss": 0.11299149990081787, "memory(GiB)": 122.96, "step": 30325, "token_acc": 0.9575038284839203, "train_speed(iter/s)": 0.238429 }, { "epoch": 2.3119140178367252, "grad_norm": 0.8807202577590942, "learning_rate": 5.5895151042767926e-05, "loss": 0.12499520778656006, "memory(GiB)": 122.96, "step": 30330, "token_acc": 0.9531401905502624, "train_speed(iter/s)": 0.238436 }, { "epoch": 2.3122951444469853, "grad_norm": 1.1041603088378906, "learning_rate": 5.5883260941677154e-05, "loss": 0.09669994115829468, "memory(GiB)": 122.96, "step": 30335, "token_acc": 0.9644835451287064, "train_speed(iter/s)": 0.238439 }, { "epoch": 2.3126762710572453, "grad_norm": 1.1845797300338745, "learning_rate": 5.587137050320865e-05, "loss": 0.0794677495956421, "memory(GiB)": 122.96, "step": 30340, "token_acc": 0.9680120972432243, "train_speed(iter/s)": 0.238439 }, { "epoch": 2.3130573976675053, "grad_norm": 0.8533769249916077, "learning_rate": 5.5859479728044305e-05, "loss": 0.08918778300285339, "memory(GiB)": 122.96, "step": 30345, "token_acc": 0.9625114693931053, "train_speed(iter/s)": 0.238442 }, { "epoch": 2.313438524277765, "grad_norm": 1.1110117435455322, "learning_rate": 5.5847588616865985e-05, "loss": 0.10104190111160279, "memory(GiB)": 122.96, "step": 30350, "token_acc": 0.9604045620830644, "train_speed(iter/s)": 0.238449 }, { "epoch": 2.313819650888025, "grad_norm": 0.9230715036392212, "learning_rate": 5.583569717035561e-05, "loss": 0.12815057039260863, "memory(GiB)": 122.96, "step": 30355, "token_acc": 0.9482706766917294, "train_speed(iter/s)": 0.238456 }, { "epoch": 2.314200777498285, "grad_norm": 0.28639477491378784, "learning_rate": 5.5823805389195064e-05, "loss": 0.0739847481250763, "memory(GiB)": 122.96, "step": 30360, "token_acc": 0.9643794525684289, "train_speed(iter/s)": 0.238458 }, { "epoch": 2.314581904108545, "grad_norm": 0.8794400691986084, "learning_rate": 5.5811913274066294e-05, "loss": 0.10677658319473267, "memory(GiB)": 122.96, "step": 30365, "token_acc": 0.954552858771787, "train_speed(iter/s)": 0.238464 }, { "epoch": 2.314963030718805, "grad_norm": 1.152249813079834, "learning_rate": 5.580002082565129e-05, "loss": 0.06804120540618896, "memory(GiB)": 122.96, "step": 30370, "token_acc": 0.9734090909090909, "train_speed(iter/s)": 0.238473 }, { "epoch": 2.315344157329065, "grad_norm": 1.3469125032424927, "learning_rate": 5.5788128044632015e-05, "loss": 0.10525201559066773, "memory(GiB)": 122.96, "step": 30375, "token_acc": 0.9640547736782047, "train_speed(iter/s)": 0.238481 }, { "epoch": 2.3157252839393245, "grad_norm": 0.6603857278823853, "learning_rate": 5.577623493169043e-05, "loss": 0.11804068088531494, "memory(GiB)": 122.96, "step": 30380, "token_acc": 0.9550961072524633, "train_speed(iter/s)": 0.238485 }, { "epoch": 2.3161064105495845, "grad_norm": 1.9953058958053589, "learning_rate": 5.5764341487508596e-05, "loss": 0.16345130205154418, "memory(GiB)": 122.96, "step": 30385, "token_acc": 0.9404622927854308, "train_speed(iter/s)": 0.238494 }, { "epoch": 2.3164875371598446, "grad_norm": 1.0898399353027344, "learning_rate": 5.575244771276853e-05, "loss": 0.07930009365081787, "memory(GiB)": 122.96, "step": 30390, "token_acc": 0.9742667928098392, "train_speed(iter/s)": 0.2385 }, { "epoch": 2.3168686637701046, "grad_norm": 0.8037872910499573, "learning_rate": 5.5740553608152266e-05, "loss": 0.0851434588432312, "memory(GiB)": 122.96, "step": 30395, "token_acc": 0.962718669343275, "train_speed(iter/s)": 0.238509 }, { "epoch": 2.317249790380364, "grad_norm": 0.6425372362136841, "learning_rate": 5.57286591743419e-05, "loss": 0.056006377935409545, "memory(GiB)": 122.96, "step": 30400, "token_acc": 0.9693749125996364, "train_speed(iter/s)": 0.238511 }, { "epoch": 2.317249790380364, "eval_loss": 0.08501468598842621, "eval_runtime": 220.5787, "eval_samples_per_second": 2.403, "eval_steps_per_second": 2.403, "eval_token_acc": 0.9623366062285404, "step": 30400 }, { "epoch": 2.317630916990624, "grad_norm": 0.8867582082748413, "learning_rate": 5.5716764412019516e-05, "loss": 0.10285605192184448, "memory(GiB)": 122.96, "step": 30405, "token_acc": 0.9623924367571302, "train_speed(iter/s)": 0.238105 }, { "epoch": 2.318012043600884, "grad_norm": 0.6023790240287781, "learning_rate": 5.570486932186721e-05, "loss": 0.07233263254165649, "memory(GiB)": 122.96, "step": 30410, "token_acc": 0.972809112621716, "train_speed(iter/s)": 0.23811 }, { "epoch": 2.318393170211144, "grad_norm": 0.31271272897720337, "learning_rate": 5.569297390456715e-05, "loss": 0.08915624022483826, "memory(GiB)": 122.96, "step": 30415, "token_acc": 0.9625522108598589, "train_speed(iter/s)": 0.238114 }, { "epoch": 2.3187742968214042, "grad_norm": 1.3782713413238525, "learning_rate": 5.568107816080144e-05, "loss": 0.08591740131378174, "memory(GiB)": 122.96, "step": 30420, "token_acc": 0.9625818521983162, "train_speed(iter/s)": 0.238128 }, { "epoch": 2.319155423431664, "grad_norm": 1.0978995561599731, "learning_rate": 5.566918209125226e-05, "loss": 0.12099208831787109, "memory(GiB)": 122.96, "step": 30425, "token_acc": 0.9486416815387666, "train_speed(iter/s)": 0.238134 }, { "epoch": 2.319536550041924, "grad_norm": 0.752592146396637, "learning_rate": 5.565728569660178e-05, "loss": 0.08636369109153748, "memory(GiB)": 122.96, "step": 30430, "token_acc": 0.9668611145587688, "train_speed(iter/s)": 0.238134 }, { "epoch": 2.319917676652184, "grad_norm": 1.2580060958862305, "learning_rate": 5.564538897753223e-05, "loss": 0.0952046811580658, "memory(GiB)": 122.96, "step": 30435, "token_acc": 0.9557873820168902, "train_speed(iter/s)": 0.238138 }, { "epoch": 2.320298803262444, "grad_norm": 1.128401756286621, "learning_rate": 5.563349193472583e-05, "loss": 0.11762738227844238, "memory(GiB)": 122.96, "step": 30440, "token_acc": 0.9588581024349286, "train_speed(iter/s)": 0.238147 }, { "epoch": 2.320679929872704, "grad_norm": 1.2994271516799927, "learning_rate": 5.562159456886481e-05, "loss": 0.084601891040802, "memory(GiB)": 122.96, "step": 30445, "token_acc": 0.9661443760492445, "train_speed(iter/s)": 0.238158 }, { "epoch": 2.3210610564829635, "grad_norm": 0.8972370624542236, "learning_rate": 5.560969688063141e-05, "loss": 0.09172968864440918, "memory(GiB)": 122.96, "step": 30450, "token_acc": 0.9696276943174396, "train_speed(iter/s)": 0.23817 }, { "epoch": 2.3214421830932235, "grad_norm": 0.8366634845733643, "learning_rate": 5.559779887070795e-05, "loss": 0.06990692615509034, "memory(GiB)": 122.96, "step": 30455, "token_acc": 0.9731577614585971, "train_speed(iter/s)": 0.238178 }, { "epoch": 2.3218233097034835, "grad_norm": 0.7747802138328552, "learning_rate": 5.558590053977669e-05, "loss": 0.08920409679412841, "memory(GiB)": 122.96, "step": 30460, "token_acc": 0.9618357487922705, "train_speed(iter/s)": 0.238177 }, { "epoch": 2.3222044363137435, "grad_norm": 0.8025301694869995, "learning_rate": 5.5574001888519946e-05, "loss": 0.08493835926055908, "memory(GiB)": 122.96, "step": 30465, "token_acc": 0.961437908496732, "train_speed(iter/s)": 0.238191 }, { "epoch": 2.3225855629240035, "grad_norm": 0.6486112475395203, "learning_rate": 5.556210291762007e-05, "loss": 0.10490245819091797, "memory(GiB)": 122.96, "step": 30470, "token_acc": 0.9528993091898681, "train_speed(iter/s)": 0.238199 }, { "epoch": 2.322966689534263, "grad_norm": 0.12567712366580963, "learning_rate": 5.555020362775941e-05, "loss": 0.05995995998382568, "memory(GiB)": 122.96, "step": 30475, "token_acc": 0.9672058497673388, "train_speed(iter/s)": 0.238208 }, { "epoch": 2.323347816144523, "grad_norm": 0.4757556915283203, "learning_rate": 5.553830401962031e-05, "loss": 0.07649307250976563, "memory(GiB)": 122.96, "step": 30480, "token_acc": 0.9758926540823288, "train_speed(iter/s)": 0.238216 }, { "epoch": 2.323728942754783, "grad_norm": 0.5417240262031555, "learning_rate": 5.55264040938852e-05, "loss": 0.08219339847564697, "memory(GiB)": 122.96, "step": 30485, "token_acc": 0.9737772539776075, "train_speed(iter/s)": 0.238219 }, { "epoch": 2.324110069365043, "grad_norm": 0.9813774228096008, "learning_rate": 5.5514503851236447e-05, "loss": 0.10128331184387207, "memory(GiB)": 122.96, "step": 30490, "token_acc": 0.9603123799769556, "train_speed(iter/s)": 0.238223 }, { "epoch": 2.324491195975303, "grad_norm": 1.4300525188446045, "learning_rate": 5.550260329235648e-05, "loss": 0.08310458660125733, "memory(GiB)": 122.96, "step": 30495, "token_acc": 0.9635036496350365, "train_speed(iter/s)": 0.238233 }, { "epoch": 2.3248723225855628, "grad_norm": 1.4410200119018555, "learning_rate": 5.5490702417927756e-05, "loss": 0.09691834449768066, "memory(GiB)": 122.96, "step": 30500, "token_acc": 0.9589810017271158, "train_speed(iter/s)": 0.238239 }, { "epoch": 2.3252534491958228, "grad_norm": 1.006060242652893, "learning_rate": 5.547880122863272e-05, "loss": 0.0863929808139801, "memory(GiB)": 122.96, "step": 30505, "token_acc": 0.9656413232733604, "train_speed(iter/s)": 0.238238 }, { "epoch": 2.325634575806083, "grad_norm": 0.7944480180740356, "learning_rate": 5.5466899725153884e-05, "loss": 0.09217605590820313, "memory(GiB)": 122.96, "step": 30510, "token_acc": 0.9659712975292203, "train_speed(iter/s)": 0.238243 }, { "epoch": 2.326015702416343, "grad_norm": 0.5183918476104736, "learning_rate": 5.54549979081737e-05, "loss": 0.09080533981323242, "memory(GiB)": 122.96, "step": 30515, "token_acc": 0.9617161716171617, "train_speed(iter/s)": 0.238254 }, { "epoch": 2.326396829026603, "grad_norm": 0.6285891532897949, "learning_rate": 5.54430957783747e-05, "loss": 0.055918163061141966, "memory(GiB)": 122.96, "step": 30520, "token_acc": 0.9704142011834319, "train_speed(iter/s)": 0.238264 }, { "epoch": 2.3267779556368624, "grad_norm": 1.4783027172088623, "learning_rate": 5.5431193336439426e-05, "loss": 0.11995725631713867, "memory(GiB)": 122.96, "step": 30525, "token_acc": 0.9482758620689655, "train_speed(iter/s)": 0.238274 }, { "epoch": 2.3271590822471224, "grad_norm": 0.9496585130691528, "learning_rate": 5.541929058305041e-05, "loss": 0.1714908242225647, "memory(GiB)": 122.96, "step": 30530, "token_acc": 0.9368179207352096, "train_speed(iter/s)": 0.238282 }, { "epoch": 2.3275402088573824, "grad_norm": 1.06456458568573, "learning_rate": 5.540738751889023e-05, "loss": 0.0750948429107666, "memory(GiB)": 122.96, "step": 30535, "token_acc": 0.96505228398459, "train_speed(iter/s)": 0.238291 }, { "epoch": 2.3279213354676425, "grad_norm": 1.2907204627990723, "learning_rate": 5.5395484144641465e-05, "loss": 0.13376771211624144, "memory(GiB)": 122.96, "step": 30540, "token_acc": 0.9506765604539502, "train_speed(iter/s)": 0.238304 }, { "epoch": 2.328302462077902, "grad_norm": 1.0038177967071533, "learning_rate": 5.5383580460986726e-05, "loss": 0.08661022186279296, "memory(GiB)": 122.96, "step": 30545, "token_acc": 0.9675634900678903, "train_speed(iter/s)": 0.238312 }, { "epoch": 2.328683588688162, "grad_norm": 0.5568806529045105, "learning_rate": 5.537167646860862e-05, "loss": 0.05814990997314453, "memory(GiB)": 122.96, "step": 30550, "token_acc": 0.9705351773902585, "train_speed(iter/s)": 0.238317 }, { "epoch": 2.329064715298422, "grad_norm": 0.6740285754203796, "learning_rate": 5.535977216818982e-05, "loss": 0.11138288974761963, "memory(GiB)": 122.96, "step": 30555, "token_acc": 0.9528246942341293, "train_speed(iter/s)": 0.238325 }, { "epoch": 2.329445841908682, "grad_norm": 1.062430739402771, "learning_rate": 5.534786756041294e-05, "loss": 0.07129503488540649, "memory(GiB)": 122.96, "step": 30560, "token_acc": 0.9656957928802589, "train_speed(iter/s)": 0.238338 }, { "epoch": 2.329826968518942, "grad_norm": 0.6346415281295776, "learning_rate": 5.5335962645960684e-05, "loss": 0.11609103679656982, "memory(GiB)": 122.96, "step": 30565, "token_acc": 0.9612492770387507, "train_speed(iter/s)": 0.23835 }, { "epoch": 2.330208095129202, "grad_norm": 1.1810859441757202, "learning_rate": 5.5324057425515705e-05, "loss": 0.09201788306236267, "memory(GiB)": 122.96, "step": 30570, "token_acc": 0.9695261949488625, "train_speed(iter/s)": 0.238357 }, { "epoch": 2.3305892217394617, "grad_norm": 2.298658609390259, "learning_rate": 5.531215189976077e-05, "loss": 0.10140079259872437, "memory(GiB)": 122.96, "step": 30575, "token_acc": 0.948019801980198, "train_speed(iter/s)": 0.238369 }, { "epoch": 2.3309703483497217, "grad_norm": 1.434770941734314, "learning_rate": 5.530024606937857e-05, "loss": 0.12280269861221313, "memory(GiB)": 122.96, "step": 30580, "token_acc": 0.9463655610444601, "train_speed(iter/s)": 0.238382 }, { "epoch": 2.3313514749599817, "grad_norm": 0.8881643414497375, "learning_rate": 5.528833993505184e-05, "loss": 0.0869701623916626, "memory(GiB)": 122.96, "step": 30585, "token_acc": 0.975200583515682, "train_speed(iter/s)": 0.238389 }, { "epoch": 2.3317326015702418, "grad_norm": 0.9988405108451843, "learning_rate": 5.5276433497463367e-05, "loss": 0.07915791273117065, "memory(GiB)": 122.96, "step": 30590, "token_acc": 0.971214320270924, "train_speed(iter/s)": 0.238397 }, { "epoch": 2.3321137281805013, "grad_norm": 0.6240341663360596, "learning_rate": 5.526452675729592e-05, "loss": 0.09636969566345215, "memory(GiB)": 122.96, "step": 30595, "token_acc": 0.9660879441830278, "train_speed(iter/s)": 0.238402 }, { "epoch": 2.3324948547907614, "grad_norm": 0.8688497543334961, "learning_rate": 5.525261971523228e-05, "loss": 0.09380059242248535, "memory(GiB)": 122.96, "step": 30600, "token_acc": 0.9620174346201743, "train_speed(iter/s)": 0.238415 }, { "epoch": 2.3324948547907614, "eval_loss": 0.08754833787679672, "eval_runtime": 220.4309, "eval_samples_per_second": 2.404, "eval_steps_per_second": 2.404, "eval_token_acc": 0.9620128305523764, "step": 30600 }, { "epoch": 2.3328759814010214, "grad_norm": 0.6813089847564697, "learning_rate": 5.5240712371955295e-05, "loss": 0.12172577381134034, "memory(GiB)": 122.96, "step": 30605, "token_acc": 0.9618011190906287, "train_speed(iter/s)": 0.238016 }, { "epoch": 2.3332571080112814, "grad_norm": 1.0603091716766357, "learning_rate": 5.5228804728147766e-05, "loss": 0.0864871859550476, "memory(GiB)": 122.96, "step": 30610, "token_acc": 0.9667369988829589, "train_speed(iter/s)": 0.23802 }, { "epoch": 2.3336382346215414, "grad_norm": 0.7441304326057434, "learning_rate": 5.521689678449253e-05, "loss": 0.14835765361785888, "memory(GiB)": 122.96, "step": 30615, "token_acc": 0.9504258943781942, "train_speed(iter/s)": 0.238026 }, { "epoch": 2.3340193612318014, "grad_norm": 0.9711832404136658, "learning_rate": 5.5204988541672506e-05, "loss": 0.10581792593002319, "memory(GiB)": 122.96, "step": 30620, "token_acc": 0.960285132382892, "train_speed(iter/s)": 0.238028 }, { "epoch": 2.334400487842061, "grad_norm": 0.7890766263008118, "learning_rate": 5.519308000037054e-05, "loss": 0.081064110994339, "memory(GiB)": 122.96, "step": 30625, "token_acc": 0.9608167770419426, "train_speed(iter/s)": 0.238038 }, { "epoch": 2.334781614452321, "grad_norm": 0.6175696849822998, "learning_rate": 5.518117116126951e-05, "loss": 0.15336552858352662, "memory(GiB)": 122.96, "step": 30630, "token_acc": 0.9558212058212058, "train_speed(iter/s)": 0.238046 }, { "epoch": 2.335162741062581, "grad_norm": 1.2845661640167236, "learning_rate": 5.516926202505236e-05, "loss": 0.042292237281799316, "memory(GiB)": 122.96, "step": 30635, "token_acc": 0.9807057628839807, "train_speed(iter/s)": 0.238057 }, { "epoch": 2.335543867672841, "grad_norm": 0.4597531855106354, "learning_rate": 5.515735259240203e-05, "loss": 0.112229323387146, "memory(GiB)": 122.96, "step": 30640, "token_acc": 0.9665833853841349, "train_speed(iter/s)": 0.238066 }, { "epoch": 2.3359249942831006, "grad_norm": 0.9087921380996704, "learning_rate": 5.5145442864001474e-05, "loss": 0.07683858871459961, "memory(GiB)": 122.96, "step": 30645, "token_acc": 0.969294920394238, "train_speed(iter/s)": 0.238074 }, { "epoch": 2.3363061208933606, "grad_norm": 0.8204307556152344, "learning_rate": 5.513353284053364e-05, "loss": 0.07119760513305665, "memory(GiB)": 122.96, "step": 30650, "token_acc": 0.9740657545435981, "train_speed(iter/s)": 0.238083 }, { "epoch": 2.3366872475036207, "grad_norm": 1.15070641040802, "learning_rate": 5.512162252268151e-05, "loss": 0.07975711822509765, "memory(GiB)": 122.96, "step": 30655, "token_acc": 0.9660400242571255, "train_speed(iter/s)": 0.23809 }, { "epoch": 2.3370683741138807, "grad_norm": 0.8089344501495361, "learning_rate": 5.5109711911128115e-05, "loss": 0.08336615562438965, "memory(GiB)": 122.96, "step": 30660, "token_acc": 0.9606580829756796, "train_speed(iter/s)": 0.238099 }, { "epoch": 2.3374495007241407, "grad_norm": 1.0416001081466675, "learning_rate": 5.509780100655644e-05, "loss": 0.11783276796340943, "memory(GiB)": 122.96, "step": 30665, "token_acc": 0.9591468416735028, "train_speed(iter/s)": 0.238105 }, { "epoch": 2.3378306273344007, "grad_norm": 0.8304465413093567, "learning_rate": 5.5085889809649525e-05, "loss": 0.08402632474899292, "memory(GiB)": 122.96, "step": 30670, "token_acc": 0.959309084869623, "train_speed(iter/s)": 0.238109 }, { "epoch": 2.3382117539446603, "grad_norm": 2.3030290603637695, "learning_rate": 5.5073978321090446e-05, "loss": 0.08283095359802246, "memory(GiB)": 122.96, "step": 30675, "token_acc": 0.9661330049261084, "train_speed(iter/s)": 0.238112 }, { "epoch": 2.3385928805549203, "grad_norm": 1.109972596168518, "learning_rate": 5.506206654156226e-05, "loss": 0.1102400541305542, "memory(GiB)": 122.96, "step": 30680, "token_acc": 0.9617504593285452, "train_speed(iter/s)": 0.238116 }, { "epoch": 2.3389740071651803, "grad_norm": 0.8983370661735535, "learning_rate": 5.505015447174804e-05, "loss": 0.10878502130508423, "memory(GiB)": 122.96, "step": 30685, "token_acc": 0.9644691780821918, "train_speed(iter/s)": 0.238122 }, { "epoch": 2.3393551337754404, "grad_norm": 0.7435944676399231, "learning_rate": 5.503824211233089e-05, "loss": 0.08372299075126648, "memory(GiB)": 122.96, "step": 30690, "token_acc": 0.9688132847306602, "train_speed(iter/s)": 0.238134 }, { "epoch": 2.3397362603857, "grad_norm": 0.8466969132423401, "learning_rate": 5.502632946399394e-05, "loss": 0.11801109313964844, "memory(GiB)": 122.96, "step": 30695, "token_acc": 0.9581090174966352, "train_speed(iter/s)": 0.238141 }, { "epoch": 2.34011738699596, "grad_norm": 0.5676029324531555, "learning_rate": 5.501441652742033e-05, "loss": 0.0825296700000763, "memory(GiB)": 122.96, "step": 30700, "token_acc": 0.9713203463203464, "train_speed(iter/s)": 0.238146 }, { "epoch": 2.34049851360622, "grad_norm": 0.6374625563621521, "learning_rate": 5.5002503303293187e-05, "loss": 0.09023303985595703, "memory(GiB)": 122.96, "step": 30705, "token_acc": 0.9656670113753878, "train_speed(iter/s)": 0.238154 }, { "epoch": 2.34087964021648, "grad_norm": 0.7388700246810913, "learning_rate": 5.499058979229571e-05, "loss": 0.09671454429626465, "memory(GiB)": 122.96, "step": 30710, "token_acc": 0.9724096863834855, "train_speed(iter/s)": 0.23816 }, { "epoch": 2.34126076682674, "grad_norm": 1.1937748193740845, "learning_rate": 5.4978675995111065e-05, "loss": 0.12974437475204467, "memory(GiB)": 122.96, "step": 30715, "token_acc": 0.9429708222811671, "train_speed(iter/s)": 0.238171 }, { "epoch": 2.3416418934369996, "grad_norm": 0.931572437286377, "learning_rate": 5.496676191242244e-05, "loss": 0.08680453300476074, "memory(GiB)": 122.96, "step": 30720, "token_acc": 0.9644277507733098, "train_speed(iter/s)": 0.23818 }, { "epoch": 2.3420230200472596, "grad_norm": 1.726481556892395, "learning_rate": 5.495484754491308e-05, "loss": 0.07619919776916503, "memory(GiB)": 122.96, "step": 30725, "token_acc": 0.9707560627674751, "train_speed(iter/s)": 0.238183 }, { "epoch": 2.3424041466575196, "grad_norm": 0.960767388343811, "learning_rate": 5.494293289326621e-05, "loss": 0.13595058917999267, "memory(GiB)": 122.96, "step": 30730, "token_acc": 0.9485695917711346, "train_speed(iter/s)": 0.238193 }, { "epoch": 2.3427852732677796, "grad_norm": 0.5304681062698364, "learning_rate": 5.493101795816508e-05, "loss": 0.0703538715839386, "memory(GiB)": 122.96, "step": 30735, "token_acc": 0.9667535853976532, "train_speed(iter/s)": 0.238201 }, { "epoch": 2.3431663998780397, "grad_norm": 1.9970533847808838, "learning_rate": 5.4919102740292924e-05, "loss": 0.08296899795532227, "memory(GiB)": 122.96, "step": 30740, "token_acc": 0.9674220963172805, "train_speed(iter/s)": 0.238209 }, { "epoch": 2.3435475264882992, "grad_norm": 0.6064893007278442, "learning_rate": 5.490718724033308e-05, "loss": 0.06556588411331177, "memory(GiB)": 122.96, "step": 30745, "token_acc": 0.9763825729793164, "train_speed(iter/s)": 0.238212 }, { "epoch": 2.3439286530985592, "grad_norm": 1.4180574417114258, "learning_rate": 5.4895271458968824e-05, "loss": 0.1026916742324829, "memory(GiB)": 122.96, "step": 30750, "token_acc": 0.9645010046885466, "train_speed(iter/s)": 0.238219 }, { "epoch": 2.3443097797088193, "grad_norm": 1.172115445137024, "learning_rate": 5.4883355396883454e-05, "loss": 0.06277583837509156, "memory(GiB)": 122.96, "step": 30755, "token_acc": 0.9818640955004592, "train_speed(iter/s)": 0.238223 }, { "epoch": 2.3446909063190793, "grad_norm": 1.2776274681091309, "learning_rate": 5.487143905476031e-05, "loss": 0.07510268688201904, "memory(GiB)": 122.96, "step": 30760, "token_acc": 0.9690693554980242, "train_speed(iter/s)": 0.238225 }, { "epoch": 2.3450720329293393, "grad_norm": 0.7859047651290894, "learning_rate": 5.485952243328274e-05, "loss": 0.11527880430221557, "memory(GiB)": 122.96, "step": 30765, "token_acc": 0.9569112627986348, "train_speed(iter/s)": 0.238229 }, { "epoch": 2.345453159539599, "grad_norm": 0.5625565052032471, "learning_rate": 5.4847605533134125e-05, "loss": 0.07759864926338196, "memory(GiB)": 122.96, "step": 30770, "token_acc": 0.9699163297922346, "train_speed(iter/s)": 0.238226 }, { "epoch": 2.345834286149859, "grad_norm": 0.6491315960884094, "learning_rate": 5.483568835499782e-05, "loss": 0.08491954207420349, "memory(GiB)": 122.96, "step": 30775, "token_acc": 0.9736842105263158, "train_speed(iter/s)": 0.238231 }, { "epoch": 2.346215412760119, "grad_norm": 0.9136077761650085, "learning_rate": 5.482377089955722e-05, "loss": 0.06941872835159302, "memory(GiB)": 122.96, "step": 30780, "token_acc": 0.9699675324675324, "train_speed(iter/s)": 0.238234 }, { "epoch": 2.346596539370379, "grad_norm": 1.067152976989746, "learning_rate": 5.4811853167495765e-05, "loss": 0.0854988694190979, "memory(GiB)": 122.96, "step": 30785, "token_acc": 0.9662027833001988, "train_speed(iter/s)": 0.238241 }, { "epoch": 2.346977665980639, "grad_norm": 0.7576886415481567, "learning_rate": 5.479993515949684e-05, "loss": 0.07178680300712585, "memory(GiB)": 122.96, "step": 30790, "token_acc": 0.9734554850833921, "train_speed(iter/s)": 0.238249 }, { "epoch": 2.3473587925908985, "grad_norm": 0.692973256111145, "learning_rate": 5.478801687624392e-05, "loss": 0.1055801510810852, "memory(GiB)": 122.96, "step": 30795, "token_acc": 0.959504132231405, "train_speed(iter/s)": 0.238257 }, { "epoch": 2.3477399192011585, "grad_norm": 1.8337305784225464, "learning_rate": 5.477609831842044e-05, "loss": 0.07733943462371826, "memory(GiB)": 122.96, "step": 30800, "token_acc": 0.9659839063643014, "train_speed(iter/s)": 0.238269 }, { "epoch": 2.3477399192011585, "eval_loss": 0.08738161623477936, "eval_runtime": 220.9966, "eval_samples_per_second": 2.398, "eval_steps_per_second": 2.398, "eval_token_acc": 0.9625173182338413, "step": 30800 }, { "epoch": 2.3481210458114186, "grad_norm": 1.3934574127197266, "learning_rate": 5.47641794867099e-05, "loss": 0.07053643465042114, "memory(GiB)": 122.96, "step": 30805, "token_acc": 0.962978571895686, "train_speed(iter/s)": 0.237871 }, { "epoch": 2.3485021724216786, "grad_norm": 0.5910361409187317, "learning_rate": 5.475226038179576e-05, "loss": 0.07433618307113647, "memory(GiB)": 122.96, "step": 30810, "token_acc": 0.96247009569378, "train_speed(iter/s)": 0.237877 }, { "epoch": 2.3488832990319386, "grad_norm": 0.8397935628890991, "learning_rate": 5.474034100436156e-05, "loss": 0.10874111652374267, "memory(GiB)": 122.96, "step": 30815, "token_acc": 0.9580856123662307, "train_speed(iter/s)": 0.237888 }, { "epoch": 2.349264425642198, "grad_norm": 0.6856833100318909, "learning_rate": 5.472842135509079e-05, "loss": 0.09164924025535584, "memory(GiB)": 122.96, "step": 30820, "token_acc": 0.96624529316837, "train_speed(iter/s)": 0.23789 }, { "epoch": 2.349645552252458, "grad_norm": 0.9008845090866089, "learning_rate": 5.471650143466699e-05, "loss": 0.08812724947929382, "memory(GiB)": 122.96, "step": 30825, "token_acc": 0.9629694019471489, "train_speed(iter/s)": 0.237899 }, { "epoch": 2.350026678862718, "grad_norm": 1.0098556280136108, "learning_rate": 5.4704581243773723e-05, "loss": 0.08434792757034301, "memory(GiB)": 122.96, "step": 30830, "token_acc": 0.9605449900652853, "train_speed(iter/s)": 0.23791 }, { "epoch": 2.3504078054729782, "grad_norm": 0.6507514119148254, "learning_rate": 5.469266078309456e-05, "loss": 0.08315092325210571, "memory(GiB)": 122.96, "step": 30835, "token_acc": 0.9692140035417518, "train_speed(iter/s)": 0.237911 }, { "epoch": 2.350788932083238, "grad_norm": 0.6123827695846558, "learning_rate": 5.4680740053313076e-05, "loss": 0.0671958088874817, "memory(GiB)": 122.96, "step": 30840, "token_acc": 0.9727370689655173, "train_speed(iter/s)": 0.237911 }, { "epoch": 2.351170058693498, "grad_norm": 0.9441723823547363, "learning_rate": 5.466881905511286e-05, "loss": 0.06048554182052612, "memory(GiB)": 122.96, "step": 30845, "token_acc": 0.9812657071053232, "train_speed(iter/s)": 0.237917 }, { "epoch": 2.351551185303758, "grad_norm": 1.258835792541504, "learning_rate": 5.4656897789177555e-05, "loss": 0.15067782402038574, "memory(GiB)": 122.96, "step": 30850, "token_acc": 0.9435379991412624, "train_speed(iter/s)": 0.237925 }, { "epoch": 2.351932311914018, "grad_norm": 0.7690373659133911, "learning_rate": 5.464497625619077e-05, "loss": 0.04793847799301147, "memory(GiB)": 122.96, "step": 30855, "token_acc": 0.9804131054131054, "train_speed(iter/s)": 0.237928 }, { "epoch": 2.352313438524278, "grad_norm": 0.670519232749939, "learning_rate": 5.463305445683614e-05, "loss": 0.04852641224861145, "memory(GiB)": 122.96, "step": 30860, "token_acc": 0.980545422963349, "train_speed(iter/s)": 0.237935 }, { "epoch": 2.352694565134538, "grad_norm": 0.7889462113380432, "learning_rate": 5.4621132391797345e-05, "loss": 0.08067988157272339, "memory(GiB)": 122.96, "step": 30865, "token_acc": 0.9665795369678865, "train_speed(iter/s)": 0.23794 }, { "epoch": 2.3530756917447975, "grad_norm": 1.8038569688796997, "learning_rate": 5.460921006175805e-05, "loss": 0.10897238254547119, "memory(GiB)": 122.96, "step": 30870, "token_acc": 0.9682108935840279, "train_speed(iter/s)": 0.237947 }, { "epoch": 2.3534568183550575, "grad_norm": 0.9841631650924683, "learning_rate": 5.4597287467401946e-05, "loss": 0.09772663712501525, "memory(GiB)": 122.96, "step": 30875, "token_acc": 0.9635193133047211, "train_speed(iter/s)": 0.237958 }, { "epoch": 2.3538379449653175, "grad_norm": 0.5393519997596741, "learning_rate": 5.458536460941275e-05, "loss": 0.11086546182632447, "memory(GiB)": 122.96, "step": 30880, "token_acc": 0.9574202743630857, "train_speed(iter/s)": 0.237964 }, { "epoch": 2.3542190715755775, "grad_norm": 1.1484524011611938, "learning_rate": 5.4573441488474164e-05, "loss": 0.09553924202919006, "memory(GiB)": 122.96, "step": 30885, "token_acc": 0.9737765847615114, "train_speed(iter/s)": 0.23797 }, { "epoch": 2.354600198185837, "grad_norm": 0.6212186217308044, "learning_rate": 5.4561518105269924e-05, "loss": 0.1053186297416687, "memory(GiB)": 122.96, "step": 30890, "token_acc": 0.9655647382920111, "train_speed(iter/s)": 0.237979 }, { "epoch": 2.354981324796097, "grad_norm": 1.087348222732544, "learning_rate": 5.45495944604838e-05, "loss": 0.10695401430130005, "memory(GiB)": 122.96, "step": 30895, "token_acc": 0.9630662020905923, "train_speed(iter/s)": 0.23799 }, { "epoch": 2.355362451406357, "grad_norm": 0.8577693700790405, "learning_rate": 5.453767055479955e-05, "loss": 0.0855899691581726, "memory(GiB)": 122.96, "step": 30900, "token_acc": 0.9663339536457453, "train_speed(iter/s)": 0.237999 }, { "epoch": 2.355743578016617, "grad_norm": 0.9391471743583679, "learning_rate": 5.4525746388900945e-05, "loss": 0.06020166873931885, "memory(GiB)": 122.96, "step": 30905, "token_acc": 0.9650382032877981, "train_speed(iter/s)": 0.238007 }, { "epoch": 2.356124704626877, "grad_norm": 0.6207118034362793, "learning_rate": 5.451382196347178e-05, "loss": 0.06447044610977173, "memory(GiB)": 122.96, "step": 30910, "token_acc": 0.9705488621151271, "train_speed(iter/s)": 0.238017 }, { "epoch": 2.356505831237137, "grad_norm": 0.8926740288734436, "learning_rate": 5.450189727919588e-05, "loss": 0.08904388546943665, "memory(GiB)": 122.96, "step": 30915, "token_acc": 0.9619396673244995, "train_speed(iter/s)": 0.238028 }, { "epoch": 2.3568869578473968, "grad_norm": 0.8048908114433289, "learning_rate": 5.448997233675707e-05, "loss": 0.06659048199653625, "memory(GiB)": 122.96, "step": 30920, "token_acc": 0.9750056548292242, "train_speed(iter/s)": 0.238025 }, { "epoch": 2.357268084457657, "grad_norm": 1.5697435140609741, "learning_rate": 5.447804713683917e-05, "loss": 0.13406717777252197, "memory(GiB)": 122.96, "step": 30925, "token_acc": 0.9579354251932697, "train_speed(iter/s)": 0.238032 }, { "epoch": 2.357649211067917, "grad_norm": 3.6107394695281982, "learning_rate": 5.446612168012605e-05, "loss": 0.09263505339622498, "memory(GiB)": 122.96, "step": 30930, "token_acc": 0.9683163131709872, "train_speed(iter/s)": 0.238035 }, { "epoch": 2.358030337678177, "grad_norm": 0.8466642498970032, "learning_rate": 5.445419596730158e-05, "loss": 0.08715442419052125, "memory(GiB)": 122.96, "step": 30935, "token_acc": 0.9661894608589705, "train_speed(iter/s)": 0.238045 }, { "epoch": 2.3584114642884364, "grad_norm": 0.6050650477409363, "learning_rate": 5.444226999904963e-05, "loss": 0.07892669439315796, "memory(GiB)": 122.96, "step": 30940, "token_acc": 0.9704898083358686, "train_speed(iter/s)": 0.238048 }, { "epoch": 2.3587925908986964, "grad_norm": 0.8390781283378601, "learning_rate": 5.443034377605412e-05, "loss": 0.05861798524856567, "memory(GiB)": 122.96, "step": 30945, "token_acc": 0.9712742980561555, "train_speed(iter/s)": 0.238054 }, { "epoch": 2.3591737175089564, "grad_norm": 1.2999714612960815, "learning_rate": 5.441841729899896e-05, "loss": 0.10116490125656127, "memory(GiB)": 122.96, "step": 30950, "token_acc": 0.9555223880597015, "train_speed(iter/s)": 0.238064 }, { "epoch": 2.3595548441192165, "grad_norm": 0.4325985610485077, "learning_rate": 5.440649056856807e-05, "loss": 0.10283323526382446, "memory(GiB)": 122.96, "step": 30955, "token_acc": 0.959199789418268, "train_speed(iter/s)": 0.238074 }, { "epoch": 2.3599359707294765, "grad_norm": 0.48289909958839417, "learning_rate": 5.4394563585445376e-05, "loss": 0.06515552401542664, "memory(GiB)": 122.96, "step": 30960, "token_acc": 0.9715128961888875, "train_speed(iter/s)": 0.238076 }, { "epoch": 2.3603170973397365, "grad_norm": 1.0268217325210571, "learning_rate": 5.4382636350314865e-05, "loss": 0.07753714919090271, "memory(GiB)": 122.96, "step": 30965, "token_acc": 0.9718234027279253, "train_speed(iter/s)": 0.238083 }, { "epoch": 2.360698223949996, "grad_norm": 0.7710219025611877, "learning_rate": 5.4370708863860496e-05, "loss": 0.08886274099349975, "memory(GiB)": 122.96, "step": 30970, "token_acc": 0.9590780023383999, "train_speed(iter/s)": 0.23809 }, { "epoch": 2.361079350560256, "grad_norm": 1.3726928234100342, "learning_rate": 5.4358781126766267e-05, "loss": 0.08288079500198364, "memory(GiB)": 122.96, "step": 30975, "token_acc": 0.9736696230598669, "train_speed(iter/s)": 0.238097 }, { "epoch": 2.361460477170516, "grad_norm": 0.5045683979988098, "learning_rate": 5.4346853139716144e-05, "loss": 0.10357705354690552, "memory(GiB)": 122.96, "step": 30980, "token_acc": 0.965595340811044, "train_speed(iter/s)": 0.238095 }, { "epoch": 2.361841603780776, "grad_norm": 1.2020784616470337, "learning_rate": 5.4334924903394194e-05, "loss": 0.08333129286766053, "memory(GiB)": 122.96, "step": 30985, "token_acc": 0.954796320630749, "train_speed(iter/s)": 0.238104 }, { "epoch": 2.3622227303910357, "grad_norm": 0.7484704256057739, "learning_rate": 5.4322996418484404e-05, "loss": 0.0856864333152771, "memory(GiB)": 122.96, "step": 30990, "token_acc": 0.9686104376674748, "train_speed(iter/s)": 0.238107 }, { "epoch": 2.3626038570012957, "grad_norm": 0.7155664563179016, "learning_rate": 5.4311067685670825e-05, "loss": 0.08418570756912232, "memory(GiB)": 122.96, "step": 30995, "token_acc": 0.9632761536603044, "train_speed(iter/s)": 0.238116 }, { "epoch": 2.3629849836115557, "grad_norm": 0.8586833477020264, "learning_rate": 5.429913870563753e-05, "loss": 0.06655234694480897, "memory(GiB)": 122.96, "step": 31000, "token_acc": 0.971395881006865, "train_speed(iter/s)": 0.238126 }, { "epoch": 2.3629849836115557, "eval_loss": 0.08528787642717361, "eval_runtime": 220.4696, "eval_samples_per_second": 2.404, "eval_steps_per_second": 2.404, "eval_token_acc": 0.9625775555689416, "step": 31000 }, { "epoch": 2.3633661102218158, "grad_norm": 1.2716025114059448, "learning_rate": 5.4287209479068576e-05, "loss": 0.07820500135421753, "memory(GiB)": 122.96, "step": 31005, "token_acc": 0.9626929799009613, "train_speed(iter/s)": 0.237729 }, { "epoch": 2.3637472368320758, "grad_norm": 1.1224666833877563, "learning_rate": 5.427528000664807e-05, "loss": 0.11107146739959717, "memory(GiB)": 122.96, "step": 31010, "token_acc": 0.960948905109489, "train_speed(iter/s)": 0.237734 }, { "epoch": 2.364128363442336, "grad_norm": 0.6373490691184998, "learning_rate": 5.4263350289060066e-05, "loss": 0.08681471943855286, "memory(GiB)": 122.96, "step": 31015, "token_acc": 0.9651859246318942, "train_speed(iter/s)": 0.237736 }, { "epoch": 2.3645094900525954, "grad_norm": 0.619637131690979, "learning_rate": 5.425142032698872e-05, "loss": 0.05805981159210205, "memory(GiB)": 122.96, "step": 31020, "token_acc": 0.9770766576287897, "train_speed(iter/s)": 0.237744 }, { "epoch": 2.3648906166628554, "grad_norm": 1.317243218421936, "learning_rate": 5.423949012111815e-05, "loss": 0.0947724461555481, "memory(GiB)": 122.96, "step": 31025, "token_acc": 0.9620253164556962, "train_speed(iter/s)": 0.237749 }, { "epoch": 2.3652717432731154, "grad_norm": 0.8387614488601685, "learning_rate": 5.4227559672132485e-05, "loss": 0.11397541761398315, "memory(GiB)": 122.96, "step": 31030, "token_acc": 0.9546788392566026, "train_speed(iter/s)": 0.237761 }, { "epoch": 2.3656528698833754, "grad_norm": 0.7210548520088196, "learning_rate": 5.42156289807159e-05, "loss": 0.07932603359222412, "memory(GiB)": 122.96, "step": 31035, "token_acc": 0.9693877551020408, "train_speed(iter/s)": 0.237759 }, { "epoch": 2.366033996493635, "grad_norm": 0.9409093260765076, "learning_rate": 5.4203698047552564e-05, "loss": 0.08605616688728332, "memory(GiB)": 122.96, "step": 31040, "token_acc": 0.9694438338096285, "train_speed(iter/s)": 0.237765 }, { "epoch": 2.366415123103895, "grad_norm": 0.314769446849823, "learning_rate": 5.4191766873326644e-05, "loss": 0.09566019177436828, "memory(GiB)": 122.96, "step": 31045, "token_acc": 0.9550695825049702, "train_speed(iter/s)": 0.237777 }, { "epoch": 2.366796249714155, "grad_norm": 0.5651035904884338, "learning_rate": 5.4179835458722336e-05, "loss": 0.11585347652435303, "memory(GiB)": 122.96, "step": 31050, "token_acc": 0.9606549920400272, "train_speed(iter/s)": 0.237777 }, { "epoch": 2.367177376324415, "grad_norm": 0.6355889439582825, "learning_rate": 5.416790380442388e-05, "loss": 0.06411581039428711, "memory(GiB)": 122.96, "step": 31055, "token_acc": 0.9647249941981898, "train_speed(iter/s)": 0.237786 }, { "epoch": 2.367558502934675, "grad_norm": 1.4874205589294434, "learning_rate": 5.415597191111548e-05, "loss": 0.06622909903526306, "memory(GiB)": 122.96, "step": 31060, "token_acc": 0.9755409219190969, "train_speed(iter/s)": 0.23779 }, { "epoch": 2.3679396295449346, "grad_norm": 0.42747634649276733, "learning_rate": 5.414403977948136e-05, "loss": 0.0880135715007782, "memory(GiB)": 122.96, "step": 31065, "token_acc": 0.9603246167718665, "train_speed(iter/s)": 0.2378 }, { "epoch": 2.3683207561551947, "grad_norm": 0.9827426671981812, "learning_rate": 5.41321074102058e-05, "loss": 0.07852803468704224, "memory(GiB)": 122.96, "step": 31070, "token_acc": 0.9592189817103312, "train_speed(iter/s)": 0.237809 }, { "epoch": 2.3687018827654547, "grad_norm": 0.9668629169464111, "learning_rate": 5.412017480397306e-05, "loss": 0.07896273136138916, "memory(GiB)": 122.96, "step": 31075, "token_acc": 0.9647932816537468, "train_speed(iter/s)": 0.237819 }, { "epoch": 2.3690830093757147, "grad_norm": 0.571475088596344, "learning_rate": 5.41082419614674e-05, "loss": 0.0755922555923462, "memory(GiB)": 122.96, "step": 31080, "token_acc": 0.9754730203223546, "train_speed(iter/s)": 0.237825 }, { "epoch": 2.3694641359859747, "grad_norm": 0.6273787617683411, "learning_rate": 5.409630888337315e-05, "loss": 0.09668500423431396, "memory(GiB)": 122.96, "step": 31085, "token_acc": 0.9622641509433962, "train_speed(iter/s)": 0.237835 }, { "epoch": 2.3698452625962343, "grad_norm": 0.8006317615509033, "learning_rate": 5.408437557037458e-05, "loss": 0.07658748030662536, "memory(GiB)": 122.96, "step": 31090, "token_acc": 0.969706947645703, "train_speed(iter/s)": 0.237835 }, { "epoch": 2.3702263892064943, "grad_norm": 0.839894711971283, "learning_rate": 5.407244202315602e-05, "loss": 0.0853196620941162, "memory(GiB)": 122.96, "step": 31095, "token_acc": 0.9693515230797982, "train_speed(iter/s)": 0.237842 }, { "epoch": 2.3706075158167543, "grad_norm": 1.0215001106262207, "learning_rate": 5.4060508242401806e-05, "loss": 0.10421254634857177, "memory(GiB)": 122.96, "step": 31100, "token_acc": 0.9607522485690924, "train_speed(iter/s)": 0.237851 }, { "epoch": 2.3709886424270143, "grad_norm": 0.9255895614624023, "learning_rate": 5.40485742287963e-05, "loss": 0.07981572151184083, "memory(GiB)": 122.96, "step": 31105, "token_acc": 0.9693438054218394, "train_speed(iter/s)": 0.237858 }, { "epoch": 2.3713697690372744, "grad_norm": 0.5459995269775391, "learning_rate": 5.4036639983023853e-05, "loss": 0.09013288617134094, "memory(GiB)": 122.96, "step": 31110, "token_acc": 0.9666237264316665, "train_speed(iter/s)": 0.237861 }, { "epoch": 2.371750895647534, "grad_norm": 0.7635063529014587, "learning_rate": 5.4024705505768826e-05, "loss": 0.10316922664642333, "memory(GiB)": 122.96, "step": 31115, "token_acc": 0.9670306144294584, "train_speed(iter/s)": 0.237866 }, { "epoch": 2.372132022257794, "grad_norm": 1.4979983568191528, "learning_rate": 5.4012770797715616e-05, "loss": 0.09620672464370728, "memory(GiB)": 122.96, "step": 31120, "token_acc": 0.9638344914718888, "train_speed(iter/s)": 0.237872 }, { "epoch": 2.372513148868054, "grad_norm": 0.7405616641044617, "learning_rate": 5.400083585954864e-05, "loss": 0.115581214427948, "memory(GiB)": 122.96, "step": 31125, "token_acc": 0.9531878839228977, "train_speed(iter/s)": 0.23788 }, { "epoch": 2.372894275478314, "grad_norm": 1.1127780675888062, "learning_rate": 5.3988900691952274e-05, "loss": 0.09446353316307068, "memory(GiB)": 122.96, "step": 31130, "token_acc": 0.9630208333333333, "train_speed(iter/s)": 0.237886 }, { "epoch": 2.373275402088574, "grad_norm": 0.5686814188957214, "learning_rate": 5.397696529561096e-05, "loss": 0.0760711371898651, "memory(GiB)": 122.96, "step": 31135, "token_acc": 0.9718289920724802, "train_speed(iter/s)": 0.237892 }, { "epoch": 2.3736565286988336, "grad_norm": 0.8878071904182434, "learning_rate": 5.396502967120916e-05, "loss": 0.09822360873222351, "memory(GiB)": 122.96, "step": 31140, "token_acc": 0.9554766457252117, "train_speed(iter/s)": 0.237903 }, { "epoch": 2.3740376553090936, "grad_norm": 1.384774088859558, "learning_rate": 5.39530938194313e-05, "loss": 0.12208359241485596, "memory(GiB)": 122.96, "step": 31145, "token_acc": 0.9640640313622999, "train_speed(iter/s)": 0.237912 }, { "epoch": 2.3744187819193536, "grad_norm": 1.3508821725845337, "learning_rate": 5.3941157740961844e-05, "loss": 0.1326884627342224, "memory(GiB)": 122.96, "step": 31150, "token_acc": 0.9491114701130856, "train_speed(iter/s)": 0.237921 }, { "epoch": 2.3747999085296136, "grad_norm": 0.31269899010658264, "learning_rate": 5.392922143648529e-05, "loss": 0.06582505106925965, "memory(GiB)": 122.96, "step": 31155, "token_acc": 0.9716687991226467, "train_speed(iter/s)": 0.237928 }, { "epoch": 2.3751810351398737, "grad_norm": 0.7705280184745789, "learning_rate": 5.3917284906686126e-05, "loss": 0.09521127939224243, "memory(GiB)": 122.96, "step": 31160, "token_acc": 0.9682450206689215, "train_speed(iter/s)": 0.237935 }, { "epoch": 2.3755621617501332, "grad_norm": 1.735999584197998, "learning_rate": 5.390534815224884e-05, "loss": 0.0856261670589447, "memory(GiB)": 122.96, "step": 31165, "token_acc": 0.9686552072800809, "train_speed(iter/s)": 0.237946 }, { "epoch": 2.3759432883603933, "grad_norm": 0.7416235208511353, "learning_rate": 5.3893411173857956e-05, "loss": 0.07734074592590331, "memory(GiB)": 122.96, "step": 31170, "token_acc": 0.9677187948350072, "train_speed(iter/s)": 0.237951 }, { "epoch": 2.3763244149706533, "grad_norm": 0.8314568996429443, "learning_rate": 5.3881473972198025e-05, "loss": 0.09363832473754882, "memory(GiB)": 122.96, "step": 31175, "token_acc": 0.9680284191829485, "train_speed(iter/s)": 0.237962 }, { "epoch": 2.3767055415809133, "grad_norm": 0.9788180589675903, "learning_rate": 5.386953654795357e-05, "loss": 0.08723482489585876, "memory(GiB)": 122.96, "step": 31180, "token_acc": 0.9594771241830066, "train_speed(iter/s)": 0.237967 }, { "epoch": 2.377086668191173, "grad_norm": 0.6686174273490906, "learning_rate": 5.385759890180914e-05, "loss": 0.09058893918991089, "memory(GiB)": 122.96, "step": 31185, "token_acc": 0.9657936167167597, "train_speed(iter/s)": 0.237969 }, { "epoch": 2.377467794801433, "grad_norm": 1.2684311866760254, "learning_rate": 5.384566103444932e-05, "loss": 0.10390739440917969, "memory(GiB)": 122.96, "step": 31190, "token_acc": 0.9576976421636616, "train_speed(iter/s)": 0.23798 }, { "epoch": 2.377848921411693, "grad_norm": 1.1410993337631226, "learning_rate": 5.3833722946558695e-05, "loss": 0.08044705390930176, "memory(GiB)": 122.96, "step": 31195, "token_acc": 0.9733996910931869, "train_speed(iter/s)": 0.237985 }, { "epoch": 2.378230048021953, "grad_norm": 0.2866517901420593, "learning_rate": 5.382178463882184e-05, "loss": 0.09050151109695434, "memory(GiB)": 122.96, "step": 31200, "token_acc": 0.9537444933920705, "train_speed(iter/s)": 0.237993 }, { "epoch": 2.378230048021953, "eval_loss": 0.08521706610918045, "eval_runtime": 220.0517, "eval_samples_per_second": 2.409, "eval_steps_per_second": 2.409, "eval_token_acc": 0.9630067465815312, "step": 31200 }, { "epoch": 2.378611174632213, "grad_norm": 1.2375469207763672, "learning_rate": 5.380984611192337e-05, "loss": 0.09774636626243591, "memory(GiB)": 122.96, "step": 31205, "token_acc": 0.9630793516231346, "train_speed(iter/s)": 0.237603 }, { "epoch": 2.378992301242473, "grad_norm": 0.6631600260734558, "learning_rate": 5.379790736654792e-05, "loss": 0.06392840147018433, "memory(GiB)": 122.96, "step": 31210, "token_acc": 0.9709886547811993, "train_speed(iter/s)": 0.237608 }, { "epoch": 2.3793734278527325, "grad_norm": 1.1937839984893799, "learning_rate": 5.378596840338009e-05, "loss": 0.07077354788780213, "memory(GiB)": 122.96, "step": 31215, "token_acc": 0.9680775597067864, "train_speed(iter/s)": 0.237618 }, { "epoch": 2.3797545544629926, "grad_norm": 0.5735511183738708, "learning_rate": 5.377402922310455e-05, "loss": 0.0809092402458191, "memory(GiB)": 122.96, "step": 31220, "token_acc": 0.9654485049833887, "train_speed(iter/s)": 0.237632 }, { "epoch": 2.3801356810732526, "grad_norm": 0.6882121562957764, "learning_rate": 5.376208982640595e-05, "loss": 0.1021620512008667, "memory(GiB)": 122.96, "step": 31225, "token_acc": 0.9565217391304348, "train_speed(iter/s)": 0.23764 }, { "epoch": 2.3805168076835126, "grad_norm": 0.5293640494346619, "learning_rate": 5.375015021396894e-05, "loss": 0.08965572118759155, "memory(GiB)": 122.96, "step": 31230, "token_acc": 0.968724279835391, "train_speed(iter/s)": 0.237641 }, { "epoch": 2.380897934293772, "grad_norm": 1.43839430809021, "learning_rate": 5.3738210386478224e-05, "loss": 0.10873479843139648, "memory(GiB)": 122.96, "step": 31235, "token_acc": 0.9632850241545894, "train_speed(iter/s)": 0.237649 }, { "epoch": 2.381279060904032, "grad_norm": 1.1074553728103638, "learning_rate": 5.372627034461849e-05, "loss": 0.055960172414779664, "memory(GiB)": 122.96, "step": 31240, "token_acc": 0.9799913005654632, "train_speed(iter/s)": 0.237656 }, { "epoch": 2.381660187514292, "grad_norm": 1.7570327520370483, "learning_rate": 5.371433008907446e-05, "loss": 0.09745961427688599, "memory(GiB)": 122.96, "step": 31245, "token_acc": 0.9713310580204778, "train_speed(iter/s)": 0.237664 }, { "epoch": 2.3820413141245522, "grad_norm": 1.0225231647491455, "learning_rate": 5.370238962053083e-05, "loss": 0.11691815853118896, "memory(GiB)": 122.96, "step": 31250, "token_acc": 0.957719814892722, "train_speed(iter/s)": 0.237672 }, { "epoch": 2.3824224407348122, "grad_norm": 0.6122372150421143, "learning_rate": 5.369044893967232e-05, "loss": 0.11683707237243653, "memory(GiB)": 122.96, "step": 31255, "token_acc": 0.9524979524979525, "train_speed(iter/s)": 0.237682 }, { "epoch": 2.3828035673450723, "grad_norm": 0.8814001083374023, "learning_rate": 5.3678508047183706e-05, "loss": 0.0943717122077942, "memory(GiB)": 122.96, "step": 31260, "token_acc": 0.9614955357142857, "train_speed(iter/s)": 0.237692 }, { "epoch": 2.383184693955332, "grad_norm": 1.2693196535110474, "learning_rate": 5.3666566943749716e-05, "loss": 0.07922405004501343, "memory(GiB)": 122.96, "step": 31265, "token_acc": 0.9686299615877081, "train_speed(iter/s)": 0.237701 }, { "epoch": 2.383565820565592, "grad_norm": 1.1481791734695435, "learning_rate": 5.365462563005513e-05, "loss": 0.13680604696273804, "memory(GiB)": 122.96, "step": 31270, "token_acc": 0.9443127962085308, "train_speed(iter/s)": 0.237713 }, { "epoch": 2.383946947175852, "grad_norm": 0.7890597581863403, "learning_rate": 5.3642684106784736e-05, "loss": 0.08330482244491577, "memory(GiB)": 122.96, "step": 31275, "token_acc": 0.9681285753200762, "train_speed(iter/s)": 0.237723 }, { "epoch": 2.384328073786112, "grad_norm": 0.8234835863113403, "learning_rate": 5.36307423746233e-05, "loss": 0.08229019045829773, "memory(GiB)": 122.96, "step": 31280, "token_acc": 0.967327262988752, "train_speed(iter/s)": 0.237733 }, { "epoch": 2.3847092003963715, "grad_norm": 0.667593240737915, "learning_rate": 5.3618800434255634e-05, "loss": 0.08715881705284119, "memory(GiB)": 122.96, "step": 31285, "token_acc": 0.970863405506549, "train_speed(iter/s)": 0.237741 }, { "epoch": 2.3850903270066315, "grad_norm": 0.693841278553009, "learning_rate": 5.3606858286366554e-05, "loss": 0.08429051637649536, "memory(GiB)": 122.96, "step": 31290, "token_acc": 0.9644095913135273, "train_speed(iter/s)": 0.237743 }, { "epoch": 2.3854714536168915, "grad_norm": 0.6991963982582092, "learning_rate": 5.35949159316409e-05, "loss": 0.08095073699951172, "memory(GiB)": 122.96, "step": 31295, "token_acc": 0.9656640181611805, "train_speed(iter/s)": 0.237748 }, { "epoch": 2.3858525802271515, "grad_norm": 1.4625240564346313, "learning_rate": 5.3582973370763503e-05, "loss": 0.08543552160263061, "memory(GiB)": 122.96, "step": 31300, "token_acc": 0.971561230412072, "train_speed(iter/s)": 0.237761 }, { "epoch": 2.3862337068374115, "grad_norm": 0.7467935085296631, "learning_rate": 5.357103060441919e-05, "loss": 0.10993642807006836, "memory(GiB)": 122.96, "step": 31305, "token_acc": 0.9640646232825004, "train_speed(iter/s)": 0.237765 }, { "epoch": 2.3866148334476716, "grad_norm": 1.5048809051513672, "learning_rate": 5.355908763329286e-05, "loss": 0.08198057413101197, "memory(GiB)": 122.96, "step": 31310, "token_acc": 0.963855421686747, "train_speed(iter/s)": 0.23777 }, { "epoch": 2.386995960057931, "grad_norm": 0.866624653339386, "learning_rate": 5.3547144458069375e-05, "loss": 0.09441487193107605, "memory(GiB)": 122.96, "step": 31315, "token_acc": 0.9676794576927733, "train_speed(iter/s)": 0.237771 }, { "epoch": 2.387377086668191, "grad_norm": 0.5799077749252319, "learning_rate": 5.35352010794336e-05, "loss": 0.10726969242095948, "memory(GiB)": 122.96, "step": 31320, "token_acc": 0.9627884499214692, "train_speed(iter/s)": 0.237771 }, { "epoch": 2.387758213278451, "grad_norm": 1.7038251161575317, "learning_rate": 5.3523257498070456e-05, "loss": 0.09630421400070191, "memory(GiB)": 122.96, "step": 31325, "token_acc": 0.95826802507837, "train_speed(iter/s)": 0.23778 }, { "epoch": 2.388139339888711, "grad_norm": 1.609902024269104, "learning_rate": 5.351131371466486e-05, "loss": 0.08274248242378235, "memory(GiB)": 122.96, "step": 31330, "token_acc": 0.9701575020723957, "train_speed(iter/s)": 0.237784 }, { "epoch": 2.3885204664989708, "grad_norm": 1.2762500047683716, "learning_rate": 5.3499369729901694e-05, "loss": 0.0806304931640625, "memory(GiB)": 122.96, "step": 31335, "token_acc": 0.9662607813292745, "train_speed(iter/s)": 0.237792 }, { "epoch": 2.388901593109231, "grad_norm": 1.6169333457946777, "learning_rate": 5.348742554446592e-05, "loss": 0.1230278491973877, "memory(GiB)": 122.96, "step": 31340, "token_acc": 0.9564719358533792, "train_speed(iter/s)": 0.237799 }, { "epoch": 2.389282719719491, "grad_norm": 0.9799315929412842, "learning_rate": 5.347548115904247e-05, "loss": 0.10180585384368897, "memory(GiB)": 122.96, "step": 31345, "token_acc": 0.9672756155679111, "train_speed(iter/s)": 0.237804 }, { "epoch": 2.389663846329751, "grad_norm": 0.9468777775764465, "learning_rate": 5.3463536574316317e-05, "loss": 0.09778773188591003, "memory(GiB)": 122.96, "step": 31350, "token_acc": 0.9572978648932446, "train_speed(iter/s)": 0.237809 }, { "epoch": 2.390044972940011, "grad_norm": 0.6771307587623596, "learning_rate": 5.34515917909724e-05, "loss": 0.10218154191970825, "memory(GiB)": 122.96, "step": 31355, "token_acc": 0.9523296525720442, "train_speed(iter/s)": 0.23782 }, { "epoch": 2.3904260995502704, "grad_norm": 1.0314041376113892, "learning_rate": 5.343964680969573e-05, "loss": 0.07776549458503723, "memory(GiB)": 122.96, "step": 31360, "token_acc": 0.970154686078253, "train_speed(iter/s)": 0.23783 }, { "epoch": 2.3908072261605304, "grad_norm": 0.5972577333450317, "learning_rate": 5.3427701631171255e-05, "loss": 0.08387594223022461, "memory(GiB)": 122.96, "step": 31365, "token_acc": 0.9685911401597677, "train_speed(iter/s)": 0.237836 }, { "epoch": 2.3911883527707904, "grad_norm": 1.6459697484970093, "learning_rate": 5.341575625608402e-05, "loss": 0.090086829662323, "memory(GiB)": 122.96, "step": 31370, "token_acc": 0.9733130003812428, "train_speed(iter/s)": 0.237846 }, { "epoch": 2.3915694793810505, "grad_norm": 1.3109780550003052, "learning_rate": 5.340381068511898e-05, "loss": 0.0913809061050415, "memory(GiB)": 122.96, "step": 31375, "token_acc": 0.9591134064988165, "train_speed(iter/s)": 0.237854 }, { "epoch": 2.3919506059913105, "grad_norm": 0.6141493916511536, "learning_rate": 5.339186491896122e-05, "loss": 0.05290588140487671, "memory(GiB)": 122.96, "step": 31380, "token_acc": 0.9756329716352561, "train_speed(iter/s)": 0.23786 }, { "epoch": 2.39233173260157, "grad_norm": 0.8543851971626282, "learning_rate": 5.337991895829575e-05, "loss": 0.09736396074295044, "memory(GiB)": 122.96, "step": 31385, "token_acc": 0.9574582660204631, "train_speed(iter/s)": 0.237866 }, { "epoch": 2.39271285921183, "grad_norm": 0.7680191993713379, "learning_rate": 5.33679728038076e-05, "loss": 0.09589442014694213, "memory(GiB)": 122.96, "step": 31390, "token_acc": 0.9653802497162316, "train_speed(iter/s)": 0.237873 }, { "epoch": 2.39309398582209, "grad_norm": 1.675175428390503, "learning_rate": 5.335602645618185e-05, "loss": 0.08773276805877686, "memory(GiB)": 122.96, "step": 31395, "token_acc": 0.9674185463659147, "train_speed(iter/s)": 0.237883 }, { "epoch": 2.39347511243235, "grad_norm": 1.1155779361724854, "learning_rate": 5.334407991610355e-05, "loss": 0.11829129457473755, "memory(GiB)": 122.96, "step": 31400, "token_acc": 0.9620253164556962, "train_speed(iter/s)": 0.237895 }, { "epoch": 2.39347511243235, "eval_loss": 0.08516139537096024, "eval_runtime": 220.5249, "eval_samples_per_second": 2.403, "eval_steps_per_second": 2.403, "eval_token_acc": 0.9628561532437805, "step": 31400 }, { "epoch": 2.39385623904261, "grad_norm": 0.910506546497345, "learning_rate": 5.3332133184257795e-05, "loss": 0.09059352278709412, "memory(GiB)": 122.96, "step": 31405, "token_acc": 0.9629528878955417, "train_speed(iter/s)": 0.237509 }, { "epoch": 2.3942373656528697, "grad_norm": 1.1716959476470947, "learning_rate": 5.3320186261329654e-05, "loss": 0.09969868063926697, "memory(GiB)": 122.96, "step": 31410, "token_acc": 0.9625910612325261, "train_speed(iter/s)": 0.237519 }, { "epoch": 2.3946184922631297, "grad_norm": 0.8604730367660522, "learning_rate": 5.3308239148004234e-05, "loss": 0.09195284843444824, "memory(GiB)": 122.96, "step": 31415, "token_acc": 0.9570867740625423, "train_speed(iter/s)": 0.237524 }, { "epoch": 2.3949996188733897, "grad_norm": 2.213545560836792, "learning_rate": 5.329629184496666e-05, "loss": 0.1034429669380188, "memory(GiB)": 122.96, "step": 31420, "token_acc": 0.9706141639729651, "train_speed(iter/s)": 0.237532 }, { "epoch": 2.3953807454836498, "grad_norm": 0.4884463846683502, "learning_rate": 5.3284344352902035e-05, "loss": 0.09135279059410095, "memory(GiB)": 122.96, "step": 31425, "token_acc": 0.9642961235791314, "train_speed(iter/s)": 0.237533 }, { "epoch": 2.39576187209391, "grad_norm": 0.12757809460163116, "learning_rate": 5.3272396672495504e-05, "loss": 0.050753408670425416, "memory(GiB)": 122.96, "step": 31430, "token_acc": 0.9727436358961172, "train_speed(iter/s)": 0.237539 }, { "epoch": 2.3961429987041694, "grad_norm": 0.5974608659744263, "learning_rate": 5.3260448804432225e-05, "loss": 0.06104531288146973, "memory(GiB)": 122.96, "step": 31435, "token_acc": 0.9754376796446302, "train_speed(iter/s)": 0.237549 }, { "epoch": 2.3965241253144294, "grad_norm": 1.7749061584472656, "learning_rate": 5.324850074939733e-05, "loss": 0.1185562252998352, "memory(GiB)": 122.96, "step": 31440, "token_acc": 0.9510847318870241, "train_speed(iter/s)": 0.237557 }, { "epoch": 2.3969052519246894, "grad_norm": 0.8989559412002563, "learning_rate": 5.323655250807599e-05, "loss": 0.07903689742088318, "memory(GiB)": 122.96, "step": 31445, "token_acc": 0.971736204576043, "train_speed(iter/s)": 0.237557 }, { "epoch": 2.3972863785349494, "grad_norm": 0.763389527797699, "learning_rate": 5.322460408115338e-05, "loss": 0.11701039075851441, "memory(GiB)": 122.96, "step": 31450, "token_acc": 0.9606656580937972, "train_speed(iter/s)": 0.237563 }, { "epoch": 2.3976675051452094, "grad_norm": 1.066260576248169, "learning_rate": 5.32126554693147e-05, "loss": 0.06187049150466919, "memory(GiB)": 122.96, "step": 31455, "token_acc": 0.9700100806451613, "train_speed(iter/s)": 0.237573 }, { "epoch": 2.398048631755469, "grad_norm": 1.5077931880950928, "learning_rate": 5.320070667324513e-05, "loss": 0.10295662879943848, "memory(GiB)": 122.96, "step": 31460, "token_acc": 0.9574521025130629, "train_speed(iter/s)": 0.237582 }, { "epoch": 2.398429758365729, "grad_norm": 1.1629680395126343, "learning_rate": 5.3188757693629884e-05, "loss": 0.10598461627960205, "memory(GiB)": 122.96, "step": 31465, "token_acc": 0.9535163776493256, "train_speed(iter/s)": 0.237591 }, { "epoch": 2.398810884975989, "grad_norm": 1.692908525466919, "learning_rate": 5.3176808531154186e-05, "loss": 0.09750716090202331, "memory(GiB)": 122.96, "step": 31470, "token_acc": 0.9654150197628458, "train_speed(iter/s)": 0.237599 }, { "epoch": 2.399192011586249, "grad_norm": 0.5367060303688049, "learning_rate": 5.3164859186503256e-05, "loss": 0.07754534482955933, "memory(GiB)": 122.96, "step": 31475, "token_acc": 0.9716872110939908, "train_speed(iter/s)": 0.237601 }, { "epoch": 2.3995731381965086, "grad_norm": 1.5796602964401245, "learning_rate": 5.315290966036234e-05, "loss": 0.11748656034469604, "memory(GiB)": 122.96, "step": 31480, "token_acc": 0.9631989596879064, "train_speed(iter/s)": 0.2376 }, { "epoch": 2.3999542648067687, "grad_norm": 0.7455106377601624, "learning_rate": 5.3140959953416693e-05, "loss": 0.08052210211753845, "memory(GiB)": 122.96, "step": 31485, "token_acc": 0.9615822424587365, "train_speed(iter/s)": 0.237612 }, { "epoch": 2.4003353914170287, "grad_norm": 1.794923186302185, "learning_rate": 5.312901006635157e-05, "loss": 0.11531267166137696, "memory(GiB)": 122.96, "step": 31490, "token_acc": 0.9578740157480315, "train_speed(iter/s)": 0.23762 }, { "epoch": 2.4007165180272887, "grad_norm": 1.5555272102355957, "learning_rate": 5.3117059999852214e-05, "loss": 0.09888643026351929, "memory(GiB)": 122.96, "step": 31495, "token_acc": 0.9664864864864865, "train_speed(iter/s)": 0.23763 }, { "epoch": 2.4010976446375487, "grad_norm": 0.6291319727897644, "learning_rate": 5.310510975460395e-05, "loss": 0.08097626566886902, "memory(GiB)": 122.96, "step": 31500, "token_acc": 0.9746247319513939, "train_speed(iter/s)": 0.237634 }, { "epoch": 2.4014787712478087, "grad_norm": 0.5683590173721313, "learning_rate": 5.3093159331292065e-05, "loss": 0.06617435216903686, "memory(GiB)": 122.96, "step": 31505, "token_acc": 0.9741222548608197, "train_speed(iter/s)": 0.237634 }, { "epoch": 2.4018598978580683, "grad_norm": 0.7380191683769226, "learning_rate": 5.308120873060183e-05, "loss": 0.08121579885482788, "memory(GiB)": 122.96, "step": 31510, "token_acc": 0.9699248120300752, "train_speed(iter/s)": 0.237648 }, { "epoch": 2.4022410244683283, "grad_norm": 0.8455606698989868, "learning_rate": 5.3069257953218576e-05, "loss": 0.08265498876571656, "memory(GiB)": 122.96, "step": 31515, "token_acc": 0.9768191896795001, "train_speed(iter/s)": 0.237654 }, { "epoch": 2.4026221510785883, "grad_norm": 0.8110338449478149, "learning_rate": 5.305730699982763e-05, "loss": 0.07475255727767945, "memory(GiB)": 122.96, "step": 31520, "token_acc": 0.9634574841883345, "train_speed(iter/s)": 0.237658 }, { "epoch": 2.4030032776888484, "grad_norm": 0.9760684370994568, "learning_rate": 5.30453558711143e-05, "loss": 0.06787163615226746, "memory(GiB)": 122.96, "step": 31525, "token_acc": 0.972764078027236, "train_speed(iter/s)": 0.23767 }, { "epoch": 2.403384404299108, "grad_norm": 1.9275811910629272, "learning_rate": 5.3033404567763945e-05, "loss": 0.13564035892486573, "memory(GiB)": 122.96, "step": 31530, "token_acc": 0.9523615635179153, "train_speed(iter/s)": 0.237681 }, { "epoch": 2.403765530909368, "grad_norm": 0.8539645671844482, "learning_rate": 5.3021453090461925e-05, "loss": 0.07939769625663758, "memory(GiB)": 122.96, "step": 31535, "token_acc": 0.9687356848373797, "train_speed(iter/s)": 0.237684 }, { "epoch": 2.404146657519628, "grad_norm": 1.4706199169158936, "learning_rate": 5.3009501439893595e-05, "loss": 0.10877256393432617, "memory(GiB)": 122.96, "step": 31540, "token_acc": 0.9693493794659647, "train_speed(iter/s)": 0.23769 }, { "epoch": 2.404527784129888, "grad_norm": 1.3269944190979004, "learning_rate": 5.29975496167443e-05, "loss": 0.1155362606048584, "memory(GiB)": 122.96, "step": 31545, "token_acc": 0.9621729237770194, "train_speed(iter/s)": 0.237699 }, { "epoch": 2.404908910740148, "grad_norm": 1.5807892084121704, "learning_rate": 5.298559762169947e-05, "loss": 0.09412874579429627, "memory(GiB)": 122.96, "step": 31550, "token_acc": 0.9645469893078221, "train_speed(iter/s)": 0.237704 }, { "epoch": 2.405290037350408, "grad_norm": 0.9008060693740845, "learning_rate": 5.297364545544445e-05, "loss": 0.10727784633636475, "memory(GiB)": 122.96, "step": 31555, "token_acc": 0.9579972606909146, "train_speed(iter/s)": 0.237709 }, { "epoch": 2.4056711639606676, "grad_norm": 1.4330517053604126, "learning_rate": 5.296169311866468e-05, "loss": 0.10135059356689453, "memory(GiB)": 122.96, "step": 31560, "token_acc": 0.9543322636386078, "train_speed(iter/s)": 0.237718 }, { "epoch": 2.4060522905709276, "grad_norm": 0.2918750047683716, "learning_rate": 5.294974061204554e-05, "loss": 0.05997448563575745, "memory(GiB)": 122.96, "step": 31565, "token_acc": 0.972, "train_speed(iter/s)": 0.237724 }, { "epoch": 2.4064334171811876, "grad_norm": 1.6249165534973145, "learning_rate": 5.293778793627247e-05, "loss": 0.09581416249275207, "memory(GiB)": 122.96, "step": 31570, "token_acc": 0.9656072106261859, "train_speed(iter/s)": 0.237732 }, { "epoch": 2.4068145437914477, "grad_norm": 0.9419896602630615, "learning_rate": 5.29258350920309e-05, "loss": 0.08600782155990601, "memory(GiB)": 122.96, "step": 31575, "token_acc": 0.9674609274916731, "train_speed(iter/s)": 0.23774 }, { "epoch": 2.4071956704017072, "grad_norm": 0.696643590927124, "learning_rate": 5.291388208000625e-05, "loss": 0.07383073568344116, "memory(GiB)": 122.96, "step": 31580, "token_acc": 0.969704086425552, "train_speed(iter/s)": 0.237747 }, { "epoch": 2.4075767970119673, "grad_norm": 0.6923823952674866, "learning_rate": 5.2901928900884e-05, "loss": 0.13172988891601561, "memory(GiB)": 122.96, "step": 31585, "token_acc": 0.954599761051374, "train_speed(iter/s)": 0.237751 }, { "epoch": 2.4079579236222273, "grad_norm": 0.5206090807914734, "learning_rate": 5.288997555534959e-05, "loss": 0.1410140037536621, "memory(GiB)": 122.96, "step": 31590, "token_acc": 0.9493966523939276, "train_speed(iter/s)": 0.237756 }, { "epoch": 2.4083390502324873, "grad_norm": 0.9300687909126282, "learning_rate": 5.28780220440885e-05, "loss": 0.0958120882511139, "memory(GiB)": 122.96, "step": 31595, "token_acc": 0.9664555878829368, "train_speed(iter/s)": 0.23776 }, { "epoch": 2.4087201768427473, "grad_norm": 1.3180656433105469, "learning_rate": 5.286606836778619e-05, "loss": 0.09393603801727295, "memory(GiB)": 122.96, "step": 31600, "token_acc": 0.9685620557681793, "train_speed(iter/s)": 0.237762 }, { "epoch": 2.4087201768427473, "eval_loss": 0.08324947953224182, "eval_runtime": 218.9144, "eval_samples_per_second": 2.421, "eval_steps_per_second": 2.421, "eval_token_acc": 0.9632627552557075, "step": 31600 }, { "epoch": 2.4091013034530073, "grad_norm": 0.5603662133216858, "learning_rate": 5.2854114527128164e-05, "loss": 0.0957037091255188, "memory(GiB)": 122.96, "step": 31605, "token_acc": 0.9633915240611003, "train_speed(iter/s)": 0.237376 }, { "epoch": 2.409482430063267, "grad_norm": 0.9108856916427612, "learning_rate": 5.284216052279992e-05, "loss": 0.09463875889778137, "memory(GiB)": 122.96, "step": 31610, "token_acc": 0.963344388200384, "train_speed(iter/s)": 0.237385 }, { "epoch": 2.409863556673527, "grad_norm": 1.821824312210083, "learning_rate": 5.2830206355486945e-05, "loss": 0.13878395557403564, "memory(GiB)": 122.96, "step": 31615, "token_acc": 0.959659004414675, "train_speed(iter/s)": 0.237389 }, { "epoch": 2.410244683283787, "grad_norm": 1.2382431030273438, "learning_rate": 5.281825202587478e-05, "loss": 0.06228114366531372, "memory(GiB)": 122.96, "step": 31620, "token_acc": 0.9710444674250258, "train_speed(iter/s)": 0.2374 }, { "epoch": 2.410625809894047, "grad_norm": 0.6332990527153015, "learning_rate": 5.280629753464894e-05, "loss": 0.06674144864082336, "memory(GiB)": 122.96, "step": 31625, "token_acc": 0.9770469798657718, "train_speed(iter/s)": 0.237402 }, { "epoch": 2.4110069365043065, "grad_norm": 0.8873818516731262, "learning_rate": 5.279434288249495e-05, "loss": 0.11393991708755494, "memory(GiB)": 122.96, "step": 31630, "token_acc": 0.9613229712633291, "train_speed(iter/s)": 0.23741 }, { "epoch": 2.4113880631145665, "grad_norm": 1.110944151878357, "learning_rate": 5.2782388070098365e-05, "loss": 0.06567127704620361, "memory(GiB)": 122.96, "step": 31635, "token_acc": 0.9792456235336582, "train_speed(iter/s)": 0.237414 }, { "epoch": 2.4117691897248266, "grad_norm": 0.9004189372062683, "learning_rate": 5.277043309814475e-05, "loss": 0.0906289279460907, "memory(GiB)": 122.96, "step": 31640, "token_acc": 0.9643046007403491, "train_speed(iter/s)": 0.237417 }, { "epoch": 2.4121503163350866, "grad_norm": 1.5165259838104248, "learning_rate": 5.275847796731964e-05, "loss": 0.10088374614715576, "memory(GiB)": 122.96, "step": 31645, "token_acc": 0.9706678700361011, "train_speed(iter/s)": 0.237428 }, { "epoch": 2.4125314429453466, "grad_norm": 0.7593477368354797, "learning_rate": 5.274652267830862e-05, "loss": 0.08354413509368896, "memory(GiB)": 122.96, "step": 31650, "token_acc": 0.9675805483289974, "train_speed(iter/s)": 0.237435 }, { "epoch": 2.412912569555606, "grad_norm": 0.9742563962936401, "learning_rate": 5.2734567231797286e-05, "loss": 0.07595022916793823, "memory(GiB)": 122.96, "step": 31655, "token_acc": 0.966954851104707, "train_speed(iter/s)": 0.237442 }, { "epoch": 2.413293696165866, "grad_norm": 0.698854386806488, "learning_rate": 5.27226116284712e-05, "loss": 0.09080097079277039, "memory(GiB)": 122.96, "step": 31660, "token_acc": 0.9652884124553344, "train_speed(iter/s)": 0.237446 }, { "epoch": 2.413674822776126, "grad_norm": 0.603148341178894, "learning_rate": 5.271065586901596e-05, "loss": 0.10014762878417968, "memory(GiB)": 122.96, "step": 31665, "token_acc": 0.9520249221183801, "train_speed(iter/s)": 0.237453 }, { "epoch": 2.4140559493863862, "grad_norm": 1.0888844728469849, "learning_rate": 5.269869995411719e-05, "loss": 0.0982728898525238, "memory(GiB)": 122.96, "step": 31670, "token_acc": 0.9598056537102474, "train_speed(iter/s)": 0.23746 }, { "epoch": 2.4144370759966463, "grad_norm": 0.4615923762321472, "learning_rate": 5.268674388446051e-05, "loss": 0.09465991854667663, "memory(GiB)": 122.96, "step": 31675, "token_acc": 0.9689521345407504, "train_speed(iter/s)": 0.237472 }, { "epoch": 2.414818202606906, "grad_norm": 0.7292890548706055, "learning_rate": 5.267478766073154e-05, "loss": 0.06650314331054688, "memory(GiB)": 122.96, "step": 31680, "token_acc": 0.9703663793103449, "train_speed(iter/s)": 0.237475 }, { "epoch": 2.415199329217166, "grad_norm": 1.8328521251678467, "learning_rate": 5.2662831283615896e-05, "loss": 0.10270296335220337, "memory(GiB)": 122.96, "step": 31685, "token_acc": 0.9630599947602829, "train_speed(iter/s)": 0.237485 }, { "epoch": 2.415580455827426, "grad_norm": 0.8092535138130188, "learning_rate": 5.265087475379924e-05, "loss": 0.1117973804473877, "memory(GiB)": 122.96, "step": 31690, "token_acc": 0.9579231808531171, "train_speed(iter/s)": 0.237492 }, { "epoch": 2.415961582437686, "grad_norm": 0.694914698600769, "learning_rate": 5.2638918071967224e-05, "loss": 0.06421082615852355, "memory(GiB)": 122.96, "step": 31695, "token_acc": 0.9745830023828436, "train_speed(iter/s)": 0.237505 }, { "epoch": 2.416342709047946, "grad_norm": 1.161771535873413, "learning_rate": 5.2626961238805486e-05, "loss": 0.06641955971717835, "memory(GiB)": 122.96, "step": 31700, "token_acc": 0.9675284384694933, "train_speed(iter/s)": 0.237511 }, { "epoch": 2.4167238356582055, "grad_norm": 1.4579120874404907, "learning_rate": 5.2615004254999724e-05, "loss": 0.10980154275894165, "memory(GiB)": 122.96, "step": 31705, "token_acc": 0.9523434423001181, "train_speed(iter/s)": 0.237522 }, { "epoch": 2.4171049622684655, "grad_norm": 0.7843745350837708, "learning_rate": 5.26030471212356e-05, "loss": 0.08074135184288025, "memory(GiB)": 122.96, "step": 31710, "token_acc": 0.9611933602512337, "train_speed(iter/s)": 0.23753 }, { "epoch": 2.4174860888787255, "grad_norm": 0.7337310314178467, "learning_rate": 5.2591089838198816e-05, "loss": 0.05520209074020386, "memory(GiB)": 122.96, "step": 31715, "token_acc": 0.9701492537313433, "train_speed(iter/s)": 0.23754 }, { "epoch": 2.4178672154889855, "grad_norm": 1.0853062868118286, "learning_rate": 5.2579132406575036e-05, "loss": 0.06261842846870422, "memory(GiB)": 122.96, "step": 31720, "token_acc": 0.9723382045929019, "train_speed(iter/s)": 0.237548 }, { "epoch": 2.4182483420992456, "grad_norm": 0.6997091770172119, "learning_rate": 5.256717482704999e-05, "loss": 0.11928043365478516, "memory(GiB)": 122.96, "step": 31725, "token_acc": 0.9498080087767417, "train_speed(iter/s)": 0.237559 }, { "epoch": 2.418629468709505, "grad_norm": 0.8186673521995544, "learning_rate": 5.25552171003094e-05, "loss": 0.05571422576904297, "memory(GiB)": 122.96, "step": 31730, "token_acc": 0.9739985945186226, "train_speed(iter/s)": 0.237568 }, { "epoch": 2.419010595319765, "grad_norm": 0.4593213200569153, "learning_rate": 5.254325922703893e-05, "loss": 0.05884087085723877, "memory(GiB)": 122.96, "step": 31735, "token_acc": 0.9728826151560178, "train_speed(iter/s)": 0.237579 }, { "epoch": 2.419391721930025, "grad_norm": 1.2494239807128906, "learning_rate": 5.253130120792437e-05, "loss": 0.09739054441452026, "memory(GiB)": 122.96, "step": 31740, "token_acc": 0.9652892561983472, "train_speed(iter/s)": 0.237583 }, { "epoch": 2.419772848540285, "grad_norm": 0.7136322259902954, "learning_rate": 5.2519343043651424e-05, "loss": 0.080861896276474, "memory(GiB)": 122.96, "step": 31745, "token_acc": 0.9726266744321491, "train_speed(iter/s)": 0.237593 }, { "epoch": 2.420153975150545, "grad_norm": 0.6551394462585449, "learning_rate": 5.250738473490584e-05, "loss": 0.07178466320037842, "memory(GiB)": 122.96, "step": 31750, "token_acc": 0.9680557381359275, "train_speed(iter/s)": 0.237598 }, { "epoch": 2.4205351017608048, "grad_norm": 0.8119661808013916, "learning_rate": 5.2495426282373385e-05, "loss": 0.06998769044876099, "memory(GiB)": 122.96, "step": 31755, "token_acc": 0.9660215689171222, "train_speed(iter/s)": 0.237604 }, { "epoch": 2.420916228371065, "grad_norm": 0.9762775301933289, "learning_rate": 5.248346768673982e-05, "loss": 0.10886942148208618, "memory(GiB)": 122.96, "step": 31760, "token_acc": 0.9616132167152576, "train_speed(iter/s)": 0.237613 }, { "epoch": 2.421297354981325, "grad_norm": 0.5545241236686707, "learning_rate": 5.2471508948690896e-05, "loss": 0.0667304515838623, "memory(GiB)": 122.96, "step": 31765, "token_acc": 0.9703777335984095, "train_speed(iter/s)": 0.237617 }, { "epoch": 2.421678481591585, "grad_norm": 0.719426691532135, "learning_rate": 5.245955006891239e-05, "loss": 0.09236547946929932, "memory(GiB)": 122.96, "step": 31770, "token_acc": 0.968293700458907, "train_speed(iter/s)": 0.237626 }, { "epoch": 2.4220596082018444, "grad_norm": 1.5304877758026123, "learning_rate": 5.244759104809013e-05, "loss": 0.08164860606193543, "memory(GiB)": 122.96, "step": 31775, "token_acc": 0.9673558215451578, "train_speed(iter/s)": 0.237637 }, { "epoch": 2.4224407348121044, "grad_norm": 1.1216273307800293, "learning_rate": 5.2435631886909864e-05, "loss": 0.08680131435394287, "memory(GiB)": 122.96, "step": 31780, "token_acc": 0.9690166142792995, "train_speed(iter/s)": 0.237645 }, { "epoch": 2.4228218614223644, "grad_norm": 0.9484697580337524, "learning_rate": 5.2423672586057415e-05, "loss": 0.07908720970153808, "memory(GiB)": 122.96, "step": 31785, "token_acc": 0.9670462805912285, "train_speed(iter/s)": 0.237653 }, { "epoch": 2.4232029880326245, "grad_norm": 1.06740140914917, "learning_rate": 5.24117131462186e-05, "loss": 0.09126312732696533, "memory(GiB)": 122.96, "step": 31790, "token_acc": 0.966627138743012, "train_speed(iter/s)": 0.237657 }, { "epoch": 2.4235841146428845, "grad_norm": 1.2163888216018677, "learning_rate": 5.2399753568079225e-05, "loss": 0.11822844743728637, "memory(GiB)": 122.96, "step": 31795, "token_acc": 0.9621054930654145, "train_speed(iter/s)": 0.237656 }, { "epoch": 2.4239652412531445, "grad_norm": 0.6266806125640869, "learning_rate": 5.2387793852325115e-05, "loss": 0.10553070306777954, "memory(GiB)": 122.96, "step": 31800, "token_acc": 0.9536397711580193, "train_speed(iter/s)": 0.237664 }, { "epoch": 2.4239652412531445, "eval_loss": 0.08320986479520798, "eval_runtime": 221.7066, "eval_samples_per_second": 2.391, "eval_steps_per_second": 2.391, "eval_token_acc": 0.9634961749292211, "step": 31800 }, { "epoch": 2.424346367863404, "grad_norm": 0.6699129939079285, "learning_rate": 5.23758339996421e-05, "loss": 0.060913360118865965, "memory(GiB)": 122.96, "step": 31805, "token_acc": 0.9639290527005864, "train_speed(iter/s)": 0.23728 }, { "epoch": 2.424727494473664, "grad_norm": 0.8798589110374451, "learning_rate": 5.236387401071604e-05, "loss": 0.08053375482559204, "memory(GiB)": 122.96, "step": 31810, "token_acc": 0.9691436918020815, "train_speed(iter/s)": 0.237286 }, { "epoch": 2.425108621083924, "grad_norm": 1.0643125772476196, "learning_rate": 5.235191388623277e-05, "loss": 0.06070588231086731, "memory(GiB)": 122.96, "step": 31815, "token_acc": 0.9715695952615993, "train_speed(iter/s)": 0.237294 }, { "epoch": 2.425489747694184, "grad_norm": 0.7604033946990967, "learning_rate": 5.2339953626878156e-05, "loss": 0.0696878433227539, "memory(GiB)": 122.96, "step": 31820, "token_acc": 0.969078947368421, "train_speed(iter/s)": 0.237304 }, { "epoch": 2.4258708743044437, "grad_norm": 0.8127695322036743, "learning_rate": 5.232799323333807e-05, "loss": 0.09960362315177917, "memory(GiB)": 122.96, "step": 31825, "token_acc": 0.9623539593249676, "train_speed(iter/s)": 0.237316 }, { "epoch": 2.4262520009147037, "grad_norm": 0.5826697945594788, "learning_rate": 5.2316032706298355e-05, "loss": 0.09174591302871704, "memory(GiB)": 122.96, "step": 31830, "token_acc": 0.9666083406240886, "train_speed(iter/s)": 0.237317 }, { "epoch": 2.4266331275249637, "grad_norm": 0.7676007747650146, "learning_rate": 5.230407204644493e-05, "loss": 0.06858834028244018, "memory(GiB)": 122.96, "step": 31835, "token_acc": 0.9712663454920853, "train_speed(iter/s)": 0.237323 }, { "epoch": 2.4270142541352238, "grad_norm": 0.6547728180885315, "learning_rate": 5.229211125446365e-05, "loss": 0.09747377634048462, "memory(GiB)": 122.96, "step": 31840, "token_acc": 0.9576492981203902, "train_speed(iter/s)": 0.237331 }, { "epoch": 2.4273953807454838, "grad_norm": 0.6518384218215942, "learning_rate": 5.2280150331040436e-05, "loss": 0.10961241722106933, "memory(GiB)": 122.96, "step": 31845, "token_acc": 0.9594496741491673, "train_speed(iter/s)": 0.237342 }, { "epoch": 2.427776507355744, "grad_norm": 0.8992350101470947, "learning_rate": 5.226818927686118e-05, "loss": 0.08256965279579162, "memory(GiB)": 122.96, "step": 31850, "token_acc": 0.9724241664577589, "train_speed(iter/s)": 0.237349 }, { "epoch": 2.4281576339660034, "grad_norm": 1.2267297506332397, "learning_rate": 5.225622809261179e-05, "loss": 0.09309107065200806, "memory(GiB)": 122.96, "step": 31855, "token_acc": 0.9603156184729635, "train_speed(iter/s)": 0.237358 }, { "epoch": 2.4285387605762634, "grad_norm": 0.6713018417358398, "learning_rate": 5.2244266778978205e-05, "loss": 0.07914117574691773, "memory(GiB)": 122.96, "step": 31860, "token_acc": 0.9637027491408935, "train_speed(iter/s)": 0.237366 }, { "epoch": 2.4289198871865234, "grad_norm": 1.339560866355896, "learning_rate": 5.223230533664631e-05, "loss": 0.12258504629135132, "memory(GiB)": 122.96, "step": 31865, "token_acc": 0.950259067357513, "train_speed(iter/s)": 0.237371 }, { "epoch": 2.4293010137967834, "grad_norm": 0.9964811205863953, "learning_rate": 5.2220343766302084e-05, "loss": 0.09028789401054382, "memory(GiB)": 122.96, "step": 31870, "token_acc": 0.9670249590089269, "train_speed(iter/s)": 0.237377 }, { "epoch": 2.429682140407043, "grad_norm": 0.7002070546150208, "learning_rate": 5.220838206863143e-05, "loss": 0.07981270551681519, "memory(GiB)": 122.96, "step": 31875, "token_acc": 0.9696598222494637, "train_speed(iter/s)": 0.237378 }, { "epoch": 2.430063267017303, "grad_norm": 1.0039584636688232, "learning_rate": 5.219642024432033e-05, "loss": 0.08711093664169312, "memory(GiB)": 122.96, "step": 31880, "token_acc": 0.9690210656753407, "train_speed(iter/s)": 0.237386 }, { "epoch": 2.430444393627563, "grad_norm": 1.5410128831863403, "learning_rate": 5.218445829405472e-05, "loss": 0.10010253190994263, "memory(GiB)": 122.96, "step": 31885, "token_acc": 0.9681790186819955, "train_speed(iter/s)": 0.237393 }, { "epoch": 2.430825520237823, "grad_norm": 0.6407871246337891, "learning_rate": 5.217249621852055e-05, "loss": 0.07102736234664916, "memory(GiB)": 122.96, "step": 31890, "token_acc": 0.9688877365828364, "train_speed(iter/s)": 0.237402 }, { "epoch": 2.431206646848083, "grad_norm": 0.611670732498169, "learning_rate": 5.216053401840382e-05, "loss": 0.06450393795967102, "memory(GiB)": 122.96, "step": 31895, "token_acc": 0.9769192172604114, "train_speed(iter/s)": 0.237414 }, { "epoch": 2.431587773458343, "grad_norm": 1.593155860900879, "learning_rate": 5.2148571694390466e-05, "loss": 0.1161083459854126, "memory(GiB)": 122.96, "step": 31900, "token_acc": 0.9509619011693701, "train_speed(iter/s)": 0.237426 }, { "epoch": 2.4319689000686027, "grad_norm": 1.0629907846450806, "learning_rate": 5.2136609247166514e-05, "loss": 0.11345916986465454, "memory(GiB)": 122.96, "step": 31905, "token_acc": 0.9641387419165197, "train_speed(iter/s)": 0.237438 }, { "epoch": 2.4323500266788627, "grad_norm": 0.7532215714454651, "learning_rate": 5.212464667741793e-05, "loss": 0.11763770580291748, "memory(GiB)": 122.96, "step": 31910, "token_acc": 0.9644532916251956, "train_speed(iter/s)": 0.237441 }, { "epoch": 2.4327311532891227, "grad_norm": 1.4345853328704834, "learning_rate": 5.211268398583072e-05, "loss": 0.1153026819229126, "memory(GiB)": 122.96, "step": 31915, "token_acc": 0.9670843463624462, "train_speed(iter/s)": 0.237443 }, { "epoch": 2.4331122798993827, "grad_norm": 0.7158430814743042, "learning_rate": 5.2100721173090896e-05, "loss": 0.11205878257751464, "memory(GiB)": 122.96, "step": 31920, "token_acc": 0.9625949736995909, "train_speed(iter/s)": 0.237445 }, { "epoch": 2.4334934065096423, "grad_norm": 1.5121842622756958, "learning_rate": 5.208875823988445e-05, "loss": 0.12104721069335937, "memory(GiB)": 122.96, "step": 31925, "token_acc": 0.9494358545758462, "train_speed(iter/s)": 0.237456 }, { "epoch": 2.4338745331199023, "grad_norm": 1.26921808719635, "learning_rate": 5.207679518689742e-05, "loss": 0.09834056496620178, "memory(GiB)": 122.96, "step": 31930, "token_acc": 0.959081987014948, "train_speed(iter/s)": 0.237461 }, { "epoch": 2.4342556597301623, "grad_norm": 1.066515326499939, "learning_rate": 5.2064832014815823e-05, "loss": 0.10878283977508545, "memory(GiB)": 122.96, "step": 31935, "token_acc": 0.9667946690761238, "train_speed(iter/s)": 0.237469 }, { "epoch": 2.4346367863404224, "grad_norm": 0.9966956973075867, "learning_rate": 5.2052868724325686e-05, "loss": 0.11512922048568726, "memory(GiB)": 122.96, "step": 31940, "token_acc": 0.9642196175200494, "train_speed(iter/s)": 0.23748 }, { "epoch": 2.4350179129506824, "grad_norm": 1.0083377361297607, "learning_rate": 5.204090531611307e-05, "loss": 0.1125170111656189, "memory(GiB)": 122.96, "step": 31945, "token_acc": 0.9462599854756718, "train_speed(iter/s)": 0.237492 }, { "epoch": 2.4353990395609424, "grad_norm": 0.6221123933792114, "learning_rate": 5.202894179086398e-05, "loss": 0.08732749223709106, "memory(GiB)": 122.96, "step": 31950, "token_acc": 0.9691863995142683, "train_speed(iter/s)": 0.237496 }, { "epoch": 2.435780166171202, "grad_norm": 1.417541265487671, "learning_rate": 5.2016978149264505e-05, "loss": 0.10020771026611328, "memory(GiB)": 122.96, "step": 31955, "token_acc": 0.9634536317953404, "train_speed(iter/s)": 0.237504 }, { "epoch": 2.436161292781462, "grad_norm": 0.6154100894927979, "learning_rate": 5.2005014392000694e-05, "loss": 0.11524808406829834, "memory(GiB)": 122.96, "step": 31960, "token_acc": 0.9590846047156727, "train_speed(iter/s)": 0.237511 }, { "epoch": 2.436542419391722, "grad_norm": 1.0033057928085327, "learning_rate": 5.199305051975861e-05, "loss": 0.06674709320068359, "memory(GiB)": 122.96, "step": 31965, "token_acc": 0.974304068522484, "train_speed(iter/s)": 0.237521 }, { "epoch": 2.436923546001982, "grad_norm": 1.5526834726333618, "learning_rate": 5.1981086533224335e-05, "loss": 0.11083319187164306, "memory(GiB)": 122.96, "step": 31970, "token_acc": 0.9573333333333334, "train_speed(iter/s)": 0.237527 }, { "epoch": 2.4373046726122416, "grad_norm": 1.394752025604248, "learning_rate": 5.1969122433083925e-05, "loss": 0.07061379551887512, "memory(GiB)": 122.96, "step": 31975, "token_acc": 0.9705153617443013, "train_speed(iter/s)": 0.237536 }, { "epoch": 2.4376857992225016, "grad_norm": 1.3300479650497437, "learning_rate": 5.195715822002349e-05, "loss": 0.07141281962394715, "memory(GiB)": 122.96, "step": 31980, "token_acc": 0.9783606557377049, "train_speed(iter/s)": 0.237549 }, { "epoch": 2.4380669258327616, "grad_norm": 0.6959165334701538, "learning_rate": 5.194519389472913e-05, "loss": 0.07534806132316589, "memory(GiB)": 122.96, "step": 31985, "token_acc": 0.9744908048249951, "train_speed(iter/s)": 0.237556 }, { "epoch": 2.4384480524430217, "grad_norm": 0.9855327010154724, "learning_rate": 5.1933229457886914e-05, "loss": 0.103554368019104, "memory(GiB)": 122.96, "step": 31990, "token_acc": 0.962838968865082, "train_speed(iter/s)": 0.237567 }, { "epoch": 2.4388291790532817, "grad_norm": 0.8330475091934204, "learning_rate": 5.1921264910182955e-05, "loss": 0.07543789148330689, "memory(GiB)": 122.96, "step": 31995, "token_acc": 0.9707169303264894, "train_speed(iter/s)": 0.237567 }, { "epoch": 2.4392103056635412, "grad_norm": 0.6359584927558899, "learning_rate": 5.1909300252303374e-05, "loss": 0.10583755970001221, "memory(GiB)": 122.96, "step": 32000, "token_acc": 0.9592153542195879, "train_speed(iter/s)": 0.237571 }, { "epoch": 2.4392103056635412, "eval_loss": 0.08176108449697495, "eval_runtime": 221.086, "eval_samples_per_second": 2.397, "eval_steps_per_second": 2.397, "eval_token_acc": 0.9633455815914704, "step": 32000 }, { "epoch": 2.4395914322738013, "grad_norm": 1.109655737876892, "learning_rate": 5.189733548493428e-05, "loss": 0.08499435186386109, "memory(GiB)": 122.96, "step": 32005, "token_acc": 0.9633421762294183, "train_speed(iter/s)": 0.237189 }, { "epoch": 2.4399725588840613, "grad_norm": 0.7547616958618164, "learning_rate": 5.1885370608761794e-05, "loss": 0.07540545463562012, "memory(GiB)": 122.96, "step": 32010, "token_acc": 0.9722627737226277, "train_speed(iter/s)": 0.237195 }, { "epoch": 2.4403536854943213, "grad_norm": 0.6588416695594788, "learning_rate": 5.187340562447207e-05, "loss": 0.11663752794265747, "memory(GiB)": 122.96, "step": 32015, "token_acc": 0.9494510521500458, "train_speed(iter/s)": 0.237204 }, { "epoch": 2.4407348121045813, "grad_norm": 1.456199049949646, "learning_rate": 5.1861440532751225e-05, "loss": 0.08906418681144715, "memory(GiB)": 122.96, "step": 32020, "token_acc": 0.9550970873786407, "train_speed(iter/s)": 0.237216 }, { "epoch": 2.441115938714841, "grad_norm": 0.9228630661964417, "learning_rate": 5.184947533428539e-05, "loss": 0.10259878635406494, "memory(GiB)": 122.96, "step": 32025, "token_acc": 0.956642800318218, "train_speed(iter/s)": 0.237223 }, { "epoch": 2.441497065325101, "grad_norm": 0.683269202709198, "learning_rate": 5.1837510029760727e-05, "loss": 0.07409683465957642, "memory(GiB)": 122.96, "step": 32030, "token_acc": 0.9642149434666045, "train_speed(iter/s)": 0.237229 }, { "epoch": 2.441878191935361, "grad_norm": 0.9451534152030945, "learning_rate": 5.18255446198634e-05, "loss": 0.10500092506408691, "memory(GiB)": 122.96, "step": 32035, "token_acc": 0.9587143252978665, "train_speed(iter/s)": 0.237238 }, { "epoch": 2.442259318545621, "grad_norm": 1.662619709968567, "learning_rate": 5.1813579105279565e-05, "loss": 0.08998711705207825, "memory(GiB)": 122.96, "step": 32040, "token_acc": 0.9619985925404645, "train_speed(iter/s)": 0.237251 }, { "epoch": 2.442640445155881, "grad_norm": 2.0273852348327637, "learning_rate": 5.180161348669538e-05, "loss": 0.08959287405014038, "memory(GiB)": 122.96, "step": 32045, "token_acc": 0.9716544017443445, "train_speed(iter/s)": 0.237261 }, { "epoch": 2.4430215717661405, "grad_norm": 1.5230754613876343, "learning_rate": 5.178964776479701e-05, "loss": 0.06429438591003418, "memory(GiB)": 122.96, "step": 32050, "token_acc": 0.9695378151260504, "train_speed(iter/s)": 0.23727 }, { "epoch": 2.4434026983764006, "grad_norm": 0.7076734900474548, "learning_rate": 5.177768194027066e-05, "loss": 0.13213027715682985, "memory(GiB)": 122.96, "step": 32055, "token_acc": 0.9546652609383237, "train_speed(iter/s)": 0.237279 }, { "epoch": 2.4437838249866606, "grad_norm": 1.375850796699524, "learning_rate": 5.176571601380249e-05, "loss": 0.09209928512573243, "memory(GiB)": 122.96, "step": 32060, "token_acc": 0.9630838774485183, "train_speed(iter/s)": 0.237287 }, { "epoch": 2.4441649515969206, "grad_norm": 1.167485237121582, "learning_rate": 5.175374998607872e-05, "loss": 0.06853286027908326, "memory(GiB)": 122.96, "step": 32065, "token_acc": 0.9645819323021744, "train_speed(iter/s)": 0.237296 }, { "epoch": 2.4445460782071806, "grad_norm": 0.7112610936164856, "learning_rate": 5.174178385778551e-05, "loss": 0.05861258506774902, "memory(GiB)": 122.96, "step": 32070, "token_acc": 0.9736473226801233, "train_speed(iter/s)": 0.237306 }, { "epoch": 2.44492720481744, "grad_norm": 1.023407220840454, "learning_rate": 5.172981762960909e-05, "loss": 0.10661687850952148, "memory(GiB)": 122.96, "step": 32075, "token_acc": 0.9602234258632363, "train_speed(iter/s)": 0.237312 }, { "epoch": 2.4453083314277, "grad_norm": 1.1306768655776978, "learning_rate": 5.171785130223564e-05, "loss": 0.0817100465297699, "memory(GiB)": 122.96, "step": 32080, "token_acc": 0.9638140970673984, "train_speed(iter/s)": 0.237317 }, { "epoch": 2.4456894580379602, "grad_norm": 1.5240305662155151, "learning_rate": 5.1705884876351406e-05, "loss": 0.06689391136169434, "memory(GiB)": 122.96, "step": 32085, "token_acc": 0.970879440885265, "train_speed(iter/s)": 0.237327 }, { "epoch": 2.4460705846482202, "grad_norm": 0.48312103748321533, "learning_rate": 5.1693918352642576e-05, "loss": 0.06792814135551453, "memory(GiB)": 122.96, "step": 32090, "token_acc": 0.9712855637513171, "train_speed(iter/s)": 0.237334 }, { "epoch": 2.4464517112584803, "grad_norm": 0.8024948239326477, "learning_rate": 5.16819517317954e-05, "loss": 0.09426245093345642, "memory(GiB)": 122.96, "step": 32095, "token_acc": 0.9680248246087426, "train_speed(iter/s)": 0.237336 }, { "epoch": 2.44683283786874, "grad_norm": 1.1475082635879517, "learning_rate": 5.166998501449608e-05, "loss": 0.13640437126159669, "memory(GiB)": 122.96, "step": 32100, "token_acc": 0.9470935130581297, "train_speed(iter/s)": 0.237341 }, { "epoch": 2.447213964479, "grad_norm": 0.8022327423095703, "learning_rate": 5.1658018201430894e-05, "loss": 0.08888157606124877, "memory(GiB)": 122.96, "step": 32105, "token_acc": 0.9677274764679515, "train_speed(iter/s)": 0.237353 }, { "epoch": 2.44759509108926, "grad_norm": 0.8285499215126038, "learning_rate": 5.1646051293286045e-05, "loss": 0.08603664636611938, "memory(GiB)": 122.96, "step": 32110, "token_acc": 0.9610083424011607, "train_speed(iter/s)": 0.237358 }, { "epoch": 2.44797621769952, "grad_norm": 1.042164921760559, "learning_rate": 5.163408429074778e-05, "loss": 0.07164736390113831, "memory(GiB)": 122.96, "step": 32115, "token_acc": 0.9697325199436884, "train_speed(iter/s)": 0.237367 }, { "epoch": 2.4483573443097795, "grad_norm": 0.26407742500305176, "learning_rate": 5.162211719450238e-05, "loss": 0.07417976260185241, "memory(GiB)": 122.96, "step": 32120, "token_acc": 0.9642629227823867, "train_speed(iter/s)": 0.237373 }, { "epoch": 2.4487384709200395, "grad_norm": 1.2943451404571533, "learning_rate": 5.16101500052361e-05, "loss": 0.10034610033035278, "memory(GiB)": 122.96, "step": 32125, "token_acc": 0.9652587310294386, "train_speed(iter/s)": 0.237378 }, { "epoch": 2.4491195975302995, "grad_norm": 1.2685195207595825, "learning_rate": 5.159818272363517e-05, "loss": 0.1436115860939026, "memory(GiB)": 122.96, "step": 32130, "token_acc": 0.9471303364433136, "train_speed(iter/s)": 0.237387 }, { "epoch": 2.4495007241405595, "grad_norm": 0.9950312376022339, "learning_rate": 5.1586215350385884e-05, "loss": 0.10590897798538208, "memory(GiB)": 122.96, "step": 32135, "token_acc": 0.9515877147319105, "train_speed(iter/s)": 0.237397 }, { "epoch": 2.4498818507508195, "grad_norm": 0.6982214450836182, "learning_rate": 5.157424788617451e-05, "loss": 0.0680645227432251, "memory(GiB)": 122.96, "step": 32140, "token_acc": 0.9668825161887141, "train_speed(iter/s)": 0.237404 }, { "epoch": 2.4502629773610796, "grad_norm": 0.9895561337471008, "learning_rate": 5.1562280331687305e-05, "loss": 0.1087909460067749, "memory(GiB)": 122.96, "step": 32145, "token_acc": 0.9603812371519342, "train_speed(iter/s)": 0.237412 }, { "epoch": 2.450644103971339, "grad_norm": 1.4580472707748413, "learning_rate": 5.1550312687610606e-05, "loss": 0.08812410831451416, "memory(GiB)": 122.96, "step": 32150, "token_acc": 0.9710865561694291, "train_speed(iter/s)": 0.237417 }, { "epoch": 2.451025230581599, "grad_norm": 0.0010332155507057905, "learning_rate": 5.153834495463065e-05, "loss": 0.06268866062164306, "memory(GiB)": 122.96, "step": 32155, "token_acc": 0.9725125822686799, "train_speed(iter/s)": 0.237424 }, { "epoch": 2.451406357191859, "grad_norm": 0.8241733312606812, "learning_rate": 5.1526377133433756e-05, "loss": 0.06694316864013672, "memory(GiB)": 122.96, "step": 32160, "token_acc": 0.9651191969887076, "train_speed(iter/s)": 0.237433 }, { "epoch": 2.451787483802119, "grad_norm": 1.466530680656433, "learning_rate": 5.151440922470619e-05, "loss": 0.10299503803253174, "memory(GiB)": 122.96, "step": 32165, "token_acc": 0.9574243490499648, "train_speed(iter/s)": 0.237445 }, { "epoch": 2.4521686104123788, "grad_norm": 1.1249780654907227, "learning_rate": 5.1502441229134314e-05, "loss": 0.08127434253692627, "memory(GiB)": 122.96, "step": 32170, "token_acc": 0.9702473620480886, "train_speed(iter/s)": 0.237451 }, { "epoch": 2.452549737022639, "grad_norm": 0.5738515853881836, "learning_rate": 5.14904731474044e-05, "loss": 0.09366962313652039, "memory(GiB)": 122.96, "step": 32175, "token_acc": 0.9709753628079649, "train_speed(iter/s)": 0.237455 }, { "epoch": 2.452930863632899, "grad_norm": 0.8114838004112244, "learning_rate": 5.147850498020276e-05, "loss": 0.09987412095069885, "memory(GiB)": 122.96, "step": 32180, "token_acc": 0.9584398976982097, "train_speed(iter/s)": 0.237462 }, { "epoch": 2.453311990243159, "grad_norm": 1.7480616569519043, "learning_rate": 5.146653672821572e-05, "loss": 0.10303734540939331, "memory(GiB)": 122.96, "step": 32185, "token_acc": 0.9683773976153447, "train_speed(iter/s)": 0.237471 }, { "epoch": 2.453693116853419, "grad_norm": 0.6125215888023376, "learning_rate": 5.1454568392129606e-05, "loss": 0.09398956298828125, "memory(GiB)": 122.96, "step": 32190, "token_acc": 0.963462163039948, "train_speed(iter/s)": 0.237476 }, { "epoch": 2.454074243463679, "grad_norm": 1.0182191133499146, "learning_rate": 5.1442599972630754e-05, "loss": 0.05556373000144958, "memory(GiB)": 122.96, "step": 32195, "token_acc": 0.9768033047346679, "train_speed(iter/s)": 0.237485 }, { "epoch": 2.4544553700739384, "grad_norm": 1.0362367630004883, "learning_rate": 5.143063147040548e-05, "loss": 0.14264739751815797, "memory(GiB)": 122.96, "step": 32200, "token_acc": 0.9368177307310619, "train_speed(iter/s)": 0.237496 }, { "epoch": 2.4544553700739384, "eval_loss": 0.08190025389194489, "eval_runtime": 217.6576, "eval_samples_per_second": 2.435, "eval_steps_per_second": 2.435, "eval_token_acc": 0.963887717607373, "step": 32200 }, { "epoch": 2.4548364966841985, "grad_norm": 1.763527274131775, "learning_rate": 5.141866288614013e-05, "loss": 0.0791084349155426, "memory(GiB)": 122.96, "step": 32205, "token_acc": 0.9637296181175554, "train_speed(iter/s)": 0.237125 }, { "epoch": 2.4552176232944585, "grad_norm": 2.057387590408325, "learning_rate": 5.140669422052106e-05, "loss": 0.0668079674243927, "memory(GiB)": 122.96, "step": 32210, "token_acc": 0.9732885476647473, "train_speed(iter/s)": 0.237131 }, { "epoch": 2.4555987499047185, "grad_norm": 0.7704806327819824, "learning_rate": 5.139472547423458e-05, "loss": 0.09443586468696594, "memory(GiB)": 122.96, "step": 32215, "token_acc": 0.965258786589689, "train_speed(iter/s)": 0.237133 }, { "epoch": 2.455979876514978, "grad_norm": 0.8204208016395569, "learning_rate": 5.138275664796708e-05, "loss": 0.10365757942199708, "memory(GiB)": 122.96, "step": 32220, "token_acc": 0.9612956810631229, "train_speed(iter/s)": 0.237136 }, { "epoch": 2.456361003125238, "grad_norm": 1.1584113836288452, "learning_rate": 5.1370787742404914e-05, "loss": 0.08756630420684815, "memory(GiB)": 122.96, "step": 32225, "token_acc": 0.9699730527396382, "train_speed(iter/s)": 0.23714 }, { "epoch": 2.456742129735498, "grad_norm": 1.3593182563781738, "learning_rate": 5.135881875823441e-05, "loss": 0.08980016112327575, "memory(GiB)": 122.96, "step": 32230, "token_acc": 0.9585994795363142, "train_speed(iter/s)": 0.237148 }, { "epoch": 2.457123256345758, "grad_norm": 1.0349663496017456, "learning_rate": 5.1346849696141965e-05, "loss": 0.06763052344322204, "memory(GiB)": 122.96, "step": 32235, "token_acc": 0.9732796486090776, "train_speed(iter/s)": 0.237161 }, { "epoch": 2.457504382956018, "grad_norm": 0.23163004219532013, "learning_rate": 5.1334880556813934e-05, "loss": 0.05159831643104553, "memory(GiB)": 122.96, "step": 32240, "token_acc": 0.9736616702355461, "train_speed(iter/s)": 0.237168 }, { "epoch": 2.457885509566278, "grad_norm": 1.4182651042938232, "learning_rate": 5.132291134093672e-05, "loss": 0.10441603660583496, "memory(GiB)": 122.96, "step": 32245, "token_acc": 0.9577167019027484, "train_speed(iter/s)": 0.237179 }, { "epoch": 2.4582666361765377, "grad_norm": 0.584679901599884, "learning_rate": 5.1310942049196655e-05, "loss": 0.06659343838691711, "memory(GiB)": 122.96, "step": 32250, "token_acc": 0.97393297049556, "train_speed(iter/s)": 0.23719 }, { "epoch": 2.4586477627867978, "grad_norm": 1.250818133354187, "learning_rate": 5.1298972682280155e-05, "loss": 0.09552850127220154, "memory(GiB)": 122.96, "step": 32255, "token_acc": 0.9624123128468223, "train_speed(iter/s)": 0.237187 }, { "epoch": 2.4590288893970578, "grad_norm": 1.342724084854126, "learning_rate": 5.128700324087361e-05, "loss": 0.08111636638641358, "memory(GiB)": 122.96, "step": 32260, "token_acc": 0.9734351652000665, "train_speed(iter/s)": 0.237195 }, { "epoch": 2.459410016007318, "grad_norm": 0.6169635057449341, "learning_rate": 5.127503372566338e-05, "loss": 0.0718221127986908, "memory(GiB)": 122.96, "step": 32265, "token_acc": 0.9734405458089669, "train_speed(iter/s)": 0.237204 }, { "epoch": 2.4597911426175774, "grad_norm": 0.701077401638031, "learning_rate": 5.1263064137335905e-05, "loss": 0.08208571672439575, "memory(GiB)": 122.96, "step": 32270, "token_acc": 0.9626849667595968, "train_speed(iter/s)": 0.237212 }, { "epoch": 2.4601722692278374, "grad_norm": 1.0186433792114258, "learning_rate": 5.1251094476577557e-05, "loss": 0.09782501459121704, "memory(GiB)": 122.96, "step": 32275, "token_acc": 0.9590246354952238, "train_speed(iter/s)": 0.237222 }, { "epoch": 2.4605533958380974, "grad_norm": 1.129246711730957, "learning_rate": 5.123912474407474e-05, "loss": 0.06929408311843872, "memory(GiB)": 122.96, "step": 32280, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.237231 }, { "epoch": 2.4609345224483574, "grad_norm": 2.0745253562927246, "learning_rate": 5.1227154940513865e-05, "loss": 0.0999297857284546, "memory(GiB)": 122.96, "step": 32285, "token_acc": 0.9669434194341944, "train_speed(iter/s)": 0.237235 }, { "epoch": 2.4613156490586174, "grad_norm": 0.9055485129356384, "learning_rate": 5.121518506658136e-05, "loss": 0.09268057346343994, "memory(GiB)": 122.96, "step": 32290, "token_acc": 0.970393931979447, "train_speed(iter/s)": 0.237244 }, { "epoch": 2.461696775668877, "grad_norm": 0.9835166931152344, "learning_rate": 5.120321512296363e-05, "loss": 0.09732171297073364, "memory(GiB)": 122.96, "step": 32295, "token_acc": 0.9665970772442589, "train_speed(iter/s)": 0.237253 }, { "epoch": 2.462077902279137, "grad_norm": 0.525734543800354, "learning_rate": 5.1191245110347084e-05, "loss": 0.09012429118156433, "memory(GiB)": 122.96, "step": 32300, "token_acc": 0.9686050660007135, "train_speed(iter/s)": 0.237253 }, { "epoch": 2.462459028889397, "grad_norm": 1.1260920763015747, "learning_rate": 5.117927502941817e-05, "loss": 0.10239927768707276, "memory(GiB)": 122.96, "step": 32305, "token_acc": 0.9647347439435756, "train_speed(iter/s)": 0.237264 }, { "epoch": 2.462840155499657, "grad_norm": 1.496146321296692, "learning_rate": 5.116730488086331e-05, "loss": 0.06680032014846801, "memory(GiB)": 122.96, "step": 32310, "token_acc": 0.9731223789553946, "train_speed(iter/s)": 0.237273 }, { "epoch": 2.463221282109917, "grad_norm": 0.7390111088752747, "learning_rate": 5.1155334665368925e-05, "loss": 0.05874691605567932, "memory(GiB)": 122.96, "step": 32315, "token_acc": 0.9803505045140732, "train_speed(iter/s)": 0.237285 }, { "epoch": 2.4636024087201767, "grad_norm": 0.8469942212104797, "learning_rate": 5.114336438362145e-05, "loss": 0.12634363174438476, "memory(GiB)": 122.96, "step": 32320, "token_acc": 0.9625903803835272, "train_speed(iter/s)": 0.237291 }, { "epoch": 2.4639835353304367, "grad_norm": 0.937544584274292, "learning_rate": 5.113139403630734e-05, "loss": 0.11960816383361816, "memory(GiB)": 122.96, "step": 32325, "token_acc": 0.9656526005888125, "train_speed(iter/s)": 0.237298 }, { "epoch": 2.4643646619406967, "grad_norm": 1.0568403005599976, "learning_rate": 5.111942362411305e-05, "loss": 0.10082534551620484, "memory(GiB)": 122.96, "step": 32330, "token_acc": 0.9664414414414414, "train_speed(iter/s)": 0.237306 }, { "epoch": 2.4647457885509567, "grad_norm": 0.9876269698143005, "learning_rate": 5.110745314772498e-05, "loss": 0.09214457869529724, "memory(GiB)": 122.96, "step": 32335, "token_acc": 0.9545157780195865, "train_speed(iter/s)": 0.237313 }, { "epoch": 2.4651269151612167, "grad_norm": 1.2181813716888428, "learning_rate": 5.109548260782964e-05, "loss": 0.06972017884254456, "memory(GiB)": 122.96, "step": 32340, "token_acc": 0.9724256785868161, "train_speed(iter/s)": 0.237315 }, { "epoch": 2.4655080417714763, "grad_norm": 0.6257801651954651, "learning_rate": 5.108351200511344e-05, "loss": 0.08288097381591797, "memory(GiB)": 122.96, "step": 32345, "token_acc": 0.961265332472563, "train_speed(iter/s)": 0.237324 }, { "epoch": 2.4658891683817363, "grad_norm": 1.714722752571106, "learning_rate": 5.107154134026285e-05, "loss": 0.08178079128265381, "memory(GiB)": 122.96, "step": 32350, "token_acc": 0.9637262984336357, "train_speed(iter/s)": 0.237333 }, { "epoch": 2.4662702949919963, "grad_norm": 0.4944014549255371, "learning_rate": 5.105957061396435e-05, "loss": 0.0699450671672821, "memory(GiB)": 122.96, "step": 32355, "token_acc": 0.9712465878070974, "train_speed(iter/s)": 0.237337 }, { "epoch": 2.4666514216022564, "grad_norm": 0.852733850479126, "learning_rate": 5.104759982690439e-05, "loss": 0.07634921073913574, "memory(GiB)": 122.96, "step": 32360, "token_acc": 0.9722544844496064, "train_speed(iter/s)": 0.237341 }, { "epoch": 2.4670325482125164, "grad_norm": 0.6916747093200684, "learning_rate": 5.103562897976942e-05, "loss": 0.09792242050170899, "memory(GiB)": 122.96, "step": 32365, "token_acc": 0.9680723920095612, "train_speed(iter/s)": 0.237348 }, { "epoch": 2.467413674822776, "grad_norm": 1.089674949645996, "learning_rate": 5.102365807324595e-05, "loss": 0.10554548501968383, "memory(GiB)": 122.96, "step": 32370, "token_acc": 0.9529812606473594, "train_speed(iter/s)": 0.237358 }, { "epoch": 2.467794801433036, "grad_norm": 0.9333384037017822, "learning_rate": 5.101168710802045e-05, "loss": 0.09772891998291015, "memory(GiB)": 122.96, "step": 32375, "token_acc": 0.9537664408130729, "train_speed(iter/s)": 0.237366 }, { "epoch": 2.468175928043296, "grad_norm": 0.8465971350669861, "learning_rate": 5.099971608477938e-05, "loss": 0.09505209922790528, "memory(GiB)": 122.96, "step": 32380, "token_acc": 0.9656504345426955, "train_speed(iter/s)": 0.237369 }, { "epoch": 2.468557054653556, "grad_norm": 0.45627671480178833, "learning_rate": 5.098774500420923e-05, "loss": 0.10004132986068726, "memory(GiB)": 122.96, "step": 32385, "token_acc": 0.9627329192546584, "train_speed(iter/s)": 0.237378 }, { "epoch": 2.468938181263816, "grad_norm": 1.5973087549209595, "learning_rate": 5.097577386699649e-05, "loss": 0.07462520599365234, "memory(GiB)": 122.96, "step": 32390, "token_acc": 0.971830985915493, "train_speed(iter/s)": 0.237387 }, { "epoch": 2.4693193078740756, "grad_norm": 0.6236518025398254, "learning_rate": 5.0963802673827646e-05, "loss": 0.08998031616210937, "memory(GiB)": 122.96, "step": 32395, "token_acc": 0.9699539617155318, "train_speed(iter/s)": 0.237396 }, { "epoch": 2.4697004344843356, "grad_norm": 0.6644592881202698, "learning_rate": 5.0951831425389196e-05, "loss": 0.10645793676376343, "memory(GiB)": 122.96, "step": 32400, "token_acc": 0.9565501470107808, "train_speed(iter/s)": 0.237406 }, { "epoch": 2.4697004344843356, "eval_loss": 0.0816883072257042, "eval_runtime": 212.524, "eval_samples_per_second": 2.494, "eval_steps_per_second": 2.494, "eval_token_acc": 0.9644750316246009, "step": 32400 }, { "epoch": 2.4700815610945956, "grad_norm": 1.0610525608062744, "learning_rate": 5.0939860122367625e-05, "loss": 0.12554062604904176, "memory(GiB)": 122.96, "step": 32405, "token_acc": 0.9644872984030429, "train_speed(iter/s)": 0.237038 }, { "epoch": 2.4704626877048557, "grad_norm": 1.3927549123764038, "learning_rate": 5.0927888765449435e-05, "loss": 0.06789535880088807, "memory(GiB)": 122.96, "step": 32410, "token_acc": 0.9719830959461575, "train_speed(iter/s)": 0.237042 }, { "epoch": 2.4708438143151152, "grad_norm": 1.6130789518356323, "learning_rate": 5.0915917355321144e-05, "loss": 0.12432518005371093, "memory(GiB)": 122.96, "step": 32415, "token_acc": 0.9431325301204819, "train_speed(iter/s)": 0.237054 }, { "epoch": 2.4712249409253753, "grad_norm": 1.081120252609253, "learning_rate": 5.090394589266923e-05, "loss": 0.09105119705200196, "memory(GiB)": 122.96, "step": 32420, "token_acc": 0.963895486935867, "train_speed(iter/s)": 0.237063 }, { "epoch": 2.4716060675356353, "grad_norm": 0.9023790955543518, "learning_rate": 5.089197437818021e-05, "loss": 0.083616441488266, "memory(GiB)": 122.96, "step": 32425, "token_acc": 0.9638490164805954, "train_speed(iter/s)": 0.237072 }, { "epoch": 2.4719871941458953, "grad_norm": 0.7962257862091064, "learning_rate": 5.088000281254062e-05, "loss": 0.07093601226806641, "memory(GiB)": 122.96, "step": 32430, "token_acc": 0.9747744430123366, "train_speed(iter/s)": 0.237079 }, { "epoch": 2.4723683207561553, "grad_norm": 0.9732542634010315, "learning_rate": 5.086803119643693e-05, "loss": 0.13298027515411376, "memory(GiB)": 122.96, "step": 32435, "token_acc": 0.9567832832040526, "train_speed(iter/s)": 0.237082 }, { "epoch": 2.4727494473664153, "grad_norm": 0.8206663727760315, "learning_rate": 5.085605953055568e-05, "loss": 0.08403475880622864, "memory(GiB)": 122.96, "step": 32440, "token_acc": 0.9632409251033849, "train_speed(iter/s)": 0.237088 }, { "epoch": 2.473130573976675, "grad_norm": 3.106858968734741, "learning_rate": 5.0844087815583394e-05, "loss": 0.08172794580459594, "memory(GiB)": 122.96, "step": 32445, "token_acc": 0.9618705035971223, "train_speed(iter/s)": 0.237099 }, { "epoch": 2.473511700586935, "grad_norm": 0.8670427203178406, "learning_rate": 5.083211605220659e-05, "loss": 0.06163949966430664, "memory(GiB)": 122.96, "step": 32450, "token_acc": 0.9773175542406312, "train_speed(iter/s)": 0.237106 }, { "epoch": 2.473892827197195, "grad_norm": 0.6251036524772644, "learning_rate": 5.082014424111179e-05, "loss": 0.09013078212738038, "memory(GiB)": 122.96, "step": 32455, "token_acc": 0.9733815313835031, "train_speed(iter/s)": 0.23711 }, { "epoch": 2.474273953807455, "grad_norm": 0.7533485293388367, "learning_rate": 5.080817238298553e-05, "loss": 0.07429801225662232, "memory(GiB)": 122.96, "step": 32460, "token_acc": 0.969092441697106, "train_speed(iter/s)": 0.237121 }, { "epoch": 2.4746550804177145, "grad_norm": 0.8771314024925232, "learning_rate": 5.0796200478514336e-05, "loss": 0.0762300968170166, "memory(GiB)": 122.96, "step": 32465, "token_acc": 0.9670215528434173, "train_speed(iter/s)": 0.23713 }, { "epoch": 2.4750362070279746, "grad_norm": 1.2116272449493408, "learning_rate": 5.0784228528384733e-05, "loss": 0.1233478307723999, "memory(GiB)": 122.96, "step": 32470, "token_acc": 0.9426981008513425, "train_speed(iter/s)": 0.23714 }, { "epoch": 2.4754173336382346, "grad_norm": 0.6252954006195068, "learning_rate": 5.077225653328326e-05, "loss": 0.10040969848632812, "memory(GiB)": 122.96, "step": 32475, "token_acc": 0.9704091985120054, "train_speed(iter/s)": 0.237143 }, { "epoch": 2.4757984602484946, "grad_norm": 0.6778857111930847, "learning_rate": 5.076028449389646e-05, "loss": 0.10226720571517944, "memory(GiB)": 122.96, "step": 32480, "token_acc": 0.9625074390001984, "train_speed(iter/s)": 0.237148 }, { "epoch": 2.4761795868587546, "grad_norm": 1.8147715330123901, "learning_rate": 5.074831241091088e-05, "loss": 0.11114569902420043, "memory(GiB)": 122.96, "step": 32485, "token_acc": 0.9587847385774847, "train_speed(iter/s)": 0.237155 }, { "epoch": 2.4765607134690146, "grad_norm": 1.2582870721817017, "learning_rate": 5.073634028501304e-05, "loss": 0.12235462665557861, "memory(GiB)": 122.96, "step": 32490, "token_acc": 0.9492472073822243, "train_speed(iter/s)": 0.237164 }, { "epoch": 2.476941840079274, "grad_norm": 1.1435779333114624, "learning_rate": 5.072436811688952e-05, "loss": 0.10029060840606689, "memory(GiB)": 122.96, "step": 32495, "token_acc": 0.9632842445027234, "train_speed(iter/s)": 0.237172 }, { "epoch": 2.4773229666895342, "grad_norm": 1.160099744796753, "learning_rate": 5.071239590722684e-05, "loss": 0.08358358144760132, "memory(GiB)": 122.96, "step": 32500, "token_acc": 0.9670468948035488, "train_speed(iter/s)": 0.237176 }, { "epoch": 2.4777040932997942, "grad_norm": 0.5981438755989075, "learning_rate": 5.070042365671156e-05, "loss": 0.11172574758529663, "memory(GiB)": 122.96, "step": 32505, "token_acc": 0.9650209874075555, "train_speed(iter/s)": 0.237182 }, { "epoch": 2.4780852199100543, "grad_norm": 1.1696103811264038, "learning_rate": 5.068845136603023e-05, "loss": 0.0808282732963562, "memory(GiB)": 122.96, "step": 32510, "token_acc": 0.9771428571428571, "train_speed(iter/s)": 0.237191 }, { "epoch": 2.478466346520314, "grad_norm": 1.181359052658081, "learning_rate": 5.067647903586942e-05, "loss": 0.12710686922073364, "memory(GiB)": 122.96, "step": 32515, "token_acc": 0.9504666188083274, "train_speed(iter/s)": 0.237198 }, { "epoch": 2.478847473130574, "grad_norm": 0.8165749311447144, "learning_rate": 5.066450666691568e-05, "loss": 0.08523765206336975, "memory(GiB)": 122.96, "step": 32520, "token_acc": 0.9665626393223361, "train_speed(iter/s)": 0.237208 }, { "epoch": 2.479228599740834, "grad_norm": 0.7761129140853882, "learning_rate": 5.065253425985554e-05, "loss": 0.088229900598526, "memory(GiB)": 122.96, "step": 32525, "token_acc": 0.9609175870858114, "train_speed(iter/s)": 0.237218 }, { "epoch": 2.479609726351094, "grad_norm": 1.4951775074005127, "learning_rate": 5.064056181537562e-05, "loss": 0.10930988788604737, "memory(GiB)": 122.96, "step": 32530, "token_acc": 0.9547635366689513, "train_speed(iter/s)": 0.237228 }, { "epoch": 2.479990852961354, "grad_norm": 0.6839568614959717, "learning_rate": 5.062858933416245e-05, "loss": 0.08014906644821167, "memory(GiB)": 122.96, "step": 32535, "token_acc": 0.9687781061287422, "train_speed(iter/s)": 0.23723 }, { "epoch": 2.480371979571614, "grad_norm": 1.291909098625183, "learning_rate": 5.061661681690257e-05, "loss": 0.11091808080673218, "memory(GiB)": 122.96, "step": 32540, "token_acc": 0.9527059196952865, "train_speed(iter/s)": 0.237235 }, { "epoch": 2.4807531061818735, "grad_norm": 0.5007871985435486, "learning_rate": 5.0604644264282594e-05, "loss": 0.0896936297416687, "memory(GiB)": 122.96, "step": 32545, "token_acc": 0.9664624314737181, "train_speed(iter/s)": 0.237241 }, { "epoch": 2.4811342327921335, "grad_norm": 1.0064418315887451, "learning_rate": 5.059267167698909e-05, "loss": 0.08060833811759949, "memory(GiB)": 122.96, "step": 32550, "token_acc": 0.9675810473815462, "train_speed(iter/s)": 0.237254 }, { "epoch": 2.4815153594023935, "grad_norm": 0.6136196255683899, "learning_rate": 5.0580699055708604e-05, "loss": 0.07388052940368653, "memory(GiB)": 122.96, "step": 32555, "token_acc": 0.9740701001430615, "train_speed(iter/s)": 0.237262 }, { "epoch": 2.4818964860126536, "grad_norm": 1.825529932975769, "learning_rate": 5.0568726401127706e-05, "loss": 0.07615472674369812, "memory(GiB)": 122.96, "step": 32560, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.237275 }, { "epoch": 2.482277612622913, "grad_norm": 0.8835644125938416, "learning_rate": 5.0556753713933004e-05, "loss": 0.09656510949134826, "memory(GiB)": 122.96, "step": 32565, "token_acc": 0.9659300184162063, "train_speed(iter/s)": 0.237281 }, { "epoch": 2.482658739233173, "grad_norm": 0.981692910194397, "learning_rate": 5.054478099481106e-05, "loss": 0.11027920246124268, "memory(GiB)": 122.96, "step": 32570, "token_acc": 0.9472352728586917, "train_speed(iter/s)": 0.237292 }, { "epoch": 2.483039865843433, "grad_norm": 0.7605816125869751, "learning_rate": 5.053280824444845e-05, "loss": 0.07528504729270935, "memory(GiB)": 122.96, "step": 32575, "token_acc": 0.9708705726580602, "train_speed(iter/s)": 0.237302 }, { "epoch": 2.483420992453693, "grad_norm": 0.9227256774902344, "learning_rate": 5.0520835463531755e-05, "loss": 0.08171619176864624, "memory(GiB)": 122.96, "step": 32580, "token_acc": 0.9631407179880975, "train_speed(iter/s)": 0.237311 }, { "epoch": 2.483802119063953, "grad_norm": 1.7167404890060425, "learning_rate": 5.050886265274758e-05, "loss": 0.10844322443008422, "memory(GiB)": 122.96, "step": 32585, "token_acc": 0.9682051282051282, "train_speed(iter/s)": 0.23731 }, { "epoch": 2.484183245674213, "grad_norm": 1.1114164590835571, "learning_rate": 5.049688981278249e-05, "loss": 0.08117426633834839, "memory(GiB)": 122.96, "step": 32590, "token_acc": 0.9695102173207915, "train_speed(iter/s)": 0.237315 }, { "epoch": 2.484564372284473, "grad_norm": 1.3448941707611084, "learning_rate": 5.0484916944323056e-05, "loss": 0.1279611110687256, "memory(GiB)": 122.96, "step": 32595, "token_acc": 0.9558310055865922, "train_speed(iter/s)": 0.237322 }, { "epoch": 2.484945498894733, "grad_norm": 0.5457672476768494, "learning_rate": 5.04729440480559e-05, "loss": 0.059603911638259885, "memory(GiB)": 122.96, "step": 32600, "token_acc": 0.975054704595186, "train_speed(iter/s)": 0.237321 }, { "epoch": 2.484945498894733, "eval_loss": 0.08149267733097076, "eval_runtime": 216.4152, "eval_samples_per_second": 2.449, "eval_steps_per_second": 2.449, "eval_token_acc": 0.9638952472742606, "step": 32600 }, { "epoch": 2.485326625504993, "grad_norm": 0.746606171131134, "learning_rate": 5.04609711246676e-05, "loss": 0.08202637434005737, "memory(GiB)": 122.96, "step": 32605, "token_acc": 0.9638942286527653, "train_speed(iter/s)": 0.236956 }, { "epoch": 2.485707752115253, "grad_norm": 1.0643306970596313, "learning_rate": 5.044899817484473e-05, "loss": 0.06239247918128967, "memory(GiB)": 122.96, "step": 32610, "token_acc": 0.9693151798414956, "train_speed(iter/s)": 0.236964 }, { "epoch": 2.4860888787255124, "grad_norm": 0.467007577419281, "learning_rate": 5.043702519927391e-05, "loss": 0.08941280841827393, "memory(GiB)": 122.96, "step": 32615, "token_acc": 0.9706031424227065, "train_speed(iter/s)": 0.236964 }, { "epoch": 2.4864700053357724, "grad_norm": 0.6772825121879578, "learning_rate": 5.042505219864173e-05, "loss": 0.08697628974914551, "memory(GiB)": 122.96, "step": 32620, "token_acc": 0.9757437070938215, "train_speed(iter/s)": 0.236973 }, { "epoch": 2.4868511319460325, "grad_norm": 1.7354190349578857, "learning_rate": 5.041307917363477e-05, "loss": 0.11443369388580323, "memory(GiB)": 122.96, "step": 32625, "token_acc": 0.9590340664079344, "train_speed(iter/s)": 0.236985 }, { "epoch": 2.4872322585562925, "grad_norm": 1.1378047466278076, "learning_rate": 5.0401106124939634e-05, "loss": 0.0767454445362091, "memory(GiB)": 122.96, "step": 32630, "token_acc": 0.9658858312485786, "train_speed(iter/s)": 0.236991 }, { "epoch": 2.4876133851665525, "grad_norm": 1.932583212852478, "learning_rate": 5.038913305324292e-05, "loss": 0.1118741512298584, "memory(GiB)": 122.96, "step": 32635, "token_acc": 0.9592220828105396, "train_speed(iter/s)": 0.237001 }, { "epoch": 2.487994511776812, "grad_norm": 1.2275440692901611, "learning_rate": 5.037715995923125e-05, "loss": 0.1123130202293396, "memory(GiB)": 122.96, "step": 32640, "token_acc": 0.9516611003691334, "train_speed(iter/s)": 0.237006 }, { "epoch": 2.488375638387072, "grad_norm": 0.9071910381317139, "learning_rate": 5.036518684359119e-05, "loss": 0.11677471399307252, "memory(GiB)": 122.96, "step": 32645, "token_acc": 0.9598548972188634, "train_speed(iter/s)": 0.237015 }, { "epoch": 2.488756764997332, "grad_norm": 1.4639593362808228, "learning_rate": 5.035321370700935e-05, "loss": 0.08172268867492676, "memory(GiB)": 122.96, "step": 32650, "token_acc": 0.9727140783744558, "train_speed(iter/s)": 0.237019 }, { "epoch": 2.489137891607592, "grad_norm": 0.6283960342407227, "learning_rate": 5.034124055017236e-05, "loss": 0.08754828572273254, "memory(GiB)": 122.96, "step": 32655, "token_acc": 0.9662680931403398, "train_speed(iter/s)": 0.237023 }, { "epoch": 2.489519018217852, "grad_norm": 1.2513079643249512, "learning_rate": 5.032926737376681e-05, "loss": 0.09514535665512085, "memory(GiB)": 122.96, "step": 32660, "token_acc": 0.9640985833495052, "train_speed(iter/s)": 0.237032 }, { "epoch": 2.4899001448281117, "grad_norm": 1.6153745651245117, "learning_rate": 5.031729417847929e-05, "loss": 0.06783218383789062, "memory(GiB)": 122.96, "step": 32665, "token_acc": 0.9734939759036144, "train_speed(iter/s)": 0.237046 }, { "epoch": 2.4902812714383717, "grad_norm": 1.0578604936599731, "learning_rate": 5.0305320964996436e-05, "loss": 0.10022616386413574, "memory(GiB)": 122.96, "step": 32670, "token_acc": 0.9581824168587422, "train_speed(iter/s)": 0.237056 }, { "epoch": 2.4906623980486318, "grad_norm": 0.7264026999473572, "learning_rate": 5.0293347734004834e-05, "loss": 0.07359199523925782, "memory(GiB)": 122.96, "step": 32675, "token_acc": 0.970320905212391, "train_speed(iter/s)": 0.237062 }, { "epoch": 2.491043524658892, "grad_norm": 1.8340282440185547, "learning_rate": 5.02813744861911e-05, "loss": 0.08453892469406128, "memory(GiB)": 122.96, "step": 32680, "token_acc": 0.97015326700726, "train_speed(iter/s)": 0.23707 }, { "epoch": 2.491424651269152, "grad_norm": 0.7447597980499268, "learning_rate": 5.0269401222241855e-05, "loss": 0.09541623592376709, "memory(GiB)": 122.96, "step": 32685, "token_acc": 0.9597787081339713, "train_speed(iter/s)": 0.237074 }, { "epoch": 2.4918057778794114, "grad_norm": 0.9106724262237549, "learning_rate": 5.0257427942843695e-05, "loss": 0.0983817994594574, "memory(GiB)": 122.96, "step": 32690, "token_acc": 0.9641767068273093, "train_speed(iter/s)": 0.23708 }, { "epoch": 2.4921869044896714, "grad_norm": 1.3889763355255127, "learning_rate": 5.024545464868322e-05, "loss": 0.11346976757049561, "memory(GiB)": 122.96, "step": 32695, "token_acc": 0.9637648131267093, "train_speed(iter/s)": 0.237087 }, { "epoch": 2.4925680310999314, "grad_norm": 1.0603346824645996, "learning_rate": 5.023348134044707e-05, "loss": 0.084490305185318, "memory(GiB)": 122.96, "step": 32700, "token_acc": 0.9665205261060184, "train_speed(iter/s)": 0.237094 }, { "epoch": 2.4929491577101914, "grad_norm": 1.000857949256897, "learning_rate": 5.022150801882186e-05, "loss": 0.11221163272857666, "memory(GiB)": 122.96, "step": 32705, "token_acc": 0.9651517873181247, "train_speed(iter/s)": 0.2371 }, { "epoch": 2.493330284320451, "grad_norm": 0.9973498582839966, "learning_rate": 5.020953468449418e-05, "loss": 0.10742229223251343, "memory(GiB)": 122.96, "step": 32710, "token_acc": 0.9544967880085653, "train_speed(iter/s)": 0.237107 }, { "epoch": 2.493711410930711, "grad_norm": 0.9577050805091858, "learning_rate": 5.0197561338150666e-05, "loss": 0.07961418628692626, "memory(GiB)": 122.96, "step": 32715, "token_acc": 0.9570397111913357, "train_speed(iter/s)": 0.237117 }, { "epoch": 2.494092537540971, "grad_norm": 2.2972452640533447, "learning_rate": 5.018558798047792e-05, "loss": 0.09894939661026, "memory(GiB)": 122.96, "step": 32720, "token_acc": 0.9663604766633564, "train_speed(iter/s)": 0.237121 }, { "epoch": 2.494473664151231, "grad_norm": 0.49451929330825806, "learning_rate": 5.017361461216258e-05, "loss": 0.07018018364906312, "memory(GiB)": 122.96, "step": 32725, "token_acc": 0.9690272373540856, "train_speed(iter/s)": 0.237126 }, { "epoch": 2.494854790761491, "grad_norm": 1.0628807544708252, "learning_rate": 5.0161641233891224e-05, "loss": 0.08428190350532531, "memory(GiB)": 122.96, "step": 32730, "token_acc": 0.9710033752684872, "train_speed(iter/s)": 0.237132 }, { "epoch": 2.495235917371751, "grad_norm": 2.2226288318634033, "learning_rate": 5.014966784635051e-05, "loss": 0.16030316352844237, "memory(GiB)": 122.96, "step": 32735, "token_acc": 0.9154310818231741, "train_speed(iter/s)": 0.237145 }, { "epoch": 2.4956170439820107, "grad_norm": 1.2904126644134521, "learning_rate": 5.013769445022704e-05, "loss": 0.08412414193153381, "memory(GiB)": 122.96, "step": 32740, "token_acc": 0.9722882026920031, "train_speed(iter/s)": 0.237154 }, { "epoch": 2.4959981705922707, "grad_norm": 1.032455563545227, "learning_rate": 5.012572104620743e-05, "loss": 0.08627579808235168, "memory(GiB)": 122.96, "step": 32745, "token_acc": 0.9690436241610738, "train_speed(iter/s)": 0.23715 }, { "epoch": 2.4963792972025307, "grad_norm": 1.027165174484253, "learning_rate": 5.011374763497829e-05, "loss": 0.11428292989730834, "memory(GiB)": 122.96, "step": 32750, "token_acc": 0.9526191877575044, "train_speed(iter/s)": 0.237159 }, { "epoch": 2.4967604238127907, "grad_norm": 0.879060685634613, "learning_rate": 5.010177421722626e-05, "loss": 0.09785357713699341, "memory(GiB)": 122.96, "step": 32755, "token_acc": 0.9598981540420115, "train_speed(iter/s)": 0.237169 }, { "epoch": 2.4971415504230503, "grad_norm": 0.7778221964836121, "learning_rate": 5.008980079363795e-05, "loss": 0.10571023225784301, "memory(GiB)": 122.96, "step": 32760, "token_acc": 0.9539170506912442, "train_speed(iter/s)": 0.237177 }, { "epoch": 2.4975226770333103, "grad_norm": 0.9055548906326294, "learning_rate": 5.007782736489997e-05, "loss": 0.10453786849975585, "memory(GiB)": 122.96, "step": 32765, "token_acc": 0.962536023054755, "train_speed(iter/s)": 0.237183 }, { "epoch": 2.4979038036435703, "grad_norm": 1.0745505094528198, "learning_rate": 5.006585393169897e-05, "loss": 0.11275142431259155, "memory(GiB)": 122.96, "step": 32770, "token_acc": 0.9532545422473099, "train_speed(iter/s)": 0.237188 }, { "epoch": 2.4982849302538304, "grad_norm": 0.63880854845047, "learning_rate": 5.005388049472155e-05, "loss": 0.07066336870193482, "memory(GiB)": 122.96, "step": 32775, "token_acc": 0.9725378787878788, "train_speed(iter/s)": 0.237196 }, { "epoch": 2.4986660568640904, "grad_norm": 0.6250942349433899, "learning_rate": 5.004190705465434e-05, "loss": 0.14020938873291017, "memory(GiB)": 122.96, "step": 32780, "token_acc": 0.9461376773515502, "train_speed(iter/s)": 0.237205 }, { "epoch": 2.4990471834743504, "grad_norm": 2.016357183456421, "learning_rate": 5.002993361218393e-05, "loss": 0.11702556610107422, "memory(GiB)": 122.96, "step": 32785, "token_acc": 0.9425625920471281, "train_speed(iter/s)": 0.237218 }, { "epoch": 2.49942831008461, "grad_norm": 0.5274804830551147, "learning_rate": 5.001796016799698e-05, "loss": 0.09401137232780457, "memory(GiB)": 122.96, "step": 32790, "token_acc": 0.9655782531767734, "train_speed(iter/s)": 0.237224 }, { "epoch": 2.49980943669487, "grad_norm": 1.6258679628372192, "learning_rate": 5.000598672278011e-05, "loss": 0.09447575807571411, "memory(GiB)": 122.96, "step": 32795, "token_acc": 0.9593118514472966, "train_speed(iter/s)": 0.237231 }, { "epoch": 2.50019056330513, "grad_norm": 0.7354521155357361, "learning_rate": 4.99940132772199e-05, "loss": 0.07882640957832336, "memory(GiB)": 122.96, "step": 32800, "token_acc": 0.969311377245509, "train_speed(iter/s)": 0.237241 }, { "epoch": 2.50019056330513, "eval_loss": 0.08330094069242477, "eval_runtime": 218.2823, "eval_samples_per_second": 2.428, "eval_steps_per_second": 2.428, "eval_token_acc": 0.9645578579603639, "step": 32800 }, { "epoch": 2.50057168991539, "grad_norm": 1.1690751314163208, "learning_rate": 4.998203983200302e-05, "loss": 0.13596681356430054, "memory(GiB)": 122.96, "step": 32805, "token_acc": 0.9635512744133966, "train_speed(iter/s)": 0.236871 }, { "epoch": 2.5009528165256496, "grad_norm": 0.3608216643333435, "learning_rate": 4.997006638781607e-05, "loss": 0.08115845918655396, "memory(GiB)": 122.96, "step": 32810, "token_acc": 0.972549527385731, "train_speed(iter/s)": 0.236873 }, { "epoch": 2.5013339431359096, "grad_norm": 1.266127109527588, "learning_rate": 4.995809294534568e-05, "loss": 0.15216610431671143, "memory(GiB)": 122.96, "step": 32815, "token_acc": 0.9563456345634563, "train_speed(iter/s)": 0.236879 }, { "epoch": 2.5017150697461696, "grad_norm": 1.284793734550476, "learning_rate": 4.9946119505278455e-05, "loss": 0.0846735954284668, "memory(GiB)": 122.96, "step": 32820, "token_acc": 0.9573447753659767, "train_speed(iter/s)": 0.236889 }, { "epoch": 2.5020961963564297, "grad_norm": 1.336588740348816, "learning_rate": 4.993414606830103e-05, "loss": 0.11855581998825074, "memory(GiB)": 122.96, "step": 32825, "token_acc": 0.9601393188854489, "train_speed(iter/s)": 0.2369 }, { "epoch": 2.5024773229666897, "grad_norm": 0.7253202795982361, "learning_rate": 4.992217263510003e-05, "loss": 0.07139829397201539, "memory(GiB)": 122.96, "step": 32830, "token_acc": 0.9735243731505996, "train_speed(iter/s)": 0.236903 }, { "epoch": 2.5028584495769497, "grad_norm": 1.466415286064148, "learning_rate": 4.9910199206362053e-05, "loss": 0.10076133012771607, "memory(GiB)": 122.96, "step": 32835, "token_acc": 0.9591709844559585, "train_speed(iter/s)": 0.23691 }, { "epoch": 2.5032395761872093, "grad_norm": 1.0126689672470093, "learning_rate": 4.989822578277375e-05, "loss": 0.06510155200958252, "memory(GiB)": 122.96, "step": 32840, "token_acc": 0.963081130355515, "train_speed(iter/s)": 0.236921 }, { "epoch": 2.5036207027974693, "grad_norm": 1.7200003862380981, "learning_rate": 4.9886252365021726e-05, "loss": 0.09717998504638672, "memory(GiB)": 122.96, "step": 32845, "token_acc": 0.9697180647406892, "train_speed(iter/s)": 0.236924 }, { "epoch": 2.5040018294077293, "grad_norm": 1.1946361064910889, "learning_rate": 4.987427895379258e-05, "loss": 0.07672379612922668, "memory(GiB)": 122.96, "step": 32850, "token_acc": 0.9713162155910248, "train_speed(iter/s)": 0.236931 }, { "epoch": 2.5043829560179893, "grad_norm": 1.2628095149993896, "learning_rate": 4.9862305549772974e-05, "loss": 0.124867844581604, "memory(GiB)": 122.96, "step": 32855, "token_acc": 0.9618026930475405, "train_speed(iter/s)": 0.23694 }, { "epoch": 2.504764082628249, "grad_norm": 0.9964984059333801, "learning_rate": 4.985033215364951e-05, "loss": 0.10146132707595826, "memory(GiB)": 122.96, "step": 32860, "token_acc": 0.951005380911923, "train_speed(iter/s)": 0.236949 }, { "epoch": 2.505145209238509, "grad_norm": 1.2708353996276855, "learning_rate": 4.983835876610879e-05, "loss": 0.11411185264587402, "memory(GiB)": 122.96, "step": 32865, "token_acc": 0.9526542324246772, "train_speed(iter/s)": 0.236957 }, { "epoch": 2.505526335848769, "grad_norm": 1.0460419654846191, "learning_rate": 4.982638538783744e-05, "loss": 0.11193795204162597, "memory(GiB)": 122.96, "step": 32870, "token_acc": 0.9579656666131878, "train_speed(iter/s)": 0.236963 }, { "epoch": 2.505907462459029, "grad_norm": 0.6659510731697083, "learning_rate": 4.981441201952209e-05, "loss": 0.0615727961063385, "memory(GiB)": 122.96, "step": 32875, "token_acc": 0.9716690042075736, "train_speed(iter/s)": 0.236973 }, { "epoch": 2.506288589069289, "grad_norm": 1.6163833141326904, "learning_rate": 4.9802438661849346e-05, "loss": 0.07601369619369507, "memory(GiB)": 122.96, "step": 32880, "token_acc": 0.9714932126696832, "train_speed(iter/s)": 0.236984 }, { "epoch": 2.506669715679549, "grad_norm": 0.6211439967155457, "learning_rate": 4.979046531550582e-05, "loss": 0.07712869048118591, "memory(GiB)": 122.96, "step": 32885, "token_acc": 0.9697311524722693, "train_speed(iter/s)": 0.23699 }, { "epoch": 2.5070508422898086, "grad_norm": 1.671754240989685, "learning_rate": 4.977849198117815e-05, "loss": 0.08751252889633179, "memory(GiB)": 122.96, "step": 32890, "token_acc": 0.969023569023569, "train_speed(iter/s)": 0.236994 }, { "epoch": 2.5074319689000686, "grad_norm": 3.0128824710845947, "learning_rate": 4.976651865955294e-05, "loss": 0.09953057765960693, "memory(GiB)": 122.96, "step": 32895, "token_acc": 0.9661486054928677, "train_speed(iter/s)": 0.237001 }, { "epoch": 2.5078130955103286, "grad_norm": 1.0042357444763184, "learning_rate": 4.975454535131679e-05, "loss": 0.08191831111907959, "memory(GiB)": 122.96, "step": 32900, "token_acc": 0.9649820177929207, "train_speed(iter/s)": 0.237008 }, { "epoch": 2.5081942221205886, "grad_norm": 1.1863443851470947, "learning_rate": 4.974257205715633e-05, "loss": 0.11387544870376587, "memory(GiB)": 122.96, "step": 32905, "token_acc": 0.9572406881576749, "train_speed(iter/s)": 0.237013 }, { "epoch": 2.508575348730848, "grad_norm": 1.049683928489685, "learning_rate": 4.973059877775817e-05, "loss": 0.09124519824981689, "memory(GiB)": 122.96, "step": 32910, "token_acc": 0.9624971929036604, "train_speed(iter/s)": 0.237022 }, { "epoch": 2.508956475341108, "grad_norm": 2.242917776107788, "learning_rate": 4.971862551380891e-05, "loss": 0.07547228932380676, "memory(GiB)": 122.96, "step": 32915, "token_acc": 0.9665361338554646, "train_speed(iter/s)": 0.237031 }, { "epoch": 2.5093376019513682, "grad_norm": 0.7965443134307861, "learning_rate": 4.970665226599518e-05, "loss": 0.09978560209274293, "memory(GiB)": 122.96, "step": 32920, "token_acc": 0.965947096381879, "train_speed(iter/s)": 0.237039 }, { "epoch": 2.5097187285616283, "grad_norm": 0.7460707426071167, "learning_rate": 4.969467903500358e-05, "loss": 0.09241698980331421, "memory(GiB)": 122.96, "step": 32925, "token_acc": 0.9590717299578059, "train_speed(iter/s)": 0.237048 }, { "epoch": 2.5100998551718883, "grad_norm": 1.74571692943573, "learning_rate": 4.9682705821520726e-05, "loss": 0.09563430547714233, "memory(GiB)": 122.96, "step": 32930, "token_acc": 0.9725533684502357, "train_speed(iter/s)": 0.237057 }, { "epoch": 2.5104809817821483, "grad_norm": 0.9395322203636169, "learning_rate": 4.9670732626233196e-05, "loss": 0.07740887999534607, "memory(GiB)": 122.96, "step": 32935, "token_acc": 0.9729110231298187, "train_speed(iter/s)": 0.237064 }, { "epoch": 2.510862108392408, "grad_norm": 1.4174041748046875, "learning_rate": 4.9658759449827635e-05, "loss": 0.10924084186553955, "memory(GiB)": 122.96, "step": 32940, "token_acc": 0.9576306913996627, "train_speed(iter/s)": 0.237072 }, { "epoch": 2.511243235002668, "grad_norm": 2.8372652530670166, "learning_rate": 4.9646786292990643e-05, "loss": 0.12027863264083863, "memory(GiB)": 122.96, "step": 32945, "token_acc": 0.9643424281795872, "train_speed(iter/s)": 0.237076 }, { "epoch": 2.511624361612928, "grad_norm": 0.7366938591003418, "learning_rate": 4.963481315640882e-05, "loss": 0.09025511145591736, "memory(GiB)": 122.96, "step": 32950, "token_acc": 0.9659314690823159, "train_speed(iter/s)": 0.237082 }, { "epoch": 2.5120054882231875, "grad_norm": 0.6312406659126282, "learning_rate": 4.962284004076876e-05, "loss": 0.09684789776802064, "memory(GiB)": 122.96, "step": 32955, "token_acc": 0.9691833590138675, "train_speed(iter/s)": 0.237086 }, { "epoch": 2.5123866148334475, "grad_norm": 0.40779992938041687, "learning_rate": 4.961086694675708e-05, "loss": 0.09252734780311585, "memory(GiB)": 122.96, "step": 32960, "token_acc": 0.9666412795125666, "train_speed(iter/s)": 0.237088 }, { "epoch": 2.5127677414437075, "grad_norm": 0.7484343647956848, "learning_rate": 4.959889387506038e-05, "loss": 0.07744455337524414, "memory(GiB)": 122.96, "step": 32965, "token_acc": 0.9603399433427762, "train_speed(iter/s)": 0.237096 }, { "epoch": 2.5131488680539675, "grad_norm": 0.31623175740242004, "learning_rate": 4.958692082636525e-05, "loss": 0.07016698122024537, "memory(GiB)": 122.96, "step": 32970, "token_acc": 0.9698924731182795, "train_speed(iter/s)": 0.237107 }, { "epoch": 2.5135299946642276, "grad_norm": 0.489571750164032, "learning_rate": 4.957494780135828e-05, "loss": 0.09530118703842164, "memory(GiB)": 122.96, "step": 32975, "token_acc": 0.9624320316361838, "train_speed(iter/s)": 0.237116 }, { "epoch": 2.5139111212744876, "grad_norm": 2.6379051208496094, "learning_rate": 4.956297480072609e-05, "loss": 0.10669958591461182, "memory(GiB)": 122.96, "step": 32980, "token_acc": 0.9630372492836676, "train_speed(iter/s)": 0.237125 }, { "epoch": 2.5142922478847476, "grad_norm": 2.743637800216675, "learning_rate": 4.9551001825155275e-05, "loss": 0.10632832050323486, "memory(GiB)": 122.96, "step": 32985, "token_acc": 0.9646503123571537, "train_speed(iter/s)": 0.237132 }, { "epoch": 2.514673374495007, "grad_norm": 0.660750150680542, "learning_rate": 4.953902887533241e-05, "loss": 0.10157712697982788, "memory(GiB)": 122.96, "step": 32990, "token_acc": 0.9565509819747108, "train_speed(iter/s)": 0.237135 }, { "epoch": 2.515054501105267, "grad_norm": 0.4104747176170349, "learning_rate": 4.952705595194411e-05, "loss": 0.08774518966674805, "memory(GiB)": 122.96, "step": 32995, "token_acc": 0.9661757565949183, "train_speed(iter/s)": 0.237141 }, { "epoch": 2.515435627715527, "grad_norm": 0.5099527835845947, "learning_rate": 4.9515083055676956e-05, "loss": 0.0770055592060089, "memory(GiB)": 122.96, "step": 33000, "token_acc": 0.9769099808469888, "train_speed(iter/s)": 0.237138 }, { "epoch": 2.515435627715527, "eval_loss": 0.08091820776462555, "eval_runtime": 220.4141, "eval_samples_per_second": 2.405, "eval_steps_per_second": 2.405, "eval_token_acc": 0.965107523643154, "step": 33000 }, { "epoch": 2.5158167543257868, "grad_norm": 1.264560580253601, "learning_rate": 4.950311018721753e-05, "loss": 0.1001753568649292, "memory(GiB)": 122.96, "step": 33005, "token_acc": 0.965364236493138, "train_speed(iter/s)": 0.236768 }, { "epoch": 2.516197880936047, "grad_norm": 1.266026258468628, "learning_rate": 4.949113734725243e-05, "loss": 0.09502356648445129, "memory(GiB)": 122.96, "step": 33010, "token_acc": 0.9639344262295082, "train_speed(iter/s)": 0.236777 }, { "epoch": 2.516579007546307, "grad_norm": 0.1107417643070221, "learning_rate": 4.9479164536468257e-05, "loss": 0.05333621501922607, "memory(GiB)": 122.96, "step": 33015, "token_acc": 0.9713159623385155, "train_speed(iter/s)": 0.236785 }, { "epoch": 2.516960134156567, "grad_norm": 1.4980623722076416, "learning_rate": 4.9467191755551554e-05, "loss": 0.08645542860031127, "memory(GiB)": 122.96, "step": 33020, "token_acc": 0.973791233619521, "train_speed(iter/s)": 0.236789 }, { "epoch": 2.517341260766827, "grad_norm": 0.646247148513794, "learning_rate": 4.945521900518895e-05, "loss": 0.08119407892227173, "memory(GiB)": 122.96, "step": 33025, "token_acc": 0.9709443099273608, "train_speed(iter/s)": 0.23679 }, { "epoch": 2.517722387377087, "grad_norm": 1.3206604719161987, "learning_rate": 4.9443246286067015e-05, "loss": 0.06108865737915039, "memory(GiB)": 122.96, "step": 33030, "token_acc": 0.9638386648122392, "train_speed(iter/s)": 0.236802 }, { "epoch": 2.5181035139873464, "grad_norm": 1.316069483757019, "learning_rate": 4.943127359887231e-05, "loss": 0.08659700751304626, "memory(GiB)": 122.96, "step": 33035, "token_acc": 0.9596478356566398, "train_speed(iter/s)": 0.236814 }, { "epoch": 2.5184846405976065, "grad_norm": 0.6473144292831421, "learning_rate": 4.941930094429142e-05, "loss": 0.0971682071685791, "memory(GiB)": 122.96, "step": 33040, "token_acc": 0.9584726376913602, "train_speed(iter/s)": 0.236822 }, { "epoch": 2.5188657672078665, "grad_norm": 1.000908374786377, "learning_rate": 4.940732832301094e-05, "loss": 0.09670069217681884, "memory(GiB)": 122.96, "step": 33045, "token_acc": 0.966520073769329, "train_speed(iter/s)": 0.236823 }, { "epoch": 2.5192468938181265, "grad_norm": 2.267770528793335, "learning_rate": 4.939535573571742e-05, "loss": 0.08770031332969666, "memory(GiB)": 122.96, "step": 33050, "token_acc": 0.9587753864807518, "train_speed(iter/s)": 0.236833 }, { "epoch": 2.519628020428386, "grad_norm": 1.1274231672286987, "learning_rate": 4.938338318309744e-05, "loss": 0.09292935132980347, "memory(GiB)": 122.96, "step": 33055, "token_acc": 0.9640145763741269, "train_speed(iter/s)": 0.236838 }, { "epoch": 2.520009147038646, "grad_norm": 0.7619070410728455, "learning_rate": 4.9371410665837584e-05, "loss": 0.09132999777793885, "memory(GiB)": 122.96, "step": 33060, "token_acc": 0.9622641509433962, "train_speed(iter/s)": 0.236843 }, { "epoch": 2.520390273648906, "grad_norm": 0.8996629118919373, "learning_rate": 4.935943818462438e-05, "loss": 0.09562448263168336, "memory(GiB)": 122.96, "step": 33065, "token_acc": 0.9578059071729957, "train_speed(iter/s)": 0.236847 }, { "epoch": 2.520771400259166, "grad_norm": 1.0069319009780884, "learning_rate": 4.934746574014445e-05, "loss": 0.08897706270217895, "memory(GiB)": 122.96, "step": 33070, "token_acc": 0.9688449848024316, "train_speed(iter/s)": 0.236854 }, { "epoch": 2.521152526869426, "grad_norm": 1.075402021408081, "learning_rate": 4.933549333308433e-05, "loss": 0.059208142757415774, "memory(GiB)": 122.96, "step": 33075, "token_acc": 0.9721552291164218, "train_speed(iter/s)": 0.236862 }, { "epoch": 2.521533653479686, "grad_norm": 1.320214867591858, "learning_rate": 4.9323520964130574e-05, "loss": 0.10810576677322388, "memory(GiB)": 122.96, "step": 33080, "token_acc": 0.9589005235602094, "train_speed(iter/s)": 0.236869 }, { "epoch": 2.5219147800899457, "grad_norm": 1.0526854991912842, "learning_rate": 4.9311548633969766e-05, "loss": 0.0891953706741333, "memory(GiB)": 122.96, "step": 33085, "token_acc": 0.9650180940892642, "train_speed(iter/s)": 0.236879 }, { "epoch": 2.5222959067002058, "grad_norm": 0.8374066948890686, "learning_rate": 4.929957634328845e-05, "loss": 0.07507234215736389, "memory(GiB)": 122.96, "step": 33090, "token_acc": 0.9682658670664668, "train_speed(iter/s)": 0.236889 }, { "epoch": 2.5226770333104658, "grad_norm": 1.2207252979278564, "learning_rate": 4.9287604092773164e-05, "loss": 0.10953775644302369, "memory(GiB)": 122.96, "step": 33095, "token_acc": 0.947047619047619, "train_speed(iter/s)": 0.236899 }, { "epoch": 2.523058159920726, "grad_norm": 0.8483012318611145, "learning_rate": 4.927563188311049e-05, "loss": 0.10648869276046753, "memory(GiB)": 122.96, "step": 33100, "token_acc": 0.9603404913909848, "train_speed(iter/s)": 0.236905 }, { "epoch": 2.5234392865309854, "grad_norm": 1.0070604085922241, "learning_rate": 4.9263659714986964e-05, "loss": 0.10854912996292114, "memory(GiB)": 122.96, "step": 33105, "token_acc": 0.9606423885998643, "train_speed(iter/s)": 0.236912 }, { "epoch": 2.5238204131412454, "grad_norm": 0.5746860504150391, "learning_rate": 4.925168758908913e-05, "loss": 0.09892846345901489, "memory(GiB)": 122.96, "step": 33110, "token_acc": 0.9561986703167775, "train_speed(iter/s)": 0.236923 }, { "epoch": 2.5242015397515054, "grad_norm": 0.7289799451828003, "learning_rate": 4.923971550610355e-05, "loss": 0.10281234979629517, "memory(GiB)": 122.96, "step": 33115, "token_acc": 0.9611271081859317, "train_speed(iter/s)": 0.23693 }, { "epoch": 2.5245826663617654, "grad_norm": 0.8372538089752197, "learning_rate": 4.9227743466716754e-05, "loss": 0.08764684200286865, "memory(GiB)": 122.96, "step": 33120, "token_acc": 0.9693137437232657, "train_speed(iter/s)": 0.236935 }, { "epoch": 2.5249637929720254, "grad_norm": 0.5898918509483337, "learning_rate": 4.921577147161528e-05, "loss": 0.12236213684082031, "memory(GiB)": 122.96, "step": 33125, "token_acc": 0.9655566998512416, "train_speed(iter/s)": 0.236934 }, { "epoch": 2.5253449195822855, "grad_norm": 0.863740086555481, "learning_rate": 4.9203799521485675e-05, "loss": 0.06973938941955567, "memory(GiB)": 122.96, "step": 33130, "token_acc": 0.966984126984127, "train_speed(iter/s)": 0.236943 }, { "epoch": 2.525726046192545, "grad_norm": 2.110180377960205, "learning_rate": 4.919182761701449e-05, "loss": 0.1176151156425476, "memory(GiB)": 122.96, "step": 33135, "token_acc": 0.9378980891719745, "train_speed(iter/s)": 0.236954 }, { "epoch": 2.526107172802805, "grad_norm": 2.325512647628784, "learning_rate": 4.917985575888822e-05, "loss": 0.0875206708908081, "memory(GiB)": 122.96, "step": 33140, "token_acc": 0.9636697247706422, "train_speed(iter/s)": 0.236964 }, { "epoch": 2.526488299413065, "grad_norm": 0.5914955735206604, "learning_rate": 4.916788394779342e-05, "loss": 0.09069373607635497, "memory(GiB)": 122.96, "step": 33145, "token_acc": 0.964942387840157, "train_speed(iter/s)": 0.236963 }, { "epoch": 2.526869426023325, "grad_norm": 0.7067722678184509, "learning_rate": 4.915591218441662e-05, "loss": 0.07858939170837402, "memory(GiB)": 122.96, "step": 33150, "token_acc": 0.9713375796178344, "train_speed(iter/s)": 0.236972 }, { "epoch": 2.5272505526335847, "grad_norm": 1.1293522119522095, "learning_rate": 4.914394046944434e-05, "loss": 0.11027237176895141, "memory(GiB)": 122.96, "step": 33155, "token_acc": 0.9603634957463264, "train_speed(iter/s)": 0.236978 }, { "epoch": 2.5276316792438447, "grad_norm": 0.6791089773178101, "learning_rate": 4.913196880356309e-05, "loss": 0.08462151885032654, "memory(GiB)": 122.96, "step": 33160, "token_acc": 0.9706898935434812, "train_speed(iter/s)": 0.23698 }, { "epoch": 2.5280128058541047, "grad_norm": 0.5614945888519287, "learning_rate": 4.911999718745941e-05, "loss": 0.0936067521572113, "memory(GiB)": 122.96, "step": 33165, "token_acc": 0.9645569620253165, "train_speed(iter/s)": 0.236988 }, { "epoch": 2.5283939324643647, "grad_norm": 0.7882682085037231, "learning_rate": 4.910802562181981e-05, "loss": 0.08904505968093872, "memory(GiB)": 122.96, "step": 33170, "token_acc": 0.9600062735257214, "train_speed(iter/s)": 0.236992 }, { "epoch": 2.5287750590746247, "grad_norm": 1.6088447570800781, "learning_rate": 4.909605410733079e-05, "loss": 0.10349856615066529, "memory(GiB)": 122.96, "step": 33175, "token_acc": 0.9642389882250327, "train_speed(iter/s)": 0.237001 }, { "epoch": 2.5291561856848848, "grad_norm": 1.3977211713790894, "learning_rate": 4.908408264467888e-05, "loss": 0.09457155466079711, "memory(GiB)": 122.96, "step": 33180, "token_acc": 0.9677938808373591, "train_speed(iter/s)": 0.237006 }, { "epoch": 2.5295373122951443, "grad_norm": 1.0954550504684448, "learning_rate": 4.907211123455059e-05, "loss": 0.10187983512878418, "memory(GiB)": 122.96, "step": 33185, "token_acc": 0.9604072398190046, "train_speed(iter/s)": 0.237013 }, { "epoch": 2.5299184389054044, "grad_norm": 1.302017092704773, "learning_rate": 4.9060139877632393e-05, "loss": 0.07724708318710327, "memory(GiB)": 122.96, "step": 33190, "token_acc": 0.9681744511566229, "train_speed(iter/s)": 0.237017 }, { "epoch": 2.5302995655156644, "grad_norm": 0.8362134099006653, "learning_rate": 4.904816857461082e-05, "loss": 0.10180256366729737, "memory(GiB)": 122.96, "step": 33195, "token_acc": 0.9605902465359006, "train_speed(iter/s)": 0.237022 }, { "epoch": 2.5306806921259244, "grad_norm": 0.7535756826400757, "learning_rate": 4.903619732617236e-05, "loss": 0.11541591882705689, "memory(GiB)": 122.96, "step": 33200, "token_acc": 0.9631078086299049, "train_speed(iter/s)": 0.237026 }, { "epoch": 2.5306806921259244, "eval_loss": 0.08056715130805969, "eval_runtime": 221.3678, "eval_samples_per_second": 2.394, "eval_steps_per_second": 2.394, "eval_token_acc": 0.9655065959881934, "step": 33200 }, { "epoch": 2.531061818736184, "grad_norm": 0.8629365563392639, "learning_rate": 4.9024226133003514e-05, "loss": 0.09183005094528199, "memory(GiB)": 122.96, "step": 33205, "token_acc": 0.9653119071859139, "train_speed(iter/s)": 0.236661 }, { "epoch": 2.531442945346444, "grad_norm": 0.4623248279094696, "learning_rate": 4.901225499579078e-05, "loss": 0.10580255985260009, "memory(GiB)": 122.96, "step": 33210, "token_acc": 0.962425840474621, "train_speed(iter/s)": 0.236665 }, { "epoch": 2.531824071956704, "grad_norm": 1.6338202953338623, "learning_rate": 4.9000283915220623e-05, "loss": 0.0989622414112091, "memory(GiB)": 122.96, "step": 33215, "token_acc": 0.9758694109297374, "train_speed(iter/s)": 0.236674 }, { "epoch": 2.532205198566964, "grad_norm": 0.5987167358398438, "learning_rate": 4.898831289197956e-05, "loss": 0.08414579629898071, "memory(GiB)": 122.96, "step": 33220, "token_acc": 0.9705044751830757, "train_speed(iter/s)": 0.236681 }, { "epoch": 2.532586325177224, "grad_norm": 1.4805432558059692, "learning_rate": 4.897634192675405e-05, "loss": 0.08905951976776123, "memory(GiB)": 122.96, "step": 33225, "token_acc": 0.9667215815485997, "train_speed(iter/s)": 0.236691 }, { "epoch": 2.532967451787484, "grad_norm": 0.5591229796409607, "learning_rate": 4.896437102023058e-05, "loss": 0.09001799821853637, "memory(GiB)": 122.96, "step": 33230, "token_acc": 0.9690829694323144, "train_speed(iter/s)": 0.236698 }, { "epoch": 2.5333485783977436, "grad_norm": 0.9160152077674866, "learning_rate": 4.8952400173095624e-05, "loss": 0.09925960898399352, "memory(GiB)": 122.96, "step": 33235, "token_acc": 0.9535120461486257, "train_speed(iter/s)": 0.236709 }, { "epoch": 2.5337297050080037, "grad_norm": 0.6434415578842163, "learning_rate": 4.894042938603566e-05, "loss": 0.08380405902862549, "memory(GiB)": 122.96, "step": 33240, "token_acc": 0.9625796178343949, "train_speed(iter/s)": 0.236712 }, { "epoch": 2.5341108316182637, "grad_norm": 0.8504883050918579, "learning_rate": 4.892845865973717e-05, "loss": 0.093232262134552, "memory(GiB)": 122.96, "step": 33245, "token_acc": 0.9706653947054615, "train_speed(iter/s)": 0.23672 }, { "epoch": 2.5344919582285232, "grad_norm": 1.1412131786346436, "learning_rate": 4.8916487994886576e-05, "loss": 0.0829300343990326, "memory(GiB)": 122.96, "step": 33250, "token_acc": 0.9662456445993032, "train_speed(iter/s)": 0.236728 }, { "epoch": 2.5348730848387833, "grad_norm": 0.47249430418014526, "learning_rate": 4.890451739217038e-05, "loss": 0.09913569092750549, "memory(GiB)": 122.96, "step": 33255, "token_acc": 0.9645363408521304, "train_speed(iter/s)": 0.236731 }, { "epoch": 2.5352542114490433, "grad_norm": 1.1726270914077759, "learning_rate": 4.889254685227503e-05, "loss": 0.09261062145233154, "memory(GiB)": 122.96, "step": 33260, "token_acc": 0.9631779067440465, "train_speed(iter/s)": 0.236738 }, { "epoch": 2.5356353380593033, "grad_norm": 1.6553479433059692, "learning_rate": 4.8880576375886966e-05, "loss": 0.08000626564025878, "memory(GiB)": 122.96, "step": 33265, "token_acc": 0.9698612029081295, "train_speed(iter/s)": 0.23674 }, { "epoch": 2.5360164646695633, "grad_norm": 1.0415784120559692, "learning_rate": 4.886860596369266e-05, "loss": 0.10430415868759155, "memory(GiB)": 122.96, "step": 33270, "token_acc": 0.963512241054614, "train_speed(iter/s)": 0.236747 }, { "epoch": 2.5363975912798233, "grad_norm": 0.9149861931800842, "learning_rate": 4.885663561637856e-05, "loss": 0.10411295890808106, "memory(GiB)": 122.96, "step": 33275, "token_acc": 0.9532362960668938, "train_speed(iter/s)": 0.236752 }, { "epoch": 2.5367787178900834, "grad_norm": 0.8235099911689758, "learning_rate": 4.884466533463109e-05, "loss": 0.08204564452171326, "memory(GiB)": 122.96, "step": 33280, "token_acc": 0.9597948717948718, "train_speed(iter/s)": 0.23676 }, { "epoch": 2.537159844500343, "grad_norm": 0.8802794814109802, "learning_rate": 4.8832695119136706e-05, "loss": 0.05346314907073975, "memory(GiB)": 122.96, "step": 33285, "token_acc": 0.9816130380275805, "train_speed(iter/s)": 0.23677 }, { "epoch": 2.537540971110603, "grad_norm": 1.432608962059021, "learning_rate": 4.882072497058184e-05, "loss": 0.08534727096557618, "memory(GiB)": 122.96, "step": 33290, "token_acc": 0.9660447133972097, "train_speed(iter/s)": 0.236777 }, { "epoch": 2.537922097720863, "grad_norm": 0.5062547922134399, "learning_rate": 4.880875488965292e-05, "loss": 0.10437676906585694, "memory(GiB)": 122.96, "step": 33295, "token_acc": 0.9598208035839283, "train_speed(iter/s)": 0.236783 }, { "epoch": 2.5383032243311225, "grad_norm": 0.35453522205352783, "learning_rate": 4.879678487703639e-05, "loss": 0.07997508645057679, "memory(GiB)": 122.96, "step": 33300, "token_acc": 0.96015625, "train_speed(iter/s)": 0.236792 }, { "epoch": 2.5386843509413826, "grad_norm": 1.3946725130081177, "learning_rate": 4.878481493341866e-05, "loss": 0.07111991047859192, "memory(GiB)": 122.96, "step": 33305, "token_acc": 0.9650565262076053, "train_speed(iter/s)": 0.236804 }, { "epoch": 2.5390654775516426, "grad_norm": 0.9492907524108887, "learning_rate": 4.877284505948615e-05, "loss": 0.06252399682998658, "memory(GiB)": 122.96, "step": 33310, "token_acc": 0.9749406489053021, "train_speed(iter/s)": 0.236809 }, { "epoch": 2.5394466041619026, "grad_norm": 1.1537877321243286, "learning_rate": 4.876087525592527e-05, "loss": 0.13176884651184081, "memory(GiB)": 122.96, "step": 33315, "token_acc": 0.9507692307692308, "train_speed(iter/s)": 0.236818 }, { "epoch": 2.5398277307721626, "grad_norm": 1.2672069072723389, "learning_rate": 4.874890552342247e-05, "loss": 0.10590393543243408, "memory(GiB)": 122.96, "step": 33320, "token_acc": 0.9595410628019324, "train_speed(iter/s)": 0.236828 }, { "epoch": 2.5402088573824226, "grad_norm": 0.8286433219909668, "learning_rate": 4.873693586266412e-05, "loss": 0.07820298671722412, "memory(GiB)": 122.96, "step": 33325, "token_acc": 0.9624116114625977, "train_speed(iter/s)": 0.236836 }, { "epoch": 2.540589983992682, "grad_norm": 0.7912158966064453, "learning_rate": 4.872496627433663e-05, "loss": 0.07073242068290711, "memory(GiB)": 122.96, "step": 33330, "token_acc": 0.9690402476780186, "train_speed(iter/s)": 0.236848 }, { "epoch": 2.5409711106029422, "grad_norm": 0.5380409359931946, "learning_rate": 4.871299675912639e-05, "loss": 0.060371577739715576, "memory(GiB)": 122.96, "step": 33335, "token_acc": 0.9742755307608864, "train_speed(iter/s)": 0.236854 }, { "epoch": 2.5413522372132022, "grad_norm": 0.9365635514259338, "learning_rate": 4.870102731771984e-05, "loss": 0.08260161876678467, "memory(GiB)": 122.96, "step": 33340, "token_acc": 0.9674459252785668, "train_speed(iter/s)": 0.236862 }, { "epoch": 2.5417333638234623, "grad_norm": 1.0397971868515015, "learning_rate": 4.8689057950803343e-05, "loss": 0.07976720333099366, "memory(GiB)": 122.96, "step": 33345, "token_acc": 0.9676891615541923, "train_speed(iter/s)": 0.236873 }, { "epoch": 2.542114490433722, "grad_norm": 2.4838805198669434, "learning_rate": 4.867708865906329e-05, "loss": 0.10098025798797608, "memory(GiB)": 122.96, "step": 33350, "token_acc": 0.9538152610441767, "train_speed(iter/s)": 0.236886 }, { "epoch": 2.542495617043982, "grad_norm": 0.7624874114990234, "learning_rate": 4.8665119443186064e-05, "loss": 0.11523323059082032, "memory(GiB)": 122.96, "step": 33355, "token_acc": 0.9676827029012119, "train_speed(iter/s)": 0.236886 }, { "epoch": 2.542876743654242, "grad_norm": 0.9284722805023193, "learning_rate": 4.865315030385804e-05, "loss": 0.0676472544670105, "memory(GiB)": 122.96, "step": 33360, "token_acc": 0.9760348583877996, "train_speed(iter/s)": 0.236886 }, { "epoch": 2.543257870264502, "grad_norm": 0.7425903081893921, "learning_rate": 4.86411812417656e-05, "loss": 0.06258687376976013, "memory(GiB)": 122.96, "step": 33365, "token_acc": 0.9756049960967994, "train_speed(iter/s)": 0.236891 }, { "epoch": 2.543638996874762, "grad_norm": 0.6436979174613953, "learning_rate": 4.8629212257595104e-05, "loss": 0.07466110587120056, "memory(GiB)": 122.96, "step": 33370, "token_acc": 0.9699947359185822, "train_speed(iter/s)": 0.236896 }, { "epoch": 2.544020123485022, "grad_norm": 0.6491569876670837, "learning_rate": 4.861724335203293e-05, "loss": 0.07887136936187744, "memory(GiB)": 122.96, "step": 33375, "token_acc": 0.9620853080568721, "train_speed(iter/s)": 0.236907 }, { "epoch": 2.5444012500952815, "grad_norm": 0.7132535576820374, "learning_rate": 4.8605274525765435e-05, "loss": 0.07716236114501954, "memory(GiB)": 122.96, "step": 33380, "token_acc": 0.9678779069767441, "train_speed(iter/s)": 0.23691 }, { "epoch": 2.5447823767055415, "grad_norm": 0.8511771559715271, "learning_rate": 4.8593305779478954e-05, "loss": 0.07430952787399292, "memory(GiB)": 122.96, "step": 33385, "token_acc": 0.9683947248749432, "train_speed(iter/s)": 0.236918 }, { "epoch": 2.5451635033158015, "grad_norm": 0.6393892765045166, "learning_rate": 4.858133711385987e-05, "loss": 0.09449993371963501, "memory(GiB)": 122.96, "step": 33390, "token_acc": 0.9645984012181196, "train_speed(iter/s)": 0.236919 }, { "epoch": 2.5455446299260616, "grad_norm": 0.9555995464324951, "learning_rate": 4.856936852959453e-05, "loss": 0.08707157969474792, "memory(GiB)": 122.96, "step": 33395, "token_acc": 0.9683009298393914, "train_speed(iter/s)": 0.236926 }, { "epoch": 2.545925756536321, "grad_norm": 1.0605794191360474, "learning_rate": 4.855740002736925e-05, "loss": 0.10160677433013916, "memory(GiB)": 122.96, "step": 33400, "token_acc": 0.9627870150435471, "train_speed(iter/s)": 0.236937 }, { "epoch": 2.545925756536321, "eval_loss": 0.08211491256952286, "eval_runtime": 218.5514, "eval_samples_per_second": 2.425, "eval_steps_per_second": 2.425, "eval_token_acc": 0.9645653876272514, "step": 33400 }, { "epoch": 2.546306883146581, "grad_norm": 0.6244673132896423, "learning_rate": 4.85454316078704e-05, "loss": 0.08155235052108764, "memory(GiB)": 122.96, "step": 33405, "token_acc": 0.9645719947941316, "train_speed(iter/s)": 0.236581 }, { "epoch": 2.546688009756841, "grad_norm": 0.7880488634109497, "learning_rate": 4.853346327178429e-05, "loss": 0.08855304718017579, "memory(GiB)": 122.96, "step": 33410, "token_acc": 0.9702399028241725, "train_speed(iter/s)": 0.23659 }, { "epoch": 2.547069136367101, "grad_norm": 1.204325795173645, "learning_rate": 4.8521495019797246e-05, "loss": 0.10192062854766845, "memory(GiB)": 122.96, "step": 33415, "token_acc": 0.9516196886832141, "train_speed(iter/s)": 0.236598 }, { "epoch": 2.547450262977361, "grad_norm": 2.4195854663848877, "learning_rate": 4.8509526852595614e-05, "loss": 0.08239102959632874, "memory(GiB)": 122.96, "step": 33420, "token_acc": 0.9693574958813839, "train_speed(iter/s)": 0.236604 }, { "epoch": 2.5478313895876212, "grad_norm": 1.7310869693756104, "learning_rate": 4.8497558770865705e-05, "loss": 0.10298590660095215, "memory(GiB)": 122.96, "step": 33425, "token_acc": 0.9588815789473685, "train_speed(iter/s)": 0.236612 }, { "epoch": 2.548212516197881, "grad_norm": 0.06103397533297539, "learning_rate": 4.848559077529382e-05, "loss": 0.0938825011253357, "memory(GiB)": 122.96, "step": 33430, "token_acc": 0.9493708807669263, "train_speed(iter/s)": 0.236622 }, { "epoch": 2.548593642808141, "grad_norm": 0.7581799626350403, "learning_rate": 4.847362286656627e-05, "loss": 0.08775216937065125, "memory(GiB)": 122.96, "step": 33435, "token_acc": 0.9559623948540327, "train_speed(iter/s)": 0.236634 }, { "epoch": 2.548974769418401, "grad_norm": 0.9391042590141296, "learning_rate": 4.8461655045369376e-05, "loss": 0.1017961859703064, "memory(GiB)": 122.96, "step": 33440, "token_acc": 0.9657534246575342, "train_speed(iter/s)": 0.236643 }, { "epoch": 2.549355896028661, "grad_norm": 0.9612947702407837, "learning_rate": 4.8449687312389426e-05, "loss": 0.07786097526550292, "memory(GiB)": 122.96, "step": 33445, "token_acc": 0.9660706743629452, "train_speed(iter/s)": 0.23665 }, { "epoch": 2.5497370226389204, "grad_norm": 0.6162891387939453, "learning_rate": 4.8437719668312706e-05, "loss": 0.08796676397323608, "memory(GiB)": 122.96, "step": 33450, "token_acc": 0.9658412887828163, "train_speed(iter/s)": 0.236655 }, { "epoch": 2.5501181492491805, "grad_norm": 0.988123893737793, "learning_rate": 4.8425752113825516e-05, "loss": 0.09662120938301086, "memory(GiB)": 122.96, "step": 33455, "token_acc": 0.9647553918990005, "train_speed(iter/s)": 0.23666 }, { "epoch": 2.5504992758594405, "grad_norm": 1.270635724067688, "learning_rate": 4.841378464961414e-05, "loss": 0.12485998868942261, "memory(GiB)": 122.96, "step": 33460, "token_acc": 0.9636215334420881, "train_speed(iter/s)": 0.236668 }, { "epoch": 2.5508804024697005, "grad_norm": 0.6873846650123596, "learning_rate": 4.840181727636485e-05, "loss": 0.03777420520782471, "memory(GiB)": 122.96, "step": 33465, "token_acc": 0.9738812177674264, "train_speed(iter/s)": 0.236674 }, { "epoch": 2.5512615290799605, "grad_norm": 0.6051129698753357, "learning_rate": 4.838984999476391e-05, "loss": 0.10263659954071044, "memory(GiB)": 122.96, "step": 33470, "token_acc": 0.9713574097135741, "train_speed(iter/s)": 0.236682 }, { "epoch": 2.5516426556902205, "grad_norm": 0.9470630884170532, "learning_rate": 4.837788280549761e-05, "loss": 0.06979877352714539, "memory(GiB)": 122.96, "step": 33475, "token_acc": 0.9727708961053454, "train_speed(iter/s)": 0.236682 }, { "epoch": 2.55202378230048, "grad_norm": 0.6842067241668701, "learning_rate": 4.8365915709252215e-05, "loss": 0.07424346804618835, "memory(GiB)": 122.96, "step": 33480, "token_acc": 0.966411860088024, "train_speed(iter/s)": 0.236689 }, { "epoch": 2.55240490891074, "grad_norm": 0.8614187240600586, "learning_rate": 4.8353948706713967e-05, "loss": 0.09496914744377136, "memory(GiB)": 122.96, "step": 33485, "token_acc": 0.9640435262576881, "train_speed(iter/s)": 0.236691 }, { "epoch": 2.552786035521, "grad_norm": 1.8952419757843018, "learning_rate": 4.834198179856911e-05, "loss": 0.1354023337364197, "memory(GiB)": 122.96, "step": 33490, "token_acc": 0.9382278481012658, "train_speed(iter/s)": 0.2367 }, { "epoch": 2.55316716213126, "grad_norm": 0.6305596232414246, "learning_rate": 4.833001498550392e-05, "loss": 0.10297071933746338, "memory(GiB)": 122.96, "step": 33495, "token_acc": 0.9641983544585279, "train_speed(iter/s)": 0.236706 }, { "epoch": 2.5535482887415197, "grad_norm": 0.8222968578338623, "learning_rate": 4.831804826820462e-05, "loss": 0.08174842000007629, "memory(GiB)": 122.96, "step": 33500, "token_acc": 0.9749226510340335, "train_speed(iter/s)": 0.23671 }, { "epoch": 2.5539294153517798, "grad_norm": 0.7846235036849976, "learning_rate": 4.830608164735743e-05, "loss": 0.09607842564582825, "memory(GiB)": 122.96, "step": 33505, "token_acc": 0.9601593625498008, "train_speed(iter/s)": 0.236713 }, { "epoch": 2.5543105419620398, "grad_norm": 0.5848549604415894, "learning_rate": 4.8294115123648606e-05, "loss": 0.07423696517944336, "memory(GiB)": 122.96, "step": 33510, "token_acc": 0.9716919739696313, "train_speed(iter/s)": 0.236711 }, { "epoch": 2.5546916685723, "grad_norm": 1.5587016344070435, "learning_rate": 4.828214869776437e-05, "loss": 0.08522294759750366, "memory(GiB)": 122.96, "step": 33515, "token_acc": 0.9646941819990055, "train_speed(iter/s)": 0.236716 }, { "epoch": 2.55507279518256, "grad_norm": 1.374883770942688, "learning_rate": 4.827018237039092e-05, "loss": 0.08336693048477173, "memory(GiB)": 122.96, "step": 33520, "token_acc": 0.9652918638719856, "train_speed(iter/s)": 0.236724 }, { "epoch": 2.55545392179282, "grad_norm": 0.24044804275035858, "learning_rate": 4.8258216142214496e-05, "loss": 0.06760537624359131, "memory(GiB)": 122.96, "step": 33525, "token_acc": 0.9660141966014196, "train_speed(iter/s)": 0.23673 }, { "epoch": 2.5558350484030794, "grad_norm": 0.538702130317688, "learning_rate": 4.8246250013921294e-05, "loss": 0.10059034824371338, "memory(GiB)": 122.96, "step": 33530, "token_acc": 0.9557291666666666, "train_speed(iter/s)": 0.236742 }, { "epoch": 2.5562161750133394, "grad_norm": 0.5170353651046753, "learning_rate": 4.823428398619751e-05, "loss": 0.09949968457221985, "memory(GiB)": 122.96, "step": 33535, "token_acc": 0.9570856379935152, "train_speed(iter/s)": 0.236749 }, { "epoch": 2.5565973016235994, "grad_norm": 0.8190162181854248, "learning_rate": 4.8222318059729345e-05, "loss": 0.1120996356010437, "memory(GiB)": 122.96, "step": 33540, "token_acc": 0.9583609820836099, "train_speed(iter/s)": 0.236756 }, { "epoch": 2.556978428233859, "grad_norm": 0.9716420769691467, "learning_rate": 4.8210352235202995e-05, "loss": 0.13419430255889891, "memory(GiB)": 122.96, "step": 33545, "token_acc": 0.9571564582672167, "train_speed(iter/s)": 0.236766 }, { "epoch": 2.557359554844119, "grad_norm": 0.5284759998321533, "learning_rate": 4.819838651330464e-05, "loss": 0.05544393062591553, "memory(GiB)": 122.96, "step": 33550, "token_acc": 0.968358602504944, "train_speed(iter/s)": 0.236773 }, { "epoch": 2.557740681454379, "grad_norm": 0.613286018371582, "learning_rate": 4.8186420894720446e-05, "loss": 0.08546789288520813, "memory(GiB)": 122.96, "step": 33555, "token_acc": 0.9723865877712031, "train_speed(iter/s)": 0.236774 }, { "epoch": 2.558121808064639, "grad_norm": 0.8269124627113342, "learning_rate": 4.8174455380136604e-05, "loss": 0.09444112181663514, "memory(GiB)": 122.96, "step": 33560, "token_acc": 0.9647275158809763, "train_speed(iter/s)": 0.236781 }, { "epoch": 2.558502934674899, "grad_norm": 0.8729132413864136, "learning_rate": 4.8162489970239285e-05, "loss": 0.07265368700027466, "memory(GiB)": 122.96, "step": 33565, "token_acc": 0.9692544196771714, "train_speed(iter/s)": 0.236791 }, { "epoch": 2.558884061285159, "grad_norm": 1.0262267589569092, "learning_rate": 4.815052466571462e-05, "loss": 0.08108082413673401, "memory(GiB)": 122.96, "step": 33570, "token_acc": 0.9633333333333334, "train_speed(iter/s)": 0.2368 }, { "epoch": 2.559265187895419, "grad_norm": 0.4291098713874817, "learning_rate": 4.813855946724879e-05, "loss": 0.06907997727394104, "memory(GiB)": 122.96, "step": 33575, "token_acc": 0.9684456564082586, "train_speed(iter/s)": 0.236806 }, { "epoch": 2.5596463145056787, "grad_norm": 0.8573706150054932, "learning_rate": 4.812659437552795e-05, "loss": 0.06109694242477417, "memory(GiB)": 122.96, "step": 33580, "token_acc": 0.9668737060041408, "train_speed(iter/s)": 0.236815 }, { "epoch": 2.5600274411159387, "grad_norm": 0.5125099420547485, "learning_rate": 4.811462939123821e-05, "loss": 0.07955032587051392, "memory(GiB)": 122.96, "step": 33585, "token_acc": 0.9675855028075548, "train_speed(iter/s)": 0.236815 }, { "epoch": 2.5604085677261987, "grad_norm": 1.281698226928711, "learning_rate": 4.810266451506574e-05, "loss": 0.12990150451660157, "memory(GiB)": 122.96, "step": 33590, "token_acc": 0.946571887181839, "train_speed(iter/s)": 0.236823 }, { "epoch": 2.5607896943364583, "grad_norm": 0.7742004990577698, "learning_rate": 4.809069974769665e-05, "loss": 0.1321187734603882, "memory(GiB)": 122.96, "step": 33595, "token_acc": 0.956989247311828, "train_speed(iter/s)": 0.236828 }, { "epoch": 2.5611708209467183, "grad_norm": 0.9070193767547607, "learning_rate": 4.807873508981704e-05, "loss": 0.09043740034103394, "memory(GiB)": 122.96, "step": 33600, "token_acc": 0.9668783250386134, "train_speed(iter/s)": 0.236833 }, { "epoch": 2.5611708209467183, "eval_loss": 0.0816020593047142, "eval_runtime": 217.9084, "eval_samples_per_second": 2.432, "eval_steps_per_second": 2.432, "eval_token_acc": 0.9646708029636769, "step": 33600 }, { "epoch": 2.5615519475569783, "grad_norm": 0.8128228187561035, "learning_rate": 4.806677054211309e-05, "loss": 0.06645612716674805, "memory(GiB)": 122.96, "step": 33605, "token_acc": 0.9647396927158174, "train_speed(iter/s)": 0.236481 }, { "epoch": 2.5619330741672384, "grad_norm": 1.1780979633331299, "learning_rate": 4.805480610527087e-05, "loss": 0.08472345471382141, "memory(GiB)": 122.96, "step": 33610, "token_acc": 0.9574468085106383, "train_speed(iter/s)": 0.236494 }, { "epoch": 2.5623142007774984, "grad_norm": 0.883957028388977, "learning_rate": 4.80428417799765e-05, "loss": 0.07519138455390931, "memory(GiB)": 122.96, "step": 33615, "token_acc": 0.9706596897776116, "train_speed(iter/s)": 0.23649 }, { "epoch": 2.5626953273877584, "grad_norm": 0.9901317954063416, "learning_rate": 4.8030877566916073e-05, "loss": 0.12461764812469482, "memory(GiB)": 122.96, "step": 33620, "token_acc": 0.9537185660781167, "train_speed(iter/s)": 0.236499 }, { "epoch": 2.563076453998018, "grad_norm": 0.7721173167228699, "learning_rate": 4.8018913466775663e-05, "loss": 0.054832732677459715, "memory(GiB)": 122.96, "step": 33625, "token_acc": 0.9767441860465116, "train_speed(iter/s)": 0.236506 }, { "epoch": 2.563457580608278, "grad_norm": 1.161210298538208, "learning_rate": 4.8006949480241386e-05, "loss": 0.05779210329055786, "memory(GiB)": 122.96, "step": 33630, "token_acc": 0.9698275862068966, "train_speed(iter/s)": 0.236514 }, { "epoch": 2.563838707218538, "grad_norm": 0.5758131146430969, "learning_rate": 4.799498560799931e-05, "loss": 0.07993063926696778, "memory(GiB)": 122.96, "step": 33635, "token_acc": 0.9707413835854203, "train_speed(iter/s)": 0.236523 }, { "epoch": 2.564219833828798, "grad_norm": 1.2821311950683594, "learning_rate": 4.7983021850735506e-05, "loss": 0.11013473272323608, "memory(GiB)": 122.96, "step": 33640, "token_acc": 0.9503612943763745, "train_speed(iter/s)": 0.236533 }, { "epoch": 2.5646009604390576, "grad_norm": 0.6820307970046997, "learning_rate": 4.797105820913602e-05, "loss": 0.07046456933021546, "memory(GiB)": 122.96, "step": 33645, "token_acc": 0.9645152995628696, "train_speed(iter/s)": 0.236541 }, { "epoch": 2.5649820870493176, "grad_norm": 0.7819390892982483, "learning_rate": 4.795909468388695e-05, "loss": 0.058859622478485106, "memory(GiB)": 122.96, "step": 33650, "token_acc": 0.977808923148797, "train_speed(iter/s)": 0.236541 }, { "epoch": 2.5653632136595776, "grad_norm": 1.3476794958114624, "learning_rate": 4.7947131275674325e-05, "loss": 0.136098051071167, "memory(GiB)": 122.96, "step": 33655, "token_acc": 0.956744305093422, "train_speed(iter/s)": 0.236549 }, { "epoch": 2.5657443402698377, "grad_norm": 0.7969303727149963, "learning_rate": 4.793516798518418e-05, "loss": 0.0974457859992981, "memory(GiB)": 122.96, "step": 33660, "token_acc": 0.9627249357326478, "train_speed(iter/s)": 0.236557 }, { "epoch": 2.5661254668800977, "grad_norm": 1.2593172788619995, "learning_rate": 4.792320481310259e-05, "loss": 0.07500631809234619, "memory(GiB)": 122.96, "step": 33665, "token_acc": 0.9696492805755396, "train_speed(iter/s)": 0.236566 }, { "epoch": 2.5665065934903577, "grad_norm": 0.9430801272392273, "learning_rate": 4.791124176011556e-05, "loss": 0.09069485664367676, "memory(GiB)": 122.96, "step": 33670, "token_acc": 0.9606525911708254, "train_speed(iter/s)": 0.236575 }, { "epoch": 2.5668877201006173, "grad_norm": 0.5406455397605896, "learning_rate": 4.789927882690911e-05, "loss": 0.0647216260433197, "memory(GiB)": 122.96, "step": 33675, "token_acc": 0.9731534288496314, "train_speed(iter/s)": 0.236576 }, { "epoch": 2.5672688467108773, "grad_norm": 1.2473437786102295, "learning_rate": 4.7887316014169284e-05, "loss": 0.11939506530761719, "memory(GiB)": 122.96, "step": 33680, "token_acc": 0.9607751575998132, "train_speed(iter/s)": 0.236585 }, { "epoch": 2.5676499733211373, "grad_norm": 0.6951276063919067, "learning_rate": 4.787535332258209e-05, "loss": 0.05691378116607666, "memory(GiB)": 122.96, "step": 33685, "token_acc": 0.9744813961145867, "train_speed(iter/s)": 0.236591 }, { "epoch": 2.5680310999313973, "grad_norm": 0.9069197177886963, "learning_rate": 4.786339075283349e-05, "loss": 0.1023928165435791, "memory(GiB)": 122.96, "step": 33690, "token_acc": 0.9609745579991376, "train_speed(iter/s)": 0.236599 }, { "epoch": 2.568412226541657, "grad_norm": 0.7306497693061829, "learning_rate": 4.785142830560954e-05, "loss": 0.07926180362701415, "memory(GiB)": 122.96, "step": 33695, "token_acc": 0.9649344096871847, "train_speed(iter/s)": 0.236608 }, { "epoch": 2.568793353151917, "grad_norm": 0.7546234130859375, "learning_rate": 4.783946598159621e-05, "loss": 0.09403859376907349, "memory(GiB)": 122.96, "step": 33700, "token_acc": 0.9664646464646465, "train_speed(iter/s)": 0.236619 }, { "epoch": 2.569174479762177, "grad_norm": 0.5300365686416626, "learning_rate": 4.782750378147947e-05, "loss": 0.0779864490032196, "memory(GiB)": 122.96, "step": 33705, "token_acc": 0.9661495063469676, "train_speed(iter/s)": 0.236628 }, { "epoch": 2.569555606372437, "grad_norm": 0.8069491982460022, "learning_rate": 4.781554170594531e-05, "loss": 0.1019016981124878, "memory(GiB)": 122.96, "step": 33710, "token_acc": 0.9743051914001049, "train_speed(iter/s)": 0.236635 }, { "epoch": 2.569936732982697, "grad_norm": 0.8162585496902466, "learning_rate": 4.780357975567969e-05, "loss": 0.07748110294342041, "memory(GiB)": 122.96, "step": 33715, "token_acc": 0.969002201027146, "train_speed(iter/s)": 0.236642 }, { "epoch": 2.570317859592957, "grad_norm": 0.4098800718784332, "learning_rate": 4.779161793136859e-05, "loss": 0.11620439291000366, "memory(GiB)": 122.96, "step": 33720, "token_acc": 0.961318407960199, "train_speed(iter/s)": 0.236646 }, { "epoch": 2.5706989862032166, "grad_norm": 1.5975258350372314, "learning_rate": 4.7779656233697934e-05, "loss": 0.09769705533981324, "memory(GiB)": 122.96, "step": 33725, "token_acc": 0.9570537981990302, "train_speed(iter/s)": 0.236654 }, { "epoch": 2.5710801128134766, "grad_norm": 0.7343229055404663, "learning_rate": 4.77676946633537e-05, "loss": 0.10105082988739014, "memory(GiB)": 122.96, "step": 33730, "token_acc": 0.9581847649918963, "train_speed(iter/s)": 0.236659 }, { "epoch": 2.5714612394237366, "grad_norm": 1.2869561910629272, "learning_rate": 4.77557332210218e-05, "loss": 0.06251505613327027, "memory(GiB)": 122.96, "step": 33735, "token_acc": 0.9619796091758709, "train_speed(iter/s)": 0.236667 }, { "epoch": 2.5718423660339966, "grad_norm": 0.9730122685432434, "learning_rate": 4.7743771907388215e-05, "loss": 0.0996316134929657, "memory(GiB)": 122.96, "step": 33740, "token_acc": 0.9511354420113545, "train_speed(iter/s)": 0.236674 }, { "epoch": 2.572223492644256, "grad_norm": 1.0387439727783203, "learning_rate": 4.773181072313883e-05, "loss": 0.06243879199028015, "memory(GiB)": 122.96, "step": 33745, "token_acc": 0.9706390328151986, "train_speed(iter/s)": 0.236684 }, { "epoch": 2.5726046192545162, "grad_norm": 0.6113041043281555, "learning_rate": 4.771984966895957e-05, "loss": 0.09206275343894958, "memory(GiB)": 122.96, "step": 33750, "token_acc": 0.96468330134357, "train_speed(iter/s)": 0.236689 }, { "epoch": 2.5729857458647762, "grad_norm": 0.6980882883071899, "learning_rate": 4.7707888745536355e-05, "loss": 0.05328338742256165, "memory(GiB)": 122.96, "step": 33755, "token_acc": 0.9793628531356442, "train_speed(iter/s)": 0.236694 }, { "epoch": 2.5733668724750363, "grad_norm": 0.49913209676742554, "learning_rate": 4.7695927953555085e-05, "loss": 0.04082399010658264, "memory(GiB)": 122.96, "step": 33760, "token_acc": 0.9796593352075409, "train_speed(iter/s)": 0.2367 }, { "epoch": 2.5737479990852963, "grad_norm": 2.7099111080169678, "learning_rate": 4.768396729370165e-05, "loss": 0.12803636789321898, "memory(GiB)": 122.96, "step": 33765, "token_acc": 0.9585960129493951, "train_speed(iter/s)": 0.236705 }, { "epoch": 2.5741291256955563, "grad_norm": 0.996671736240387, "learning_rate": 4.7672006766661945e-05, "loss": 0.06688524484634399, "memory(GiB)": 122.96, "step": 33770, "token_acc": 0.9710306406685236, "train_speed(iter/s)": 0.236714 }, { "epoch": 2.574510252305816, "grad_norm": 1.3469046354293823, "learning_rate": 4.7660046373121856e-05, "loss": 0.09164313077926636, "memory(GiB)": 122.96, "step": 33775, "token_acc": 0.9573152781866353, "train_speed(iter/s)": 0.236719 }, { "epoch": 2.574891378916076, "grad_norm": 1.5829466581344604, "learning_rate": 4.7648086113767235e-05, "loss": 0.1119532585144043, "memory(GiB)": 122.96, "step": 33780, "token_acc": 0.9561752988047809, "train_speed(iter/s)": 0.236726 }, { "epoch": 2.575272505526336, "grad_norm": 1.006494164466858, "learning_rate": 4.763612598928397e-05, "loss": 0.06549522280693054, "memory(GiB)": 122.96, "step": 33785, "token_acc": 0.9716684155299056, "train_speed(iter/s)": 0.236732 }, { "epoch": 2.575653632136596, "grad_norm": 0.7849476933479309, "learning_rate": 4.762416600035791e-05, "loss": 0.07857672572135925, "memory(GiB)": 122.96, "step": 33790, "token_acc": 0.9698446542796223, "train_speed(iter/s)": 0.23674 }, { "epoch": 2.5760347587468555, "grad_norm": 1.603631854057312, "learning_rate": 4.7612206147674896e-05, "loss": 0.1249001145362854, "memory(GiB)": 122.96, "step": 33795, "token_acc": 0.9456478928712091, "train_speed(iter/s)": 0.236747 }, { "epoch": 2.5764158853571155, "grad_norm": 0.8279714584350586, "learning_rate": 4.760024643192079e-05, "loss": 0.08458157777786254, "memory(GiB)": 122.96, "step": 33800, "token_acc": 0.9757952973720608, "train_speed(iter/s)": 0.236753 }, { "epoch": 2.5764158853571155, "eval_loss": 0.08176679164171219, "eval_runtime": 215.3403, "eval_samples_per_second": 2.461, "eval_steps_per_second": 2.461, "eval_token_acc": 0.9646030359616891, "step": 33800 }, { "epoch": 2.5767970119673755, "grad_norm": 0.930053174495697, "learning_rate": 4.7588286853781416e-05, "loss": 0.12198359966278076, "memory(GiB)": 122.96, "step": 33805, "token_acc": 0.9641474547870804, "train_speed(iter/s)": 0.236403 }, { "epoch": 2.5771781385776356, "grad_norm": 0.646510899066925, "learning_rate": 4.75763274139426e-05, "loss": 0.09420689940452576, "memory(GiB)": 122.96, "step": 33810, "token_acc": 0.9687130052426856, "train_speed(iter/s)": 0.236408 }, { "epoch": 2.5775592651878956, "grad_norm": 0.6364647150039673, "learning_rate": 4.756436811309014e-05, "loss": 0.09182687401771546, "memory(GiB)": 122.96, "step": 33815, "token_acc": 0.9671465968586388, "train_speed(iter/s)": 0.236413 }, { "epoch": 2.5779403917981556, "grad_norm": 0.7364826202392578, "learning_rate": 4.755240895190989e-05, "loss": 0.06027455329895019, "memory(GiB)": 122.96, "step": 33820, "token_acc": 0.9749351771823682, "train_speed(iter/s)": 0.236425 }, { "epoch": 2.578321518408415, "grad_norm": 1.3709372282028198, "learning_rate": 4.7540449931087615e-05, "loss": 0.09123446941375732, "memory(GiB)": 122.96, "step": 33825, "token_acc": 0.9706484641638226, "train_speed(iter/s)": 0.236432 }, { "epoch": 2.578702645018675, "grad_norm": 1.148686170578003, "learning_rate": 4.752849105130912e-05, "loss": 0.09112793207168579, "memory(GiB)": 122.96, "step": 33830, "token_acc": 0.9667121884602253, "train_speed(iter/s)": 0.236439 }, { "epoch": 2.579083771628935, "grad_norm": 1.0731024742126465, "learning_rate": 4.7516532313260206e-05, "loss": 0.07172578573226929, "memory(GiB)": 122.96, "step": 33835, "token_acc": 0.973458667403926, "train_speed(iter/s)": 0.236442 }, { "epoch": 2.5794648982391952, "grad_norm": 0.6122640371322632, "learning_rate": 4.7504573717626634e-05, "loss": 0.08957872390747071, "memory(GiB)": 122.96, "step": 33840, "token_acc": 0.9698054474708171, "train_speed(iter/s)": 0.236446 }, { "epoch": 2.579846024849455, "grad_norm": 1.6825331449508667, "learning_rate": 4.749261526509417e-05, "loss": 0.10790429115295411, "memory(GiB)": 122.96, "step": 33845, "token_acc": 0.958005249343832, "train_speed(iter/s)": 0.236455 }, { "epoch": 2.580227151459715, "grad_norm": 0.9899312853813171, "learning_rate": 4.7480656956348594e-05, "loss": 0.09846560955047608, "memory(GiB)": 122.96, "step": 33850, "token_acc": 0.9565010319098269, "train_speed(iter/s)": 0.23646 }, { "epoch": 2.580608278069975, "grad_norm": 0.3949052393436432, "learning_rate": 4.746869879207566e-05, "loss": 0.07196462750434876, "memory(GiB)": 122.96, "step": 33855, "token_acc": 0.9754135002235136, "train_speed(iter/s)": 0.236464 }, { "epoch": 2.580989404680235, "grad_norm": 0.6894046068191528, "learning_rate": 4.745674077296109e-05, "loss": 0.08135765194892883, "memory(GiB)": 122.96, "step": 33860, "token_acc": 0.967979002624672, "train_speed(iter/s)": 0.23647 }, { "epoch": 2.581370531290495, "grad_norm": 1.2190715074539185, "learning_rate": 4.7444782899690635e-05, "loss": 0.07030567526817322, "memory(GiB)": 122.96, "step": 33865, "token_acc": 0.9694897422409259, "train_speed(iter/s)": 0.236482 }, { "epoch": 2.581751657900755, "grad_norm": 0.7777606844902039, "learning_rate": 4.743282517295e-05, "loss": 0.07832826375961303, "memory(GiB)": 122.96, "step": 33870, "token_acc": 0.963064584212748, "train_speed(iter/s)": 0.236491 }, { "epoch": 2.5821327845110145, "grad_norm": 0.735948920249939, "learning_rate": 4.742086759342496e-05, "loss": 0.06766526699066162, "memory(GiB)": 122.96, "step": 33875, "token_acc": 0.973505853357979, "train_speed(iter/s)": 0.236486 }, { "epoch": 2.5825139111212745, "grad_norm": 0.6310897469520569, "learning_rate": 4.740891016180119e-05, "loss": 0.0898415744304657, "memory(GiB)": 122.96, "step": 33880, "token_acc": 0.9646126520016396, "train_speed(iter/s)": 0.236488 }, { "epoch": 2.5828950377315345, "grad_norm": 1.298627257347107, "learning_rate": 4.739695287876439e-05, "loss": 0.07887614965438842, "memory(GiB)": 122.96, "step": 33885, "token_acc": 0.9557838320678874, "train_speed(iter/s)": 0.2365 }, { "epoch": 2.583276164341794, "grad_norm": 0.655427873134613, "learning_rate": 4.7384995745000274e-05, "loss": 0.05766052007675171, "memory(GiB)": 122.96, "step": 33890, "token_acc": 0.97771124417831, "train_speed(iter/s)": 0.236505 }, { "epoch": 2.583657290952054, "grad_norm": 0.07144086807966232, "learning_rate": 4.737303876119452e-05, "loss": 0.06853293180465699, "memory(GiB)": 122.96, "step": 33895, "token_acc": 0.9732635060639471, "train_speed(iter/s)": 0.236514 }, { "epoch": 2.584038417562314, "grad_norm": 0.7946116328239441, "learning_rate": 4.736108192803278e-05, "loss": 0.0999443531036377, "memory(GiB)": 122.96, "step": 33900, "token_acc": 0.9604875998318622, "train_speed(iter/s)": 0.23652 }, { "epoch": 2.584419544172574, "grad_norm": 0.9583191275596619, "learning_rate": 4.7349125246200764e-05, "loss": 0.0689189076423645, "memory(GiB)": 122.96, "step": 33905, "token_acc": 0.9728395061728395, "train_speed(iter/s)": 0.236533 }, { "epoch": 2.584800670782834, "grad_norm": 1.0344456434249878, "learning_rate": 4.7337168716384116e-05, "loss": 0.0747263789176941, "memory(GiB)": 122.96, "step": 33910, "token_acc": 0.9644519678374948, "train_speed(iter/s)": 0.236544 }, { "epoch": 2.585181797393094, "grad_norm": 0.7092944383621216, "learning_rate": 4.7325212339268474e-05, "loss": 0.07923012375831603, "memory(GiB)": 122.96, "step": 33915, "token_acc": 0.9615674453654861, "train_speed(iter/s)": 0.236549 }, { "epoch": 2.585562924003354, "grad_norm": 0.9079461097717285, "learning_rate": 4.7313256115539495e-05, "loss": 0.131829571723938, "memory(GiB)": 122.96, "step": 33920, "token_acc": 0.9617667597765364, "train_speed(iter/s)": 0.236556 }, { "epoch": 2.5859440506136138, "grad_norm": 0.6713945865631104, "learning_rate": 4.730130004588282e-05, "loss": 0.12849587202072144, "memory(GiB)": 122.96, "step": 33925, "token_acc": 0.9636678200692042, "train_speed(iter/s)": 0.236562 }, { "epoch": 2.586325177223874, "grad_norm": 1.377274751663208, "learning_rate": 4.728934413098405e-05, "loss": 0.08038285970687867, "memory(GiB)": 122.96, "step": 33930, "token_acc": 0.9719534732036091, "train_speed(iter/s)": 0.236565 }, { "epoch": 2.586706303834134, "grad_norm": 0.7679956555366516, "learning_rate": 4.727738837152882e-05, "loss": 0.07744968533515931, "memory(GiB)": 122.96, "step": 33935, "token_acc": 0.9686815593517302, "train_speed(iter/s)": 0.236573 }, { "epoch": 2.5870874304443934, "grad_norm": 1.3349090814590454, "learning_rate": 4.726543276820273e-05, "loss": 0.09478347301483155, "memory(GiB)": 122.96, "step": 33940, "token_acc": 0.9586449626044875, "train_speed(iter/s)": 0.236581 }, { "epoch": 2.5874685570546534, "grad_norm": 1.4836622476577759, "learning_rate": 4.725347732169139e-05, "loss": 0.07638683319091796, "memory(GiB)": 122.96, "step": 33945, "token_acc": 0.9700149925037481, "train_speed(iter/s)": 0.23659 }, { "epoch": 2.5878496836649134, "grad_norm": 1.7623060941696167, "learning_rate": 4.7241522032680366e-05, "loss": 0.10199128389358521, "memory(GiB)": 122.96, "step": 33950, "token_acc": 0.958287249170485, "train_speed(iter/s)": 0.236597 }, { "epoch": 2.5882308102751734, "grad_norm": 0.7934370040893555, "learning_rate": 4.7229566901855264e-05, "loss": 0.08000107407569886, "memory(GiB)": 122.96, "step": 33955, "token_acc": 0.9671862182116489, "train_speed(iter/s)": 0.2366 }, { "epoch": 2.5886119368854335, "grad_norm": 0.8853175044059753, "learning_rate": 4.721761192990165e-05, "loss": 0.09526806473731994, "memory(GiB)": 122.96, "step": 33960, "token_acc": 0.96590761223162, "train_speed(iter/s)": 0.236604 }, { "epoch": 2.5889930634956935, "grad_norm": 1.670579195022583, "learning_rate": 4.7205657117505056e-05, "loss": 0.08847002387046814, "memory(GiB)": 122.96, "step": 33965, "token_acc": 0.9604708362614195, "train_speed(iter/s)": 0.236612 }, { "epoch": 2.589374190105953, "grad_norm": 1.0070867538452148, "learning_rate": 4.719370246535107e-05, "loss": 0.07580370903015136, "memory(GiB)": 122.96, "step": 33970, "token_acc": 0.9637305699481865, "train_speed(iter/s)": 0.23662 }, { "epoch": 2.589755316716213, "grad_norm": 1.007646918296814, "learning_rate": 4.718174797412523e-05, "loss": 0.08596599698066712, "memory(GiB)": 122.96, "step": 33975, "token_acc": 0.9657534246575342, "train_speed(iter/s)": 0.236629 }, { "epoch": 2.590136443326473, "grad_norm": 1.1688297986984253, "learning_rate": 4.716979364451307e-05, "loss": 0.10353718996047974, "memory(GiB)": 122.96, "step": 33980, "token_acc": 0.9624230644638808, "train_speed(iter/s)": 0.236638 }, { "epoch": 2.590517569936733, "grad_norm": 0.5601897239685059, "learning_rate": 4.71578394772001e-05, "loss": 0.08620396852493287, "memory(GiB)": 122.96, "step": 33985, "token_acc": 0.9678846883297509, "train_speed(iter/s)": 0.236641 }, { "epoch": 2.5908986965469927, "grad_norm": 0.8004137873649597, "learning_rate": 4.7145885472871855e-05, "loss": 0.1362424373626709, "memory(GiB)": 122.96, "step": 33990, "token_acc": 0.9486543233441497, "train_speed(iter/s)": 0.236645 }, { "epoch": 2.5912798231572527, "grad_norm": 0.7461467981338501, "learning_rate": 4.713393163221383e-05, "loss": 0.09784333109855652, "memory(GiB)": 122.96, "step": 33995, "token_acc": 0.9658884172225591, "train_speed(iter/s)": 0.236649 }, { "epoch": 2.5916609497675127, "grad_norm": 0.62941974401474, "learning_rate": 4.712197795591151e-05, "loss": 0.08269661664962769, "memory(GiB)": 122.96, "step": 34000, "token_acc": 0.9646071515446364, "train_speed(iter/s)": 0.236651 }, { "epoch": 2.5916609497675127, "eval_loss": 0.07932724803686142, "eval_runtime": 216.8181, "eval_samples_per_second": 2.444, "eval_steps_per_second": 2.444, "eval_token_acc": 0.9654312993193181, "step": 34000 }, { "epoch": 2.5920420763777727, "grad_norm": 0.721224844455719, "learning_rate": 4.71100244446504e-05, "loss": 0.09640024304389953, "memory(GiB)": 122.96, "step": 34005, "token_acc": 0.965474959373615, "train_speed(iter/s)": 0.236303 }, { "epoch": 2.5924232029880327, "grad_norm": 0.8289339542388916, "learning_rate": 4.7098071099116e-05, "loss": 0.06655012965202331, "memory(GiB)": 122.96, "step": 34010, "token_acc": 0.9638339920948616, "train_speed(iter/s)": 0.236308 }, { "epoch": 2.5928043295982928, "grad_norm": 0.7720751166343689, "learning_rate": 4.7086117919993746e-05, "loss": 0.09038561582565308, "memory(GiB)": 122.96, "step": 34015, "token_acc": 0.9700520833333334, "train_speed(iter/s)": 0.236312 }, { "epoch": 2.5931854562085523, "grad_norm": 0.9583210945129395, "learning_rate": 4.70741649079691e-05, "loss": 0.09396907687187195, "memory(GiB)": 122.96, "step": 34020, "token_acc": 0.9642210144927537, "train_speed(iter/s)": 0.236318 }, { "epoch": 2.5935665828188124, "grad_norm": 0.8889279365539551, "learning_rate": 4.706221206372753e-05, "loss": 0.07941424250602722, "memory(GiB)": 122.96, "step": 34025, "token_acc": 0.9647732276530163, "train_speed(iter/s)": 0.236325 }, { "epoch": 2.5939477094290724, "grad_norm": 1.2236301898956299, "learning_rate": 4.7050259387954466e-05, "loss": 0.09041731357574463, "memory(GiB)": 122.96, "step": 34030, "token_acc": 0.9669894366197183, "train_speed(iter/s)": 0.236331 }, { "epoch": 2.5943288360393324, "grad_norm": 0.6097456216812134, "learning_rate": 4.703830688133534e-05, "loss": 0.12461872100830078, "memory(GiB)": 122.96, "step": 34035, "token_acc": 0.9614762058918744, "train_speed(iter/s)": 0.23634 }, { "epoch": 2.594709962649592, "grad_norm": 0.7389066219329834, "learning_rate": 4.702635454455555e-05, "loss": 0.06718681454658508, "memory(GiB)": 122.96, "step": 34040, "token_acc": 0.9681686647374949, "train_speed(iter/s)": 0.236344 }, { "epoch": 2.595091089259852, "grad_norm": 1.731662392616272, "learning_rate": 4.701440237830055e-05, "loss": 0.08666484355926514, "memory(GiB)": 122.96, "step": 34045, "token_acc": 0.9756372659598466, "train_speed(iter/s)": 0.23635 }, { "epoch": 2.595472215870112, "grad_norm": 0.5735782980918884, "learning_rate": 4.700245038325571e-05, "loss": 0.0789783775806427, "memory(GiB)": 122.96, "step": 34050, "token_acc": 0.9643734643734644, "train_speed(iter/s)": 0.236353 }, { "epoch": 2.595853342480372, "grad_norm": 0.7593801617622375, "learning_rate": 4.699049856010642e-05, "loss": 0.12437083721160888, "memory(GiB)": 122.96, "step": 34055, "token_acc": 0.947808764940239, "train_speed(iter/s)": 0.236364 }, { "epoch": 2.596234469090632, "grad_norm": 1.0809094905853271, "learning_rate": 4.697854690953809e-05, "loss": 0.10753762722015381, "memory(GiB)": 122.96, "step": 34060, "token_acc": 0.9555898226676947, "train_speed(iter/s)": 0.236368 }, { "epoch": 2.596615595700892, "grad_norm": 1.0418771505355835, "learning_rate": 4.6966595432236066e-05, "loss": 0.071216881275177, "memory(GiB)": 122.96, "step": 34065, "token_acc": 0.9752042007001167, "train_speed(iter/s)": 0.236378 }, { "epoch": 2.5969967223111516, "grad_norm": 0.8582795262336731, "learning_rate": 4.695464412888571e-05, "loss": 0.09296025633811951, "memory(GiB)": 122.96, "step": 34070, "token_acc": 0.9615834633385335, "train_speed(iter/s)": 0.236386 }, { "epoch": 2.5973778489214117, "grad_norm": 0.9182913303375244, "learning_rate": 4.694269300017239e-05, "loss": 0.1090484619140625, "memory(GiB)": 122.96, "step": 34075, "token_acc": 0.9610413935193857, "train_speed(iter/s)": 0.236392 }, { "epoch": 2.5977589755316717, "grad_norm": 0.5515517592430115, "learning_rate": 4.693074204678144e-05, "loss": 0.09550970792770386, "memory(GiB)": 122.96, "step": 34080, "token_acc": 0.9670781893004116, "train_speed(iter/s)": 0.2364 }, { "epoch": 2.5981401021419317, "grad_norm": 0.982452392578125, "learning_rate": 4.6918791269398176e-05, "loss": 0.06814967393875122, "memory(GiB)": 122.96, "step": 34085, "token_acc": 0.9737440562332024, "train_speed(iter/s)": 0.236408 }, { "epoch": 2.5985212287521913, "grad_norm": 1.2840639352798462, "learning_rate": 4.6906840668707954e-05, "loss": 0.09271828532218933, "memory(GiB)": 122.96, "step": 34090, "token_acc": 0.9676825526692575, "train_speed(iter/s)": 0.236413 }, { "epoch": 2.5989023553624513, "grad_norm": 1.064578652381897, "learning_rate": 4.689489024539605e-05, "loss": 0.06382675766944886, "memory(GiB)": 122.96, "step": 34095, "token_acc": 0.9740885808978608, "train_speed(iter/s)": 0.236424 }, { "epoch": 2.5992834819727113, "grad_norm": 1.1126813888549805, "learning_rate": 4.68829400001478e-05, "loss": 0.07332289218902588, "memory(GiB)": 122.96, "step": 34100, "token_acc": 0.9697160883280758, "train_speed(iter/s)": 0.236436 }, { "epoch": 2.5996646085829713, "grad_norm": 2.0775439739227295, "learning_rate": 4.687098993364845e-05, "loss": 0.08412699699401856, "memory(GiB)": 122.96, "step": 34105, "token_acc": 0.9709972189114024, "train_speed(iter/s)": 0.236448 }, { "epoch": 2.6000457351932313, "grad_norm": 0.6218855381011963, "learning_rate": 4.685904004658333e-05, "loss": 0.049913495779037476, "memory(GiB)": 122.96, "step": 34110, "token_acc": 0.9776476849387973, "train_speed(iter/s)": 0.236454 }, { "epoch": 2.6004268618034914, "grad_norm": 1.2839341163635254, "learning_rate": 4.684709033963768e-05, "loss": 0.10951352119445801, "memory(GiB)": 122.96, "step": 34115, "token_acc": 0.9625977558653519, "train_speed(iter/s)": 0.236461 }, { "epoch": 2.600807988413751, "grad_norm": 1.1918044090270996, "learning_rate": 4.6835140813496756e-05, "loss": 0.10197027921676635, "memory(GiB)": 122.96, "step": 34120, "token_acc": 0.9588928922371438, "train_speed(iter/s)": 0.236469 }, { "epoch": 2.601189115024011, "grad_norm": 0.5429685115814209, "learning_rate": 4.682319146884583e-05, "loss": 0.049181267619132996, "memory(GiB)": 122.96, "step": 34125, "token_acc": 0.9732910819375283, "train_speed(iter/s)": 0.236479 }, { "epoch": 2.601570241634271, "grad_norm": 0.6397847533226013, "learning_rate": 4.681124230637014e-05, "loss": 0.06147398948669434, "memory(GiB)": 122.96, "step": 34130, "token_acc": 0.9713966123362097, "train_speed(iter/s)": 0.236482 }, { "epoch": 2.601951368244531, "grad_norm": 1.107353925704956, "learning_rate": 4.679929332675488e-05, "loss": 0.055589312314987184, "memory(GiB)": 122.96, "step": 34135, "token_acc": 0.9698383084577115, "train_speed(iter/s)": 0.23649 }, { "epoch": 2.6023324948547906, "grad_norm": 1.1172703504562378, "learning_rate": 4.678734453068531e-05, "loss": 0.14241528511047363, "memory(GiB)": 122.96, "step": 34140, "token_acc": 0.938179888564606, "train_speed(iter/s)": 0.236499 }, { "epoch": 2.6027136214650506, "grad_norm": 0.479903906583786, "learning_rate": 4.6775395918846615e-05, "loss": 0.07070747613906861, "memory(GiB)": 122.96, "step": 34145, "token_acc": 0.9689644416718652, "train_speed(iter/s)": 0.236503 }, { "epoch": 2.6030947480753106, "grad_norm": 1.0528767108917236, "learning_rate": 4.676344749192401e-05, "loss": 0.11165710687637329, "memory(GiB)": 122.96, "step": 34150, "token_acc": 0.9578863726658721, "train_speed(iter/s)": 0.236514 }, { "epoch": 2.6034758746855706, "grad_norm": 0.9184074401855469, "learning_rate": 4.675149925060268e-05, "loss": 0.05879241824150085, "memory(GiB)": 122.96, "step": 34155, "token_acc": 0.9746740596433389, "train_speed(iter/s)": 0.236518 }, { "epoch": 2.6038570012958306, "grad_norm": 1.0138955116271973, "learning_rate": 4.673955119556778e-05, "loss": 0.08034753799438477, "memory(GiB)": 122.96, "step": 34160, "token_acc": 0.964819033156163, "train_speed(iter/s)": 0.236527 }, { "epoch": 2.6042381279060907, "grad_norm": 0.942582368850708, "learning_rate": 4.672760332750449e-05, "loss": 0.11882693767547607, "memory(GiB)": 122.96, "step": 34165, "token_acc": 0.9565393988627132, "train_speed(iter/s)": 0.236535 }, { "epoch": 2.6046192545163502, "grad_norm": 0.8357480764389038, "learning_rate": 4.671565564709797e-05, "loss": 0.07874206304550171, "memory(GiB)": 122.96, "step": 34170, "token_acc": 0.967147575178168, "train_speed(iter/s)": 0.23654 }, { "epoch": 2.6050003811266103, "grad_norm": 0.4317990243434906, "learning_rate": 4.670370815503334e-05, "loss": 0.079747474193573, "memory(GiB)": 122.96, "step": 34175, "token_acc": 0.9684882329477463, "train_speed(iter/s)": 0.236549 }, { "epoch": 2.6053815077368703, "grad_norm": 1.011272668838501, "learning_rate": 4.669176085199578e-05, "loss": 0.08705169558525086, "memory(GiB)": 122.96, "step": 34180, "token_acc": 0.9692724807953005, "train_speed(iter/s)": 0.236552 }, { "epoch": 2.60576263434713, "grad_norm": 0.8848428130149841, "learning_rate": 4.6679813738670364e-05, "loss": 0.07391495704650879, "memory(GiB)": 122.96, "step": 34185, "token_acc": 0.9688715953307393, "train_speed(iter/s)": 0.236561 }, { "epoch": 2.60614376095739, "grad_norm": 0.760168731212616, "learning_rate": 4.6667866815742216e-05, "loss": 0.05986350178718567, "memory(GiB)": 122.96, "step": 34190, "token_acc": 0.9752475247524752, "train_speed(iter/s)": 0.236561 }, { "epoch": 2.60652488756765, "grad_norm": 1.9547309875488281, "learning_rate": 4.6655920083896455e-05, "loss": 0.09228652715682983, "memory(GiB)": 122.96, "step": 34195, "token_acc": 0.9695381591876843, "train_speed(iter/s)": 0.236571 }, { "epoch": 2.60690601417791, "grad_norm": 0.6332152485847473, "learning_rate": 4.6643973543818166e-05, "loss": 0.07325916290283203, "memory(GiB)": 122.96, "step": 34200, "token_acc": 0.9658072773816054, "train_speed(iter/s)": 0.236577 }, { "epoch": 2.60690601417791, "eval_loss": 0.07826592773199081, "eval_runtime": 219.994, "eval_samples_per_second": 2.409, "eval_steps_per_second": 2.409, "eval_token_acc": 0.9656195409915065, "step": 34200 }, { "epoch": 2.60728714078817, "grad_norm": 2.0419859886169434, "learning_rate": 4.6632027196192404e-05, "loss": 0.1078346848487854, "memory(GiB)": 122.96, "step": 34205, "token_acc": 0.9655729874710659, "train_speed(iter/s)": 0.236227 }, { "epoch": 2.60766826739843, "grad_norm": 0.9725856184959412, "learning_rate": 4.6620081041704256e-05, "loss": 0.07571849822998047, "memory(GiB)": 122.96, "step": 34210, "token_acc": 0.9690079164561226, "train_speed(iter/s)": 0.236232 }, { "epoch": 2.60804939400869, "grad_norm": 0.5028820633888245, "learning_rate": 4.660813508103879e-05, "loss": 0.10436105728149414, "memory(GiB)": 122.96, "step": 34215, "token_acc": 0.9638102623755977, "train_speed(iter/s)": 0.236235 }, { "epoch": 2.6084305206189495, "grad_norm": 1.1707018613815308, "learning_rate": 4.6596189314881025e-05, "loss": 0.10353643894195556, "memory(GiB)": 122.96, "step": 34220, "token_acc": 0.9608872555453472, "train_speed(iter/s)": 0.236236 }, { "epoch": 2.6088116472292096, "grad_norm": 1.5673580169677734, "learning_rate": 4.6584243743916e-05, "loss": 0.12903823852539062, "memory(GiB)": 122.96, "step": 34225, "token_acc": 0.9620152993932999, "train_speed(iter/s)": 0.236245 }, { "epoch": 2.6091927738394696, "grad_norm": 0.8953158855438232, "learning_rate": 4.6572298368828756e-05, "loss": 0.10350924730300903, "memory(GiB)": 122.96, "step": 34230, "token_acc": 0.9681528662420382, "train_speed(iter/s)": 0.236249 }, { "epoch": 2.609573900449729, "grad_norm": 2.281813144683838, "learning_rate": 4.6560353190304295e-05, "loss": 0.14158824682235718, "memory(GiB)": 122.96, "step": 34235, "token_acc": 0.9587194608256108, "train_speed(iter/s)": 0.236257 }, { "epoch": 2.609955027059989, "grad_norm": 0.7127488255500793, "learning_rate": 4.6548408209027604e-05, "loss": 0.09629364609718323, "memory(GiB)": 122.96, "step": 34240, "token_acc": 0.9737075857168236, "train_speed(iter/s)": 0.236262 }, { "epoch": 2.610336153670249, "grad_norm": 1.2858225107192993, "learning_rate": 4.65364634256837e-05, "loss": 0.10234858989715576, "memory(GiB)": 122.96, "step": 34245, "token_acc": 0.9632252792154726, "train_speed(iter/s)": 0.236271 }, { "epoch": 2.610717280280509, "grad_norm": 0.5620526671409607, "learning_rate": 4.652451884095754e-05, "loss": 0.07174274921417237, "memory(GiB)": 122.96, "step": 34250, "token_acc": 0.9766204675906481, "train_speed(iter/s)": 0.236276 }, { "epoch": 2.611098406890769, "grad_norm": 0.612727701663971, "learning_rate": 4.651257445553409e-05, "loss": 0.098134446144104, "memory(GiB)": 122.96, "step": 34255, "token_acc": 0.9655744504355039, "train_speed(iter/s)": 0.236279 }, { "epoch": 2.6114795335010292, "grad_norm": 1.245408535003662, "learning_rate": 4.6500630270098324e-05, "loss": 0.09175491333007812, "memory(GiB)": 122.96, "step": 34260, "token_acc": 0.9687647151153665, "train_speed(iter/s)": 0.236283 }, { "epoch": 2.611860660111289, "grad_norm": 1.2608333826065063, "learning_rate": 4.648868628533517e-05, "loss": 0.06942033767700195, "memory(GiB)": 122.96, "step": 34265, "token_acc": 0.9719435154217763, "train_speed(iter/s)": 0.236289 }, { "epoch": 2.612241786721549, "grad_norm": 0.10658451914787292, "learning_rate": 4.6476742501929536e-05, "loss": 0.09777196049690247, "memory(GiB)": 122.96, "step": 34270, "token_acc": 0.9512699350265801, "train_speed(iter/s)": 0.236298 }, { "epoch": 2.612622913331809, "grad_norm": 1.457679033279419, "learning_rate": 4.64647989205664e-05, "loss": 0.06456367373466491, "memory(GiB)": 122.96, "step": 34275, "token_acc": 0.9778663125276671, "train_speed(iter/s)": 0.236304 }, { "epoch": 2.613004039942069, "grad_norm": 0.9247164130210876, "learning_rate": 4.645285554193063e-05, "loss": 0.0771110475063324, "memory(GiB)": 122.96, "step": 34280, "token_acc": 0.9781893716461831, "train_speed(iter/s)": 0.236308 }, { "epoch": 2.6133851665523284, "grad_norm": 1.0543620586395264, "learning_rate": 4.644091236670714e-05, "loss": 0.09944462776184082, "memory(GiB)": 122.96, "step": 34285, "token_acc": 0.9634343434343434, "train_speed(iter/s)": 0.236314 }, { "epoch": 2.6137662931625885, "grad_norm": 0.7869449853897095, "learning_rate": 4.6428969395580806e-05, "loss": 0.05566803216934204, "memory(GiB)": 122.96, "step": 34290, "token_acc": 0.9765702891326022, "train_speed(iter/s)": 0.236323 }, { "epoch": 2.6141474197728485, "grad_norm": 0.6862897872924805, "learning_rate": 4.64170266292365e-05, "loss": 0.0813014566898346, "memory(GiB)": 122.96, "step": 34295, "token_acc": 0.9644549763033176, "train_speed(iter/s)": 0.23633 }, { "epoch": 2.6145285463831085, "grad_norm": 1.1799038648605347, "learning_rate": 4.64050840683591e-05, "loss": 0.10632283687591552, "memory(GiB)": 122.96, "step": 34300, "token_acc": 0.9444816053511705, "train_speed(iter/s)": 0.236341 }, { "epoch": 2.6149096729933685, "grad_norm": 0.7651465535163879, "learning_rate": 4.639314171363345e-05, "loss": 0.08166947364807128, "memory(GiB)": 122.96, "step": 34305, "token_acc": 0.9670388091440723, "train_speed(iter/s)": 0.236344 }, { "epoch": 2.6152907996036285, "grad_norm": 0.7753387093544006, "learning_rate": 4.6381199565744384e-05, "loss": 0.08617087006568909, "memory(GiB)": 122.96, "step": 34310, "token_acc": 0.9631986027944112, "train_speed(iter/s)": 0.236345 }, { "epoch": 2.615671926213888, "grad_norm": 0.48653778433799744, "learning_rate": 4.636925762537671e-05, "loss": 0.0615780770778656, "memory(GiB)": 122.96, "step": 34315, "token_acc": 0.9754108488817497, "train_speed(iter/s)": 0.236343 }, { "epoch": 2.616053052824148, "grad_norm": 0.7974960207939148, "learning_rate": 4.635731589321528e-05, "loss": 0.08526462316513062, "memory(GiB)": 122.96, "step": 34320, "token_acc": 0.9729839528742636, "train_speed(iter/s)": 0.236348 }, { "epoch": 2.616434179434408, "grad_norm": 0.8235100507736206, "learning_rate": 4.634537436994488e-05, "loss": 0.0655815064907074, "memory(GiB)": 122.96, "step": 34325, "token_acc": 0.9673704414587332, "train_speed(iter/s)": 0.236358 }, { "epoch": 2.616815306044668, "grad_norm": 0.983639657497406, "learning_rate": 4.633343305625029e-05, "loss": 0.0683695912361145, "memory(GiB)": 122.96, "step": 34330, "token_acc": 0.968528553563985, "train_speed(iter/s)": 0.236365 }, { "epoch": 2.6171964326549277, "grad_norm": 0.7712479829788208, "learning_rate": 4.6321491952816306e-05, "loss": 0.07709214687347413, "memory(GiB)": 122.96, "step": 34335, "token_acc": 0.9675601374570447, "train_speed(iter/s)": 0.236369 }, { "epoch": 2.6175775592651878, "grad_norm": 0.6983458399772644, "learning_rate": 4.6309551060327686e-05, "loss": 0.10948750972747803, "memory(GiB)": 122.96, "step": 34340, "token_acc": 0.9647239263803681, "train_speed(iter/s)": 0.236375 }, { "epoch": 2.6179586858754478, "grad_norm": 1.4490485191345215, "learning_rate": 4.6297610379469184e-05, "loss": 0.09112035036087036, "memory(GiB)": 122.96, "step": 34345, "token_acc": 0.9557222356641665, "train_speed(iter/s)": 0.236382 }, { "epoch": 2.618339812485708, "grad_norm": 0.7367695569992065, "learning_rate": 4.6285669910925555e-05, "loss": 0.07221097350120545, "memory(GiB)": 122.96, "step": 34350, "token_acc": 0.9737195759295206, "train_speed(iter/s)": 0.236387 }, { "epoch": 2.618720939095968, "grad_norm": 1.4930578470230103, "learning_rate": 4.6273729655381514e-05, "loss": 0.09764638543128967, "memory(GiB)": 122.96, "step": 34355, "token_acc": 0.9698052322231185, "train_speed(iter/s)": 0.236389 }, { "epoch": 2.619102065706228, "grad_norm": 2.204122543334961, "learning_rate": 4.626178961352178e-05, "loss": 0.08748858571052551, "memory(GiB)": 122.96, "step": 34360, "token_acc": 0.9633484162895928, "train_speed(iter/s)": 0.236397 }, { "epoch": 2.6194831923164874, "grad_norm": 1.1176056861877441, "learning_rate": 4.624984978603108e-05, "loss": 0.08757224082946777, "memory(GiB)": 122.96, "step": 34365, "token_acc": 0.9719454640797064, "train_speed(iter/s)": 0.236405 }, { "epoch": 2.6198643189267474, "grad_norm": 0.5771187543869019, "learning_rate": 4.623791017359408e-05, "loss": 0.0635875105857849, "memory(GiB)": 122.96, "step": 34370, "token_acc": 0.9740003586157432, "train_speed(iter/s)": 0.236413 }, { "epoch": 2.6202454455370074, "grad_norm": 0.8891420364379883, "learning_rate": 4.622597077689548e-05, "loss": 0.0915436565876007, "memory(GiB)": 122.96, "step": 34375, "token_acc": 0.9587258860475549, "train_speed(iter/s)": 0.23642 }, { "epoch": 2.6206265721472675, "grad_norm": 0.8134907484054565, "learning_rate": 4.621403159661993e-05, "loss": 0.08667811155319213, "memory(GiB)": 122.96, "step": 34380, "token_acc": 0.9635809113742375, "train_speed(iter/s)": 0.236425 }, { "epoch": 2.621007698757527, "grad_norm": 0.9435305595397949, "learning_rate": 4.620209263345211e-05, "loss": 0.06328303813934326, "memory(GiB)": 122.96, "step": 34385, "token_acc": 0.9718939635899074, "train_speed(iter/s)": 0.236435 }, { "epoch": 2.621388825367787, "grad_norm": 1.4771685600280762, "learning_rate": 4.619015388807665e-05, "loss": 0.10097131729125977, "memory(GiB)": 122.96, "step": 34390, "token_acc": 0.9589855979962429, "train_speed(iter/s)": 0.236439 }, { "epoch": 2.621769951978047, "grad_norm": 0.770915150642395, "learning_rate": 4.617821536117818e-05, "loss": 0.10863854885101318, "memory(GiB)": 122.96, "step": 34395, "token_acc": 0.9614181438998958, "train_speed(iter/s)": 0.236447 }, { "epoch": 2.622151078588307, "grad_norm": 0.6036670207977295, "learning_rate": 4.6166277053441324e-05, "loss": 0.06730060577392578, "memory(GiB)": 122.96, "step": 34400, "token_acc": 0.97227385377943, "train_speed(iter/s)": 0.236449 }, { "epoch": 2.622151078588307, "eval_loss": 0.07841651886701584, "eval_runtime": 221.542, "eval_samples_per_second": 2.392, "eval_steps_per_second": 2.392, "eval_token_acc": 0.965612011324619, "step": 34400 }, { "epoch": 2.622532205198567, "grad_norm": 1.0552517175674438, "learning_rate": 4.6154338965550675e-05, "loss": 0.07920050024986267, "memory(GiB)": 122.96, "step": 34405, "token_acc": 0.9656754962001033, "train_speed(iter/s)": 0.236101 }, { "epoch": 2.622913331808827, "grad_norm": 0.8265538811683655, "learning_rate": 4.614240109819086e-05, "loss": 0.09569973349571229, "memory(GiB)": 122.96, "step": 34410, "token_acc": 0.9692712906057945, "train_speed(iter/s)": 0.236107 }, { "epoch": 2.6232944584190867, "grad_norm": 0.9442800283432007, "learning_rate": 4.6130463452046434e-05, "loss": 0.08906965851783752, "memory(GiB)": 122.96, "step": 34415, "token_acc": 0.9592048401037165, "train_speed(iter/s)": 0.236113 }, { "epoch": 2.6236755850293467, "grad_norm": 0.81524258852005, "learning_rate": 4.611852602780198e-05, "loss": 0.1175311803817749, "memory(GiB)": 122.96, "step": 34420, "token_acc": 0.9656293543892244, "train_speed(iter/s)": 0.236121 }, { "epoch": 2.6240567116396067, "grad_norm": 0.35128647089004517, "learning_rate": 4.610658882614204e-05, "loss": 0.07521740198135377, "memory(GiB)": 122.96, "step": 34425, "token_acc": 0.9692671394799054, "train_speed(iter/s)": 0.23613 }, { "epoch": 2.6244378382498668, "grad_norm": 0.818597674369812, "learning_rate": 4.6094651847751174e-05, "loss": 0.07586504220962524, "memory(GiB)": 122.96, "step": 34430, "token_acc": 0.9737335834896811, "train_speed(iter/s)": 0.236142 }, { "epoch": 2.6248189648601263, "grad_norm": 0.611804723739624, "learning_rate": 4.6082715093313886e-05, "loss": 0.07701766490936279, "memory(GiB)": 122.96, "step": 34435, "token_acc": 0.9679613117727066, "train_speed(iter/s)": 0.236147 }, { "epoch": 2.6252000914703864, "grad_norm": 0.6534541249275208, "learning_rate": 4.6070778563514715e-05, "loss": 0.08279744982719421, "memory(GiB)": 122.96, "step": 34440, "token_acc": 0.9711340206185567, "train_speed(iter/s)": 0.236154 }, { "epoch": 2.6255812180806464, "grad_norm": 1.4544756412506104, "learning_rate": 4.605884225903817e-05, "loss": 0.10027914047241211, "memory(GiB)": 122.96, "step": 34445, "token_acc": 0.9603927986906711, "train_speed(iter/s)": 0.236165 }, { "epoch": 2.6259623446909064, "grad_norm": 0.6570330858230591, "learning_rate": 4.604690618056871e-05, "loss": 0.09477045536041259, "memory(GiB)": 122.96, "step": 34450, "token_acc": 0.9606537530266344, "train_speed(iter/s)": 0.236174 }, { "epoch": 2.6263434713011664, "grad_norm": 1.2199070453643799, "learning_rate": 4.6034970328790855e-05, "loss": 0.12491008043289184, "memory(GiB)": 122.96, "step": 34455, "token_acc": 0.9294392523364486, "train_speed(iter/s)": 0.236186 }, { "epoch": 2.6267245979114264, "grad_norm": 1.0403878688812256, "learning_rate": 4.602303470438905e-05, "loss": 0.11044152975082397, "memory(GiB)": 122.96, "step": 34460, "token_acc": 0.9571013087736306, "train_speed(iter/s)": 0.236193 }, { "epoch": 2.627105724521686, "grad_norm": 0.9080274105072021, "learning_rate": 4.601109930804773e-05, "loss": 0.09239108562469482, "memory(GiB)": 122.96, "step": 34465, "token_acc": 0.9684952632738488, "train_speed(iter/s)": 0.2362 }, { "epoch": 2.627486851131946, "grad_norm": 0.6978354454040527, "learning_rate": 4.599916414045138e-05, "loss": 0.07672637701034546, "memory(GiB)": 122.96, "step": 34470, "token_acc": 0.9695885509838998, "train_speed(iter/s)": 0.23621 }, { "epoch": 2.627867977742206, "grad_norm": 1.0192087888717651, "learning_rate": 4.598722920228439e-05, "loss": 0.08601888418197631, "memory(GiB)": 122.96, "step": 34475, "token_acc": 0.9651821862348178, "train_speed(iter/s)": 0.236214 }, { "epoch": 2.6282491043524656, "grad_norm": 1.034632921218872, "learning_rate": 4.5975294494231186e-05, "loss": 0.07509363889694214, "memory(GiB)": 122.96, "step": 34480, "token_acc": 0.9722658294086866, "train_speed(iter/s)": 0.236218 }, { "epoch": 2.6286302309627256, "grad_norm": 1.8488266468048096, "learning_rate": 4.596336001697615e-05, "loss": 0.15468306541442872, "memory(GiB)": 122.96, "step": 34485, "token_acc": 0.9424117085587019, "train_speed(iter/s)": 0.236227 }, { "epoch": 2.6290113575729857, "grad_norm": 0.2803126275539398, "learning_rate": 4.59514257712037e-05, "loss": 0.0838808000087738, "memory(GiB)": 122.96, "step": 34490, "token_acc": 0.9511868940153795, "train_speed(iter/s)": 0.236238 }, { "epoch": 2.6293924841832457, "grad_norm": 1.0675890445709229, "learning_rate": 4.59394917575982e-05, "loss": 0.06667059659957886, "memory(GiB)": 122.96, "step": 34495, "token_acc": 0.9711664482306684, "train_speed(iter/s)": 0.236249 }, { "epoch": 2.6297736107935057, "grad_norm": 1.1665489673614502, "learning_rate": 4.5927557976843985e-05, "loss": 0.07805976867675782, "memory(GiB)": 122.96, "step": 34500, "token_acc": 0.9691195795006571, "train_speed(iter/s)": 0.236257 }, { "epoch": 2.6301547374037657, "grad_norm": 0.6722052097320557, "learning_rate": 4.591562442962544e-05, "loss": 0.07837527990341187, "memory(GiB)": 122.96, "step": 34505, "token_acc": 0.9660901538880566, "train_speed(iter/s)": 0.236256 }, { "epoch": 2.6305358640140257, "grad_norm": 1.3864669799804688, "learning_rate": 4.5903691116626877e-05, "loss": 0.08717117309570313, "memory(GiB)": 122.96, "step": 34510, "token_acc": 0.9631190727081138, "train_speed(iter/s)": 0.236266 }, { "epoch": 2.6309169906242853, "grad_norm": 0.6589874029159546, "learning_rate": 4.5891758038532605e-05, "loss": 0.07707738876342773, "memory(GiB)": 122.96, "step": 34515, "token_acc": 0.9712643678160919, "train_speed(iter/s)": 0.236272 }, { "epoch": 2.6312981172345453, "grad_norm": 1.6244090795516968, "learning_rate": 4.587982519602696e-05, "loss": 0.09339557886123658, "memory(GiB)": 122.96, "step": 34520, "token_acc": 0.9690402476780186, "train_speed(iter/s)": 0.23628 }, { "epoch": 2.6316792438448053, "grad_norm": 1.2460570335388184, "learning_rate": 4.586789258979422e-05, "loss": 0.10717564821243286, "memory(GiB)": 122.96, "step": 34525, "token_acc": 0.9616153538584566, "train_speed(iter/s)": 0.236286 }, { "epoch": 2.632060370455065, "grad_norm": 1.3849495649337769, "learning_rate": 4.585596022051865e-05, "loss": 0.0729071855545044, "memory(GiB)": 122.96, "step": 34530, "token_acc": 0.9726708074534162, "train_speed(iter/s)": 0.236296 }, { "epoch": 2.632441497065325, "grad_norm": 1.2723926305770874, "learning_rate": 4.584402808888454e-05, "loss": 0.07066272497177124, "memory(GiB)": 122.96, "step": 34535, "token_acc": 0.9671686746987952, "train_speed(iter/s)": 0.236305 }, { "epoch": 2.632822623675585, "grad_norm": 1.6462208032608032, "learning_rate": 4.5832096195576127e-05, "loss": 0.1126067042350769, "memory(GiB)": 122.96, "step": 34540, "token_acc": 0.954985754985755, "train_speed(iter/s)": 0.236311 }, { "epoch": 2.633203750285845, "grad_norm": 0.8074650168418884, "learning_rate": 4.582016454127766e-05, "loss": 0.07615478038787842, "memory(GiB)": 122.96, "step": 34545, "token_acc": 0.9675218340611353, "train_speed(iter/s)": 0.23632 }, { "epoch": 2.633584876896105, "grad_norm": 0.7124179005622864, "learning_rate": 4.580823312667337e-05, "loss": 0.07518311142921448, "memory(GiB)": 122.96, "step": 34550, "token_acc": 0.9666004469828656, "train_speed(iter/s)": 0.236324 }, { "epoch": 2.633966003506365, "grad_norm": 2.025308847427368, "learning_rate": 4.579630195244744e-05, "loss": 0.10309662818908691, "memory(GiB)": 122.96, "step": 34555, "token_acc": 0.9636650868878357, "train_speed(iter/s)": 0.236331 }, { "epoch": 2.6343471301166246, "grad_norm": 0.6852911710739136, "learning_rate": 4.57843710192841e-05, "loss": 0.10235599279403687, "memory(GiB)": 122.96, "step": 34560, "token_acc": 0.9630718954248366, "train_speed(iter/s)": 0.236335 }, { "epoch": 2.6347282567268846, "grad_norm": 1.2011756896972656, "learning_rate": 4.577244032786752e-05, "loss": 0.1060101866722107, "memory(GiB)": 122.96, "step": 34565, "token_acc": 0.9709509899383317, "train_speed(iter/s)": 0.236339 }, { "epoch": 2.6351093833371446, "grad_norm": 1.8492302894592285, "learning_rate": 4.5760509878881855e-05, "loss": 0.09034164547920227, "memory(GiB)": 122.96, "step": 34570, "token_acc": 0.9711451758340848, "train_speed(iter/s)": 0.236348 }, { "epoch": 2.6354905099474046, "grad_norm": 0.9346972107887268, "learning_rate": 4.574857967301128e-05, "loss": 0.06038922071456909, "memory(GiB)": 122.96, "step": 34575, "token_acc": 0.9797410510281798, "train_speed(iter/s)": 0.236351 }, { "epoch": 2.635871636557664, "grad_norm": 0.839268147945404, "learning_rate": 4.5736649710939946e-05, "loss": 0.07569088339805603, "memory(GiB)": 122.96, "step": 34580, "token_acc": 0.963881636205396, "train_speed(iter/s)": 0.236357 }, { "epoch": 2.6362527631679242, "grad_norm": 1.0588699579238892, "learning_rate": 4.5724719993351944e-05, "loss": 0.053468060493469236, "memory(GiB)": 122.96, "step": 34585, "token_acc": 0.9770075497597803, "train_speed(iter/s)": 0.236367 }, { "epoch": 2.6366338897781842, "grad_norm": 0.828446626663208, "learning_rate": 4.571279052093143e-05, "loss": 0.06658456325531006, "memory(GiB)": 122.96, "step": 34590, "token_acc": 0.9712987817468511, "train_speed(iter/s)": 0.236374 }, { "epoch": 2.6370150163884443, "grad_norm": 0.724871814250946, "learning_rate": 4.570086129436248e-05, "loss": 0.08491186499595642, "memory(GiB)": 122.96, "step": 34595, "token_acc": 0.9675888034048126, "train_speed(iter/s)": 0.236379 }, { "epoch": 2.6373961429987043, "grad_norm": 0.9738374352455139, "learning_rate": 4.5688932314329187e-05, "loss": 0.10163453817367554, "memory(GiB)": 122.96, "step": 34600, "token_acc": 0.9572787650332076, "train_speed(iter/s)": 0.236385 }, { "epoch": 2.6373961429987043, "eval_loss": 0.0791676789522171, "eval_runtime": 221.4526, "eval_samples_per_second": 2.393, "eval_steps_per_second": 2.393, "eval_token_acc": 0.9654614179868682, "step": 34600 }, { "epoch": 2.6377772696089643, "grad_norm": 0.7109556198120117, "learning_rate": 4.56770035815156e-05, "loss": 0.08453396558761597, "memory(GiB)": 122.96, "step": 34605, "token_acc": 0.9658202078645666, "train_speed(iter/s)": 0.23603 }, { "epoch": 2.638158396219224, "grad_norm": 1.3445689678192139, "learning_rate": 4.566507509660582e-05, "loss": 0.06915705800056457, "memory(GiB)": 122.96, "step": 34610, "token_acc": 0.9722976643128735, "train_speed(iter/s)": 0.236041 }, { "epoch": 2.638539522829484, "grad_norm": 0.7363599538803101, "learning_rate": 4.565314686028386e-05, "loss": 0.11613163948059083, "memory(GiB)": 122.96, "step": 34615, "token_acc": 0.9658826570173171, "train_speed(iter/s)": 0.236048 }, { "epoch": 2.638920649439744, "grad_norm": 0.8980638980865479, "learning_rate": 4.5641218873233745e-05, "loss": 0.06864437460899353, "memory(GiB)": 122.96, "step": 34620, "token_acc": 0.9783931832014607, "train_speed(iter/s)": 0.236052 }, { "epoch": 2.639301776050004, "grad_norm": 0.8212231397628784, "learning_rate": 4.5629291136139515e-05, "loss": 0.10776060819625854, "memory(GiB)": 122.96, "step": 34625, "token_acc": 0.9527777777777777, "train_speed(iter/s)": 0.23606 }, { "epoch": 2.6396829026602635, "grad_norm": 0.8818819522857666, "learning_rate": 4.561736364968515e-05, "loss": 0.09207921624183654, "memory(GiB)": 122.96, "step": 34630, "token_acc": 0.9676248953391013, "train_speed(iter/s)": 0.236069 }, { "epoch": 2.6400640292705235, "grad_norm": 0.7599839568138123, "learning_rate": 4.5605436414554635e-05, "loss": 0.10315999984741211, "memory(GiB)": 122.96, "step": 34635, "token_acc": 0.9658712541620422, "train_speed(iter/s)": 0.236076 }, { "epoch": 2.6404451558807835, "grad_norm": 1.2587523460388184, "learning_rate": 4.559350943143196e-05, "loss": 0.0902387797832489, "memory(GiB)": 122.96, "step": 34640, "token_acc": 0.9664131812420785, "train_speed(iter/s)": 0.236087 }, { "epoch": 2.6408262824910436, "grad_norm": 0.5499386787414551, "learning_rate": 4.558158270100106e-05, "loss": 0.11591588258743286, "memory(GiB)": 122.96, "step": 34645, "token_acc": 0.967198711377947, "train_speed(iter/s)": 0.23609 }, { "epoch": 2.6412074091013036, "grad_norm": 0.7512726783752441, "learning_rate": 4.556965622394589e-05, "loss": 0.0806598722934723, "memory(GiB)": 122.96, "step": 34650, "token_acc": 0.9680350987151363, "train_speed(iter/s)": 0.2361 }, { "epoch": 2.6415885357115636, "grad_norm": 0.6224662661552429, "learning_rate": 4.5557730000950386e-05, "loss": 0.057606637477874756, "memory(GiB)": 122.96, "step": 34655, "token_acc": 0.9749192947603675, "train_speed(iter/s)": 0.236109 }, { "epoch": 2.641969662321823, "grad_norm": 0.6146666407585144, "learning_rate": 4.5545804032698444e-05, "loss": 0.08936265110969543, "memory(GiB)": 122.96, "step": 34660, "token_acc": 0.9682687820811946, "train_speed(iter/s)": 0.23611 }, { "epoch": 2.642350788932083, "grad_norm": 1.0363690853118896, "learning_rate": 4.553387831987398e-05, "loss": 0.09350624680519104, "memory(GiB)": 122.96, "step": 34665, "token_acc": 0.963336875664187, "train_speed(iter/s)": 0.236117 }, { "epoch": 2.642731915542343, "grad_norm": 0.6018379330635071, "learning_rate": 4.552195286316084e-05, "loss": 0.09945698380470276, "memory(GiB)": 122.96, "step": 34670, "token_acc": 0.9687707641196013, "train_speed(iter/s)": 0.236124 }, { "epoch": 2.6431130421526032, "grad_norm": 1.0576268434524536, "learning_rate": 4.5510027663242936e-05, "loss": 0.04971529841423035, "memory(GiB)": 122.96, "step": 34675, "token_acc": 0.9785310734463277, "train_speed(iter/s)": 0.23613 }, { "epoch": 2.643494168762863, "grad_norm": 0.6960410475730896, "learning_rate": 4.549810272080412e-05, "loss": 0.10189125537872315, "memory(GiB)": 122.96, "step": 34680, "token_acc": 0.9600430305824497, "train_speed(iter/s)": 0.236134 }, { "epoch": 2.643875295373123, "grad_norm": 0.993836522102356, "learning_rate": 4.5486178036528226e-05, "loss": 0.07377658486366272, "memory(GiB)": 122.96, "step": 34685, "token_acc": 0.975119482710149, "train_speed(iter/s)": 0.236136 }, { "epoch": 2.644256421983383, "grad_norm": 0.7220819592475891, "learning_rate": 4.547425361109906e-05, "loss": 0.10406787395477295, "memory(GiB)": 122.96, "step": 34690, "token_acc": 0.9599456890699253, "train_speed(iter/s)": 0.236142 }, { "epoch": 2.644637548593643, "grad_norm": 1.2464371919631958, "learning_rate": 4.5462329445200455e-05, "loss": 0.11260309219360351, "memory(GiB)": 122.96, "step": 34695, "token_acc": 0.9605858281283652, "train_speed(iter/s)": 0.236147 }, { "epoch": 2.645018675203903, "grad_norm": 1.0397944450378418, "learning_rate": 4.545040553951621e-05, "loss": 0.08344818353652954, "memory(GiB)": 122.96, "step": 34700, "token_acc": 0.9697944593386952, "train_speed(iter/s)": 0.236153 }, { "epoch": 2.645399801814163, "grad_norm": 1.0590509176254272, "learning_rate": 4.543848189473008e-05, "loss": 0.08904408216476441, "memory(GiB)": 122.96, "step": 34705, "token_acc": 0.9598864039758609, "train_speed(iter/s)": 0.236162 }, { "epoch": 2.6457809284244225, "grad_norm": 1.2606360912322998, "learning_rate": 4.542655851152584e-05, "loss": 0.057963895797729495, "memory(GiB)": 122.96, "step": 34710, "token_acc": 0.9766917293233083, "train_speed(iter/s)": 0.236171 }, { "epoch": 2.6461620550346825, "grad_norm": 0.48929363489151, "learning_rate": 4.541463539058726e-05, "loss": 0.07432994842529297, "memory(GiB)": 122.96, "step": 34715, "token_acc": 0.9669599867175827, "train_speed(iter/s)": 0.236176 }, { "epoch": 2.6465431816449425, "grad_norm": 1.1390858888626099, "learning_rate": 4.540271253259806e-05, "loss": 0.06637262105941773, "memory(GiB)": 122.96, "step": 34720, "token_acc": 0.9737895158063226, "train_speed(iter/s)": 0.236183 }, { "epoch": 2.6469243082552025, "grad_norm": 0.6493496894836426, "learning_rate": 4.539078993824195e-05, "loss": 0.09039289951324463, "memory(GiB)": 122.96, "step": 34725, "token_acc": 0.9628175236225304, "train_speed(iter/s)": 0.236182 }, { "epoch": 2.647305434865462, "grad_norm": 0.9344501495361328, "learning_rate": 4.537886760820266e-05, "loss": 0.09192712903022766, "memory(GiB)": 122.96, "step": 34730, "token_acc": 0.9593086064097948, "train_speed(iter/s)": 0.236192 }, { "epoch": 2.647686561475722, "grad_norm": 1.6318135261535645, "learning_rate": 4.5366945543163866e-05, "loss": 0.08626474142074585, "memory(GiB)": 122.96, "step": 34735, "token_acc": 0.966497461928934, "train_speed(iter/s)": 0.2362 }, { "epoch": 2.648067688085982, "grad_norm": 0.7969309091567993, "learning_rate": 4.535502374380924e-05, "loss": 0.08600993752479554, "memory(GiB)": 122.96, "step": 34740, "token_acc": 0.9753017120404154, "train_speed(iter/s)": 0.236208 }, { "epoch": 2.648448814696242, "grad_norm": 0.6688978672027588, "learning_rate": 4.534310221082245e-05, "loss": 0.08370350599288941, "memory(GiB)": 122.96, "step": 34745, "token_acc": 0.9664916229057264, "train_speed(iter/s)": 0.236211 }, { "epoch": 2.648829941306502, "grad_norm": 0.9750869870185852, "learning_rate": 4.533118094488715e-05, "loss": 0.0799898386001587, "memory(GiB)": 122.96, "step": 34750, "token_acc": 0.9691934925579785, "train_speed(iter/s)": 0.236221 }, { "epoch": 2.649211067916762, "grad_norm": 0.5955497026443481, "learning_rate": 4.531925994668693e-05, "loss": 0.04989679157733917, "memory(GiB)": 122.96, "step": 34755, "token_acc": 0.9765209940017138, "train_speed(iter/s)": 0.236225 }, { "epoch": 2.6495921945270218, "grad_norm": 1.219254732131958, "learning_rate": 4.530733921690545e-05, "loss": 0.09081840515136719, "memory(GiB)": 122.96, "step": 34760, "token_acc": 0.9646643109540636, "train_speed(iter/s)": 0.236235 }, { "epoch": 2.649973321137282, "grad_norm": 0.6814606189727783, "learning_rate": 4.5295418756226295e-05, "loss": 0.08430290222167969, "memory(GiB)": 122.96, "step": 34765, "token_acc": 0.9655781112091791, "train_speed(iter/s)": 0.236248 }, { "epoch": 2.650354447747542, "grad_norm": 0.6479282975196838, "learning_rate": 4.5283498565333034e-05, "loss": 0.07764970064163208, "memory(GiB)": 122.96, "step": 34770, "token_acc": 0.9713102632357291, "train_speed(iter/s)": 0.236252 }, { "epoch": 2.650735574357802, "grad_norm": 0.6928173899650574, "learning_rate": 4.527157864490923e-05, "loss": 0.06987233161926269, "memory(GiB)": 122.96, "step": 34775, "token_acc": 0.9752208989627353, "train_speed(iter/s)": 0.236257 }, { "epoch": 2.6511167009680614, "grad_norm": 1.6042349338531494, "learning_rate": 4.525965899563846e-05, "loss": 0.07601243257522583, "memory(GiB)": 122.96, "step": 34780, "token_acc": 0.9701639344262295, "train_speed(iter/s)": 0.236263 }, { "epoch": 2.6514978275783214, "grad_norm": 1.5826935768127441, "learning_rate": 4.5247739618204256e-05, "loss": 0.053401076793670656, "memory(GiB)": 122.96, "step": 34785, "token_acc": 0.9798668615034908, "train_speed(iter/s)": 0.236265 }, { "epoch": 2.6518789541885814, "grad_norm": 0.8021280169487, "learning_rate": 4.5235820513290114e-05, "loss": 0.10230897665023804, "memory(GiB)": 122.96, "step": 34790, "token_acc": 0.9583741429970617, "train_speed(iter/s)": 0.236269 }, { "epoch": 2.6522600807988415, "grad_norm": 1.4549806118011475, "learning_rate": 4.522390168157957e-05, "loss": 0.08673296570777893, "memory(GiB)": 122.96, "step": 34795, "token_acc": 0.9665292662819456, "train_speed(iter/s)": 0.236274 }, { "epoch": 2.6526412074091015, "grad_norm": 1.0394068956375122, "learning_rate": 4.521198312375611e-05, "loss": 0.10868560075759888, "memory(GiB)": 122.96, "step": 34800, "token_acc": 0.9650593990216632, "train_speed(iter/s)": 0.23628 }, { "epoch": 2.6526412074091015, "eval_loss": 0.07940098643302917, "eval_runtime": 223.0293, "eval_samples_per_second": 2.376, "eval_steps_per_second": 2.376, "eval_token_acc": 0.96582284199747, "step": 34800 }, { "epoch": 2.6530223340193615, "grad_norm": 1.1645346879959106, "learning_rate": 4.5200064840503166e-05, "loss": 0.09724587202072144, "memory(GiB)": 122.96, "step": 34805, "token_acc": 0.9657538616402135, "train_speed(iter/s)": 0.235925 }, { "epoch": 2.653403460629621, "grad_norm": 1.0690507888793945, "learning_rate": 4.518814683250425e-05, "loss": 0.09641849994659424, "memory(GiB)": 122.96, "step": 34810, "token_acc": 0.9623484365028717, "train_speed(iter/s)": 0.23593 }, { "epoch": 2.653784587239881, "grad_norm": 0.934097945690155, "learning_rate": 4.5176229100442775e-05, "loss": 0.08912160992622375, "memory(GiB)": 122.96, "step": 34815, "token_acc": 0.9702051739518287, "train_speed(iter/s)": 0.235935 }, { "epoch": 2.654165713850141, "grad_norm": 1.263421654701233, "learning_rate": 4.5164311645002187e-05, "loss": 0.10587561130523682, "memory(GiB)": 122.96, "step": 34820, "token_acc": 0.9575185434929198, "train_speed(iter/s)": 0.235944 }, { "epoch": 2.6545468404604007, "grad_norm": 0.6879926323890686, "learning_rate": 4.515239446686589e-05, "loss": 0.0826075553894043, "memory(GiB)": 122.96, "step": 34825, "token_acc": 0.9800173761946134, "train_speed(iter/s)": 0.235952 }, { "epoch": 2.6549279670706607, "grad_norm": 0.5690597891807556, "learning_rate": 4.514047756671726e-05, "loss": 0.075517076253891, "memory(GiB)": 122.96, "step": 34830, "token_acc": 0.9727932285368803, "train_speed(iter/s)": 0.23596 }, { "epoch": 2.6553090936809207, "grad_norm": 1.035203218460083, "learning_rate": 4.51285609452397e-05, "loss": 0.0786234200000763, "memory(GiB)": 122.96, "step": 34835, "token_acc": 0.9708981435022579, "train_speed(iter/s)": 0.235968 }, { "epoch": 2.6556902202911807, "grad_norm": 0.5182098150253296, "learning_rate": 4.5116644603116564e-05, "loss": 0.06986383199691773, "memory(GiB)": 122.96, "step": 34840, "token_acc": 0.9712529079428381, "train_speed(iter/s)": 0.235971 }, { "epoch": 2.6560713469014408, "grad_norm": 1.005347728729248, "learning_rate": 4.510472854103119e-05, "loss": 0.07075812816619872, "memory(GiB)": 122.96, "step": 34845, "token_acc": 0.9737609329446064, "train_speed(iter/s)": 0.23598 }, { "epoch": 2.6564524735117008, "grad_norm": 1.1412445306777954, "learning_rate": 4.509281275966692e-05, "loss": 0.07964443564414977, "memory(GiB)": 122.96, "step": 34850, "token_acc": 0.9679331716518459, "train_speed(iter/s)": 0.235983 }, { "epoch": 2.656833600121961, "grad_norm": 1.3200820684432983, "learning_rate": 4.508089725970708e-05, "loss": 0.06194390654563904, "memory(GiB)": 122.96, "step": 34855, "token_acc": 0.9711128344319504, "train_speed(iter/s)": 0.235988 }, { "epoch": 2.6572147267322204, "grad_norm": 0.878207266330719, "learning_rate": 4.506898204183494e-05, "loss": 0.07742317914962768, "memory(GiB)": 122.96, "step": 34860, "token_acc": 0.9720848056537102, "train_speed(iter/s)": 0.235998 }, { "epoch": 2.6575958533424804, "grad_norm": 0.9697880148887634, "learning_rate": 4.5057067106733804e-05, "loss": 0.07053429484367371, "memory(GiB)": 122.96, "step": 34865, "token_acc": 0.9701715137956749, "train_speed(iter/s)": 0.236009 }, { "epoch": 2.6579769799527404, "grad_norm": 1.1064485311508179, "learning_rate": 4.504515245508693e-05, "loss": 0.1030411958694458, "memory(GiB)": 122.96, "step": 34870, "token_acc": 0.9622799664710813, "train_speed(iter/s)": 0.236013 }, { "epoch": 2.658358106563, "grad_norm": 1.1469590663909912, "learning_rate": 4.5033238087577574e-05, "loss": 0.08747999668121338, "memory(GiB)": 122.96, "step": 34875, "token_acc": 0.9740829346092504, "train_speed(iter/s)": 0.236024 }, { "epoch": 2.65873923317326, "grad_norm": 1.5519354343414307, "learning_rate": 4.5021324004888946e-05, "loss": 0.11580581665039062, "memory(GiB)": 122.96, "step": 34880, "token_acc": 0.9597875569044007, "train_speed(iter/s)": 0.236031 }, { "epoch": 2.65912035978352, "grad_norm": 0.8507601022720337, "learning_rate": 4.500941020770431e-05, "loss": 0.10885385274887086, "memory(GiB)": 122.96, "step": 34885, "token_acc": 0.954343245141653, "train_speed(iter/s)": 0.236039 }, { "epoch": 2.65950148639378, "grad_norm": 0.9800879955291748, "learning_rate": 4.499749669670682e-05, "loss": 0.10078145265579223, "memory(GiB)": 122.96, "step": 34890, "token_acc": 0.9641991156753673, "train_speed(iter/s)": 0.236042 }, { "epoch": 2.65988261300404, "grad_norm": 0.9540125727653503, "learning_rate": 4.4985583472579677e-05, "loss": 0.10268805027008057, "memory(GiB)": 122.96, "step": 34895, "token_acc": 0.9660601367918441, "train_speed(iter/s)": 0.236045 }, { "epoch": 2.6602637396143, "grad_norm": 1.3989028930664062, "learning_rate": 4.497367053600607e-05, "loss": 0.100931715965271, "memory(GiB)": 122.96, "step": 34900, "token_acc": 0.9643870967741935, "train_speed(iter/s)": 0.236053 }, { "epoch": 2.6606448662245596, "grad_norm": 1.0339680910110474, "learning_rate": 4.4961757887669125e-05, "loss": 0.04833863079547882, "memory(GiB)": 122.96, "step": 34905, "token_acc": 0.9741492146596858, "train_speed(iter/s)": 0.236062 }, { "epoch": 2.6610259928348197, "grad_norm": 0.45887675881385803, "learning_rate": 4.494984552825198e-05, "loss": 0.05490332841873169, "memory(GiB)": 122.96, "step": 34910, "token_acc": 0.9739551786795881, "train_speed(iter/s)": 0.236071 }, { "epoch": 2.6614071194450797, "grad_norm": 0.45629504323005676, "learning_rate": 4.493793345843776e-05, "loss": 0.08238070011138916, "memory(GiB)": 122.96, "step": 34915, "token_acc": 0.9706798866855524, "train_speed(iter/s)": 0.236077 }, { "epoch": 2.6617882460553397, "grad_norm": 1.3043519258499146, "learning_rate": 4.4926021678909566e-05, "loss": 0.10910717248916627, "memory(GiB)": 122.96, "step": 34920, "token_acc": 0.9578012215435869, "train_speed(iter/s)": 0.236085 }, { "epoch": 2.6621693726655993, "grad_norm": 0.9262687563896179, "learning_rate": 4.491411019035048e-05, "loss": 0.05665872097015381, "memory(GiB)": 122.96, "step": 34925, "token_acc": 0.9800573514077163, "train_speed(iter/s)": 0.236086 }, { "epoch": 2.6625504992758593, "grad_norm": 0.7802358865737915, "learning_rate": 4.490219899344358e-05, "loss": 0.08006370067596436, "memory(GiB)": 122.96, "step": 34930, "token_acc": 0.967284904688305, "train_speed(iter/s)": 0.236094 }, { "epoch": 2.6629316258861193, "grad_norm": 1.1045026779174805, "learning_rate": 4.489028808887191e-05, "loss": 0.057595640420913696, "memory(GiB)": 122.96, "step": 34935, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.236101 }, { "epoch": 2.6633127524963793, "grad_norm": 0.6823529005050659, "learning_rate": 4.4878377477318486e-05, "loss": 0.049222621321678164, "memory(GiB)": 122.96, "step": 34940, "token_acc": 0.9706933523945676, "train_speed(iter/s)": 0.236113 }, { "epoch": 2.6636938791066394, "grad_norm": 0.6395122408866882, "learning_rate": 4.486646715946637e-05, "loss": 0.07279379963874817, "memory(GiB)": 122.96, "step": 34945, "token_acc": 0.9730856051397813, "train_speed(iter/s)": 0.236118 }, { "epoch": 2.6640750057168994, "grad_norm": 0.7404492497444153, "learning_rate": 4.4854557135998524e-05, "loss": 0.062296736240386966, "memory(GiB)": 122.96, "step": 34950, "token_acc": 0.97497308934338, "train_speed(iter/s)": 0.236128 }, { "epoch": 2.664456132327159, "grad_norm": 0.6181774735450745, "learning_rate": 4.484264740759796e-05, "loss": 0.08760073781013489, "memory(GiB)": 122.96, "step": 34955, "token_acc": 0.9676403708238587, "train_speed(iter/s)": 0.236132 }, { "epoch": 2.664837258937419, "grad_norm": 0.8381205797195435, "learning_rate": 4.483073797494764e-05, "loss": 0.08710880279541015, "memory(GiB)": 122.96, "step": 34960, "token_acc": 0.9685323703894764, "train_speed(iter/s)": 0.236141 }, { "epoch": 2.665218385547679, "grad_norm": 0.5959903001785278, "learning_rate": 4.4818828838730495e-05, "loss": 0.05549919009208679, "memory(GiB)": 122.96, "step": 34965, "token_acc": 0.970108695652174, "train_speed(iter/s)": 0.236148 }, { "epoch": 2.665599512157939, "grad_norm": 1.6724343299865723, "learning_rate": 4.480691999962948e-05, "loss": 0.09415339231491089, "memory(GiB)": 122.96, "step": 34970, "token_acc": 0.9653301886792452, "train_speed(iter/s)": 0.236156 }, { "epoch": 2.6659806387681986, "grad_norm": 0.5706411004066467, "learning_rate": 4.4795011458327506e-05, "loss": 0.10845847129821777, "memory(GiB)": 122.96, "step": 34975, "token_acc": 0.9669487043892121, "train_speed(iter/s)": 0.236166 }, { "epoch": 2.6663617653784586, "grad_norm": 1.0162218809127808, "learning_rate": 4.478310321550747e-05, "loss": 0.12566871643066407, "memory(GiB)": 122.96, "step": 34980, "token_acc": 0.9613578424795385, "train_speed(iter/s)": 0.236169 }, { "epoch": 2.6667428919887186, "grad_norm": 1.5870065689086914, "learning_rate": 4.4771195271852245e-05, "loss": 0.08670580983161927, "memory(GiB)": 122.96, "step": 34985, "token_acc": 0.9604340945059914, "train_speed(iter/s)": 0.236179 }, { "epoch": 2.6671240185989786, "grad_norm": 0.8754083514213562, "learning_rate": 4.475928762804472e-05, "loss": 0.09162315130233764, "memory(GiB)": 122.96, "step": 34990, "token_acc": 0.9638689048760991, "train_speed(iter/s)": 0.236186 }, { "epoch": 2.6675051452092386, "grad_norm": 1.5178778171539307, "learning_rate": 4.474738028476772e-05, "loss": 0.09392979741096497, "memory(GiB)": 122.96, "step": 34995, "token_acc": 0.9613100938517526, "train_speed(iter/s)": 0.236191 }, { "epoch": 2.6678862718194987, "grad_norm": 0.9080075621604919, "learning_rate": 4.473547324270409e-05, "loss": 0.062468111515045166, "memory(GiB)": 122.96, "step": 35000, "token_acc": 0.9735263702171665, "train_speed(iter/s)": 0.236199 }, { "epoch": 2.6678862718194987, "eval_loss": 0.07843136787414551, "eval_runtime": 221.7515, "eval_samples_per_second": 2.39, "eval_steps_per_second": 2.39, "eval_token_acc": 0.9663499186795976, "step": 35000 }, { "epoch": 2.6682673984297582, "grad_norm": 0.6307786107063293, "learning_rate": 4.4723566502536645e-05, "loss": 0.07848840951919556, "memory(GiB)": 122.96, "step": 35005, "token_acc": 0.9669153581649802, "train_speed(iter/s)": 0.235846 }, { "epoch": 2.6686485250400183, "grad_norm": 1.0642178058624268, "learning_rate": 4.471166006494817e-05, "loss": 0.0710215449333191, "memory(GiB)": 122.96, "step": 35010, "token_acc": 0.9727775414717141, "train_speed(iter/s)": 0.235856 }, { "epoch": 2.6690296516502783, "grad_norm": 1.6664384603500366, "learning_rate": 4.469975393062144e-05, "loss": 0.11612780094146728, "memory(GiB)": 122.96, "step": 35015, "token_acc": 0.9497319034852547, "train_speed(iter/s)": 0.235865 }, { "epoch": 2.6694107782605383, "grad_norm": 0.5637938380241394, "learning_rate": 4.468784810023924e-05, "loss": 0.07632217407226563, "memory(GiB)": 122.96, "step": 35020, "token_acc": 0.9769769769769769, "train_speed(iter/s)": 0.23587 }, { "epoch": 2.669791904870798, "grad_norm": 1.5170615911483765, "learning_rate": 4.46759425744843e-05, "loss": 0.11441630125045776, "memory(GiB)": 122.96, "step": 35025, "token_acc": 0.9504504504504504, "train_speed(iter/s)": 0.23588 }, { "epoch": 2.670173031481058, "grad_norm": 0.9731893539428711, "learning_rate": 4.4664037354039334e-05, "loss": 0.07293434739112854, "memory(GiB)": 122.96, "step": 35030, "token_acc": 0.9725170068027211, "train_speed(iter/s)": 0.235888 }, { "epoch": 2.670554158091318, "grad_norm": 0.5590035319328308, "learning_rate": 4.4652132439587074e-05, "loss": 0.1052127718925476, "memory(GiB)": 122.96, "step": 35035, "token_acc": 0.9679885332059245, "train_speed(iter/s)": 0.235892 }, { "epoch": 2.670935284701578, "grad_norm": 1.074646234512329, "learning_rate": 4.464022783181021e-05, "loss": 0.11190294027328491, "memory(GiB)": 122.96, "step": 35040, "token_acc": 0.9625638116846285, "train_speed(iter/s)": 0.2359 }, { "epoch": 2.671316411311838, "grad_norm": 1.4665356874465942, "learning_rate": 4.462832353139139e-05, "loss": 0.06761512160301208, "memory(GiB)": 122.96, "step": 35045, "token_acc": 0.9757510729613734, "train_speed(iter/s)": 0.235906 }, { "epoch": 2.671697537922098, "grad_norm": 0.6132632493972778, "learning_rate": 4.4616419539013286e-05, "loss": 0.0890347719192505, "memory(GiB)": 122.96, "step": 35050, "token_acc": 0.9690256615878108, "train_speed(iter/s)": 0.235906 }, { "epoch": 2.6720786645323575, "grad_norm": 1.828651785850525, "learning_rate": 4.460451585535855e-05, "loss": 0.10179684162139893, "memory(GiB)": 122.96, "step": 35055, "token_acc": 0.9708496267330252, "train_speed(iter/s)": 0.235913 }, { "epoch": 2.6724597911426176, "grad_norm": 0.4339956045150757, "learning_rate": 4.45926124811098e-05, "loss": 0.12122589349746704, "memory(GiB)": 122.96, "step": 35060, "token_acc": 0.9500728104847098, "train_speed(iter/s)": 0.235921 }, { "epoch": 2.6728409177528776, "grad_norm": 0.9354884028434753, "learning_rate": 4.4580709416949606e-05, "loss": 0.09570226073265076, "memory(GiB)": 122.96, "step": 35065, "token_acc": 0.967329220927576, "train_speed(iter/s)": 0.235928 }, { "epoch": 2.6732220443631376, "grad_norm": 1.366164207458496, "learning_rate": 4.456880666356057e-05, "loss": 0.11747571229934692, "memory(GiB)": 122.96, "step": 35070, "token_acc": 0.9557640750670241, "train_speed(iter/s)": 0.235934 }, { "epoch": 2.673603170973397, "grad_norm": 0.7041143774986267, "learning_rate": 4.4556904221625294e-05, "loss": 0.1636034607887268, "memory(GiB)": 122.96, "step": 35075, "token_acc": 0.940415964024733, "train_speed(iter/s)": 0.235943 }, { "epoch": 2.673984297583657, "grad_norm": 1.0360685586929321, "learning_rate": 4.4545002091826307e-05, "loss": 0.07659584879875184, "memory(GiB)": 122.96, "step": 35080, "token_acc": 0.9676798768757214, "train_speed(iter/s)": 0.235953 }, { "epoch": 2.674365424193917, "grad_norm": 0.9309723377227783, "learning_rate": 4.453310027484612e-05, "loss": 0.10948672294616699, "memory(GiB)": 122.96, "step": 35085, "token_acc": 0.9570124481327801, "train_speed(iter/s)": 0.235958 }, { "epoch": 2.6747465508041772, "grad_norm": 0.9038699269294739, "learning_rate": 4.452119877136727e-05, "loss": 0.08245308995246887, "memory(GiB)": 122.96, "step": 35090, "token_acc": 0.9656876200933296, "train_speed(iter/s)": 0.235966 }, { "epoch": 2.6751276774144372, "grad_norm": 0.8327280879020691, "learning_rate": 4.450929758207225e-05, "loss": 0.11698803901672364, "memory(GiB)": 122.96, "step": 35095, "token_acc": 0.9545177045177046, "train_speed(iter/s)": 0.235974 }, { "epoch": 2.6755088040246973, "grad_norm": 0.5915766358375549, "learning_rate": 4.449739670764353e-05, "loss": 0.08347607851028442, "memory(GiB)": 122.96, "step": 35100, "token_acc": 0.9618403837767118, "train_speed(iter/s)": 0.235983 }, { "epoch": 2.675889930634957, "grad_norm": 0.9511887431144714, "learning_rate": 4.448549614876356e-05, "loss": 0.07888557910919189, "memory(GiB)": 122.96, "step": 35105, "token_acc": 0.9698665518725785, "train_speed(iter/s)": 0.235986 }, { "epoch": 2.676271057245217, "grad_norm": 1.0451788902282715, "learning_rate": 4.447359590611481e-05, "loss": 0.10269246101379395, "memory(GiB)": 122.96, "step": 35110, "token_acc": 0.9631013545072397, "train_speed(iter/s)": 0.235992 }, { "epoch": 2.676652183855477, "grad_norm": 1.0318127870559692, "learning_rate": 4.446169598037969e-05, "loss": 0.11112785339355469, "memory(GiB)": 122.96, "step": 35115, "token_acc": 0.9528130671506352, "train_speed(iter/s)": 0.235999 }, { "epoch": 2.6770333104657364, "grad_norm": 0.5985041856765747, "learning_rate": 4.44497963722406e-05, "loss": 0.07435340881347656, "memory(GiB)": 122.96, "step": 35120, "token_acc": 0.9672811825278192, "train_speed(iter/s)": 0.236007 }, { "epoch": 2.6774144370759965, "grad_norm": 1.1406794786453247, "learning_rate": 4.443789708237993e-05, "loss": 0.09671497344970703, "memory(GiB)": 122.96, "step": 35125, "token_acc": 0.965949565465166, "train_speed(iter/s)": 0.236009 }, { "epoch": 2.6777955636862565, "grad_norm": 0.7000481486320496, "learning_rate": 4.442599811148006e-05, "loss": 0.09976692199707031, "memory(GiB)": 122.96, "step": 35130, "token_acc": 0.9629327902240326, "train_speed(iter/s)": 0.236016 }, { "epoch": 2.6781766902965165, "grad_norm": 1.3462473154067993, "learning_rate": 4.441409946022332e-05, "loss": 0.08998606204986573, "memory(GiB)": 122.96, "step": 35135, "token_acc": 0.9713412304165074, "train_speed(iter/s)": 0.23602 }, { "epoch": 2.6785578169067765, "grad_norm": 0.6552445292472839, "learning_rate": 4.440220112929207e-05, "loss": 0.06629632711410523, "memory(GiB)": 122.96, "step": 35140, "token_acc": 0.9716259560819146, "train_speed(iter/s)": 0.236028 }, { "epoch": 2.6789389435170365, "grad_norm": 1.8415154218673706, "learning_rate": 4.4390303119368595e-05, "loss": 0.07014608979225159, "memory(GiB)": 122.96, "step": 35145, "token_acc": 0.9727582292849035, "train_speed(iter/s)": 0.236036 }, { "epoch": 2.6793200701272966, "grad_norm": 0.6169350743293762, "learning_rate": 4.43784054311352e-05, "loss": 0.07307873964309693, "memory(GiB)": 122.96, "step": 35150, "token_acc": 0.9694462126034373, "train_speed(iter/s)": 0.236044 }, { "epoch": 2.679701196737556, "grad_norm": 0.9401827454566956, "learning_rate": 4.436650806527417e-05, "loss": 0.07986955642700196, "memory(GiB)": 122.96, "step": 35155, "token_acc": 0.9596744358120607, "train_speed(iter/s)": 0.236054 }, { "epoch": 2.680082323347816, "grad_norm": 0.6553975939750671, "learning_rate": 4.435461102246777e-05, "loss": 0.069408118724823, "memory(GiB)": 122.96, "step": 35160, "token_acc": 0.97165991902834, "train_speed(iter/s)": 0.236058 }, { "epoch": 2.680463449958076, "grad_norm": 0.9611994624137878, "learning_rate": 4.434271430339823e-05, "loss": 0.08336496353149414, "memory(GiB)": 122.96, "step": 35165, "token_acc": 0.9704125950054289, "train_speed(iter/s)": 0.236065 }, { "epoch": 2.6808445765683357, "grad_norm": 1.9711670875549316, "learning_rate": 4.433081790874776e-05, "loss": 0.14030874967575074, "memory(GiB)": 122.96, "step": 35170, "token_acc": 0.9618764172335601, "train_speed(iter/s)": 0.236069 }, { "epoch": 2.6812257031785958, "grad_norm": 0.95865398645401, "learning_rate": 4.431892183919858e-05, "loss": 0.10978789329528808, "memory(GiB)": 122.96, "step": 35175, "token_acc": 0.9548705302096178, "train_speed(iter/s)": 0.236077 }, { "epoch": 2.681606829788856, "grad_norm": 0.9708633422851562, "learning_rate": 4.430702609543288e-05, "loss": 0.055616730451583864, "memory(GiB)": 122.96, "step": 35180, "token_acc": 0.9767213918157315, "train_speed(iter/s)": 0.236083 }, { "epoch": 2.681987956399116, "grad_norm": 1.6855332851409912, "learning_rate": 4.429513067813279e-05, "loss": 0.08995945453643799, "memory(GiB)": 122.96, "step": 35185, "token_acc": 0.965467102871683, "train_speed(iter/s)": 0.236088 }, { "epoch": 2.682369083009376, "grad_norm": 1.0304208993911743, "learning_rate": 4.4283235587980496e-05, "loss": 0.08290210366249084, "memory(GiB)": 122.96, "step": 35190, "token_acc": 0.9692720399951226, "train_speed(iter/s)": 0.236089 }, { "epoch": 2.682750209619636, "grad_norm": 1.3022607564926147, "learning_rate": 4.427134082565812e-05, "loss": 0.08793768286705017, "memory(GiB)": 122.96, "step": 35195, "token_acc": 0.9703631010794896, "train_speed(iter/s)": 0.236093 }, { "epoch": 2.6831313362298954, "grad_norm": 1.4607073068618774, "learning_rate": 4.4259446391847746e-05, "loss": 0.054417884349823, "memory(GiB)": 122.96, "step": 35200, "token_acc": 0.9796545105566219, "train_speed(iter/s)": 0.236103 }, { "epoch": 2.6831313362298954, "eval_loss": 0.07838715612888336, "eval_runtime": 221.4811, "eval_samples_per_second": 2.393, "eval_steps_per_second": 2.393, "eval_token_acc": 0.965905668333233, "step": 35200 }, { "epoch": 2.6835124628401554, "grad_norm": 1.0164872407913208, "learning_rate": 4.424755228723148e-05, "loss": 0.09090142250061035, "memory(GiB)": 122.96, "step": 35205, "token_acc": 0.9659274734681146, "train_speed(iter/s)": 0.23576 }, { "epoch": 2.6838935894504155, "grad_norm": 1.0169868469238281, "learning_rate": 4.42356585124914e-05, "loss": 0.09625781774520874, "memory(GiB)": 122.96, "step": 35210, "token_acc": 0.9569789674952199, "train_speed(iter/s)": 0.235768 }, { "epoch": 2.6842747160606755, "grad_norm": 1.0060405731201172, "learning_rate": 4.422376506830957e-05, "loss": 0.07743846774101257, "memory(GiB)": 122.96, "step": 35215, "token_acc": 0.9656850192061459, "train_speed(iter/s)": 0.235776 }, { "epoch": 2.684655842670935, "grad_norm": 0.9206529259681702, "learning_rate": 4.4211871955368004e-05, "loss": 0.12534357309341432, "memory(GiB)": 122.96, "step": 35220, "token_acc": 0.9616989002654531, "train_speed(iter/s)": 0.235786 }, { "epoch": 2.685036969281195, "grad_norm": 0.8882681727409363, "learning_rate": 4.419997917434871e-05, "loss": 0.08128911256790161, "memory(GiB)": 122.96, "step": 35225, "token_acc": 0.9594957081545065, "train_speed(iter/s)": 0.235794 }, { "epoch": 2.685418095891455, "grad_norm": 0.9535582661628723, "learning_rate": 4.4188086725933704e-05, "loss": 0.09914633631706238, "memory(GiB)": 122.96, "step": 35230, "token_acc": 0.9584382871536524, "train_speed(iter/s)": 0.235802 }, { "epoch": 2.685799222501715, "grad_norm": 0.8457511067390442, "learning_rate": 4.417619461080495e-05, "loss": 0.06210626363754272, "memory(GiB)": 122.96, "step": 35235, "token_acc": 0.9732243517474634, "train_speed(iter/s)": 0.23581 }, { "epoch": 2.686180349111975, "grad_norm": 0.745723307132721, "learning_rate": 4.41643028296444e-05, "loss": 0.05522758960723877, "memory(GiB)": 122.96, "step": 35240, "token_acc": 0.9686977299880526, "train_speed(iter/s)": 0.235815 }, { "epoch": 2.686561475722235, "grad_norm": 0.7693453431129456, "learning_rate": 4.415241138313402e-05, "loss": 0.06324333548545838, "memory(GiB)": 122.96, "step": 35245, "token_acc": 0.9785001279754287, "train_speed(iter/s)": 0.235823 }, { "epoch": 2.6869426023324947, "grad_norm": 0.8835068941116333, "learning_rate": 4.414052027195571e-05, "loss": 0.06384479999542236, "memory(GiB)": 122.96, "step": 35250, "token_acc": 0.9685055165496489, "train_speed(iter/s)": 0.235829 }, { "epoch": 2.6873237289427547, "grad_norm": 1.2129287719726562, "learning_rate": 4.412862949679135e-05, "loss": 0.09133310914039612, "memory(GiB)": 122.96, "step": 35255, "token_acc": 0.970947705870567, "train_speed(iter/s)": 0.235833 }, { "epoch": 2.6877048555530147, "grad_norm": 0.8428898453712463, "learning_rate": 4.4116739058322864e-05, "loss": 0.07776565551757812, "memory(GiB)": 122.96, "step": 35260, "token_acc": 0.9685181471826749, "train_speed(iter/s)": 0.235839 }, { "epoch": 2.6880859821632748, "grad_norm": 0.7238909602165222, "learning_rate": 4.4104848957232086e-05, "loss": 0.07144677639007568, "memory(GiB)": 122.96, "step": 35265, "token_acc": 0.9662921348314607, "train_speed(iter/s)": 0.235848 }, { "epoch": 2.6884671087735343, "grad_norm": 1.4484970569610596, "learning_rate": 4.4092959194200864e-05, "loss": 0.07191640138626099, "memory(GiB)": 122.96, "step": 35270, "token_acc": 0.9660889223813113, "train_speed(iter/s)": 0.235858 }, { "epoch": 2.6888482353837944, "grad_norm": 0.7443024516105652, "learning_rate": 4.408106976991101e-05, "loss": 0.08877195715904236, "memory(GiB)": 122.96, "step": 35275, "token_acc": 0.963632972771354, "train_speed(iter/s)": 0.235864 }, { "epoch": 2.6892293619940544, "grad_norm": 0.6300280690193176, "learning_rate": 4.4069180685044354e-05, "loss": 0.10613809823989868, "memory(GiB)": 122.96, "step": 35280, "token_acc": 0.9689975887013434, "train_speed(iter/s)": 0.235874 }, { "epoch": 2.6896104886043144, "grad_norm": 0.6633759140968323, "learning_rate": 4.405729194028265e-05, "loss": 0.08017803430557251, "memory(GiB)": 122.96, "step": 35285, "token_acc": 0.9680973524091432, "train_speed(iter/s)": 0.235878 }, { "epoch": 2.6899916152145744, "grad_norm": 1.4697518348693848, "learning_rate": 4.404540353630768e-05, "loss": 0.1642252564430237, "memory(GiB)": 122.96, "step": 35290, "token_acc": 0.9571195947168446, "train_speed(iter/s)": 0.235882 }, { "epoch": 2.6903727418248344, "grad_norm": 1.1984577178955078, "learning_rate": 4.4033515473801186e-05, "loss": 0.09471780061721802, "memory(GiB)": 122.96, "step": 35295, "token_acc": 0.9654477397063058, "train_speed(iter/s)": 0.235885 }, { "epoch": 2.690753868435094, "grad_norm": 1.1482563018798828, "learning_rate": 4.402162775344489e-05, "loss": 0.08634262084960938, "memory(GiB)": 122.96, "step": 35300, "token_acc": 0.9611430921052632, "train_speed(iter/s)": 0.235893 }, { "epoch": 2.691134995045354, "grad_norm": 1.1681169271469116, "learning_rate": 4.4009740375920496e-05, "loss": 0.13199710845947266, "memory(GiB)": 122.96, "step": 35305, "token_acc": 0.9558158651870119, "train_speed(iter/s)": 0.2359 }, { "epoch": 2.691516121655614, "grad_norm": 0.6707178354263306, "learning_rate": 4.39978533419097e-05, "loss": 0.06836973428726197, "memory(GiB)": 122.96, "step": 35310, "token_acc": 0.9733653269346131, "train_speed(iter/s)": 0.235903 }, { "epoch": 2.691897248265874, "grad_norm": 0.9165451526641846, "learning_rate": 4.3985966652094165e-05, "loss": 0.057117462158203125, "memory(GiB)": 122.96, "step": 35315, "token_acc": 0.9778398185307974, "train_speed(iter/s)": 0.235909 }, { "epoch": 2.6922783748761336, "grad_norm": 1.8033150434494019, "learning_rate": 4.3974080307155514e-05, "loss": 0.1039546012878418, "memory(GiB)": 122.96, "step": 35320, "token_acc": 0.9637387387387387, "train_speed(iter/s)": 0.235916 }, { "epoch": 2.6926595014863937, "grad_norm": 0.8163173794746399, "learning_rate": 4.396219430777541e-05, "loss": 0.052589583396911624, "memory(GiB)": 122.96, "step": 35325, "token_acc": 0.9711415134672937, "train_speed(iter/s)": 0.235923 }, { "epoch": 2.6930406280966537, "grad_norm": 0.9008318185806274, "learning_rate": 4.3950308654635444e-05, "loss": 0.0991385817527771, "memory(GiB)": 122.96, "step": 35330, "token_acc": 0.9486984609538286, "train_speed(iter/s)": 0.235931 }, { "epoch": 2.6934217547069137, "grad_norm": 0.9078161716461182, "learning_rate": 4.39384233484172e-05, "loss": 0.10175211429595947, "memory(GiB)": 122.96, "step": 35335, "token_acc": 0.958712259003274, "train_speed(iter/s)": 0.235935 }, { "epoch": 2.6938028813171737, "grad_norm": 1.0569475889205933, "learning_rate": 4.392653838980223e-05, "loss": 0.12729686498641968, "memory(GiB)": 122.96, "step": 35340, "token_acc": 0.9584384740982633, "train_speed(iter/s)": 0.235938 }, { "epoch": 2.6941840079274337, "grad_norm": 1.5204691886901855, "learning_rate": 4.391465377947211e-05, "loss": 0.09144845008850097, "memory(GiB)": 122.96, "step": 35345, "token_acc": 0.9566058595909342, "train_speed(iter/s)": 0.235946 }, { "epoch": 2.6945651345376933, "grad_norm": 1.054661750793457, "learning_rate": 4.390276951810837e-05, "loss": 0.09276120662689209, "memory(GiB)": 122.96, "step": 35350, "token_acc": 0.9654731457800512, "train_speed(iter/s)": 0.235951 }, { "epoch": 2.6949462611479533, "grad_norm": 0.9206229448318481, "learning_rate": 4.3890885606392495e-05, "loss": 0.0787214994430542, "memory(GiB)": 122.96, "step": 35355, "token_acc": 0.9708500222518914, "train_speed(iter/s)": 0.23596 }, { "epoch": 2.6953273877582133, "grad_norm": 0.46435466408729553, "learning_rate": 4.3879002045005974e-05, "loss": 0.09731523394584655, "memory(GiB)": 122.96, "step": 35360, "token_acc": 0.9547619047619048, "train_speed(iter/s)": 0.235968 }, { "epoch": 2.6957085143684734, "grad_norm": 1.0577620267868042, "learning_rate": 4.386711883463029e-05, "loss": 0.08033180236816406, "memory(GiB)": 122.96, "step": 35365, "token_acc": 0.9697762970014279, "train_speed(iter/s)": 0.235975 }, { "epoch": 2.696089640978733, "grad_norm": 0.63516765832901, "learning_rate": 4.3855235975946885e-05, "loss": 0.07029534578323364, "memory(GiB)": 122.96, "step": 35370, "token_acc": 0.9751265365148228, "train_speed(iter/s)": 0.235977 }, { "epoch": 2.696470767588993, "grad_norm": 0.7372981309890747, "learning_rate": 4.384335346963718e-05, "loss": 0.0874361515045166, "memory(GiB)": 122.96, "step": 35375, "token_acc": 0.969290082424887, "train_speed(iter/s)": 0.23598 }, { "epoch": 2.696851894199253, "grad_norm": 0.8637972474098206, "learning_rate": 4.383147131638257e-05, "loss": 0.07841692566871643, "memory(GiB)": 122.96, "step": 35380, "token_acc": 0.974401913875598, "train_speed(iter/s)": 0.235986 }, { "epoch": 2.697233020809513, "grad_norm": 1.342454195022583, "learning_rate": 4.381958951686447e-05, "loss": 0.12298245429992676, "memory(GiB)": 122.96, "step": 35385, "token_acc": 0.949094781682641, "train_speed(iter/s)": 0.235994 }, { "epoch": 2.697614147419773, "grad_norm": 0.06202472001314163, "learning_rate": 4.380770807176422e-05, "loss": 0.04612143635749817, "memory(GiB)": 122.96, "step": 35390, "token_acc": 0.972885032537961, "train_speed(iter/s)": 0.236002 }, { "epoch": 2.697995274030033, "grad_norm": 1.1343315839767456, "learning_rate": 4.3795826981763176e-05, "loss": 0.07536575198173523, "memory(GiB)": 122.96, "step": 35395, "token_acc": 0.9724073787009766, "train_speed(iter/s)": 0.236004 }, { "epoch": 2.6983764006402926, "grad_norm": 0.5838271975517273, "learning_rate": 4.3783946247542674e-05, "loss": 0.06245092153549194, "memory(GiB)": 122.96, "step": 35400, "token_acc": 0.9750859106529209, "train_speed(iter/s)": 0.236012 }, { "epoch": 2.6983764006402926, "eval_loss": 0.07676771283149719, "eval_runtime": 221.0225, "eval_samples_per_second": 2.398, "eval_steps_per_second": 2.398, "eval_token_acc": 0.9666887536895368, "step": 35400 }, { "epoch": 2.6987575272505526, "grad_norm": 0.6321262717247009, "learning_rate": 4.3772065869784006e-05, "loss": 0.07267065048217773, "memory(GiB)": 122.96, "step": 35405, "token_acc": 0.9667638816591676, "train_speed(iter/s)": 0.235667 }, { "epoch": 2.6991386538608126, "grad_norm": 0.6317225694656372, "learning_rate": 4.376018584916845e-05, "loss": 0.10444469451904297, "memory(GiB)": 122.96, "step": 35410, "token_acc": 0.9656783468104223, "train_speed(iter/s)": 0.235673 }, { "epoch": 2.699519780471072, "grad_norm": 0.7578514814376831, "learning_rate": 4.3748306186377287e-05, "loss": 0.07969510555267334, "memory(GiB)": 122.96, "step": 35415, "token_acc": 0.9730958007634976, "train_speed(iter/s)": 0.235676 }, { "epoch": 2.6999009070813322, "grad_norm": 0.682076096534729, "learning_rate": 4.373642688209175e-05, "loss": 0.07491014003753663, "memory(GiB)": 122.96, "step": 35420, "token_acc": 0.9719636524822695, "train_speed(iter/s)": 0.235677 }, { "epoch": 2.7002820336915923, "grad_norm": 0.5768642425537109, "learning_rate": 4.3724547936993064e-05, "loss": 0.07084066867828369, "memory(GiB)": 122.96, "step": 35425, "token_acc": 0.9730959446092977, "train_speed(iter/s)": 0.235676 }, { "epoch": 2.7006631603018523, "grad_norm": 1.3360315561294556, "learning_rate": 4.371266935176244e-05, "loss": 0.08262470364570618, "memory(GiB)": 122.96, "step": 35430, "token_acc": 0.9668810289389067, "train_speed(iter/s)": 0.23568 }, { "epoch": 2.7010442869121123, "grad_norm": 1.9561270475387573, "learning_rate": 4.3700791127081046e-05, "loss": 0.11395291090011597, "memory(GiB)": 122.96, "step": 35435, "token_acc": 0.9612511671335201, "train_speed(iter/s)": 0.23569 }, { "epoch": 2.7014254135223723, "grad_norm": 0.5722203850746155, "learning_rate": 4.368891326363005e-05, "loss": 0.08316536545753479, "memory(GiB)": 122.96, "step": 35440, "token_acc": 0.9677377049180328, "train_speed(iter/s)": 0.235693 }, { "epoch": 2.7018065401326323, "grad_norm": 1.0655966997146606, "learning_rate": 4.3677035762090584e-05, "loss": 0.09825537204742432, "memory(GiB)": 122.96, "step": 35445, "token_acc": 0.9580042381044115, "train_speed(iter/s)": 0.235699 }, { "epoch": 2.702187666742892, "grad_norm": 0.6710293889045715, "learning_rate": 4.3665158623143784e-05, "loss": 0.110711669921875, "memory(GiB)": 122.96, "step": 35450, "token_acc": 0.9649851632047478, "train_speed(iter/s)": 0.235705 }, { "epoch": 2.702568793353152, "grad_norm": 1.0139358043670654, "learning_rate": 4.365328184747074e-05, "loss": 0.06675156354904174, "memory(GiB)": 122.96, "step": 35455, "token_acc": 0.9730354391371341, "train_speed(iter/s)": 0.235711 }, { "epoch": 2.702949919963412, "grad_norm": 0.40842220187187195, "learning_rate": 4.364140543575252e-05, "loss": 0.06650227308273315, "memory(GiB)": 122.96, "step": 35460, "token_acc": 0.9776948114110664, "train_speed(iter/s)": 0.235711 }, { "epoch": 2.7033310465736715, "grad_norm": 1.171823263168335, "learning_rate": 4.36295293886702e-05, "loss": 0.10579248666763305, "memory(GiB)": 122.96, "step": 35465, "token_acc": 0.9702427564604542, "train_speed(iter/s)": 0.235718 }, { "epoch": 2.7037121731839315, "grad_norm": 0.8871923685073853, "learning_rate": 4.361765370690479e-05, "loss": 0.07406131625175476, "memory(GiB)": 122.96, "step": 35470, "token_acc": 0.9685615848406546, "train_speed(iter/s)": 0.235726 }, { "epoch": 2.7040932997941916, "grad_norm": 1.2977409362792969, "learning_rate": 4.360577839113733e-05, "loss": 0.057308930158615115, "memory(GiB)": 122.96, "step": 35475, "token_acc": 0.9779913205207688, "train_speed(iter/s)": 0.235735 }, { "epoch": 2.7044744264044516, "grad_norm": 0.5309045314788818, "learning_rate": 4.359390344204882e-05, "loss": 0.06825003623962403, "memory(GiB)": 122.96, "step": 35480, "token_acc": 0.9718936311157377, "train_speed(iter/s)": 0.235736 }, { "epoch": 2.7048555530147116, "grad_norm": 1.5621318817138672, "learning_rate": 4.358202886032021e-05, "loss": 0.11710309982299805, "memory(GiB)": 122.96, "step": 35485, "token_acc": 0.9452508276037688, "train_speed(iter/s)": 0.235744 }, { "epoch": 2.7052366796249716, "grad_norm": 0.8814067840576172, "learning_rate": 4.3570154646632466e-05, "loss": 0.08137757778167724, "memory(GiB)": 122.96, "step": 35490, "token_acc": 0.9631255487269534, "train_speed(iter/s)": 0.235749 }, { "epoch": 2.705617806235231, "grad_norm": 0.44718262553215027, "learning_rate": 4.355828080166652e-05, "loss": 0.06542729139328003, "memory(GiB)": 122.96, "step": 35495, "token_acc": 0.9787934186471664, "train_speed(iter/s)": 0.235754 }, { "epoch": 2.705998932845491, "grad_norm": 1.2874548435211182, "learning_rate": 4.354640732610326e-05, "loss": 0.09437135457992554, "memory(GiB)": 122.96, "step": 35500, "token_acc": 0.9589568191534844, "train_speed(iter/s)": 0.235766 }, { "epoch": 2.706380059455751, "grad_norm": 0.6346542239189148, "learning_rate": 4.353453422062361e-05, "loss": 0.08073940873146057, "memory(GiB)": 122.96, "step": 35505, "token_acc": 0.9720422775315377, "train_speed(iter/s)": 0.235762 }, { "epoch": 2.7067611860660112, "grad_norm": 0.8158250451087952, "learning_rate": 4.352266148590841e-05, "loss": 0.08905956745147706, "memory(GiB)": 122.96, "step": 35510, "token_acc": 0.9604043807919124, "train_speed(iter/s)": 0.235771 }, { "epoch": 2.707142312676271, "grad_norm": 1.4863781929016113, "learning_rate": 4.3510789122638506e-05, "loss": 0.08909804224967957, "memory(GiB)": 122.96, "step": 35515, "token_acc": 0.9595701125895599, "train_speed(iter/s)": 0.235778 }, { "epoch": 2.707523439286531, "grad_norm": 1.2774780988693237, "learning_rate": 4.349891713149475e-05, "loss": 0.0735629141330719, "memory(GiB)": 122.96, "step": 35520, "token_acc": 0.9732120914229541, "train_speed(iter/s)": 0.235787 }, { "epoch": 2.707904565896791, "grad_norm": 0.25067007541656494, "learning_rate": 4.348704551315792e-05, "loss": 0.07424221634864807, "memory(GiB)": 122.96, "step": 35525, "token_acc": 0.9670747150696496, "train_speed(iter/s)": 0.235795 }, { "epoch": 2.708285692507051, "grad_norm": 0.23121044039726257, "learning_rate": 4.34751742683088e-05, "loss": 0.06941035389900208, "memory(GiB)": 122.96, "step": 35530, "token_acc": 0.9664019547953574, "train_speed(iter/s)": 0.235805 }, { "epoch": 2.708666819117311, "grad_norm": 0.5919772386550903, "learning_rate": 4.346330339762816e-05, "loss": 0.04191494584083557, "memory(GiB)": 122.96, "step": 35535, "token_acc": 0.9754028837998303, "train_speed(iter/s)": 0.235815 }, { "epoch": 2.709047945727571, "grad_norm": 1.6588588953018188, "learning_rate": 4.345143290179675e-05, "loss": 0.0560515820980072, "memory(GiB)": 122.96, "step": 35540, "token_acc": 0.9829399392381397, "train_speed(iter/s)": 0.235823 }, { "epoch": 2.7094290723378305, "grad_norm": 0.5652149319648743, "learning_rate": 4.343956278149526e-05, "loss": 0.08185315132141113, "memory(GiB)": 122.96, "step": 35545, "token_acc": 0.9704336399474376, "train_speed(iter/s)": 0.23583 }, { "epoch": 2.7098101989480905, "grad_norm": 0.7423208951950073, "learning_rate": 4.34276930374044e-05, "loss": 0.08300833702087403, "memory(GiB)": 122.96, "step": 35550, "token_acc": 0.9750412919801799, "train_speed(iter/s)": 0.235837 }, { "epoch": 2.7101913255583505, "grad_norm": 0.6640967130661011, "learning_rate": 4.341582367020485e-05, "loss": 0.08577765226364135, "memory(GiB)": 122.96, "step": 35555, "token_acc": 0.9705380679175066, "train_speed(iter/s)": 0.235841 }, { "epoch": 2.7105724521686105, "grad_norm": 2.084167003631592, "learning_rate": 4.3403954680577265e-05, "loss": 0.08674956560134887, "memory(GiB)": 122.96, "step": 35560, "token_acc": 0.96529284164859, "train_speed(iter/s)": 0.23585 }, { "epoch": 2.71095357877887, "grad_norm": 2.4054014682769775, "learning_rate": 4.3392086069202256e-05, "loss": 0.09079451560974121, "memory(GiB)": 122.96, "step": 35565, "token_acc": 0.9716574245224893, "train_speed(iter/s)": 0.235858 }, { "epoch": 2.71133470538913, "grad_norm": 1.6501370668411255, "learning_rate": 4.338021783676045e-05, "loss": 0.06017959117889404, "memory(GiB)": 122.96, "step": 35570, "token_acc": 0.9750849377123443, "train_speed(iter/s)": 0.235869 }, { "epoch": 2.71171583199939, "grad_norm": 0.9408222436904907, "learning_rate": 4.3368349983932435e-05, "loss": 0.11018457412719726, "memory(GiB)": 122.96, "step": 35575, "token_acc": 0.9599325179249262, "train_speed(iter/s)": 0.235877 }, { "epoch": 2.71209695860965, "grad_norm": 1.2673035860061646, "learning_rate": 4.3356482511398766e-05, "loss": 0.12338459491729736, "memory(GiB)": 122.96, "step": 35580, "token_acc": 0.9590800951625694, "train_speed(iter/s)": 0.235883 }, { "epoch": 2.71247808521991, "grad_norm": 0.9863390922546387, "learning_rate": 4.3344615419840004e-05, "loss": 0.07662172317504883, "memory(GiB)": 122.96, "step": 35585, "token_acc": 0.9697829479358774, "train_speed(iter/s)": 0.235887 }, { "epoch": 2.71285921183017, "grad_norm": 1.6998751163482666, "learning_rate": 4.3332748709936664e-05, "loss": 0.1071923017501831, "memory(GiB)": 122.96, "step": 35590, "token_acc": 0.9733029801324503, "train_speed(iter/s)": 0.235893 }, { "epoch": 2.7132403384404298, "grad_norm": 0.5378970503807068, "learning_rate": 4.332088238236923e-05, "loss": 0.06603131294250489, "memory(GiB)": 122.96, "step": 35595, "token_acc": 0.9764941235308827, "train_speed(iter/s)": 0.2359 }, { "epoch": 2.71362146505069, "grad_norm": 1.0697497129440308, "learning_rate": 4.330901643781822e-05, "loss": 0.06301345229148865, "memory(GiB)": 122.96, "step": 35600, "token_acc": 0.9752047315741583, "train_speed(iter/s)": 0.235907 }, { "epoch": 2.71362146505069, "eval_loss": 0.07654815167188644, "eval_runtime": 222.3236, "eval_samples_per_second": 2.384, "eval_steps_per_second": 2.384, "eval_token_acc": 0.9669522920306005, "step": 35600 }, { "epoch": 2.71400259166095, "grad_norm": 1.0057942867279053, "learning_rate": 4.3297150876964035e-05, "loss": 0.07763549089431762, "memory(GiB)": 122.96, "step": 35605, "token_acc": 0.967178607624542, "train_speed(iter/s)": 0.235559 }, { "epoch": 2.71438371827121, "grad_norm": 1.7471017837524414, "learning_rate": 4.328528570048716e-05, "loss": 0.09439860582351685, "memory(GiB)": 122.96, "step": 35610, "token_acc": 0.9654680817361001, "train_speed(iter/s)": 0.235563 }, { "epoch": 2.7147648448814694, "grad_norm": 1.3044712543487549, "learning_rate": 4.3273420909068e-05, "loss": 0.1035967230796814, "memory(GiB)": 122.96, "step": 35615, "token_acc": 0.9638532763532763, "train_speed(iter/s)": 0.235571 }, { "epoch": 2.7151459714917294, "grad_norm": 0.9582646489143372, "learning_rate": 4.326155650338692e-05, "loss": 0.09638741016387939, "memory(GiB)": 122.96, "step": 35620, "token_acc": 0.960453466912734, "train_speed(iter/s)": 0.235579 }, { "epoch": 2.7155270981019894, "grad_norm": 1.506386399269104, "learning_rate": 4.3249692484124315e-05, "loss": 0.09296506643295288, "memory(GiB)": 122.96, "step": 35625, "token_acc": 0.9633603768646951, "train_speed(iter/s)": 0.235586 }, { "epoch": 2.7159082247122495, "grad_norm": 2.1300253868103027, "learning_rate": 4.323782885196053e-05, "loss": 0.08713527917861938, "memory(GiB)": 122.96, "step": 35630, "token_acc": 0.9475917144996256, "train_speed(iter/s)": 0.235595 }, { "epoch": 2.7162893513225095, "grad_norm": 1.2594480514526367, "learning_rate": 4.322596560757586e-05, "loss": 0.07735421657562255, "memory(GiB)": 122.96, "step": 35635, "token_acc": 0.9593088071348941, "train_speed(iter/s)": 0.235604 }, { "epoch": 2.7166704779327695, "grad_norm": 0.6773355603218079, "learning_rate": 4.3214102751650636e-05, "loss": 0.09126784801483154, "memory(GiB)": 122.96, "step": 35640, "token_acc": 0.9579944154425155, "train_speed(iter/s)": 0.235606 }, { "epoch": 2.717051604543029, "grad_norm": 0.9887690544128418, "learning_rate": 4.320224028486513e-05, "loss": 0.07618552446365356, "memory(GiB)": 122.96, "step": 35645, "token_acc": 0.9724828810550342, "train_speed(iter/s)": 0.235611 }, { "epoch": 2.717432731153289, "grad_norm": 0.5562832355499268, "learning_rate": 4.319037820789959e-05, "loss": 0.10555518865585327, "memory(GiB)": 122.96, "step": 35650, "token_acc": 0.9642756283535724, "train_speed(iter/s)": 0.235611 }, { "epoch": 2.717813857763549, "grad_norm": 0.8524538278579712, "learning_rate": 4.317851652143427e-05, "loss": 0.09457230567932129, "memory(GiB)": 122.96, "step": 35655, "token_acc": 0.9680173661360347, "train_speed(iter/s)": 0.235615 }, { "epoch": 2.718194984373809, "grad_norm": 1.82323157787323, "learning_rate": 4.316665522614937e-05, "loss": 0.09357575178146363, "memory(GiB)": 122.96, "step": 35660, "token_acc": 0.9685564554549786, "train_speed(iter/s)": 0.235622 }, { "epoch": 2.7185761109840687, "grad_norm": 0.729487955570221, "learning_rate": 4.315479432272509e-05, "loss": 0.09518301486968994, "memory(GiB)": 122.96, "step": 35665, "token_acc": 0.9584583429494576, "train_speed(iter/s)": 0.23563 }, { "epoch": 2.7189572375943287, "grad_norm": 0.9275891184806824, "learning_rate": 4.314293381184157e-05, "loss": 0.08640878200531006, "memory(GiB)": 122.96, "step": 35670, "token_acc": 0.96630859375, "train_speed(iter/s)": 0.23564 }, { "epoch": 2.7193383642045887, "grad_norm": 0.6889150738716125, "learning_rate": 4.3131073694178985e-05, "loss": 0.08236033320426941, "memory(GiB)": 122.96, "step": 35675, "token_acc": 0.9703252805782766, "train_speed(iter/s)": 0.235645 }, { "epoch": 2.7197194908148488, "grad_norm": 1.7570236921310425, "learning_rate": 4.311921397041745e-05, "loss": 0.07088986039161682, "memory(GiB)": 122.96, "step": 35680, "token_acc": 0.9709618874773139, "train_speed(iter/s)": 0.235655 }, { "epoch": 2.720100617425109, "grad_norm": 1.919663429260254, "learning_rate": 4.310735464123706e-05, "loss": 0.06675441265106201, "memory(GiB)": 122.96, "step": 35685, "token_acc": 0.9667412855772306, "train_speed(iter/s)": 0.235664 }, { "epoch": 2.720481744035369, "grad_norm": 1.0758261680603027, "learning_rate": 4.30954957073179e-05, "loss": 0.07461090087890625, "memory(GiB)": 122.96, "step": 35690, "token_acc": 0.9665, "train_speed(iter/s)": 0.235671 }, { "epoch": 2.7208628706456284, "grad_norm": 0.07010713964700699, "learning_rate": 4.3083637169340016e-05, "loss": 0.07521390914916992, "memory(GiB)": 122.96, "step": 35695, "token_acc": 0.9730363423212193, "train_speed(iter/s)": 0.235677 }, { "epoch": 2.7212439972558884, "grad_norm": 0.9437703490257263, "learning_rate": 4.307177902798344e-05, "loss": 0.08875989317893981, "memory(GiB)": 122.96, "step": 35700, "token_acc": 0.9645206766917294, "train_speed(iter/s)": 0.235684 }, { "epoch": 2.7216251238661484, "grad_norm": 1.4150831699371338, "learning_rate": 4.30599212839282e-05, "loss": 0.05708191394805908, "memory(GiB)": 122.96, "step": 35705, "token_acc": 0.9749911940824234, "train_speed(iter/s)": 0.235694 }, { "epoch": 2.7220062504764084, "grad_norm": 0.6397532820701599, "learning_rate": 4.3048063937854264e-05, "loss": 0.043326807022094724, "memory(GiB)": 122.96, "step": 35710, "token_acc": 0.9814707585408222, "train_speed(iter/s)": 0.235698 }, { "epoch": 2.722387377086668, "grad_norm": 0.6481029987335205, "learning_rate": 4.3036206990441584e-05, "loss": 0.08643834590911866, "memory(GiB)": 122.96, "step": 35715, "token_acc": 0.9651303820497271, "train_speed(iter/s)": 0.235707 }, { "epoch": 2.722768503696928, "grad_norm": 0.2683565318584442, "learning_rate": 4.302435044237013e-05, "loss": 0.08062859177589417, "memory(GiB)": 122.96, "step": 35720, "token_acc": 0.968184939345388, "train_speed(iter/s)": 0.235715 }, { "epoch": 2.723149630307188, "grad_norm": 0.9589257836341858, "learning_rate": 4.301249429431982e-05, "loss": 0.11361410617828369, "memory(GiB)": 122.96, "step": 35725, "token_acc": 0.9534569983136594, "train_speed(iter/s)": 0.235721 }, { "epoch": 2.723530756917448, "grad_norm": 0.6809784770011902, "learning_rate": 4.300063854697052e-05, "loss": 0.08955263495445251, "memory(GiB)": 122.96, "step": 35730, "token_acc": 0.9636320418350368, "train_speed(iter/s)": 0.235729 }, { "epoch": 2.723911883527708, "grad_norm": 1.0372300148010254, "learning_rate": 4.2988783201002115e-05, "loss": 0.09896026253700256, "memory(GiB)": 122.96, "step": 35735, "token_acc": 0.9587747581759558, "train_speed(iter/s)": 0.235736 }, { "epoch": 2.724293010137968, "grad_norm": 0.5639796853065491, "learning_rate": 4.2976928257094444e-05, "loss": 0.06577342748641968, "memory(GiB)": 122.96, "step": 35740, "token_acc": 0.9760128711423138, "train_speed(iter/s)": 0.235738 }, { "epoch": 2.7246741367482277, "grad_norm": 0.27676689624786377, "learning_rate": 4.296507371592737e-05, "loss": 0.0534085214138031, "memory(GiB)": 122.96, "step": 35745, "token_acc": 0.9780424264979531, "train_speed(iter/s)": 0.235747 }, { "epoch": 2.7250552633584877, "grad_norm": 0.723821759223938, "learning_rate": 4.295321957818067e-05, "loss": 0.08273829221725464, "memory(GiB)": 122.96, "step": 35750, "token_acc": 0.97031652279267, "train_speed(iter/s)": 0.235749 }, { "epoch": 2.7254363899687477, "grad_norm": 0.9038600921630859, "learning_rate": 4.294136584453412e-05, "loss": 0.08306689858436585, "memory(GiB)": 122.96, "step": 35755, "token_acc": 0.9651022864019254, "train_speed(iter/s)": 0.235756 }, { "epoch": 2.7258175165790073, "grad_norm": 1.0152894258499146, "learning_rate": 4.2929512515667494e-05, "loss": 0.05955403447151184, "memory(GiB)": 122.96, "step": 35760, "token_acc": 0.9746783465644676, "train_speed(iter/s)": 0.235761 }, { "epoch": 2.7261986431892673, "grad_norm": 1.4407148361206055, "learning_rate": 4.2917659592260506e-05, "loss": 0.105423903465271, "memory(GiB)": 122.96, "step": 35765, "token_acc": 0.9615483870967741, "train_speed(iter/s)": 0.235769 }, { "epoch": 2.7265797697995273, "grad_norm": 1.5723509788513184, "learning_rate": 4.290580707499288e-05, "loss": 0.08945835828781128, "memory(GiB)": 122.96, "step": 35770, "token_acc": 0.958538706871557, "train_speed(iter/s)": 0.235777 }, { "epoch": 2.7269608964097873, "grad_norm": 0.7911795973777771, "learning_rate": 4.289395496454429e-05, "loss": 0.0904066801071167, "memory(GiB)": 122.96, "step": 35775, "token_acc": 0.9648199901364458, "train_speed(iter/s)": 0.235783 }, { "epoch": 2.7273420230200474, "grad_norm": 0.6524139642715454, "learning_rate": 4.288210326159441e-05, "loss": 0.09201250076293946, "memory(GiB)": 122.96, "step": 35780, "token_acc": 0.9642485248177716, "train_speed(iter/s)": 0.235789 }, { "epoch": 2.7277231496303074, "grad_norm": 1.8482624292373657, "learning_rate": 4.2870251966822875e-05, "loss": 0.0814014732837677, "memory(GiB)": 122.96, "step": 35785, "token_acc": 0.9702445358147587, "train_speed(iter/s)": 0.23579 }, { "epoch": 2.728104276240567, "grad_norm": 1.2952357530593872, "learning_rate": 4.28584010809093e-05, "loss": 0.08730539679527283, "memory(GiB)": 122.96, "step": 35790, "token_acc": 0.9600997506234414, "train_speed(iter/s)": 0.235798 }, { "epoch": 2.728485402850827, "grad_norm": 2.185176372528076, "learning_rate": 4.284655060453329e-05, "loss": 0.09840134382247925, "memory(GiB)": 122.96, "step": 35795, "token_acc": 0.9675144261594357, "train_speed(iter/s)": 0.235805 }, { "epoch": 2.728866529461087, "grad_norm": 1.3329147100448608, "learning_rate": 4.2834700538374404e-05, "loss": 0.0935293436050415, "memory(GiB)": 122.96, "step": 35800, "token_acc": 0.9588189588189588, "train_speed(iter/s)": 0.235815 }, { "epoch": 2.728866529461087, "eval_loss": 0.07659797370433807, "eval_runtime": 221.8952, "eval_samples_per_second": 2.389, "eval_steps_per_second": 2.389, "eval_token_acc": 0.9673061863743148, "step": 35800 }, { "epoch": 2.729247656071347, "grad_norm": 1.1892683506011963, "learning_rate": 4.282285088311219e-05, "loss": 0.12913265228271484, "memory(GiB)": 122.96, "step": 35805, "token_acc": 0.966738126769735, "train_speed(iter/s)": 0.235479 }, { "epoch": 2.7296287826816066, "grad_norm": 1.7173138856887817, "learning_rate": 4.281100163942617e-05, "loss": 0.08970249295234681, "memory(GiB)": 122.96, "step": 35810, "token_acc": 0.9653624118024374, "train_speed(iter/s)": 0.235488 }, { "epoch": 2.7300099092918666, "grad_norm": 1.1461225748062134, "learning_rate": 4.279915280799586e-05, "loss": 0.10853818655014039, "memory(GiB)": 122.96, "step": 35815, "token_acc": 0.9583919597989949, "train_speed(iter/s)": 0.235495 }, { "epoch": 2.7303910359021266, "grad_norm": 1.7309094667434692, "learning_rate": 4.2787304389500695e-05, "loss": 0.08615018129348755, "memory(GiB)": 122.96, "step": 35820, "token_acc": 0.9712896503820329, "train_speed(iter/s)": 0.235501 }, { "epoch": 2.7307721625123866, "grad_norm": 0.33034655451774597, "learning_rate": 4.277545638462017e-05, "loss": 0.07479963898658752, "memory(GiB)": 122.96, "step": 35825, "token_acc": 0.9548180727708916, "train_speed(iter/s)": 0.235512 }, { "epoch": 2.7311532891226467, "grad_norm": 1.2293055057525635, "learning_rate": 4.2763608794033696e-05, "loss": 0.09643290042877198, "memory(GiB)": 122.96, "step": 35830, "token_acc": 0.964583719636566, "train_speed(iter/s)": 0.23552 }, { "epoch": 2.7315344157329067, "grad_norm": 0.8170815706253052, "learning_rate": 4.275176161842067e-05, "loss": 0.05468939542770386, "memory(GiB)": 122.96, "step": 35835, "token_acc": 0.9758793969849247, "train_speed(iter/s)": 0.235525 }, { "epoch": 2.7319155423431662, "grad_norm": 0.6521604061126709, "learning_rate": 4.273991485846048e-05, "loss": 0.10435469150543213, "memory(GiB)": 122.96, "step": 35840, "token_acc": 0.9631218988869519, "train_speed(iter/s)": 0.235528 }, { "epoch": 2.7322966689534263, "grad_norm": 0.9644380807876587, "learning_rate": 4.272806851483248e-05, "loss": 0.07631123661994935, "memory(GiB)": 122.96, "step": 35845, "token_acc": 0.9694821815317975, "train_speed(iter/s)": 0.235533 }, { "epoch": 2.7326777955636863, "grad_norm": 0.5921468734741211, "learning_rate": 4.2716222588216016e-05, "loss": 0.08040010333061218, "memory(GiB)": 122.96, "step": 35850, "token_acc": 0.9668999432570455, "train_speed(iter/s)": 0.235541 }, { "epoch": 2.7330589221739463, "grad_norm": 2.4007863998413086, "learning_rate": 4.270437707929037e-05, "loss": 0.09538315534591675, "memory(GiB)": 122.96, "step": 35855, "token_acc": 0.95274040039383, "train_speed(iter/s)": 0.235551 }, { "epoch": 2.733440048784206, "grad_norm": 0.4892181158065796, "learning_rate": 4.2692531988734854e-05, "loss": 0.08161606788635253, "memory(GiB)": 122.96, "step": 35860, "token_acc": 0.968683776783612, "train_speed(iter/s)": 0.23556 }, { "epoch": 2.733821175394466, "grad_norm": 0.49300989508628845, "learning_rate": 4.268068731722871e-05, "loss": 0.09424226880073547, "memory(GiB)": 122.96, "step": 35865, "token_acc": 0.9658042744656918, "train_speed(iter/s)": 0.235568 }, { "epoch": 2.734202302004726, "grad_norm": 0.6563447117805481, "learning_rate": 4.266884306545118e-05, "loss": 0.06082687973976135, "memory(GiB)": 122.96, "step": 35870, "token_acc": 0.9720372836218375, "train_speed(iter/s)": 0.235576 }, { "epoch": 2.734583428614986, "grad_norm": 1.0380146503448486, "learning_rate": 4.265699923408148e-05, "loss": 0.10046892166137696, "memory(GiB)": 122.96, "step": 35875, "token_acc": 0.9567316620241412, "train_speed(iter/s)": 0.235582 }, { "epoch": 2.734964555225246, "grad_norm": 0.7814728617668152, "learning_rate": 4.26451558237988e-05, "loss": 0.08002877831459046, "memory(GiB)": 122.96, "step": 35880, "token_acc": 0.9673309376020908, "train_speed(iter/s)": 0.235592 }, { "epoch": 2.735345681835506, "grad_norm": 0.6951321363449097, "learning_rate": 4.2633312835282307e-05, "loss": 0.11272075176239013, "memory(GiB)": 122.96, "step": 35885, "token_acc": 0.9558074965293846, "train_speed(iter/s)": 0.2356 }, { "epoch": 2.7357268084457655, "grad_norm": 1.0905017852783203, "learning_rate": 4.2621470269211136e-05, "loss": 0.1267564296722412, "memory(GiB)": 122.96, "step": 35890, "token_acc": 0.9562699102229945, "train_speed(iter/s)": 0.235609 }, { "epoch": 2.7361079350560256, "grad_norm": 0.7702677249908447, "learning_rate": 4.26096281262644e-05, "loss": 0.08950303196907043, "memory(GiB)": 122.96, "step": 35895, "token_acc": 0.9655516171909615, "train_speed(iter/s)": 0.235611 }, { "epoch": 2.7364890616662856, "grad_norm": 1.2690509557724, "learning_rate": 4.2597786407121205e-05, "loss": 0.06723722815513611, "memory(GiB)": 122.96, "step": 35900, "token_acc": 0.9746919746919747, "train_speed(iter/s)": 0.23562 }, { "epoch": 2.7368701882765456, "grad_norm": 0.8746135830879211, "learning_rate": 4.258594511246061e-05, "loss": 0.09514663219451905, "memory(GiB)": 122.96, "step": 35905, "token_acc": 0.9686898723106524, "train_speed(iter/s)": 0.235625 }, { "epoch": 2.737251314886805, "grad_norm": 1.2319340705871582, "learning_rate": 4.2574104242961644e-05, "loss": 0.08544286489486694, "memory(GiB)": 122.96, "step": 35910, "token_acc": 0.9564544650751547, "train_speed(iter/s)": 0.235632 }, { "epoch": 2.737632441497065, "grad_norm": 2.426100969314575, "learning_rate": 4.256226379930335e-05, "loss": 0.11443558931350709, "memory(GiB)": 122.96, "step": 35915, "token_acc": 0.9564274875352266, "train_speed(iter/s)": 0.235639 }, { "epoch": 2.738013568107325, "grad_norm": 0.5768752098083496, "learning_rate": 4.255042378216472e-05, "loss": 0.050571000576019286, "memory(GiB)": 122.96, "step": 35920, "token_acc": 0.9793021880544057, "train_speed(iter/s)": 0.235644 }, { "epoch": 2.7383946947175852, "grad_norm": 0.6836011409759521, "learning_rate": 4.25385841922247e-05, "loss": 0.07122111916542054, "memory(GiB)": 122.96, "step": 35925, "token_acc": 0.9679544240697882, "train_speed(iter/s)": 0.235649 }, { "epoch": 2.7387758213278453, "grad_norm": 0.9548466205596924, "learning_rate": 4.252674503016225e-05, "loss": 0.0785791277885437, "memory(GiB)": 122.96, "step": 35930, "token_acc": 0.9606393129770993, "train_speed(iter/s)": 0.235657 }, { "epoch": 2.7391569479381053, "grad_norm": 1.5440421104431152, "learning_rate": 4.251490629665631e-05, "loss": 0.10118836164474487, "memory(GiB)": 122.96, "step": 35935, "token_acc": 0.9595189205045468, "train_speed(iter/s)": 0.235666 }, { "epoch": 2.739538074548365, "grad_norm": 0.4116368889808655, "learning_rate": 4.2503067992385745e-05, "loss": 0.062047290802001956, "memory(GiB)": 122.96, "step": 35940, "token_acc": 0.9608091024020228, "train_speed(iter/s)": 0.235676 }, { "epoch": 2.739919201158625, "grad_norm": 0.7505683898925781, "learning_rate": 4.249123011802943e-05, "loss": 0.0996061086654663, "memory(GiB)": 122.96, "step": 35945, "token_acc": 0.9582403570290086, "train_speed(iter/s)": 0.23568 }, { "epoch": 2.740300327768885, "grad_norm": 1.2447725534439087, "learning_rate": 4.247939267426625e-05, "loss": 0.10817514657974243, "memory(GiB)": 122.96, "step": 35950, "token_acc": 0.9645390070921985, "train_speed(iter/s)": 0.235685 }, { "epoch": 2.740681454379145, "grad_norm": 0.7793630361557007, "learning_rate": 4.246755566177499e-05, "loss": 0.09425817728042603, "memory(GiB)": 122.96, "step": 35955, "token_acc": 0.9649171270718232, "train_speed(iter/s)": 0.235695 }, { "epoch": 2.7410625809894045, "grad_norm": 0.6527905464172363, "learning_rate": 4.2455719081234443e-05, "loss": 0.10564165115356446, "memory(GiB)": 122.96, "step": 35960, "token_acc": 0.9579503227068258, "train_speed(iter/s)": 0.235702 }, { "epoch": 2.7414437075996645, "grad_norm": 1.1404554843902588, "learning_rate": 4.244388293332341e-05, "loss": 0.05614688992500305, "memory(GiB)": 122.96, "step": 35965, "token_acc": 0.9776863084922011, "train_speed(iter/s)": 0.235707 }, { "epoch": 2.7418248342099245, "grad_norm": 0.8685212135314941, "learning_rate": 4.243204721872063e-05, "loss": 0.08530694842338563, "memory(GiB)": 122.96, "step": 35970, "token_acc": 0.9601139601139601, "train_speed(iter/s)": 0.235713 }, { "epoch": 2.7422059608201845, "grad_norm": 1.1547967195510864, "learning_rate": 4.2420211938104806e-05, "loss": 0.07745405435562133, "memory(GiB)": 122.96, "step": 35975, "token_acc": 0.9659376814399071, "train_speed(iter/s)": 0.235721 }, { "epoch": 2.7425870874304445, "grad_norm": 1.0924409627914429, "learning_rate": 4.240837709215467e-05, "loss": 0.1073150873184204, "memory(GiB)": 122.96, "step": 35980, "token_acc": 0.9615785813630042, "train_speed(iter/s)": 0.235725 }, { "epoch": 2.7429682140407046, "grad_norm": 1.2724134922027588, "learning_rate": 4.239654268154888e-05, "loss": 0.11150245666503907, "memory(GiB)": 122.96, "step": 35985, "token_acc": 0.9666751462732129, "train_speed(iter/s)": 0.235734 }, { "epoch": 2.743349340650964, "grad_norm": 3.8650100231170654, "learning_rate": 4.2384708706966064e-05, "loss": 0.11465519666671753, "memory(GiB)": 122.96, "step": 35990, "token_acc": 0.9554395126196693, "train_speed(iter/s)": 0.235737 }, { "epoch": 2.743730467261224, "grad_norm": 1.478115439414978, "learning_rate": 4.237287516908488e-05, "loss": 0.07770415544509887, "memory(GiB)": 122.96, "step": 35995, "token_acc": 0.9673375123721544, "train_speed(iter/s)": 0.235746 }, { "epoch": 2.744111593871484, "grad_norm": 2.0596556663513184, "learning_rate": 4.236104206858391e-05, "loss": 0.10138874053955078, "memory(GiB)": 122.96, "step": 36000, "token_acc": 0.9723790322580645, "train_speed(iter/s)": 0.235754 }, { "epoch": 2.744111593871484, "eval_loss": 0.0774289146065712, "eval_runtime": 221.8175, "eval_samples_per_second": 2.389, "eval_steps_per_second": 2.389, "eval_token_acc": 0.9666962833564243, "step": 36000 }, { "epoch": 2.744492720481744, "grad_norm": 0.5009119510650635, "learning_rate": 4.23492094061417e-05, "loss": 0.10316756963729859, "memory(GiB)": 122.96, "step": 36005, "token_acc": 0.9663777405492024, "train_speed(iter/s)": 0.235419 }, { "epoch": 2.7448738470920038, "grad_norm": 1.0636125802993774, "learning_rate": 4.233737718243686e-05, "loss": 0.09585130214691162, "memory(GiB)": 122.96, "step": 36010, "token_acc": 0.965531142914735, "train_speed(iter/s)": 0.235425 }, { "epoch": 2.745254973702264, "grad_norm": 0.8922129273414612, "learning_rate": 4.232554539814787e-05, "loss": 0.09847974181175231, "memory(GiB)": 122.96, "step": 36015, "token_acc": 0.9599400171379606, "train_speed(iter/s)": 0.235432 }, { "epoch": 2.745636100312524, "grad_norm": 1.1830281019210815, "learning_rate": 4.2313714053953233e-05, "loss": 0.10700086355209351, "memory(GiB)": 122.96, "step": 36020, "token_acc": 0.9602494154325799, "train_speed(iter/s)": 0.23544 }, { "epoch": 2.746017226922784, "grad_norm": 0.5644654035568237, "learning_rate": 4.230188315053143e-05, "loss": 0.06363803744316102, "memory(GiB)": 122.96, "step": 36025, "token_acc": 0.9823337982333799, "train_speed(iter/s)": 0.235451 }, { "epoch": 2.746398353533044, "grad_norm": 0.8629458546638489, "learning_rate": 4.229005268856091e-05, "loss": 0.07743685245513916, "memory(GiB)": 122.96, "step": 36030, "token_acc": 0.9692559280457891, "train_speed(iter/s)": 0.235455 }, { "epoch": 2.746779480143304, "grad_norm": 0.7974511981010437, "learning_rate": 4.227822266872008e-05, "loss": 0.09857755899429321, "memory(GiB)": 122.96, "step": 36035, "token_acc": 0.9558498896247241, "train_speed(iter/s)": 0.235463 }, { "epoch": 2.7471606067535634, "grad_norm": 1.6877200603485107, "learning_rate": 4.226639309168736e-05, "loss": 0.10738775730133057, "memory(GiB)": 122.96, "step": 36040, "token_acc": 0.968281797896079, "train_speed(iter/s)": 0.235467 }, { "epoch": 2.7475417333638235, "grad_norm": 0.8187024593353271, "learning_rate": 4.2254563958141104e-05, "loss": 0.10633816719055175, "memory(GiB)": 122.96, "step": 36045, "token_acc": 0.9596676475679419, "train_speed(iter/s)": 0.235473 }, { "epoch": 2.7479228599740835, "grad_norm": 1.002465009689331, "learning_rate": 4.2242735268759655e-05, "loss": 0.09708261489868164, "memory(GiB)": 122.96, "step": 36050, "token_acc": 0.9631416441528368, "train_speed(iter/s)": 0.235478 }, { "epoch": 2.748303986584343, "grad_norm": 1.0996912717819214, "learning_rate": 4.223090702422134e-05, "loss": 0.05363468527793884, "memory(GiB)": 122.96, "step": 36055, "token_acc": 0.9757229560871118, "train_speed(iter/s)": 0.235488 }, { "epoch": 2.748685113194603, "grad_norm": 1.7137305736541748, "learning_rate": 4.2219079225204464e-05, "loss": 0.088137286901474, "memory(GiB)": 122.96, "step": 36060, "token_acc": 0.9664082687338501, "train_speed(iter/s)": 0.235494 }, { "epoch": 2.749066239804863, "grad_norm": 0.7687479257583618, "learning_rate": 4.220725187238728e-05, "loss": 0.08112077713012696, "memory(GiB)": 122.96, "step": 36065, "token_acc": 0.9572716581732654, "train_speed(iter/s)": 0.235501 }, { "epoch": 2.749447366415123, "grad_norm": 0.8418876528739929, "learning_rate": 4.2195424966448037e-05, "loss": 0.06631267070770264, "memory(GiB)": 122.96, "step": 36070, "token_acc": 0.9775967413441955, "train_speed(iter/s)": 0.235507 }, { "epoch": 2.749828493025383, "grad_norm": 1.9810903072357178, "learning_rate": 4.218359850806496e-05, "loss": 0.07221702933311462, "memory(GiB)": 122.96, "step": 36075, "token_acc": 0.9710560625814864, "train_speed(iter/s)": 0.235513 }, { "epoch": 2.750209619635643, "grad_norm": 1.4238126277923584, "learning_rate": 4.217177249791622e-05, "loss": 0.09562355279922485, "memory(GiB)": 122.96, "step": 36080, "token_acc": 0.9725823591923486, "train_speed(iter/s)": 0.235522 }, { "epoch": 2.750590746245903, "grad_norm": 1.0349708795547485, "learning_rate": 4.2159946936680016e-05, "loss": 0.06820365190505981, "memory(GiB)": 122.96, "step": 36085, "token_acc": 0.9720750101916021, "train_speed(iter/s)": 0.23553 }, { "epoch": 2.7509718728561627, "grad_norm": 1.4965686798095703, "learning_rate": 4.214812182503447e-05, "loss": 0.10692830085754394, "memory(GiB)": 122.96, "step": 36090, "token_acc": 0.9572536410062418, "train_speed(iter/s)": 0.235537 }, { "epoch": 2.7513529994664228, "grad_norm": 1.4444934129714966, "learning_rate": 4.213629716365769e-05, "loss": 0.10088772773742676, "memory(GiB)": 122.96, "step": 36095, "token_acc": 0.9628722970216238, "train_speed(iter/s)": 0.235546 }, { "epoch": 2.7517341260766828, "grad_norm": 0.7864841818809509, "learning_rate": 4.2124472953227795e-05, "loss": 0.09247565865516663, "memory(GiB)": 122.96, "step": 36100, "token_acc": 0.9658377875900535, "train_speed(iter/s)": 0.235555 }, { "epoch": 2.7521152526869423, "grad_norm": 1.0814357995986938, "learning_rate": 4.211264919442282e-05, "loss": 0.0770712673664093, "memory(GiB)": 122.96, "step": 36105, "token_acc": 0.9675842552096733, "train_speed(iter/s)": 0.235562 }, { "epoch": 2.7524963792972024, "grad_norm": 1.9760977029800415, "learning_rate": 4.210082588792082e-05, "loss": 0.06944127678871155, "memory(GiB)": 122.96, "step": 36110, "token_acc": 0.9683648315529991, "train_speed(iter/s)": 0.235572 }, { "epoch": 2.7528775059074624, "grad_norm": 0.44257479906082153, "learning_rate": 4.2089003034399774e-05, "loss": 0.058572965860366824, "memory(GiB)": 122.96, "step": 36115, "token_acc": 0.9805712065280746, "train_speed(iter/s)": 0.235576 }, { "epoch": 2.7532586325177224, "grad_norm": 1.3039697408676147, "learning_rate": 4.207718063453771e-05, "loss": 0.10856097936630249, "memory(GiB)": 122.96, "step": 36120, "token_acc": 0.9571256038647343, "train_speed(iter/s)": 0.235585 }, { "epoch": 2.7536397591279824, "grad_norm": 1.5724818706512451, "learning_rate": 4.206535868901258e-05, "loss": 0.043909657001495364, "memory(GiB)": 122.96, "step": 36125, "token_acc": 0.979800853485064, "train_speed(iter/s)": 0.235594 }, { "epoch": 2.7540208857382424, "grad_norm": 2.18611741065979, "learning_rate": 4.205353719850229e-05, "loss": 0.1262003540992737, "memory(GiB)": 122.96, "step": 36130, "token_acc": 0.9484777517564403, "train_speed(iter/s)": 0.235604 }, { "epoch": 2.754402012348502, "grad_norm": 0.7887604236602783, "learning_rate": 4.204171616368477e-05, "loss": 0.04816741347312927, "memory(GiB)": 122.96, "step": 36135, "token_acc": 0.9738871363477042, "train_speed(iter/s)": 0.235608 }, { "epoch": 2.754783138958762, "grad_norm": 1.8722261190414429, "learning_rate": 4.202989558523788e-05, "loss": 0.09338799715042115, "memory(GiB)": 122.96, "step": 36140, "token_acc": 0.9633044787354159, "train_speed(iter/s)": 0.235613 }, { "epoch": 2.755164265569022, "grad_norm": 0.8806177973747253, "learning_rate": 4.201807546383952e-05, "loss": 0.06579349040985108, "memory(GiB)": 122.96, "step": 36145, "token_acc": 0.9666836647968724, "train_speed(iter/s)": 0.235621 }, { "epoch": 2.755545392179282, "grad_norm": 1.2966192960739136, "learning_rate": 4.200625580016747e-05, "loss": 0.10801869630813599, "memory(GiB)": 122.96, "step": 36150, "token_acc": 0.9462809917355371, "train_speed(iter/s)": 0.235631 }, { "epoch": 2.7559265187895416, "grad_norm": 0.7677803635597229, "learning_rate": 4.1994436594899575e-05, "loss": 0.08398632407188415, "memory(GiB)": 122.96, "step": 36155, "token_acc": 0.9646994931209268, "train_speed(iter/s)": 0.235638 }, { "epoch": 2.7563076453998017, "grad_norm": 1.025064468383789, "learning_rate": 4.198261784871358e-05, "loss": 0.08445631265640259, "memory(GiB)": 122.96, "step": 36160, "token_acc": 0.9668772146048374, "train_speed(iter/s)": 0.235642 }, { "epoch": 2.7566887720100617, "grad_norm": 0.7825415134429932, "learning_rate": 4.1970799562287256e-05, "loss": 0.11279771327972413, "memory(GiB)": 122.96, "step": 36165, "token_acc": 0.9606834771068348, "train_speed(iter/s)": 0.23565 }, { "epoch": 2.7570698986203217, "grad_norm": 0.9214193820953369, "learning_rate": 4.19589817362983e-05, "loss": 0.07024667263031006, "memory(GiB)": 122.96, "step": 36170, "token_acc": 0.9709062315706704, "train_speed(iter/s)": 0.235655 }, { "epoch": 2.7574510252305817, "grad_norm": 1.081828236579895, "learning_rate": 4.194716437142444e-05, "loss": 0.09184709787368775, "memory(GiB)": 122.96, "step": 36175, "token_acc": 0.9617028551889212, "train_speed(iter/s)": 0.235662 }, { "epoch": 2.7578321518408417, "grad_norm": 1.376746416091919, "learning_rate": 4.1935347468343334e-05, "loss": 0.10770206451416016, "memory(GiB)": 122.96, "step": 36180, "token_acc": 0.9561137179133314, "train_speed(iter/s)": 0.23567 }, { "epoch": 2.7582132784511013, "grad_norm": 0.4956839680671692, "learning_rate": 4.1923531027732615e-05, "loss": 0.0525756299495697, "memory(GiB)": 122.96, "step": 36185, "token_acc": 0.9780015902464883, "train_speed(iter/s)": 0.235676 }, { "epoch": 2.7585944050613613, "grad_norm": 1.7300044298171997, "learning_rate": 4.191171505026993e-05, "loss": 0.112065589427948, "memory(GiB)": 122.96, "step": 36190, "token_acc": 0.9597423510466989, "train_speed(iter/s)": 0.235684 }, { "epoch": 2.7589755316716214, "grad_norm": 1.3968719244003296, "learning_rate": 4.1899899536632844e-05, "loss": 0.09584531784057618, "memory(GiB)": 122.96, "step": 36195, "token_acc": 0.9601630535482676, "train_speed(iter/s)": 0.235688 }, { "epoch": 2.7593566582818814, "grad_norm": 1.2060410976409912, "learning_rate": 4.1888084487498916e-05, "loss": 0.071506929397583, "memory(GiB)": 122.96, "step": 36200, "token_acc": 0.97599451303155, "train_speed(iter/s)": 0.235699 }, { "epoch": 2.7593566582818814, "eval_loss": 0.07753386348485947, "eval_runtime": 220.4791, "eval_samples_per_second": 2.404, "eval_steps_per_second": 2.404, "eval_token_acc": 0.9670125293657008, "step": 36200 }, { "epoch": 2.759737784892141, "grad_norm": 0.6607219576835632, "learning_rate": 4.187626990354572e-05, "loss": 0.06384857892990112, "memory(GiB)": 122.96, "step": 36205, "token_acc": 0.967335919631742, "train_speed(iter/s)": 0.235367 }, { "epoch": 2.760118911502401, "grad_norm": 1.0810904502868652, "learning_rate": 4.186445578545074e-05, "loss": 0.06676793694496155, "memory(GiB)": 122.96, "step": 36210, "token_acc": 0.9709618874773139, "train_speed(iter/s)": 0.235375 }, { "epoch": 2.760500038112661, "grad_norm": 0.5588495135307312, "learning_rate": 4.1852642133891455e-05, "loss": 0.09877266883850097, "memory(GiB)": 122.96, "step": 36215, "token_acc": 0.9699953117674637, "train_speed(iter/s)": 0.235375 }, { "epoch": 2.760881164722921, "grad_norm": 0.8207181096076965, "learning_rate": 4.184082894954535e-05, "loss": 0.07206475734710693, "memory(GiB)": 122.96, "step": 36220, "token_acc": 0.9704906782568913, "train_speed(iter/s)": 0.235377 }, { "epoch": 2.761262291333181, "grad_norm": 0.9270002245903015, "learning_rate": 4.182901623308984e-05, "loss": 0.09352295398712158, "memory(GiB)": 122.96, "step": 36225, "token_acc": 0.9625393194166428, "train_speed(iter/s)": 0.235386 }, { "epoch": 2.761643417943441, "grad_norm": 0.8764816522598267, "learning_rate": 4.181720398520233e-05, "loss": 0.07189087271690368, "memory(GiB)": 122.96, "step": 36230, "token_acc": 0.9690821256038648, "train_speed(iter/s)": 0.235397 }, { "epoch": 2.7620245445537006, "grad_norm": 1.3416770696640015, "learning_rate": 4.180539220656019e-05, "loss": 0.08117685317993165, "memory(GiB)": 122.96, "step": 36235, "token_acc": 0.9683760683760684, "train_speed(iter/s)": 0.235404 }, { "epoch": 2.7624056711639606, "grad_norm": 0.9607065916061401, "learning_rate": 4.1793580897840787e-05, "loss": 0.10312105417251587, "memory(GiB)": 122.96, "step": 36240, "token_acc": 0.9559063690800218, "train_speed(iter/s)": 0.235413 }, { "epoch": 2.7627867977742206, "grad_norm": 1.0895726680755615, "learning_rate": 4.1781770059721445e-05, "loss": 0.08235690593719483, "memory(GiB)": 122.96, "step": 36245, "token_acc": 0.9700996677740864, "train_speed(iter/s)": 0.23542 }, { "epoch": 2.7631679243844807, "grad_norm": 0.39658495783805847, "learning_rate": 4.176995969287943e-05, "loss": 0.06402266621589661, "memory(GiB)": 122.96, "step": 36250, "token_acc": 0.9808612440191388, "train_speed(iter/s)": 0.23543 }, { "epoch": 2.7635490509947402, "grad_norm": 1.457228422164917, "learning_rate": 4.1758149797992054e-05, "loss": 0.09143427014350891, "memory(GiB)": 122.96, "step": 36255, "token_acc": 0.9628732849071832, "train_speed(iter/s)": 0.235439 }, { "epoch": 2.7639301776050003, "grad_norm": 0.8689384460449219, "learning_rate": 4.174634037573654e-05, "loss": 0.058904063701629636, "memory(GiB)": 122.96, "step": 36260, "token_acc": 0.974091260634184, "train_speed(iter/s)": 0.235445 }, { "epoch": 2.7643113042152603, "grad_norm": 0.46016544103622437, "learning_rate": 4.173453142679009e-05, "loss": 0.068360435962677, "memory(GiB)": 122.96, "step": 36265, "token_acc": 0.9778495102404274, "train_speed(iter/s)": 0.235448 }, { "epoch": 2.7646924308255203, "grad_norm": 0.8173233866691589, "learning_rate": 4.1722722951829916e-05, "loss": 0.060930836200714114, "memory(GiB)": 122.96, "step": 36270, "token_acc": 0.9720767888307156, "train_speed(iter/s)": 0.235455 }, { "epoch": 2.7650735574357803, "grad_norm": 1.0083820819854736, "learning_rate": 4.1710914951533156e-05, "loss": 0.08112999200820922, "memory(GiB)": 122.96, "step": 36275, "token_acc": 0.97109375, "train_speed(iter/s)": 0.235464 }, { "epoch": 2.7654546840460403, "grad_norm": 2.1769845485687256, "learning_rate": 4.169910742657697e-05, "loss": 0.06332104802131652, "memory(GiB)": 122.96, "step": 36280, "token_acc": 0.9739736070381232, "train_speed(iter/s)": 0.23547 }, { "epoch": 2.7658358106563, "grad_norm": 0.9891211986541748, "learning_rate": 4.1687300377638455e-05, "loss": 0.05588276386260986, "memory(GiB)": 122.96, "step": 36285, "token_acc": 0.9818644607063315, "train_speed(iter/s)": 0.235479 }, { "epoch": 2.76621693726656, "grad_norm": 2.373494863510132, "learning_rate": 4.167549380539467e-05, "loss": 0.0994698464870453, "memory(GiB)": 122.96, "step": 36290, "token_acc": 0.9686591276252019, "train_speed(iter/s)": 0.235487 }, { "epoch": 2.76659806387682, "grad_norm": 0.922339677810669, "learning_rate": 4.166368771052271e-05, "loss": 0.07894558906555176, "memory(GiB)": 122.96, "step": 36295, "token_acc": 0.9682322801024765, "train_speed(iter/s)": 0.235492 }, { "epoch": 2.76697919048708, "grad_norm": 0.22008441388607025, "learning_rate": 4.165188209369957e-05, "loss": 0.062323343753814694, "memory(GiB)": 122.96, "step": 36300, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.2355 }, { "epoch": 2.7673603170973395, "grad_norm": 1.758994698524475, "learning_rate": 4.164007695560224e-05, "loss": 0.07008575797080993, "memory(GiB)": 122.96, "step": 36305, "token_acc": 0.9783503310073358, "train_speed(iter/s)": 0.235507 }, { "epoch": 2.7677414437075996, "grad_norm": 0.9090166091918945, "learning_rate": 4.162827229690771e-05, "loss": 0.09704723358154296, "memory(GiB)": 122.96, "step": 36310, "token_acc": 0.9657799944582987, "train_speed(iter/s)": 0.235509 }, { "epoch": 2.7681225703178596, "grad_norm": 1.928450584411621, "learning_rate": 4.161646811829291e-05, "loss": 0.09053519368171692, "memory(GiB)": 122.96, "step": 36315, "token_acc": 0.9680191603268526, "train_speed(iter/s)": 0.235514 }, { "epoch": 2.7685036969281196, "grad_norm": 1.560502052307129, "learning_rate": 4.160466442043475e-05, "loss": 0.13967348337173463, "memory(GiB)": 122.96, "step": 36320, "token_acc": 0.9474010861132661, "train_speed(iter/s)": 0.235519 }, { "epoch": 2.7688848235383796, "grad_norm": 0.6211332082748413, "learning_rate": 4.159286120401015e-05, "loss": 0.07417197227478027, "memory(GiB)": 122.96, "step": 36325, "token_acc": 0.9697370001200912, "train_speed(iter/s)": 0.235521 }, { "epoch": 2.7692659501486396, "grad_norm": 0.8066635131835938, "learning_rate": 4.158105846969593e-05, "loss": 0.08970252871513366, "memory(GiB)": 122.96, "step": 36330, "token_acc": 0.9618456078083407, "train_speed(iter/s)": 0.235529 }, { "epoch": 2.769647076758899, "grad_norm": 1.2891541719436646, "learning_rate": 4.156925621816894e-05, "loss": 0.08030766844749451, "memory(GiB)": 122.96, "step": 36335, "token_acc": 0.9679044597872738, "train_speed(iter/s)": 0.235535 }, { "epoch": 2.7700282033691592, "grad_norm": 0.8287607431411743, "learning_rate": 4.155745445010598e-05, "loss": 0.08223112225532532, "memory(GiB)": 122.96, "step": 36340, "token_acc": 0.9674971126876754, "train_speed(iter/s)": 0.235539 }, { "epoch": 2.7704093299794192, "grad_norm": 0.5788384675979614, "learning_rate": 4.154565316618384e-05, "loss": 0.08237577080726624, "memory(GiB)": 122.96, "step": 36345, "token_acc": 0.9748587570621469, "train_speed(iter/s)": 0.235546 }, { "epoch": 2.770790456589679, "grad_norm": 0.8958272337913513, "learning_rate": 4.153385236707925e-05, "loss": 0.0916076123714447, "memory(GiB)": 122.96, "step": 36350, "token_acc": 0.9666374012291484, "train_speed(iter/s)": 0.235551 }, { "epoch": 2.771171583199939, "grad_norm": 0.7509499788284302, "learning_rate": 4.152205205346894e-05, "loss": 0.06918643712997437, "memory(GiB)": 122.96, "step": 36355, "token_acc": 0.9744045989597591, "train_speed(iter/s)": 0.235552 }, { "epoch": 2.771552709810199, "grad_norm": 0.718990683555603, "learning_rate": 4.15102522260296e-05, "loss": 0.09540314078330994, "memory(GiB)": 122.96, "step": 36360, "token_acc": 0.9716114210699048, "train_speed(iter/s)": 0.235559 }, { "epoch": 2.771933836420459, "grad_norm": 0.884028434753418, "learning_rate": 4.149845288543791e-05, "loss": 0.06851948499679565, "memory(GiB)": 122.96, "step": 36365, "token_acc": 0.9720868409393, "train_speed(iter/s)": 0.235564 }, { "epoch": 2.772314963030719, "grad_norm": 1.6565873622894287, "learning_rate": 4.148665403237047e-05, "loss": 0.11777944564819336, "memory(GiB)": 122.96, "step": 36370, "token_acc": 0.9572309801233722, "train_speed(iter/s)": 0.235568 }, { "epoch": 2.772696089640979, "grad_norm": 1.0289907455444336, "learning_rate": 4.147485566750393e-05, "loss": 0.07789106369018554, "memory(GiB)": 122.96, "step": 36375, "token_acc": 0.966804979253112, "train_speed(iter/s)": 0.235574 }, { "epoch": 2.773077216251239, "grad_norm": 0.5598178505897522, "learning_rate": 4.1463057791514866e-05, "loss": 0.08800234794616699, "memory(GiB)": 122.96, "step": 36380, "token_acc": 0.9632784958871915, "train_speed(iter/s)": 0.235579 }, { "epoch": 2.7734583428614985, "grad_norm": 0.6030678749084473, "learning_rate": 4.14512604050798e-05, "loss": 0.058179455995559695, "memory(GiB)": 122.96, "step": 36385, "token_acc": 0.9785207700101317, "train_speed(iter/s)": 0.235586 }, { "epoch": 2.7738394694717585, "grad_norm": 0.25762659311294556, "learning_rate": 4.143946350887529e-05, "loss": 0.10248603820800781, "memory(GiB)": 122.96, "step": 36390, "token_acc": 0.9730348511829051, "train_speed(iter/s)": 0.235594 }, { "epoch": 2.7742205960820185, "grad_norm": 1.3151499032974243, "learning_rate": 4.1427667103577824e-05, "loss": 0.07895773649215698, "memory(GiB)": 122.96, "step": 36395, "token_acc": 0.967479674796748, "train_speed(iter/s)": 0.2356 }, { "epoch": 2.774601722692278, "grad_norm": 0.522243082523346, "learning_rate": 4.141587118986388e-05, "loss": 0.07168622016906738, "memory(GiB)": 122.96, "step": 36400, "token_acc": 0.9727078891257995, "train_speed(iter/s)": 0.235601 }, { "epoch": 2.774601722692278, "eval_loss": 0.07645303755998611, "eval_runtime": 220.8907, "eval_samples_per_second": 2.399, "eval_steps_per_second": 2.399, "eval_token_acc": 0.9676977290524668, "step": 36400 }, { "epoch": 2.774982849302538, "grad_norm": 1.2162169218063354, "learning_rate": 4.140407576840985e-05, "loss": 0.13256523609161378, "memory(GiB)": 122.96, "step": 36405, "token_acc": 0.9675535271414525, "train_speed(iter/s)": 0.235274 }, { "epoch": 2.775363975912798, "grad_norm": 0.6755419373512268, "learning_rate": 4.13922808398922e-05, "loss": 0.08124409914016724, "memory(GiB)": 122.96, "step": 36410, "token_acc": 0.9695206428373511, "train_speed(iter/s)": 0.235283 }, { "epoch": 2.775745102523058, "grad_norm": 2.6815364360809326, "learning_rate": 4.138048640498731e-05, "loss": 0.0857742428779602, "memory(GiB)": 122.96, "step": 36415, "token_acc": 0.9713563605728728, "train_speed(iter/s)": 0.235292 }, { "epoch": 2.776126229133318, "grad_norm": 1.8331245183944702, "learning_rate": 4.136869246437153e-05, "loss": 0.11509518623352051, "memory(GiB)": 122.96, "step": 36420, "token_acc": 0.9640581187866428, "train_speed(iter/s)": 0.235296 }, { "epoch": 2.776507355743578, "grad_norm": 0.4943867325782776, "learning_rate": 4.135689901872117e-05, "loss": 0.06862449645996094, "memory(GiB)": 122.96, "step": 36425, "token_acc": 0.9699594544784371, "train_speed(iter/s)": 0.235303 }, { "epoch": 2.776888482353838, "grad_norm": 1.251592755317688, "learning_rate": 4.1345106068712554e-05, "loss": 0.0933029294013977, "memory(GiB)": 122.96, "step": 36430, "token_acc": 0.9696132596685083, "train_speed(iter/s)": 0.235313 }, { "epoch": 2.777269608964098, "grad_norm": 0.7031018137931824, "learning_rate": 4.133331361502194e-05, "loss": 0.09444934725761414, "memory(GiB)": 122.96, "step": 36435, "token_acc": 0.9645120405576679, "train_speed(iter/s)": 0.235314 }, { "epoch": 2.777650735574358, "grad_norm": 0.025030970573425293, "learning_rate": 4.1321521658325565e-05, "loss": 0.06731322407722473, "memory(GiB)": 122.96, "step": 36440, "token_acc": 0.9681245366938473, "train_speed(iter/s)": 0.235322 }, { "epoch": 2.778031862184618, "grad_norm": 0.8849080204963684, "learning_rate": 4.130973019929965e-05, "loss": 0.11034375429153442, "memory(GiB)": 122.96, "step": 36445, "token_acc": 0.9536423841059603, "train_speed(iter/s)": 0.235332 }, { "epoch": 2.7784129887948774, "grad_norm": 0.6636297106742859, "learning_rate": 4.1297939238620386e-05, "loss": 0.07208907604217529, "memory(GiB)": 122.96, "step": 36450, "token_acc": 0.9758194519075766, "train_speed(iter/s)": 0.23534 }, { "epoch": 2.7787941154051374, "grad_norm": 1.6018611192703247, "learning_rate": 4.128614877696393e-05, "loss": 0.06330386400222779, "memory(GiB)": 122.96, "step": 36455, "token_acc": 0.9770869623475814, "train_speed(iter/s)": 0.23534 }, { "epoch": 2.7791752420153975, "grad_norm": 1.1094274520874023, "learning_rate": 4.1274358815006385e-05, "loss": 0.06758404970169067, "memory(GiB)": 122.96, "step": 36460, "token_acc": 0.9698403311649911, "train_speed(iter/s)": 0.23535 }, { "epoch": 2.7795563686256575, "grad_norm": 1.913881540298462, "learning_rate": 4.126256935342388e-05, "loss": 0.1271100401878357, "memory(GiB)": 122.96, "step": 36465, "token_acc": 0.9585654596100278, "train_speed(iter/s)": 0.235359 }, { "epoch": 2.7799374952359175, "grad_norm": 1.7787593603134155, "learning_rate": 4.1250780392892485e-05, "loss": 0.10310649871826172, "memory(GiB)": 122.96, "step": 36470, "token_acc": 0.9540332147093713, "train_speed(iter/s)": 0.235367 }, { "epoch": 2.7803186218461775, "grad_norm": 0.8732700347900391, "learning_rate": 4.123899193408822e-05, "loss": 0.08938018083572388, "memory(GiB)": 122.96, "step": 36475, "token_acc": 0.9656862745098039, "train_speed(iter/s)": 0.235378 }, { "epoch": 2.780699748456437, "grad_norm": 0.7307310104370117, "learning_rate": 4.122720397768712e-05, "loss": 0.05103349685668945, "memory(GiB)": 122.96, "step": 36480, "token_acc": 0.973466641030571, "train_speed(iter/s)": 0.235382 }, { "epoch": 2.781080875066697, "grad_norm": 1.193358302116394, "learning_rate": 4.121541652436516e-05, "loss": 0.07958240509033203, "memory(GiB)": 122.96, "step": 36485, "token_acc": 0.9742044812652985, "train_speed(iter/s)": 0.235388 }, { "epoch": 2.781462001676957, "grad_norm": 1.086230993270874, "learning_rate": 4.1203629574798285e-05, "loss": 0.11204242706298828, "memory(GiB)": 122.96, "step": 36490, "token_acc": 0.9503424657534246, "train_speed(iter/s)": 0.235397 }, { "epoch": 2.781843128287217, "grad_norm": 1.2544385194778442, "learning_rate": 4.119184312966245e-05, "loss": 0.08108473420143128, "memory(GiB)": 122.96, "step": 36495, "token_acc": 0.9720930232558139, "train_speed(iter/s)": 0.235401 }, { "epoch": 2.7822242548974767, "grad_norm": 0.704272985458374, "learning_rate": 4.118005718963353e-05, "loss": 0.06436741352081299, "memory(GiB)": 122.96, "step": 36500, "token_acc": 0.9684512428298279, "train_speed(iter/s)": 0.235413 }, { "epoch": 2.7826053815077367, "grad_norm": 0.9944061636924744, "learning_rate": 4.116827175538741e-05, "loss": 0.08787984251976014, "memory(GiB)": 122.96, "step": 36505, "token_acc": 0.9662212323682257, "train_speed(iter/s)": 0.235418 }, { "epoch": 2.7829865081179967, "grad_norm": 1.3454316854476929, "learning_rate": 4.11564868275999e-05, "loss": 0.11848341226577759, "memory(GiB)": 122.96, "step": 36510, "token_acc": 0.9532424158085165, "train_speed(iter/s)": 0.235426 }, { "epoch": 2.7833676347282568, "grad_norm": 1.0767661333084106, "learning_rate": 4.114470240694685e-05, "loss": 0.09040989875793456, "memory(GiB)": 122.96, "step": 36515, "token_acc": 0.9662540274715957, "train_speed(iter/s)": 0.235431 }, { "epoch": 2.783748761338517, "grad_norm": 1.2659990787506104, "learning_rate": 4.1132918494104015e-05, "loss": 0.10079787969589234, "memory(GiB)": 122.96, "step": 36520, "token_acc": 0.9621165328392012, "train_speed(iter/s)": 0.235437 }, { "epoch": 2.784129887948777, "grad_norm": 0.74942946434021, "learning_rate": 4.1121135089747156e-05, "loss": 0.11501826047897339, "memory(GiB)": 122.96, "step": 36525, "token_acc": 0.9582917912927883, "train_speed(iter/s)": 0.235444 }, { "epoch": 2.7845110145590364, "grad_norm": 1.0585417747497559, "learning_rate": 4.1109352194552e-05, "loss": 0.08875986337661743, "memory(GiB)": 122.96, "step": 36530, "token_acc": 0.9589930978481527, "train_speed(iter/s)": 0.23545 }, { "epoch": 2.7848921411692964, "grad_norm": 1.0165746212005615, "learning_rate": 4.109756980919424e-05, "loss": 0.060883831977844236, "memory(GiB)": 122.96, "step": 36535, "token_acc": 0.9772380291464261, "train_speed(iter/s)": 0.235453 }, { "epoch": 2.7852732677795564, "grad_norm": 0.9741947650909424, "learning_rate": 4.108578793434951e-05, "loss": 0.050427043437957765, "memory(GiB)": 122.96, "step": 36540, "token_acc": 0.9752377828796327, "train_speed(iter/s)": 0.235457 }, { "epoch": 2.7856543943898164, "grad_norm": 0.7384887933731079, "learning_rate": 4.1074006570693507e-05, "loss": 0.11738067865371704, "memory(GiB)": 122.96, "step": 36545, "token_acc": 0.9503311258278145, "train_speed(iter/s)": 0.235464 }, { "epoch": 2.786035521000076, "grad_norm": 0.7410151958465576, "learning_rate": 4.1062225718901795e-05, "loss": 0.0851446270942688, "memory(GiB)": 122.96, "step": 36550, "token_acc": 0.9665173308182032, "train_speed(iter/s)": 0.235472 }, { "epoch": 2.786416647610336, "grad_norm": 0.6020153164863586, "learning_rate": 4.105044537964996e-05, "loss": 0.07792996168136597, "memory(GiB)": 122.96, "step": 36555, "token_acc": 0.9788799240626483, "train_speed(iter/s)": 0.235479 }, { "epoch": 2.786797774220596, "grad_norm": 0.8210570812225342, "learning_rate": 4.1038665553613567e-05, "loss": 0.09622241258621216, "memory(GiB)": 122.96, "step": 36560, "token_acc": 0.9623644917160974, "train_speed(iter/s)": 0.235486 }, { "epoch": 2.787178900830856, "grad_norm": 0.9296842217445374, "learning_rate": 4.10268862414681e-05, "loss": 0.09591569900512695, "memory(GiB)": 122.96, "step": 36565, "token_acc": 0.9644038431249016, "train_speed(iter/s)": 0.23549 }, { "epoch": 2.787560027441116, "grad_norm": 1.170676350593567, "learning_rate": 4.101510744388908e-05, "loss": 0.09500133991241455, "memory(GiB)": 122.96, "step": 36570, "token_acc": 0.9573371805441055, "train_speed(iter/s)": 0.235497 }, { "epoch": 2.787941154051376, "grad_norm": 1.4319919347763062, "learning_rate": 4.100332916155195e-05, "loss": 0.10960228443145752, "memory(GiB)": 122.96, "step": 36575, "token_acc": 0.9546148810822606, "train_speed(iter/s)": 0.235503 }, { "epoch": 2.7883222806616357, "grad_norm": 0.7024170160293579, "learning_rate": 4.099155139513213e-05, "loss": 0.05863608121871948, "memory(GiB)": 122.96, "step": 36580, "token_acc": 0.9613722312263642, "train_speed(iter/s)": 0.235513 }, { "epoch": 2.7887034072718957, "grad_norm": 1.4978535175323486, "learning_rate": 4.097977414530505e-05, "loss": 0.09685714244842529, "memory(GiB)": 122.96, "step": 36585, "token_acc": 0.9598082595870207, "train_speed(iter/s)": 0.235518 }, { "epoch": 2.7890845338821557, "grad_norm": 1.4491987228393555, "learning_rate": 4.096799741274606e-05, "loss": 0.0978753924369812, "memory(GiB)": 122.96, "step": 36590, "token_acc": 0.9628174123337364, "train_speed(iter/s)": 0.235526 }, { "epoch": 2.7894656604924157, "grad_norm": 1.3727492094039917, "learning_rate": 4.09562211981305e-05, "loss": 0.0904083013534546, "memory(GiB)": 122.96, "step": 36595, "token_acc": 0.9655629139072848, "train_speed(iter/s)": 0.235535 }, { "epoch": 2.7898467871026753, "grad_norm": 1.0467239618301392, "learning_rate": 4.094444550213369e-05, "loss": 0.1301613688468933, "memory(GiB)": 122.96, "step": 36600, "token_acc": 0.9525022747952684, "train_speed(iter/s)": 0.235541 }, { "epoch": 2.7898467871026753, "eval_loss": 0.07642733305692673, "eval_runtime": 216.2575, "eval_samples_per_second": 2.451, "eval_steps_per_second": 2.451, "eval_token_acc": 0.9678332630564423, "step": 36600 }, { "epoch": 2.7902279137129353, "grad_norm": 0.40056517720222473, "learning_rate": 4.093267032543091e-05, "loss": 0.06338745951652527, "memory(GiB)": 122.96, "step": 36605, "token_acc": 0.9677159707892464, "train_speed(iter/s)": 0.235222 }, { "epoch": 2.7906090403231953, "grad_norm": 1.1515675783157349, "learning_rate": 4.0920895668697414e-05, "loss": 0.04842616319656372, "memory(GiB)": 122.96, "step": 36610, "token_acc": 0.9807930607187113, "train_speed(iter/s)": 0.235229 }, { "epoch": 2.7909901669334554, "grad_norm": 1.4294439554214478, "learning_rate": 4.09091215326084e-05, "loss": 0.06429333686828613, "memory(GiB)": 122.96, "step": 36615, "token_acc": 0.966078431372549, "train_speed(iter/s)": 0.235237 }, { "epoch": 2.7913712935437154, "grad_norm": 0.6767603754997253, "learning_rate": 4.089734791783909e-05, "loss": 0.06859900951385497, "memory(GiB)": 122.96, "step": 36620, "token_acc": 0.9770878299850573, "train_speed(iter/s)": 0.235242 }, { "epoch": 2.7917524201539754, "grad_norm": 0.6418783664703369, "learning_rate": 4.088557482506464e-05, "loss": 0.08225930333137513, "memory(GiB)": 122.96, "step": 36625, "token_acc": 0.9716098334655036, "train_speed(iter/s)": 0.235246 }, { "epoch": 2.792133546764235, "grad_norm": 1.0925025939941406, "learning_rate": 4.087380225496017e-05, "loss": 0.07174615859985352, "memory(GiB)": 122.96, "step": 36630, "token_acc": 0.967371229222245, "train_speed(iter/s)": 0.235252 }, { "epoch": 2.792514673374495, "grad_norm": 0.9911214709281921, "learning_rate": 4.08620302082008e-05, "loss": 0.06293715238571167, "memory(GiB)": 122.96, "step": 36635, "token_acc": 0.9766718506998445, "train_speed(iter/s)": 0.235261 }, { "epoch": 2.792895799984755, "grad_norm": 0.5798222422599792, "learning_rate": 4.0850258685461585e-05, "loss": 0.07545194625854493, "memory(GiB)": 122.96, "step": 36640, "token_acc": 0.9535617673579802, "train_speed(iter/s)": 0.235271 }, { "epoch": 2.7932769265950146, "grad_norm": 0.656215488910675, "learning_rate": 4.083848768741757e-05, "loss": 0.09164856076240539, "memory(GiB)": 122.96, "step": 36645, "token_acc": 0.9734885224700938, "train_speed(iter/s)": 0.23528 }, { "epoch": 2.7936580532052746, "grad_norm": 0.8668796420097351, "learning_rate": 4.082671721474378e-05, "loss": 0.09737058877944946, "memory(GiB)": 122.96, "step": 36650, "token_acc": 0.9666367252662647, "train_speed(iter/s)": 0.235283 }, { "epoch": 2.7940391798155346, "grad_norm": 0.9103873372077942, "learning_rate": 4.0814947268115184e-05, "loss": 0.12800090312957763, "memory(GiB)": 122.96, "step": 36655, "token_acc": 0.9596987315010571, "train_speed(iter/s)": 0.235289 }, { "epoch": 2.7944203064257946, "grad_norm": 1.0717148780822754, "learning_rate": 4.080317784820673e-05, "loss": 0.09216012358665467, "memory(GiB)": 122.96, "step": 36660, "token_acc": 0.9637705467963771, "train_speed(iter/s)": 0.235299 }, { "epoch": 2.7948014330360547, "grad_norm": 0.7615875005722046, "learning_rate": 4.079140895569337e-05, "loss": 0.04761860370635986, "memory(GiB)": 122.96, "step": 36665, "token_acc": 0.9757240204429302, "train_speed(iter/s)": 0.23531 }, { "epoch": 2.7951825596463147, "grad_norm": 1.024062991142273, "learning_rate": 4.077964059124996e-05, "loss": 0.08474367260932922, "memory(GiB)": 122.96, "step": 36670, "token_acc": 0.9658048373644704, "train_speed(iter/s)": 0.23532 }, { "epoch": 2.7955636862565747, "grad_norm": 1.5493580102920532, "learning_rate": 4.076787275555135e-05, "loss": 0.12462332248687744, "memory(GiB)": 122.96, "step": 36675, "token_acc": 0.9592261904761905, "train_speed(iter/s)": 0.235326 }, { "epoch": 2.7959448128668343, "grad_norm": 0.5942776799201965, "learning_rate": 4.075610544927243e-05, "loss": 0.0643521249294281, "memory(GiB)": 122.96, "step": 36680, "token_acc": 0.9702970297029703, "train_speed(iter/s)": 0.235331 }, { "epoch": 2.7963259394770943, "grad_norm": 0.9140352010726929, "learning_rate": 4.074433867308795e-05, "loss": 0.07752467393875122, "memory(GiB)": 122.96, "step": 36685, "token_acc": 0.9726590854101137, "train_speed(iter/s)": 0.23534 }, { "epoch": 2.7967070660873543, "grad_norm": 2.738765239715576, "learning_rate": 4.07325724276727e-05, "loss": 0.10140366554260254, "memory(GiB)": 122.96, "step": 36690, "token_acc": 0.9670273055126224, "train_speed(iter/s)": 0.235349 }, { "epoch": 2.797088192697614, "grad_norm": 1.4033223390579224, "learning_rate": 4.072080671370142e-05, "loss": 0.10834966897964478, "memory(GiB)": 122.96, "step": 36695, "token_acc": 0.9661234991423671, "train_speed(iter/s)": 0.235358 }, { "epoch": 2.797469319307874, "grad_norm": 0.9291077256202698, "learning_rate": 4.07090415318488e-05, "loss": 0.08157490491867066, "memory(GiB)": 122.96, "step": 36700, "token_acc": 0.9655453065653825, "train_speed(iter/s)": 0.235361 }, { "epoch": 2.797850445918134, "grad_norm": 0.6391225457191467, "learning_rate": 4.069727688278955e-05, "loss": 0.08174354434013367, "memory(GiB)": 122.96, "step": 36705, "token_acc": 0.9724358974358974, "train_speed(iter/s)": 0.23537 }, { "epoch": 2.798231572528394, "grad_norm": 1.0302814245224, "learning_rate": 4.06855127671983e-05, "loss": 0.06498481631278992, "memory(GiB)": 122.96, "step": 36710, "token_acc": 0.9729326115363076, "train_speed(iter/s)": 0.235375 }, { "epoch": 2.798612699138654, "grad_norm": 1.0147652626037598, "learning_rate": 4.0673749185749654e-05, "loss": 0.09835391640663146, "memory(GiB)": 122.96, "step": 36715, "token_acc": 0.9571192963166575, "train_speed(iter/s)": 0.235387 }, { "epoch": 2.798993825748914, "grad_norm": 1.1075419187545776, "learning_rate": 4.066198613911822e-05, "loss": 0.08342958688735962, "memory(GiB)": 122.96, "step": 36720, "token_acc": 0.9617607070020394, "train_speed(iter/s)": 0.235393 }, { "epoch": 2.7993749523591736, "grad_norm": 1.3226616382598877, "learning_rate": 4.0650223627978554e-05, "loss": 0.11905139684677124, "memory(GiB)": 122.96, "step": 36725, "token_acc": 0.9503163815326927, "train_speed(iter/s)": 0.235401 }, { "epoch": 2.7997560789694336, "grad_norm": 1.494226098060608, "learning_rate": 4.063846165300518e-05, "loss": 0.08005784749984741, "memory(GiB)": 122.96, "step": 36730, "token_acc": 0.9805961754780652, "train_speed(iter/s)": 0.23541 }, { "epoch": 2.8001372055796936, "grad_norm": 1.0193595886230469, "learning_rate": 4.062670021487256e-05, "loss": 0.09067997932434083, "memory(GiB)": 122.96, "step": 36735, "token_acc": 0.9654282765737874, "train_speed(iter/s)": 0.235416 }, { "epoch": 2.8005183321899536, "grad_norm": 0.9789173603057861, "learning_rate": 4.061493931425521e-05, "loss": 0.11200917959213257, "memory(GiB)": 122.96, "step": 36740, "token_acc": 0.9526081424936387, "train_speed(iter/s)": 0.235425 }, { "epoch": 2.800899458800213, "grad_norm": 1.67096745967865, "learning_rate": 4.0603178951827536e-05, "loss": 0.11819676160812378, "memory(GiB)": 122.96, "step": 36745, "token_acc": 0.9545560475413657, "train_speed(iter/s)": 0.235432 }, { "epoch": 2.801280585410473, "grad_norm": 1.0002846717834473, "learning_rate": 4.059141912826393e-05, "loss": 0.11247262954711915, "memory(GiB)": 122.96, "step": 36750, "token_acc": 0.9625, "train_speed(iter/s)": 0.235435 }, { "epoch": 2.801661712020733, "grad_norm": 1.2062668800354004, "learning_rate": 4.057965984423879e-05, "loss": 0.09238088726997376, "memory(GiB)": 122.96, "step": 36755, "token_acc": 0.957391713747646, "train_speed(iter/s)": 0.235443 }, { "epoch": 2.8020428386309932, "grad_norm": 1.239045262336731, "learning_rate": 4.056790110042644e-05, "loss": 0.08172243237495422, "memory(GiB)": 122.96, "step": 36760, "token_acc": 0.966756294304571, "train_speed(iter/s)": 0.235449 }, { "epoch": 2.8024239652412533, "grad_norm": 0.6657172441482544, "learning_rate": 4.0556142897501174e-05, "loss": 0.0768383264541626, "memory(GiB)": 122.96, "step": 36765, "token_acc": 0.9704312114989733, "train_speed(iter/s)": 0.235456 }, { "epoch": 2.8028050918515133, "grad_norm": 1.4484046697616577, "learning_rate": 4.0544385236137305e-05, "loss": 0.055221033096313474, "memory(GiB)": 122.96, "step": 36770, "token_acc": 0.9761222540592168, "train_speed(iter/s)": 0.235465 }, { "epoch": 2.803186218461773, "grad_norm": 0.8007888197898865, "learning_rate": 4.0532628117009066e-05, "loss": 0.08206046223640442, "memory(GiB)": 122.96, "step": 36775, "token_acc": 0.9619120654396728, "train_speed(iter/s)": 0.235474 }, { "epoch": 2.803567345072033, "grad_norm": 1.5043879747390747, "learning_rate": 4.052087154079066e-05, "loss": 0.06869337558746338, "memory(GiB)": 122.96, "step": 36780, "token_acc": 0.9816939890710382, "train_speed(iter/s)": 0.235482 }, { "epoch": 2.803948471682293, "grad_norm": 1.84316086769104, "learning_rate": 4.0509115508156284e-05, "loss": 0.06745745539665222, "memory(GiB)": 122.96, "step": 36785, "token_acc": 0.9626168224299065, "train_speed(iter/s)": 0.235489 }, { "epoch": 2.804329598292553, "grad_norm": 0.8834179043769836, "learning_rate": 4.0497360019780096e-05, "loss": 0.08040390610694885, "memory(GiB)": 122.96, "step": 36790, "token_acc": 0.9733291983253217, "train_speed(iter/s)": 0.235494 }, { "epoch": 2.8047107249028125, "grad_norm": 0.7222166061401367, "learning_rate": 4.048560507633621e-05, "loss": 0.06423368453979492, "memory(GiB)": 122.96, "step": 36795, "token_acc": 0.9733741978737669, "train_speed(iter/s)": 0.235495 }, { "epoch": 2.8050918515130725, "grad_norm": 0.9137911200523376, "learning_rate": 4.0473850678498716e-05, "loss": 0.09731810092926026, "memory(GiB)": 122.96, "step": 36800, "token_acc": 0.9520016767973172, "train_speed(iter/s)": 0.235501 }, { "epoch": 2.8050918515130725, "eval_loss": 0.07635236531496048, "eval_runtime": 215.1752, "eval_samples_per_second": 2.463, "eval_steps_per_second": 2.463, "eval_token_acc": 0.9681419793988314, "step": 36800 }, { "epoch": 2.8054729781233325, "grad_norm": 1.1574220657348633, "learning_rate": 4.046209682694169e-05, "loss": 0.08730719089508057, "memory(GiB)": 122.96, "step": 36805, "token_acc": 0.9681948612358443, "train_speed(iter/s)": 0.235185 }, { "epoch": 2.8058541047335925, "grad_norm": 0.4871918261051178, "learning_rate": 4.045034352233912e-05, "loss": 0.04637258052825928, "memory(GiB)": 122.96, "step": 36810, "token_acc": 0.9785867237687366, "train_speed(iter/s)": 0.235196 }, { "epoch": 2.8062352313438526, "grad_norm": 1.2497634887695312, "learning_rate": 4.043859076536506e-05, "loss": 0.0716752290725708, "memory(GiB)": 122.96, "step": 36815, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.2352 }, { "epoch": 2.8066163579541126, "grad_norm": 0.7433101534843445, "learning_rate": 4.042683855669344e-05, "loss": 0.08512197136878967, "memory(GiB)": 122.96, "step": 36820, "token_acc": 0.9691931540342298, "train_speed(iter/s)": 0.235205 }, { "epoch": 2.806997484564372, "grad_norm": 0.9254250526428223, "learning_rate": 4.041508689699821e-05, "loss": 0.06816704273223877, "memory(GiB)": 122.96, "step": 36825, "token_acc": 0.9718354936789887, "train_speed(iter/s)": 0.235212 }, { "epoch": 2.807378611174632, "grad_norm": 1.375670075416565, "learning_rate": 4.0403335786953275e-05, "loss": 0.09206663966178893, "memory(GiB)": 122.96, "step": 36830, "token_acc": 0.9578049846961084, "train_speed(iter/s)": 0.235218 }, { "epoch": 2.807759737784892, "grad_norm": 0.5542064309120178, "learning_rate": 4.039158522723249e-05, "loss": 0.08337479829788208, "memory(GiB)": 122.96, "step": 36835, "token_acc": 0.9745938472174214, "train_speed(iter/s)": 0.235223 }, { "epoch": 2.808140864395152, "grad_norm": 2.0370168685913086, "learning_rate": 4.03798352185097e-05, "loss": 0.10555912256240844, "memory(GiB)": 122.96, "step": 36840, "token_acc": 0.9663899574812715, "train_speed(iter/s)": 0.235231 }, { "epoch": 2.8085219910054118, "grad_norm": 2.373049736022949, "learning_rate": 4.036808576145872e-05, "loss": 0.10478427410125732, "memory(GiB)": 122.96, "step": 36845, "token_acc": 0.9606548719302879, "train_speed(iter/s)": 0.23524 }, { "epoch": 2.808903117615672, "grad_norm": 1.202152132987976, "learning_rate": 4.035633685675333e-05, "loss": 0.09056482315063477, "memory(GiB)": 122.96, "step": 36850, "token_acc": 0.9699723429474516, "train_speed(iter/s)": 0.235247 }, { "epoch": 2.809284244225932, "grad_norm": 0.754456639289856, "learning_rate": 4.034458850506726e-05, "loss": 0.042677664756774904, "memory(GiB)": 122.96, "step": 36855, "token_acc": 0.9773396782234308, "train_speed(iter/s)": 0.235255 }, { "epoch": 2.809665370836192, "grad_norm": 1.0163607597351074, "learning_rate": 4.0332840707074235e-05, "loss": 0.10294344425201415, "memory(GiB)": 122.96, "step": 36860, "token_acc": 0.963991527418216, "train_speed(iter/s)": 0.235263 }, { "epoch": 2.810046497446452, "grad_norm": 1.2148113250732422, "learning_rate": 4.032109346344795e-05, "loss": 0.04769116342067718, "memory(GiB)": 122.96, "step": 36865, "token_acc": 0.9779563719862228, "train_speed(iter/s)": 0.23527 }, { "epoch": 2.810427624056712, "grad_norm": 0.7905786633491516, "learning_rate": 4.030934677486201e-05, "loss": 0.08967609405517578, "memory(GiB)": 122.96, "step": 36870, "token_acc": 0.9637497892429607, "train_speed(iter/s)": 0.235277 }, { "epoch": 2.8108087506669714, "grad_norm": 0.6032323837280273, "learning_rate": 4.029760064199009e-05, "loss": 0.06143359541893005, "memory(GiB)": 122.96, "step": 36875, "token_acc": 0.9812206572769953, "train_speed(iter/s)": 0.235283 }, { "epoch": 2.8111898772772315, "grad_norm": 1.2043129205703735, "learning_rate": 4.028585506550574e-05, "loss": 0.0717646598815918, "memory(GiB)": 122.96, "step": 36880, "token_acc": 0.974146185764443, "train_speed(iter/s)": 0.235291 }, { "epoch": 2.8115710038874915, "grad_norm": 0.6496508121490479, "learning_rate": 4.027411004608251e-05, "loss": 0.10287641286849976, "memory(GiB)": 122.96, "step": 36885, "token_acc": 0.9690311418685121, "train_speed(iter/s)": 0.235296 }, { "epoch": 2.8119521304977515, "grad_norm": 1.0407556295394897, "learning_rate": 4.026236558439394e-05, "loss": 0.12701424360275268, "memory(GiB)": 122.96, "step": 36890, "token_acc": 0.944979794839913, "train_speed(iter/s)": 0.235305 }, { "epoch": 2.812333257108011, "grad_norm": 0.8468184471130371, "learning_rate": 4.025062168111353e-05, "loss": 0.09326013326644897, "memory(GiB)": 122.96, "step": 36895, "token_acc": 0.9709443099273608, "train_speed(iter/s)": 0.235311 }, { "epoch": 2.812714383718271, "grad_norm": 0.7214844822883606, "learning_rate": 4.023887833691471e-05, "loss": 0.09739107489585877, "memory(GiB)": 122.96, "step": 36900, "token_acc": 0.9736919029523531, "train_speed(iter/s)": 0.235321 }, { "epoch": 2.813095510328531, "grad_norm": 1.2149076461791992, "learning_rate": 4.0227135552470927e-05, "loss": 0.06845536231994628, "memory(GiB)": 122.96, "step": 36905, "token_acc": 0.9745293466223699, "train_speed(iter/s)": 0.235328 }, { "epoch": 2.813476636938791, "grad_norm": 0.8331543207168579, "learning_rate": 4.021539332845557e-05, "loss": 0.09929978847503662, "memory(GiB)": 122.96, "step": 36910, "token_acc": 0.9630504072718687, "train_speed(iter/s)": 0.235326 }, { "epoch": 2.813857763549051, "grad_norm": 1.249104619026184, "learning_rate": 4.0203651665542e-05, "loss": 0.06204451322555542, "memory(GiB)": 122.96, "step": 36915, "token_acc": 0.9702026221692491, "train_speed(iter/s)": 0.235335 }, { "epoch": 2.814238890159311, "grad_norm": 0.7015763521194458, "learning_rate": 4.019191056440353e-05, "loss": 0.05535479784011841, "memory(GiB)": 122.96, "step": 36920, "token_acc": 0.9789674952198852, "train_speed(iter/s)": 0.235345 }, { "epoch": 2.8146200167695707, "grad_norm": 0.8901872634887695, "learning_rate": 4.0180170025713484e-05, "loss": 0.08181743025779724, "memory(GiB)": 122.96, "step": 36925, "token_acc": 0.9627031019202363, "train_speed(iter/s)": 0.235354 }, { "epoch": 2.8150011433798308, "grad_norm": 0.7533461451530457, "learning_rate": 4.0168430050145125e-05, "loss": 0.06646875143051148, "memory(GiB)": 122.96, "step": 36930, "token_acc": 0.9758750247182124, "train_speed(iter/s)": 0.23536 }, { "epoch": 2.815382269990091, "grad_norm": 1.0599215030670166, "learning_rate": 4.015669063837167e-05, "loss": 0.08506077527999878, "memory(GiB)": 122.96, "step": 36935, "token_acc": 0.9669811320754716, "train_speed(iter/s)": 0.235368 }, { "epoch": 2.815763396600351, "grad_norm": 1.1567624807357788, "learning_rate": 4.014495179106632e-05, "loss": 0.10604052543640137, "memory(GiB)": 122.96, "step": 36940, "token_acc": 0.9700194873332334, "train_speed(iter/s)": 0.235373 }, { "epoch": 2.8161445232106104, "grad_norm": 1.0826857089996338, "learning_rate": 4.013321350890226e-05, "loss": 0.0680406391620636, "memory(GiB)": 122.96, "step": 36945, "token_acc": 0.9750912604681126, "train_speed(iter/s)": 0.235378 }, { "epoch": 2.8165256498208704, "grad_norm": 0.6640536785125732, "learning_rate": 4.012147579255262e-05, "loss": 0.06095672845840454, "memory(GiB)": 122.96, "step": 36950, "token_acc": 0.968032647508927, "train_speed(iter/s)": 0.235383 }, { "epoch": 2.8169067764311304, "grad_norm": 0.6961215138435364, "learning_rate": 4.010973864269051e-05, "loss": 0.12544753551483154, "memory(GiB)": 122.96, "step": 36955, "token_acc": 0.9538718929254302, "train_speed(iter/s)": 0.235392 }, { "epoch": 2.8172879030413904, "grad_norm": 0.5102221965789795, "learning_rate": 4.009800205998897e-05, "loss": 0.0820692539215088, "memory(GiB)": 122.96, "step": 36960, "token_acc": 0.9663127059685097, "train_speed(iter/s)": 0.235397 }, { "epoch": 2.8176690296516504, "grad_norm": 0.045260608196258545, "learning_rate": 4.008626604512108e-05, "loss": 0.04682015180587769, "memory(GiB)": 122.96, "step": 36965, "token_acc": 0.9781560283687943, "train_speed(iter/s)": 0.235406 }, { "epoch": 2.8180501562619105, "grad_norm": 1.443069577217102, "learning_rate": 4.007453059875983e-05, "loss": 0.10248687267303466, "memory(GiB)": 122.96, "step": 36970, "token_acc": 0.9698197285801093, "train_speed(iter/s)": 0.235411 }, { "epoch": 2.81843128287217, "grad_norm": 1.5521090030670166, "learning_rate": 4.006279572157817e-05, "loss": 0.07119889259338379, "memory(GiB)": 122.96, "step": 36975, "token_acc": 0.9754290171606864, "train_speed(iter/s)": 0.235422 }, { "epoch": 2.81881240948243, "grad_norm": 0.5529496669769287, "learning_rate": 4.005106141424908e-05, "loss": 0.07963992357254028, "memory(GiB)": 122.96, "step": 36980, "token_acc": 0.969797958758592, "train_speed(iter/s)": 0.235429 }, { "epoch": 2.81919353609269, "grad_norm": 0.8216714262962341, "learning_rate": 4.003932767744545e-05, "loss": 0.07668164968490601, "memory(GiB)": 122.96, "step": 36985, "token_acc": 0.9701456310679611, "train_speed(iter/s)": 0.235436 }, { "epoch": 2.8195746627029497, "grad_norm": 0.9717664122581482, "learning_rate": 4.0027594511840136e-05, "loss": 0.1049383044242859, "memory(GiB)": 122.96, "step": 36990, "token_acc": 0.9604819277108434, "train_speed(iter/s)": 0.235442 }, { "epoch": 2.8199557893132097, "grad_norm": 1.0794439315795898, "learning_rate": 4.0015861918106016e-05, "loss": 0.0885585069656372, "memory(GiB)": 122.96, "step": 36995, "token_acc": 0.9661538461538461, "train_speed(iter/s)": 0.235451 }, { "epoch": 2.8203369159234697, "grad_norm": 3.250413656234741, "learning_rate": 4.000412989691588e-05, "loss": 0.12776585817337036, "memory(GiB)": 122.96, "step": 37000, "token_acc": 0.9549242424242425, "train_speed(iter/s)": 0.235461 }, { "epoch": 2.8203369159234697, "eval_loss": 0.07391706854104996, "eval_runtime": 219.3961, "eval_samples_per_second": 2.416, "eval_steps_per_second": 2.416, "eval_token_acc": 0.9683302210710198, "step": 37000 }, { "epoch": 2.8207180425337297, "grad_norm": 1.1751999855041504, "learning_rate": 3.999239844894251e-05, "loss": 0.13115190267562865, "memory(GiB)": 122.96, "step": 37005, "token_acc": 0.9679376083188909, "train_speed(iter/s)": 0.235137 }, { "epoch": 2.8210991691439897, "grad_norm": 0.5355262756347656, "learning_rate": 3.998066757485863e-05, "loss": 0.06140064001083374, "memory(GiB)": 122.96, "step": 37010, "token_acc": 0.9669980119284294, "train_speed(iter/s)": 0.235147 }, { "epoch": 2.8214802957542497, "grad_norm": 0.9067244529724121, "learning_rate": 3.9968937275336995e-05, "loss": 0.05486927032470703, "memory(GiB)": 122.96, "step": 37015, "token_acc": 0.9738219895287958, "train_speed(iter/s)": 0.235151 }, { "epoch": 2.8218614223645098, "grad_norm": 1.1822378635406494, "learning_rate": 3.9957207551050243e-05, "loss": 0.08770846128463745, "memory(GiB)": 122.96, "step": 37020, "token_acc": 0.9645661750622486, "train_speed(iter/s)": 0.235157 }, { "epoch": 2.8222425489747693, "grad_norm": 1.6147971153259277, "learning_rate": 3.994547840267103e-05, "loss": 0.06765681505203247, "memory(GiB)": 122.96, "step": 37025, "token_acc": 0.9709281328999639, "train_speed(iter/s)": 0.235162 }, { "epoch": 2.8226236755850294, "grad_norm": 2.579986810684204, "learning_rate": 3.993374983087198e-05, "loss": 0.08962969183921814, "memory(GiB)": 122.96, "step": 37030, "token_acc": 0.9693816884661117, "train_speed(iter/s)": 0.235168 }, { "epoch": 2.8230048021952894, "grad_norm": 0.4994626045227051, "learning_rate": 3.992202183632567e-05, "loss": 0.07182464599609376, "memory(GiB)": 122.96, "step": 37035, "token_acc": 0.9739983646770237, "train_speed(iter/s)": 0.235171 }, { "epoch": 2.823385928805549, "grad_norm": 1.139796495437622, "learning_rate": 3.991029441970462e-05, "loss": 0.07686096429824829, "memory(GiB)": 122.96, "step": 37040, "token_acc": 0.9675938428301377, "train_speed(iter/s)": 0.23518 }, { "epoch": 2.823767055415809, "grad_norm": 1.5872453451156616, "learning_rate": 3.989856758168138e-05, "loss": 0.08141154050827026, "memory(GiB)": 122.96, "step": 37045, "token_acc": 0.9714334296888284, "train_speed(iter/s)": 0.235183 }, { "epoch": 2.824148182026069, "grad_norm": 0.8142287731170654, "learning_rate": 3.9886841322928415e-05, "loss": 0.0812134325504303, "memory(GiB)": 122.96, "step": 37050, "token_acc": 0.9753887966178469, "train_speed(iter/s)": 0.235188 }, { "epoch": 2.824529308636329, "grad_norm": 0.7422438859939575, "learning_rate": 3.9875115644118156e-05, "loss": 0.11091704368591308, "memory(GiB)": 122.96, "step": 37055, "token_acc": 0.9615530303030303, "train_speed(iter/s)": 0.235194 }, { "epoch": 2.824910435246589, "grad_norm": 1.0799150466918945, "learning_rate": 3.9863390545923036e-05, "loss": 0.08091414570808411, "memory(GiB)": 122.96, "step": 37060, "token_acc": 0.970703125, "train_speed(iter/s)": 0.235201 }, { "epoch": 2.825291561856849, "grad_norm": 1.383588194847107, "learning_rate": 3.9851666029015436e-05, "loss": 0.13060142993927001, "memory(GiB)": 122.96, "step": 37065, "token_acc": 0.9403354412977729, "train_speed(iter/s)": 0.235211 }, { "epoch": 2.8256726884671086, "grad_norm": 1.1318490505218506, "learning_rate": 3.9839942094067684e-05, "loss": 0.09031330943107604, "memory(GiB)": 122.96, "step": 37070, "token_acc": 0.9721867823424343, "train_speed(iter/s)": 0.235219 }, { "epoch": 2.8260538150773686, "grad_norm": 0.8360273838043213, "learning_rate": 3.9828218741752093e-05, "loss": 0.08069726228713989, "memory(GiB)": 122.96, "step": 37075, "token_acc": 0.9722499169159189, "train_speed(iter/s)": 0.235222 }, { "epoch": 2.8264349416876287, "grad_norm": 0.8736184239387512, "learning_rate": 3.9816495972740965e-05, "loss": 0.10121493339538574, "memory(GiB)": 122.96, "step": 37080, "token_acc": 0.9612969183584051, "train_speed(iter/s)": 0.235227 }, { "epoch": 2.8268160682978887, "grad_norm": 1.1784236431121826, "learning_rate": 3.980477378770654e-05, "loss": 0.08203986883163453, "memory(GiB)": 122.96, "step": 37085, "token_acc": 0.9728803882386526, "train_speed(iter/s)": 0.235234 }, { "epoch": 2.8271971949081482, "grad_norm": 1.543839454650879, "learning_rate": 3.979305218732104e-05, "loss": 0.08631514310836792, "memory(GiB)": 122.96, "step": 37090, "token_acc": 0.967235494880546, "train_speed(iter/s)": 0.235246 }, { "epoch": 2.8275783215184083, "grad_norm": 1.9943255186080933, "learning_rate": 3.97813311722566e-05, "loss": 0.07424243688583373, "memory(GiB)": 122.96, "step": 37095, "token_acc": 0.9763934426229508, "train_speed(iter/s)": 0.235253 }, { "epoch": 2.8279594481286683, "grad_norm": 1.2927846908569336, "learning_rate": 3.976961074318542e-05, "loss": 0.11411020755767823, "memory(GiB)": 122.96, "step": 37100, "token_acc": 0.9557297671201291, "train_speed(iter/s)": 0.235261 }, { "epoch": 2.8283405747389283, "grad_norm": 1.0905365943908691, "learning_rate": 3.975789090077959e-05, "loss": 0.10129848718643189, "memory(GiB)": 122.96, "step": 37105, "token_acc": 0.9709421112372304, "train_speed(iter/s)": 0.235267 }, { "epoch": 2.8287217013491883, "grad_norm": 0.5565839409828186, "learning_rate": 3.974617164571118e-05, "loss": 0.08270695805549622, "memory(GiB)": 122.96, "step": 37110, "token_acc": 0.9666121112929623, "train_speed(iter/s)": 0.23527 }, { "epoch": 2.8291028279594483, "grad_norm": 0.7808406352996826, "learning_rate": 3.973445297865224e-05, "loss": 0.11617907285690307, "memory(GiB)": 122.96, "step": 37115, "token_acc": 0.9561986703167775, "train_speed(iter/s)": 0.235278 }, { "epoch": 2.829483954569708, "grad_norm": 0.46499618887901306, "learning_rate": 3.972273490027479e-05, "loss": 0.05223644375801086, "memory(GiB)": 122.96, "step": 37120, "token_acc": 0.9763727121464226, "train_speed(iter/s)": 0.235287 }, { "epoch": 2.829865081179968, "grad_norm": 0.7660902738571167, "learning_rate": 3.97110174112508e-05, "loss": 0.07091000080108642, "memory(GiB)": 122.96, "step": 37125, "token_acc": 0.967443091582848, "train_speed(iter/s)": 0.235294 }, { "epoch": 2.830246207790228, "grad_norm": 1.2747206687927246, "learning_rate": 3.969930051225221e-05, "loss": 0.10268213748931884, "memory(GiB)": 122.96, "step": 37130, "token_acc": 0.9605168700646087, "train_speed(iter/s)": 0.235303 }, { "epoch": 2.830627334400488, "grad_norm": 1.050632119178772, "learning_rate": 3.9687584203950936e-05, "loss": 0.07852541804313659, "memory(GiB)": 122.96, "step": 37135, "token_acc": 0.9627055825804958, "train_speed(iter/s)": 0.23531 }, { "epoch": 2.8310084610107475, "grad_norm": 0.6592636108398438, "learning_rate": 3.967586848701886e-05, "loss": 0.07245814800262451, "memory(GiB)": 122.96, "step": 37140, "token_acc": 0.9733201581027668, "train_speed(iter/s)": 0.235316 }, { "epoch": 2.8313895876210076, "grad_norm": 1.0048855543136597, "learning_rate": 3.9664153362127805e-05, "loss": 0.09650664329528809, "memory(GiB)": 122.96, "step": 37145, "token_acc": 0.9565306963329213, "train_speed(iter/s)": 0.235323 }, { "epoch": 2.8317707142312676, "grad_norm": 1.3832238912582397, "learning_rate": 3.96524388299496e-05, "loss": 0.11208385229110718, "memory(GiB)": 122.96, "step": 37150, "token_acc": 0.9655730129390019, "train_speed(iter/s)": 0.235329 }, { "epoch": 2.8321518408415276, "grad_norm": 0.8861879706382751, "learning_rate": 3.964072489115601e-05, "loss": 0.07781385183334351, "memory(GiB)": 122.96, "step": 37155, "token_acc": 0.9601722282023681, "train_speed(iter/s)": 0.235339 }, { "epoch": 2.8325329674517876, "grad_norm": 1.2923848628997803, "learning_rate": 3.9629011546418765e-05, "loss": 0.04949742555618286, "memory(GiB)": 122.96, "step": 37160, "token_acc": 0.9698568198944989, "train_speed(iter/s)": 0.235347 }, { "epoch": 2.8329140940620476, "grad_norm": 0.5005874037742615, "learning_rate": 3.961729879640959e-05, "loss": 0.06599857211112976, "memory(GiB)": 122.96, "step": 37165, "token_acc": 0.979084519115378, "train_speed(iter/s)": 0.235352 }, { "epoch": 2.833295220672307, "grad_norm": 0.47089096903800964, "learning_rate": 3.9605586641800145e-05, "loss": 0.04505498707294464, "memory(GiB)": 122.96, "step": 37170, "token_acc": 0.9806001818732949, "train_speed(iter/s)": 0.235356 }, { "epoch": 2.8336763472825672, "grad_norm": 0.8457223773002625, "learning_rate": 3.959387508326207e-05, "loss": 0.09503658413887024, "memory(GiB)": 122.96, "step": 37175, "token_acc": 0.9631517960602549, "train_speed(iter/s)": 0.235364 }, { "epoch": 2.8340574738928273, "grad_norm": 1.1616133451461792, "learning_rate": 3.958216412146696e-05, "loss": 0.09132510423660278, "memory(GiB)": 122.96, "step": 37180, "token_acc": 0.9707643536456498, "train_speed(iter/s)": 0.23537 }, { "epoch": 2.8344386005030873, "grad_norm": 1.6323531866073608, "learning_rate": 3.9570453757086405e-05, "loss": 0.10978183746337891, "memory(GiB)": 122.96, "step": 37185, "token_acc": 0.9627534181989628, "train_speed(iter/s)": 0.235377 }, { "epoch": 2.834819727113347, "grad_norm": 0.37861520051956177, "learning_rate": 3.955874399079193e-05, "loss": 0.09096361398696899, "memory(GiB)": 122.96, "step": 37190, "token_acc": 0.9619116061803809, "train_speed(iter/s)": 0.235387 }, { "epoch": 2.835200853723607, "grad_norm": 1.0373079776763916, "learning_rate": 3.954703482325502e-05, "loss": 0.0527302622795105, "memory(GiB)": 122.96, "step": 37195, "token_acc": 0.9728301886792453, "train_speed(iter/s)": 0.235396 }, { "epoch": 2.835581980333867, "grad_norm": 1.0132797956466675, "learning_rate": 3.9535326255147166e-05, "loss": 0.08107391595840455, "memory(GiB)": 122.96, "step": 37200, "token_acc": 0.9711229946524064, "train_speed(iter/s)": 0.235402 }, { "epoch": 2.835581980333867, "eval_loss": 0.07531843334436417, "eval_runtime": 220.8145, "eval_samples_per_second": 2.4, "eval_steps_per_second": 2.4, "eval_token_acc": 0.9681118607312812, "step": 37200 }, { "epoch": 2.835963106944127, "grad_norm": 0.7924807667732239, "learning_rate": 3.952361828713978e-05, "loss": 0.11442897319793702, "memory(GiB)": 122.96, "step": 37205, "token_acc": 0.9679359303183754, "train_speed(iter/s)": 0.235081 }, { "epoch": 2.836344233554387, "grad_norm": 0.6689233779907227, "learning_rate": 3.951191091990426e-05, "loss": 0.0818480372428894, "memory(GiB)": 122.96, "step": 37210, "token_acc": 0.9716363636363636, "train_speed(iter/s)": 0.235085 }, { "epoch": 2.836725360164647, "grad_norm": 0.6019874811172485, "learning_rate": 3.950020415411199e-05, "loss": 0.07830572724342347, "memory(GiB)": 122.96, "step": 37215, "token_acc": 0.9694656488549618, "train_speed(iter/s)": 0.23509 }, { "epoch": 2.8371064867749065, "grad_norm": 0.6677398085594177, "learning_rate": 3.9488497990434295e-05, "loss": 0.07112939953804016, "memory(GiB)": 122.96, "step": 37220, "token_acc": 0.9757971571263926, "train_speed(iter/s)": 0.2351 }, { "epoch": 2.8374876133851665, "grad_norm": 0.6930105686187744, "learning_rate": 3.9476792429542455e-05, "loss": 0.0926108717918396, "memory(GiB)": 122.96, "step": 37225, "token_acc": 0.9727996461742592, "train_speed(iter/s)": 0.235107 }, { "epoch": 2.8378687399954265, "grad_norm": 1.8544495105743408, "learning_rate": 3.9465087472107746e-05, "loss": 0.0855736792087555, "memory(GiB)": 122.96, "step": 37230, "token_acc": 0.9608900876601484, "train_speed(iter/s)": 0.235114 }, { "epoch": 2.8382498666056866, "grad_norm": 0.7679210305213928, "learning_rate": 3.9453383118801356e-05, "loss": 0.05204768180847168, "memory(GiB)": 122.96, "step": 37235, "token_acc": 0.9761336515513126, "train_speed(iter/s)": 0.235123 }, { "epoch": 2.838630993215946, "grad_norm": 0.8050784468650818, "learning_rate": 3.944167937029453e-05, "loss": 0.05148593187332153, "memory(GiB)": 122.96, "step": 37240, "token_acc": 0.9753818098928653, "train_speed(iter/s)": 0.23513 }, { "epoch": 2.839012119826206, "grad_norm": 1.5313812494277954, "learning_rate": 3.942997622725838e-05, "loss": 0.07024246454238892, "memory(GiB)": 122.96, "step": 37245, "token_acc": 0.9783154530929011, "train_speed(iter/s)": 0.23513 }, { "epoch": 2.839393246436466, "grad_norm": 1.621543288230896, "learning_rate": 3.941827369036404e-05, "loss": 0.06141210794448852, "memory(GiB)": 122.96, "step": 37250, "token_acc": 0.967756381549485, "train_speed(iter/s)": 0.235141 }, { "epoch": 2.839774373046726, "grad_norm": 1.3409210443496704, "learning_rate": 3.940657176028261e-05, "loss": 0.11061840057373047, "memory(GiB)": 122.96, "step": 37255, "token_acc": 0.9607021220854074, "train_speed(iter/s)": 0.235149 }, { "epoch": 2.840155499656986, "grad_norm": 0.7905392646789551, "learning_rate": 3.939487043768513e-05, "loss": 0.08258526921272277, "memory(GiB)": 122.96, "step": 37260, "token_acc": 0.9642663779101245, "train_speed(iter/s)": 0.235156 }, { "epoch": 2.8405366262672462, "grad_norm": 0.476076602935791, "learning_rate": 3.93831697232426e-05, "loss": 0.11182751655578613, "memory(GiB)": 122.96, "step": 37265, "token_acc": 0.9620253164556962, "train_speed(iter/s)": 0.235168 }, { "epoch": 2.840917752877506, "grad_norm": 2.4273130893707275, "learning_rate": 3.9371469617626036e-05, "loss": 0.10456565618515015, "memory(GiB)": 122.96, "step": 37270, "token_acc": 0.9511568123393316, "train_speed(iter/s)": 0.235176 }, { "epoch": 2.841298879487766, "grad_norm": 1.2171694040298462, "learning_rate": 3.935977012150636e-05, "loss": 0.08522037267684937, "memory(GiB)": 122.96, "step": 37275, "token_acc": 0.9686330422384601, "train_speed(iter/s)": 0.235182 }, { "epoch": 2.841680006098026, "grad_norm": 1.3108835220336914, "learning_rate": 3.934807123555449e-05, "loss": 0.08187426924705506, "memory(GiB)": 122.96, "step": 37280, "token_acc": 0.9641818181818181, "train_speed(iter/s)": 0.235187 }, { "epoch": 2.8420611327082854, "grad_norm": 1.1976078748703003, "learning_rate": 3.93363729604413e-05, "loss": 0.08537745475769043, "memory(GiB)": 122.96, "step": 37285, "token_acc": 0.9546848381601363, "train_speed(iter/s)": 0.235196 }, { "epoch": 2.8424422593185454, "grad_norm": 1.2470980882644653, "learning_rate": 3.9324675296837646e-05, "loss": 0.09837604761123657, "memory(GiB)": 122.96, "step": 37290, "token_acc": 0.9540463603090688, "train_speed(iter/s)": 0.235205 }, { "epoch": 2.8428233859288055, "grad_norm": 1.489729881286621, "learning_rate": 3.931297824541432e-05, "loss": 0.10673336982727051, "memory(GiB)": 122.96, "step": 37295, "token_acc": 0.9590491410307631, "train_speed(iter/s)": 0.235212 }, { "epoch": 2.8432045125390655, "grad_norm": 1.4418636560440063, "learning_rate": 3.930128180684209e-05, "loss": 0.1026721715927124, "memory(GiB)": 122.96, "step": 37300, "token_acc": 0.9604190919674039, "train_speed(iter/s)": 0.235221 }, { "epoch": 2.8435856391493255, "grad_norm": 0.8947908878326416, "learning_rate": 3.928958598179172e-05, "loss": 0.0649897038936615, "memory(GiB)": 122.96, "step": 37305, "token_acc": 0.9703912583715192, "train_speed(iter/s)": 0.235231 }, { "epoch": 2.8439667657595855, "grad_norm": 0.9955778121948242, "learning_rate": 3.927789077093388e-05, "loss": 0.0812071442604065, "memory(GiB)": 122.96, "step": 37310, "token_acc": 0.9669649002064694, "train_speed(iter/s)": 0.235234 }, { "epoch": 2.8443478923698455, "grad_norm": 1.2245725393295288, "learning_rate": 3.9266196174939235e-05, "loss": 0.07952761650085449, "memory(GiB)": 122.96, "step": 37315, "token_acc": 0.9605695509309967, "train_speed(iter/s)": 0.235242 }, { "epoch": 2.844729018980105, "grad_norm": 0.8701687455177307, "learning_rate": 3.925450219447844e-05, "loss": 0.09536066055297851, "memory(GiB)": 122.96, "step": 37320, "token_acc": 0.9637937124690922, "train_speed(iter/s)": 0.235249 }, { "epoch": 2.845110145590365, "grad_norm": 1.5913084745407104, "learning_rate": 3.9242808830222086e-05, "loss": 0.06593762636184693, "memory(GiB)": 122.96, "step": 37325, "token_acc": 0.9756036843415484, "train_speed(iter/s)": 0.235256 }, { "epoch": 2.845491272200625, "grad_norm": 0.4090445637702942, "learning_rate": 3.923111608284071e-05, "loss": 0.06542204022407531, "memory(GiB)": 122.96, "step": 37330, "token_acc": 0.9696691176470589, "train_speed(iter/s)": 0.235263 }, { "epoch": 2.8458723988108847, "grad_norm": 0.7415654063224792, "learning_rate": 3.921942395300486e-05, "loss": 0.09544023275375366, "memory(GiB)": 122.96, "step": 37335, "token_acc": 0.9683487271908088, "train_speed(iter/s)": 0.235266 }, { "epoch": 2.8462535254211447, "grad_norm": 1.7409909963607788, "learning_rate": 3.9207732441385e-05, "loss": 0.1308335542678833, "memory(GiB)": 122.96, "step": 37340, "token_acc": 0.9447531873161163, "train_speed(iter/s)": 0.235274 }, { "epoch": 2.8466346520314048, "grad_norm": 0.8360950350761414, "learning_rate": 3.919604154865163e-05, "loss": 0.05408849120140076, "memory(GiB)": 122.96, "step": 37345, "token_acc": 0.9752776031845799, "train_speed(iter/s)": 0.235279 }, { "epoch": 2.8470157786416648, "grad_norm": 2.114609956741333, "learning_rate": 3.918435127547514e-05, "loss": 0.09584521055221558, "memory(GiB)": 122.96, "step": 37350, "token_acc": 0.9658065869519614, "train_speed(iter/s)": 0.235284 }, { "epoch": 2.847396905251925, "grad_norm": 0.7424201369285583, "learning_rate": 3.9172661622525894e-05, "loss": 0.07180578112602234, "memory(GiB)": 122.96, "step": 37355, "token_acc": 0.9726893997839839, "train_speed(iter/s)": 0.23529 }, { "epoch": 2.847778031862185, "grad_norm": 0.5499105453491211, "learning_rate": 3.9160972590474274e-05, "loss": 0.07142413258552552, "memory(GiB)": 122.96, "step": 37360, "token_acc": 0.9751449875724938, "train_speed(iter/s)": 0.235295 }, { "epoch": 2.8481591584724444, "grad_norm": 1.567277193069458, "learning_rate": 3.914928417999058e-05, "loss": 0.08360521793365479, "memory(GiB)": 122.96, "step": 37365, "token_acc": 0.9642493823572155, "train_speed(iter/s)": 0.235299 }, { "epoch": 2.8485402850827044, "grad_norm": 0.5081021189689636, "learning_rate": 3.913759639174507e-05, "loss": 0.06911305785179138, "memory(GiB)": 122.96, "step": 37370, "token_acc": 0.9762390941154632, "train_speed(iter/s)": 0.235304 }, { "epoch": 2.8489214116929644, "grad_norm": 1.2233604192733765, "learning_rate": 3.912590922640801e-05, "loss": 0.0903830885887146, "memory(GiB)": 122.96, "step": 37375, "token_acc": 0.9642622231966674, "train_speed(iter/s)": 0.235309 }, { "epoch": 2.8493025383032244, "grad_norm": 0.7939669489860535, "learning_rate": 3.91142226846496e-05, "loss": 0.0876248836517334, "memory(GiB)": 122.96, "step": 37380, "token_acc": 0.961027713625866, "train_speed(iter/s)": 0.235318 }, { "epoch": 2.849683664913484, "grad_norm": 1.6148709058761597, "learning_rate": 3.9102536767139985e-05, "loss": 0.08891031742095948, "memory(GiB)": 122.96, "step": 37385, "token_acc": 0.9639936881365657, "train_speed(iter/s)": 0.235324 }, { "epoch": 2.850064791523744, "grad_norm": 0.599860429763794, "learning_rate": 3.909085147454933e-05, "loss": 0.07568166255950928, "memory(GiB)": 122.96, "step": 37390, "token_acc": 0.9690112130479103, "train_speed(iter/s)": 0.23533 }, { "epoch": 2.850445918134004, "grad_norm": 1.0100765228271484, "learning_rate": 3.907916680754772e-05, "loss": 0.08821362257003784, "memory(GiB)": 122.96, "step": 37395, "token_acc": 0.9614352783366867, "train_speed(iter/s)": 0.23534 }, { "epoch": 2.850827044744264, "grad_norm": 1.6458839178085327, "learning_rate": 3.9067482766805214e-05, "loss": 0.1252490758895874, "memory(GiB)": 122.96, "step": 37400, "token_acc": 0.9526128488480989, "train_speed(iter/s)": 0.235346 }, { "epoch": 2.850827044744264, "eval_loss": 0.07392237335443497, "eval_runtime": 221.274, "eval_samples_per_second": 2.395, "eval_steps_per_second": 2.395, "eval_token_acc": 0.9681269200650563, "step": 37400 }, { "epoch": 2.851208171354524, "grad_norm": 1.0230239629745483, "learning_rate": 3.905579935299182e-05, "loss": 0.09801877737045288, "memory(GiB)": 122.96, "step": 37405, "token_acc": 0.9681560924045894, "train_speed(iter/s)": 0.235026 }, { "epoch": 2.851589297964784, "grad_norm": 0.8388431072235107, "learning_rate": 3.9044116566777567e-05, "loss": 0.08635572195053101, "memory(GiB)": 122.96, "step": 37410, "token_acc": 0.9720351092059604, "train_speed(iter/s)": 0.235033 }, { "epoch": 2.8519704245750437, "grad_norm": 1.2618399858474731, "learning_rate": 3.903243440883238e-05, "loss": 0.09397611618041993, "memory(GiB)": 122.96, "step": 37415, "token_acc": 0.9611407082419304, "train_speed(iter/s)": 0.235041 }, { "epoch": 2.8523515511853037, "grad_norm": 0.7472879886627197, "learning_rate": 3.902075287982618e-05, "loss": 0.08769038915634156, "memory(GiB)": 122.96, "step": 37420, "token_acc": 0.9668793679732605, "train_speed(iter/s)": 0.235046 }, { "epoch": 2.8527326777955637, "grad_norm": 1.8295495510101318, "learning_rate": 3.900907198042886e-05, "loss": 0.07200249433517455, "memory(GiB)": 122.96, "step": 37425, "token_acc": 0.9733162100456622, "train_speed(iter/s)": 0.235049 }, { "epoch": 2.8531138044058237, "grad_norm": 0.7457391619682312, "learning_rate": 3.899739171131025e-05, "loss": 0.06193675994873047, "memory(GiB)": 122.96, "step": 37430, "token_acc": 0.9752563530985288, "train_speed(iter/s)": 0.235056 }, { "epoch": 2.8534949310160833, "grad_norm": 0.5031586289405823, "learning_rate": 3.898571207314017e-05, "loss": 0.11662843227386474, "memory(GiB)": 122.96, "step": 37435, "token_acc": 0.9628567331134376, "train_speed(iter/s)": 0.235062 }, { "epoch": 2.8538760576263433, "grad_norm": 0.7260444164276123, "learning_rate": 3.897403306658839e-05, "loss": 0.0766907811164856, "memory(GiB)": 122.96, "step": 37440, "token_acc": 0.9683707976924607, "train_speed(iter/s)": 0.235066 }, { "epoch": 2.8542571842366034, "grad_norm": 0.570349931716919, "learning_rate": 3.8962354692324655e-05, "loss": 0.061160147190093994, "memory(GiB)": 122.96, "step": 37445, "token_acc": 0.9760383386581469, "train_speed(iter/s)": 0.23507 }, { "epoch": 2.8546383108468634, "grad_norm": 1.2957910299301147, "learning_rate": 3.8950676951018636e-05, "loss": 0.06857154369354249, "memory(GiB)": 122.96, "step": 37450, "token_acc": 0.9689621726479146, "train_speed(iter/s)": 0.23508 }, { "epoch": 2.8550194374571234, "grad_norm": 1.0401804447174072, "learning_rate": 3.893899984334004e-05, "loss": 0.10757466554641723, "memory(GiB)": 122.96, "step": 37455, "token_acc": 0.9671361502347418, "train_speed(iter/s)": 0.235085 }, { "epoch": 2.8554005640673834, "grad_norm": 1.5904053449630737, "learning_rate": 3.892732336995847e-05, "loss": 0.09515454173088074, "memory(GiB)": 122.96, "step": 37460, "token_acc": 0.9640628362384334, "train_speed(iter/s)": 0.235091 }, { "epoch": 2.855781690677643, "grad_norm": 0.9020172357559204, "learning_rate": 3.891564753154352e-05, "loss": 0.07712588906288147, "memory(GiB)": 122.96, "step": 37465, "token_acc": 0.9707806367204536, "train_speed(iter/s)": 0.2351 }, { "epoch": 2.856162817287903, "grad_norm": 1.318202257156372, "learning_rate": 3.8903972328764735e-05, "loss": 0.07396081686019898, "memory(GiB)": 122.96, "step": 37470, "token_acc": 0.9662203286670724, "train_speed(iter/s)": 0.235108 }, { "epoch": 2.856543943898163, "grad_norm": 1.7637637853622437, "learning_rate": 3.8892297762291636e-05, "loss": 0.06605539321899415, "memory(GiB)": 122.96, "step": 37475, "token_acc": 0.9739776951672863, "train_speed(iter/s)": 0.235119 }, { "epoch": 2.856925070508423, "grad_norm": 0.8949657678604126, "learning_rate": 3.888062383279374e-05, "loss": 0.08866016864776612, "memory(GiB)": 122.96, "step": 37480, "token_acc": 0.9618261140542225, "train_speed(iter/s)": 0.235124 }, { "epoch": 2.8573061971186826, "grad_norm": 0.7615159153938293, "learning_rate": 3.8868950540940455e-05, "loss": 0.07980450987815857, "memory(GiB)": 122.96, "step": 37485, "token_acc": 0.9752947078003511, "train_speed(iter/s)": 0.235128 }, { "epoch": 2.8576873237289426, "grad_norm": 0.5338117480278015, "learning_rate": 3.88572778874012e-05, "loss": 0.09254343509674072, "memory(GiB)": 122.96, "step": 37490, "token_acc": 0.9654071993872861, "train_speed(iter/s)": 0.235131 }, { "epoch": 2.8580684503392026, "grad_norm": 1.243766188621521, "learning_rate": 3.884560587284536e-05, "loss": 0.06830713748931885, "memory(GiB)": 122.96, "step": 37495, "token_acc": 0.974052812858783, "train_speed(iter/s)": 0.235137 }, { "epoch": 2.8584495769494627, "grad_norm": 0.678043782711029, "learning_rate": 3.883393449794226e-05, "loss": 0.055156415700912474, "memory(GiB)": 122.96, "step": 37500, "token_acc": 0.9775222164140094, "train_speed(iter/s)": 0.235142 }, { "epoch": 2.8588307035597227, "grad_norm": 0.9878964424133301, "learning_rate": 3.88222637633612e-05, "loss": 0.11557214260101319, "memory(GiB)": 122.96, "step": 37505, "token_acc": 0.9654387417218543, "train_speed(iter/s)": 0.235146 }, { "epoch": 2.8592118301699827, "grad_norm": 1.6379841566085815, "learning_rate": 3.881059366977143e-05, "loss": 0.09238015413284302, "memory(GiB)": 122.96, "step": 37510, "token_acc": 0.9714389446932107, "train_speed(iter/s)": 0.235148 }, { "epoch": 2.8595929567802423, "grad_norm": 3.083301544189453, "learning_rate": 3.87989242178422e-05, "loss": 0.09083805084228516, "memory(GiB)": 122.96, "step": 37515, "token_acc": 0.9593633592956315, "train_speed(iter/s)": 0.235156 }, { "epoch": 2.8599740833905023, "grad_norm": 1.5604013204574585, "learning_rate": 3.878725540824269e-05, "loss": 0.08880417943000793, "memory(GiB)": 122.96, "step": 37520, "token_acc": 0.9768244575936884, "train_speed(iter/s)": 0.235162 }, { "epoch": 2.8603552100007623, "grad_norm": 0.8298394083976746, "learning_rate": 3.877558724164203e-05, "loss": 0.09186557531356812, "memory(GiB)": 122.96, "step": 37525, "token_acc": 0.9643271942923511, "train_speed(iter/s)": 0.235167 }, { "epoch": 2.8607363366110223, "grad_norm": 0.7213504314422607, "learning_rate": 3.8763919718709375e-05, "loss": 0.0660668134689331, "memory(GiB)": 122.96, "step": 37530, "token_acc": 0.9750692520775623, "train_speed(iter/s)": 0.235174 }, { "epoch": 2.861117463221282, "grad_norm": 1.0971808433532715, "learning_rate": 3.875225284011377e-05, "loss": 0.10126533508300781, "memory(GiB)": 122.96, "step": 37535, "token_acc": 0.9610876699484294, "train_speed(iter/s)": 0.235181 }, { "epoch": 2.861498589831542, "grad_norm": 0.4884926676750183, "learning_rate": 3.8740586606524266e-05, "loss": 0.08379967212677002, "memory(GiB)": 122.96, "step": 37540, "token_acc": 0.9711077466565733, "train_speed(iter/s)": 0.235181 }, { "epoch": 2.861879716441802, "grad_norm": 1.4050750732421875, "learning_rate": 3.872892101860988e-05, "loss": 0.08777062892913819, "memory(GiB)": 122.96, "step": 37545, "token_acc": 0.9650195694716243, "train_speed(iter/s)": 0.235188 }, { "epoch": 2.862260843052062, "grad_norm": 0.8504447340965271, "learning_rate": 3.8717256077039573e-05, "loss": 0.13713667392730713, "memory(GiB)": 122.96, "step": 37550, "token_acc": 0.9591652566271855, "train_speed(iter/s)": 0.235192 }, { "epoch": 2.862641969662322, "grad_norm": 0.9192399382591248, "learning_rate": 3.870559178248225e-05, "loss": 0.08787782788276673, "memory(GiB)": 122.96, "step": 37555, "token_acc": 0.9728244728244728, "train_speed(iter/s)": 0.235195 }, { "epoch": 2.863023096272582, "grad_norm": 0.6913599967956543, "learning_rate": 3.8693928135606846e-05, "loss": 0.09883487224578857, "memory(GiB)": 122.96, "step": 37560, "token_acc": 0.9587291188994431, "train_speed(iter/s)": 0.235204 }, { "epoch": 2.8634042228828416, "grad_norm": 2.4948015213012695, "learning_rate": 3.86822651370822e-05, "loss": 0.08358967304229736, "memory(GiB)": 122.96, "step": 37565, "token_acc": 0.9637082285303629, "train_speed(iter/s)": 0.235213 }, { "epoch": 2.8637853494931016, "grad_norm": 0.7621044516563416, "learning_rate": 3.867060278757712e-05, "loss": 0.07112233638763428, "memory(GiB)": 122.96, "step": 37570, "token_acc": 0.9705693148922483, "train_speed(iter/s)": 0.23522 }, { "epoch": 2.8641664761033616, "grad_norm": 0.9077858924865723, "learning_rate": 3.865894108776038e-05, "loss": 0.06059027314186096, "memory(GiB)": 122.96, "step": 37575, "token_acc": 0.9741909601497727, "train_speed(iter/s)": 0.235221 }, { "epoch": 2.864547602713621, "grad_norm": 1.4839637279510498, "learning_rate": 3.864728003830076e-05, "loss": 0.0761684775352478, "memory(GiB)": 122.96, "step": 37580, "token_acc": 0.9644988066825776, "train_speed(iter/s)": 0.23523 }, { "epoch": 2.864928729323881, "grad_norm": 0.6079143285751343, "learning_rate": 3.863561963986694e-05, "loss": 0.06423194408416748, "memory(GiB)": 122.96, "step": 37585, "token_acc": 0.9746809172979143, "train_speed(iter/s)": 0.235231 }, { "epoch": 2.8653098559341412, "grad_norm": 1.6962138414382935, "learning_rate": 3.8623959893127595e-05, "loss": 0.058178645372390744, "memory(GiB)": 122.96, "step": 37590, "token_acc": 0.9761102603369066, "train_speed(iter/s)": 0.235241 }, { "epoch": 2.8656909825444012, "grad_norm": 0.7918026447296143, "learning_rate": 3.861230079875136e-05, "loss": 0.08919512033462525, "memory(GiB)": 122.96, "step": 37595, "token_acc": 0.9668085106382979, "train_speed(iter/s)": 0.235249 }, { "epoch": 2.8660721091546613, "grad_norm": 1.1996636390686035, "learning_rate": 3.860064235740683e-05, "loss": 0.0779586374759674, "memory(GiB)": 122.96, "step": 37600, "token_acc": 0.969147005444646, "train_speed(iter/s)": 0.235257 }, { "epoch": 2.8660721091546613, "eval_loss": 0.07338440418243408, "eval_runtime": 221.5313, "eval_samples_per_second": 2.392, "eval_steps_per_second": 2.392, "eval_token_acc": 0.9686765857478465, "step": 37600 }, { "epoch": 2.8664532357649213, "grad_norm": 0.6825397610664368, "learning_rate": 3.8588984569762555e-05, "loss": 0.07049527168273925, "memory(GiB)": 122.96, "step": 37605, "token_acc": 0.9685343222887254, "train_speed(iter/s)": 0.234937 }, { "epoch": 2.8668343623751813, "grad_norm": 0.8457134962081909, "learning_rate": 3.8577327436487057e-05, "loss": 0.044300153851509094, "memory(GiB)": 122.96, "step": 37610, "token_acc": 0.9846153846153847, "train_speed(iter/s)": 0.234947 }, { "epoch": 2.867215488985441, "grad_norm": 1.4401220083236694, "learning_rate": 3.856567095824883e-05, "loss": 0.08750039339065552, "memory(GiB)": 122.96, "step": 37615, "token_acc": 0.9628764278296988, "train_speed(iter/s)": 0.234956 }, { "epoch": 2.867596615595701, "grad_norm": 0.7018383741378784, "learning_rate": 3.855401513571631e-05, "loss": 0.06733075380325318, "memory(GiB)": 122.96, "step": 37620, "token_acc": 0.9710383800329645, "train_speed(iter/s)": 0.234964 }, { "epoch": 2.867977742205961, "grad_norm": 1.0767743587493896, "learning_rate": 3.8542359969557916e-05, "loss": 0.10761547088623047, "memory(GiB)": 122.96, "step": 37625, "token_acc": 0.9591100420926038, "train_speed(iter/s)": 0.234971 }, { "epoch": 2.8683588688162205, "grad_norm": 1.5876203775405884, "learning_rate": 3.8530705460442e-05, "loss": 0.07416890859603882, "memory(GiB)": 122.96, "step": 37630, "token_acc": 0.9742801230081074, "train_speed(iter/s)": 0.234979 }, { "epoch": 2.8687399954264805, "grad_norm": 2.0851857662200928, "learning_rate": 3.8519051609036904e-05, "loss": 0.16404304504394532, "memory(GiB)": 122.96, "step": 37635, "token_acc": 0.9439270469538222, "train_speed(iter/s)": 0.234983 }, { "epoch": 2.8691211220367405, "grad_norm": 0.5806670188903809, "learning_rate": 3.8507398416010926e-05, "loss": 0.05104566216468811, "memory(GiB)": 122.96, "step": 37640, "token_acc": 0.977751448130595, "train_speed(iter/s)": 0.234983 }, { "epoch": 2.8695022486470005, "grad_norm": 0.5802030563354492, "learning_rate": 3.849574588203231e-05, "loss": 0.06266066431999207, "memory(GiB)": 122.96, "step": 37645, "token_acc": 0.978594564145895, "train_speed(iter/s)": 0.234987 }, { "epoch": 2.8698833752572606, "grad_norm": 0.8798251152038574, "learning_rate": 3.848409400776928e-05, "loss": 0.06567035913467408, "memory(GiB)": 122.96, "step": 37650, "token_acc": 0.9727592267135325, "train_speed(iter/s)": 0.23499 }, { "epoch": 2.8702645018675206, "grad_norm": 1.1115680932998657, "learning_rate": 3.847244279389002e-05, "loss": 0.08284960985183716, "memory(GiB)": 122.96, "step": 37655, "token_acc": 0.9699845508717723, "train_speed(iter/s)": 0.234998 }, { "epoch": 2.87064562847778, "grad_norm": 0.8585509657859802, "learning_rate": 3.846079224106267e-05, "loss": 0.07910059690475464, "memory(GiB)": 122.96, "step": 37660, "token_acc": 0.9604117181314331, "train_speed(iter/s)": 0.235004 }, { "epoch": 2.87102675508804, "grad_norm": 1.0320909023284912, "learning_rate": 3.844914234995534e-05, "loss": 0.042608022689819336, "memory(GiB)": 122.96, "step": 37665, "token_acc": 0.9814704124327556, "train_speed(iter/s)": 0.235013 }, { "epoch": 2.8714078816983, "grad_norm": 0.9491744041442871, "learning_rate": 3.84374931212361e-05, "loss": 0.0982345998287201, "memory(GiB)": 122.96, "step": 37670, "token_acc": 0.969661610268378, "train_speed(iter/s)": 0.235017 }, { "epoch": 2.87178900830856, "grad_norm": 1.1583856344223022, "learning_rate": 3.842584455557296e-05, "loss": 0.06122907400131226, "memory(GiB)": 122.96, "step": 37675, "token_acc": 0.9773917838433968, "train_speed(iter/s)": 0.235026 }, { "epoch": 2.87217013491882, "grad_norm": 0.9622520804405212, "learning_rate": 3.8414196653633924e-05, "loss": 0.058839929103851316, "memory(GiB)": 122.96, "step": 37680, "token_acc": 0.9758982035928143, "train_speed(iter/s)": 0.235027 }, { "epoch": 2.87255126152908, "grad_norm": 1.3225998878479004, "learning_rate": 3.8402549416086956e-05, "loss": 0.0706447958946228, "memory(GiB)": 122.96, "step": 37685, "token_acc": 0.9626615605552896, "train_speed(iter/s)": 0.235034 }, { "epoch": 2.87293238813934, "grad_norm": 0.6204521656036377, "learning_rate": 3.8390902843599954e-05, "loss": 0.10583727359771729, "memory(GiB)": 122.96, "step": 37690, "token_acc": 0.9636386671734448, "train_speed(iter/s)": 0.235036 }, { "epoch": 2.8733135147496, "grad_norm": 0.4902087152004242, "learning_rate": 3.83792569368408e-05, "loss": 0.0976695716381073, "memory(GiB)": 122.96, "step": 37695, "token_acc": 0.9702998436833878, "train_speed(iter/s)": 0.235038 }, { "epoch": 2.87369464135986, "grad_norm": 1.1833827495574951, "learning_rate": 3.836761169647734e-05, "loss": 0.08696631193161011, "memory(GiB)": 122.96, "step": 37700, "token_acc": 0.9682090022033365, "train_speed(iter/s)": 0.235042 }, { "epoch": 2.87407576797012, "grad_norm": 1.4736696481704712, "learning_rate": 3.835596712317737e-05, "loss": 0.09079868197441102, "memory(GiB)": 122.96, "step": 37705, "token_acc": 0.9642497482376636, "train_speed(iter/s)": 0.235047 }, { "epoch": 2.8744568945803795, "grad_norm": 0.604415237903595, "learning_rate": 3.8344323217608633e-05, "loss": 0.08161352872848511, "memory(GiB)": 122.96, "step": 37710, "token_acc": 0.971889400921659, "train_speed(iter/s)": 0.235054 }, { "epoch": 2.8748380211906395, "grad_norm": 1.3426930904388428, "learning_rate": 3.8332679980438884e-05, "loss": 0.08416850566864013, "memory(GiB)": 122.96, "step": 37715, "token_acc": 0.9679309281529448, "train_speed(iter/s)": 0.235058 }, { "epoch": 2.8752191478008995, "grad_norm": 2.5330376625061035, "learning_rate": 3.83210374123358e-05, "loss": 0.08411678075790405, "memory(GiB)": 122.96, "step": 37720, "token_acc": 0.9755792110206637, "train_speed(iter/s)": 0.235064 }, { "epoch": 2.8756002744111595, "grad_norm": 0.03720680996775627, "learning_rate": 3.8309395513967005e-05, "loss": 0.08168486952781677, "memory(GiB)": 122.96, "step": 37725, "token_acc": 0.9668043445005354, "train_speed(iter/s)": 0.23507 }, { "epoch": 2.875981401021419, "grad_norm": 1.2487587928771973, "learning_rate": 3.8297754286000145e-05, "loss": 0.09484231472015381, "memory(GiB)": 122.96, "step": 37730, "token_acc": 0.9589237668161436, "train_speed(iter/s)": 0.235075 }, { "epoch": 2.876362527631679, "grad_norm": 2.021404981613159, "learning_rate": 3.828611372910277e-05, "loss": 0.07753746509552002, "memory(GiB)": 122.96, "step": 37735, "token_acc": 0.9726694915254237, "train_speed(iter/s)": 0.235081 }, { "epoch": 2.876743654241939, "grad_norm": 1.106571912765503, "learning_rate": 3.827447384394241e-05, "loss": 0.10608422756195068, "memory(GiB)": 122.96, "step": 37740, "token_acc": 0.9591307168342523, "train_speed(iter/s)": 0.235091 }, { "epoch": 2.877124780852199, "grad_norm": 0.7597334384918213, "learning_rate": 3.826283463118654e-05, "loss": 0.07758030295372009, "memory(GiB)": 122.96, "step": 37745, "token_acc": 0.9703839122486289, "train_speed(iter/s)": 0.235098 }, { "epoch": 2.877505907462459, "grad_norm": 0.6588160991668701, "learning_rate": 3.825119609150265e-05, "loss": 0.08207584619522094, "memory(GiB)": 122.96, "step": 37750, "token_acc": 0.9663372969351868, "train_speed(iter/s)": 0.235102 }, { "epoch": 2.877887034072719, "grad_norm": 0.48011189699172974, "learning_rate": 3.8239558225558156e-05, "loss": 0.08710498213768006, "memory(GiB)": 122.96, "step": 37755, "token_acc": 0.9696752381928081, "train_speed(iter/s)": 0.235104 }, { "epoch": 2.8782681606829787, "grad_norm": 1.2707535028457642, "learning_rate": 3.822792103402042e-05, "loss": 0.04682544767856598, "memory(GiB)": 122.96, "step": 37760, "token_acc": 0.9755374351371386, "train_speed(iter/s)": 0.235112 }, { "epoch": 2.8786492872932388, "grad_norm": 0.9917944669723511, "learning_rate": 3.821628451755677e-05, "loss": 0.11355810165405274, "memory(GiB)": 122.96, "step": 37765, "token_acc": 0.9554342883128695, "train_speed(iter/s)": 0.235117 }, { "epoch": 2.879030413903499, "grad_norm": 0.7982029914855957, "learning_rate": 3.820464867683453e-05, "loss": 0.08655939698219299, "memory(GiB)": 122.96, "step": 37770, "token_acc": 0.970108695652174, "train_speed(iter/s)": 0.235121 }, { "epoch": 2.879411540513759, "grad_norm": 1.5520362854003906, "learning_rate": 3.819301351252096e-05, "loss": 0.08331148624420166, "memory(GiB)": 122.96, "step": 37775, "token_acc": 0.9647741400745959, "train_speed(iter/s)": 0.235131 }, { "epoch": 2.8797926671240184, "grad_norm": 0.5555896759033203, "learning_rate": 3.8181379025283265e-05, "loss": 0.037147268652915955, "memory(GiB)": 122.96, "step": 37780, "token_acc": 0.9859131859131859, "train_speed(iter/s)": 0.235135 }, { "epoch": 2.8801737937342784, "grad_norm": 2.0533390045166016, "learning_rate": 3.8169745215788634e-05, "loss": 0.11949472427368164, "memory(GiB)": 122.96, "step": 37785, "token_acc": 0.9586832381592207, "train_speed(iter/s)": 0.235143 }, { "epoch": 2.8805549203445384, "grad_norm": 0.8549385666847229, "learning_rate": 3.8158112084704226e-05, "loss": 0.08377432227134704, "memory(GiB)": 122.96, "step": 37790, "token_acc": 0.9685997171145686, "train_speed(iter/s)": 0.235147 }, { "epoch": 2.8809360469547984, "grad_norm": 1.0679277181625366, "learning_rate": 3.814647963269713e-05, "loss": 0.09465956687927246, "memory(GiB)": 122.96, "step": 37795, "token_acc": 0.9710660500415036, "train_speed(iter/s)": 0.235152 }, { "epoch": 2.8813171735650585, "grad_norm": 0.7813785076141357, "learning_rate": 3.813484786043441e-05, "loss": 0.0633561372756958, "memory(GiB)": 122.96, "step": 37800, "token_acc": 0.9688281135127187, "train_speed(iter/s)": 0.235155 }, { "epoch": 2.8813171735650585, "eval_loss": 0.07429222017526627, "eval_runtime": 221.252, "eval_samples_per_second": 2.395, "eval_steps_per_second": 2.395, "eval_token_acc": 0.9686088187458587, "step": 37800 }, { "epoch": 2.8816983001753185, "grad_norm": 0.7490967512130737, "learning_rate": 3.812321676858312e-05, "loss": 0.06352437138557435, "memory(GiB)": 122.96, "step": 37805, "token_acc": 0.9687987067121927, "train_speed(iter/s)": 0.234835 }, { "epoch": 2.882079426785578, "grad_norm": 1.4641709327697754, "learning_rate": 3.811158635781023e-05, "loss": 0.05955098867416382, "memory(GiB)": 122.96, "step": 37810, "token_acc": 0.9770384254920338, "train_speed(iter/s)": 0.234845 }, { "epoch": 2.882460553395838, "grad_norm": 0.7408734560012817, "learning_rate": 3.809995662878269e-05, "loss": 0.0508830189704895, "memory(GiB)": 122.96, "step": 37815, "token_acc": 0.9802779480891068, "train_speed(iter/s)": 0.234844 }, { "epoch": 2.882841680006098, "grad_norm": 0.569401204586029, "learning_rate": 3.808832758216742e-05, "loss": 0.06532365679740906, "memory(GiB)": 122.96, "step": 37820, "token_acc": 0.9747447608812466, "train_speed(iter/s)": 0.234847 }, { "epoch": 2.883222806616358, "grad_norm": 2.13411545753479, "learning_rate": 3.8076699218631284e-05, "loss": 0.07660976052284241, "memory(GiB)": 122.96, "step": 37825, "token_acc": 0.9699540473665607, "train_speed(iter/s)": 0.234856 }, { "epoch": 2.8836039332266177, "grad_norm": 0.34800365567207336, "learning_rate": 3.806507153884111e-05, "loss": 0.07094260454177856, "memory(GiB)": 122.96, "step": 37830, "token_acc": 0.9706905116741182, "train_speed(iter/s)": 0.234864 }, { "epoch": 2.8839850598368777, "grad_norm": 0.9863859415054321, "learning_rate": 3.80534445434637e-05, "loss": 0.08238838911056519, "memory(GiB)": 122.96, "step": 37835, "token_acc": 0.970679330993186, "train_speed(iter/s)": 0.234868 }, { "epoch": 2.8843661864471377, "grad_norm": 1.201694130897522, "learning_rate": 3.804181823316582e-05, "loss": 0.09837762117385865, "memory(GiB)": 122.96, "step": 37840, "token_acc": 0.9665847665847666, "train_speed(iter/s)": 0.234874 }, { "epoch": 2.8847473130573977, "grad_norm": 0.7310218214988708, "learning_rate": 3.8030192608614164e-05, "loss": 0.09818058609962463, "memory(GiB)": 122.96, "step": 37845, "token_acc": 0.9620743034055728, "train_speed(iter/s)": 0.23488 }, { "epoch": 2.8851284396676578, "grad_norm": 1.334642767906189, "learning_rate": 3.80185676704754e-05, "loss": 0.0840908408164978, "memory(GiB)": 122.96, "step": 37850, "token_acc": 0.9657759882869692, "train_speed(iter/s)": 0.234885 }, { "epoch": 2.8855095662779178, "grad_norm": 1.497881531715393, "learning_rate": 3.80069434194162e-05, "loss": 0.09427948594093323, "memory(GiB)": 122.96, "step": 37855, "token_acc": 0.963962783383567, "train_speed(iter/s)": 0.234889 }, { "epoch": 2.8858906928881773, "grad_norm": 0.281019926071167, "learning_rate": 3.7995319856103136e-05, "loss": 0.056127339601516724, "memory(GiB)": 122.96, "step": 37860, "token_acc": 0.9784591991890522, "train_speed(iter/s)": 0.234897 }, { "epoch": 2.8862718194984374, "grad_norm": 1.6171692609786987, "learning_rate": 3.798369698120275e-05, "loss": 0.06277437210083008, "memory(GiB)": 122.96, "step": 37865, "token_acc": 0.9742194213692352, "train_speed(iter/s)": 0.234905 }, { "epoch": 2.8866529461086974, "grad_norm": 1.5683726072311401, "learning_rate": 3.797207479538161e-05, "loss": 0.09869595766067504, "memory(GiB)": 122.96, "step": 37870, "token_acc": 0.9654059040590406, "train_speed(iter/s)": 0.234913 }, { "epoch": 2.8870340727189574, "grad_norm": 0.5380122065544128, "learning_rate": 3.7960453299306134e-05, "loss": 0.09572099447250366, "memory(GiB)": 122.96, "step": 37875, "token_acc": 0.9628637059724349, "train_speed(iter/s)": 0.234914 }, { "epoch": 2.887415199329217, "grad_norm": 0.8102245926856995, "learning_rate": 3.7948832493642805e-05, "loss": 0.0830041527748108, "memory(GiB)": 122.96, "step": 37880, "token_acc": 0.9679862798342147, "train_speed(iter/s)": 0.23492 }, { "epoch": 2.887796325939477, "grad_norm": 1.0901880264282227, "learning_rate": 3.7937212379058004e-05, "loss": 0.09912623763084412, "memory(GiB)": 122.96, "step": 37885, "token_acc": 0.9691656590084643, "train_speed(iter/s)": 0.234928 }, { "epoch": 2.888177452549737, "grad_norm": 0.9431098103523254, "learning_rate": 3.79255929562181e-05, "loss": 0.06892263293266296, "memory(GiB)": 122.96, "step": 37890, "token_acc": 0.9726509612780937, "train_speed(iter/s)": 0.234929 }, { "epoch": 2.888558579159997, "grad_norm": 1.363510012626648, "learning_rate": 3.791397422578942e-05, "loss": 0.0828850269317627, "memory(GiB)": 122.96, "step": 37895, "token_acc": 0.9619520264681555, "train_speed(iter/s)": 0.23494 }, { "epoch": 2.888939705770257, "grad_norm": 0.7722975611686707, "learning_rate": 3.790235618843822e-05, "loss": 0.12587124109268188, "memory(GiB)": 122.96, "step": 37900, "token_acc": 0.9546460176991151, "train_speed(iter/s)": 0.234948 }, { "epoch": 2.889320832380517, "grad_norm": 2.1903038024902344, "learning_rate": 3.7890738844830747e-05, "loss": 0.11206810474395752, "memory(GiB)": 122.96, "step": 37905, "token_acc": 0.9662680706764234, "train_speed(iter/s)": 0.234953 }, { "epoch": 2.8897019589907766, "grad_norm": 1.013375163078308, "learning_rate": 3.787912219563322e-05, "loss": 0.10055527687072754, "memory(GiB)": 122.96, "step": 37910, "token_acc": 0.9522704673516739, "train_speed(iter/s)": 0.234962 }, { "epoch": 2.8900830856010367, "grad_norm": 0.9082911610603333, "learning_rate": 3.7867506241511786e-05, "loss": 0.081793874502182, "memory(GiB)": 122.96, "step": 37915, "token_acc": 0.964797706275884, "train_speed(iter/s)": 0.234966 }, { "epoch": 2.8904642122112967, "grad_norm": 0.9624173045158386, "learning_rate": 3.785589098313255e-05, "loss": 0.05942943096160889, "memory(GiB)": 122.96, "step": 37920, "token_acc": 0.977042431918936, "train_speed(iter/s)": 0.234969 }, { "epoch": 2.8908453388215563, "grad_norm": 1.369828224182129, "learning_rate": 3.784427642116163e-05, "loss": 0.09542831778526306, "memory(GiB)": 122.96, "step": 37925, "token_acc": 0.9759882869692533, "train_speed(iter/s)": 0.234978 }, { "epoch": 2.8912264654318163, "grad_norm": 0.9761334657669067, "learning_rate": 3.7832662556265046e-05, "loss": 0.049045336246490476, "memory(GiB)": 122.96, "step": 37930, "token_acc": 0.9715944432925565, "train_speed(iter/s)": 0.234985 }, { "epoch": 2.8916075920420763, "grad_norm": 1.1205757856369019, "learning_rate": 3.7821049389108786e-05, "loss": 0.105331289768219, "memory(GiB)": 122.96, "step": 37935, "token_acc": 0.9600924175593363, "train_speed(iter/s)": 0.234991 }, { "epoch": 2.8919887186523363, "grad_norm": 0.7117846012115479, "learning_rate": 3.7809436920358844e-05, "loss": 0.08248488903045655, "memory(GiB)": 122.96, "step": 37940, "token_acc": 0.9712509712509713, "train_speed(iter/s)": 0.234994 }, { "epoch": 2.8923698452625963, "grad_norm": 0.5552998781204224, "learning_rate": 3.779782515068112e-05, "loss": 0.06780540943145752, "memory(GiB)": 122.96, "step": 37945, "token_acc": 0.9767510300176574, "train_speed(iter/s)": 0.234996 }, { "epoch": 2.8927509718728563, "grad_norm": 0.7938038110733032, "learning_rate": 3.778621408074149e-05, "loss": 0.10284035205841065, "memory(GiB)": 122.96, "step": 37950, "token_acc": 0.9682257244534824, "train_speed(iter/s)": 0.235004 }, { "epoch": 2.8931320984831164, "grad_norm": 0.4910276532173157, "learning_rate": 3.777460371120581e-05, "loss": 0.06375249624252319, "memory(GiB)": 122.96, "step": 37955, "token_acc": 0.973495130129331, "train_speed(iter/s)": 0.235012 }, { "epoch": 2.893513225093376, "grad_norm": 1.298025369644165, "learning_rate": 3.7762994042739874e-05, "loss": 0.0954779028892517, "memory(GiB)": 122.96, "step": 37960, "token_acc": 0.9666374012291484, "train_speed(iter/s)": 0.235013 }, { "epoch": 2.893894351703636, "grad_norm": 1.5332584381103516, "learning_rate": 3.775138507600945e-05, "loss": 0.08412294387817383, "memory(GiB)": 122.96, "step": 37965, "token_acc": 0.9705375333019903, "train_speed(iter/s)": 0.235019 }, { "epoch": 2.894275478313896, "grad_norm": 1.3342256546020508, "learning_rate": 3.773977681168023e-05, "loss": 0.058449959754943846, "memory(GiB)": 122.96, "step": 37970, "token_acc": 0.9772232699840417, "train_speed(iter/s)": 0.235021 }, { "epoch": 2.8946566049241556, "grad_norm": 2.3797428607940674, "learning_rate": 3.7728169250417936e-05, "loss": 0.08247337937355041, "memory(GiB)": 122.96, "step": 37975, "token_acc": 0.9659227261819054, "train_speed(iter/s)": 0.23503 }, { "epoch": 2.8950377315344156, "grad_norm": 2.223917245864868, "learning_rate": 3.771656239288818e-05, "loss": 0.0862675130367279, "memory(GiB)": 122.96, "step": 37980, "token_acc": 0.9723320158102767, "train_speed(iter/s)": 0.235039 }, { "epoch": 2.8954188581446756, "grad_norm": 0.49933919310569763, "learning_rate": 3.7704956239756564e-05, "loss": 0.0782415509223938, "memory(GiB)": 122.96, "step": 37985, "token_acc": 0.9711737868104521, "train_speed(iter/s)": 0.235045 }, { "epoch": 2.8957999847549356, "grad_norm": 0.8481409549713135, "learning_rate": 3.769335079168866e-05, "loss": 0.0573919951915741, "memory(GiB)": 122.96, "step": 37990, "token_acc": 0.9771302102545186, "train_speed(iter/s)": 0.235053 }, { "epoch": 2.8961811113651956, "grad_norm": 1.4453394412994385, "learning_rate": 3.768174604934998e-05, "loss": 0.08061132431030274, "memory(GiB)": 122.96, "step": 37995, "token_acc": 0.9704731412308787, "train_speed(iter/s)": 0.235054 }, { "epoch": 2.8965622379754556, "grad_norm": 1.8469257354736328, "learning_rate": 3.767014201340598e-05, "loss": 0.07005182504653931, "memory(GiB)": 122.96, "step": 38000, "token_acc": 0.9769951718261858, "train_speed(iter/s)": 0.23506 }, { "epoch": 2.8965622379754556, "eval_loss": 0.07313370704650879, "eval_runtime": 221.3149, "eval_samples_per_second": 2.395, "eval_steps_per_second": 2.395, "eval_token_acc": 0.9689100054213602, "step": 38000 }, { "epoch": 2.896943364585715, "grad_norm": 0.8806771039962769, "learning_rate": 3.765853868452214e-05, "loss": 0.05372971296310425, "memory(GiB)": 122.96, "step": 38005, "token_acc": 0.9690916752297241, "train_speed(iter/s)": 0.234746 }, { "epoch": 2.8973244911959752, "grad_norm": 0.8762566447257996, "learning_rate": 3.7646936063363816e-05, "loss": 0.06655730605125428, "memory(GiB)": 122.96, "step": 38010, "token_acc": 0.9746633188769688, "train_speed(iter/s)": 0.234753 }, { "epoch": 2.8977056178062353, "grad_norm": 1.786808967590332, "learning_rate": 3.763533415059639e-05, "loss": 0.09971969723701476, "memory(GiB)": 122.96, "step": 38015, "token_acc": 0.9562343286984272, "train_speed(iter/s)": 0.234761 }, { "epoch": 2.8980867444164953, "grad_norm": 0.9162140488624573, "learning_rate": 3.762373294688518e-05, "loss": 0.0608712911605835, "memory(GiB)": 122.96, "step": 38020, "token_acc": 0.9793564055859137, "train_speed(iter/s)": 0.234767 }, { "epoch": 2.898467871026755, "grad_norm": 0.8191885352134705, "learning_rate": 3.7612132452895445e-05, "loss": 0.05766780376434326, "memory(GiB)": 122.96, "step": 38025, "token_acc": 0.9684962650211107, "train_speed(iter/s)": 0.234772 }, { "epoch": 2.898848997637015, "grad_norm": 0.5313200354576111, "learning_rate": 3.7600532669292436e-05, "loss": 0.07932933568954467, "memory(GiB)": 122.96, "step": 38030, "token_acc": 0.9643281807372176, "train_speed(iter/s)": 0.234776 }, { "epoch": 2.899230124247275, "grad_norm": 0.7560880184173584, "learning_rate": 3.758893359674134e-05, "loss": 0.05417177677154541, "memory(GiB)": 122.96, "step": 38035, "token_acc": 0.9776735156123605, "train_speed(iter/s)": 0.234778 }, { "epoch": 2.899611250857535, "grad_norm": 0.6784417629241943, "learning_rate": 3.757733523590729e-05, "loss": 0.07130923271179199, "memory(GiB)": 122.96, "step": 38040, "token_acc": 0.9614260666277031, "train_speed(iter/s)": 0.234787 }, { "epoch": 2.899992377467795, "grad_norm": 0.38170069456100464, "learning_rate": 3.756573758745543e-05, "loss": 0.06712146401405335, "memory(GiB)": 122.96, "step": 38045, "token_acc": 0.9715037264357738, "train_speed(iter/s)": 0.234795 }, { "epoch": 2.900373504078055, "grad_norm": 1.3735333681106567, "learning_rate": 3.755414065205082e-05, "loss": 0.0863541841506958, "memory(GiB)": 122.96, "step": 38050, "token_acc": 0.9649730561970746, "train_speed(iter/s)": 0.2348 }, { "epoch": 2.9007546306883145, "grad_norm": 1.0491634607315063, "learning_rate": 3.7542544430358476e-05, "loss": 0.09077118039131164, "memory(GiB)": 122.96, "step": 38055, "token_acc": 0.9594221372424049, "train_speed(iter/s)": 0.234807 }, { "epoch": 2.9011357572985745, "grad_norm": 0.8341416716575623, "learning_rate": 3.753094892304341e-05, "loss": 0.07302770018577576, "memory(GiB)": 122.96, "step": 38060, "token_acc": 0.9641174338528452, "train_speed(iter/s)": 0.234813 }, { "epoch": 2.9015168839088346, "grad_norm": 1.4460233449935913, "learning_rate": 3.7519354130770557e-05, "loss": 0.07721441984176636, "memory(GiB)": 122.96, "step": 38065, "token_acc": 0.9708791208791209, "train_speed(iter/s)": 0.234824 }, { "epoch": 2.9018980105190946, "grad_norm": 0.6992290019989014, "learning_rate": 3.7507760054204834e-05, "loss": 0.10300500392913818, "memory(GiB)": 122.96, "step": 38070, "token_acc": 0.957725321888412, "train_speed(iter/s)": 0.234831 }, { "epoch": 2.902279137129354, "grad_norm": 0.4452054798603058, "learning_rate": 3.749616669401108e-05, "loss": 0.0781505823135376, "memory(GiB)": 122.96, "step": 38075, "token_acc": 0.9757085020242915, "train_speed(iter/s)": 0.234838 }, { "epoch": 2.902660263739614, "grad_norm": 1.9531822204589844, "learning_rate": 3.748457405085416e-05, "loss": 0.0851179838180542, "memory(GiB)": 122.96, "step": 38080, "token_acc": 0.9637902656948036, "train_speed(iter/s)": 0.234843 }, { "epoch": 2.903041390349874, "grad_norm": 0.6668861508369446, "learning_rate": 3.747298212539884e-05, "loss": 0.08823171257972717, "memory(GiB)": 122.96, "step": 38085, "token_acc": 0.9631540162122328, "train_speed(iter/s)": 0.234851 }, { "epoch": 2.903422516960134, "grad_norm": 0.7634332180023193, "learning_rate": 3.746139091830985e-05, "loss": 0.0831906259059906, "memory(GiB)": 122.96, "step": 38090, "token_acc": 0.9728544933400776, "train_speed(iter/s)": 0.234856 }, { "epoch": 2.903803643570394, "grad_norm": 1.020965337753296, "learning_rate": 3.7449800430251905e-05, "loss": 0.07657724618911743, "memory(GiB)": 122.96, "step": 38095, "token_acc": 0.9720232002729444, "train_speed(iter/s)": 0.234863 }, { "epoch": 2.9041847701806542, "grad_norm": 1.3418511152267456, "learning_rate": 3.7438210661889676e-05, "loss": 0.07149125933647156, "memory(GiB)": 122.96, "step": 38100, "token_acc": 0.9710982658959537, "train_speed(iter/s)": 0.234874 }, { "epoch": 2.904565896790914, "grad_norm": 1.805721640586853, "learning_rate": 3.7426621613887755e-05, "loss": 0.08186355233192444, "memory(GiB)": 122.96, "step": 38105, "token_acc": 0.9721092388146426, "train_speed(iter/s)": 0.234877 }, { "epoch": 2.904947023401174, "grad_norm": 0.8951708078384399, "learning_rate": 3.7415033286910764e-05, "loss": 0.07275183200836181, "memory(GiB)": 122.96, "step": 38110, "token_acc": 0.9777702358730698, "train_speed(iter/s)": 0.234881 }, { "epoch": 2.905328150011434, "grad_norm": 1.2457143068313599, "learning_rate": 3.740344568162319e-05, "loss": 0.10987311601638794, "memory(GiB)": 122.96, "step": 38115, "token_acc": 0.9675937739219188, "train_speed(iter/s)": 0.234888 }, { "epoch": 2.905709276621694, "grad_norm": 0.9420811533927917, "learning_rate": 3.7391858798689553e-05, "loss": 0.08784855604171753, "memory(GiB)": 122.96, "step": 38120, "token_acc": 0.9552196235025671, "train_speed(iter/s)": 0.234895 }, { "epoch": 2.9060904032319534, "grad_norm": 1.6861162185668945, "learning_rate": 3.7380272638774314e-05, "loss": 0.08441098928451538, "memory(GiB)": 122.96, "step": 38125, "token_acc": 0.9646556977452773, "train_speed(iter/s)": 0.234902 }, { "epoch": 2.9064715298422135, "grad_norm": 0.6586142182350159, "learning_rate": 3.736868720254187e-05, "loss": 0.09057918190956116, "memory(GiB)": 122.96, "step": 38130, "token_acc": 0.9689813182939725, "train_speed(iter/s)": 0.234902 }, { "epoch": 2.9068526564524735, "grad_norm": 0.416040301322937, "learning_rate": 3.7357102490656615e-05, "loss": 0.06134576797485351, "memory(GiB)": 122.96, "step": 38135, "token_acc": 0.9737916502041354, "train_speed(iter/s)": 0.234905 }, { "epoch": 2.9072337830627335, "grad_norm": 1.3224214315414429, "learning_rate": 3.734551850378284e-05, "loss": 0.10116394758224487, "memory(GiB)": 122.96, "step": 38140, "token_acc": 0.9541561712846348, "train_speed(iter/s)": 0.234913 }, { "epoch": 2.9076149096729935, "grad_norm": 0.8738381862640381, "learning_rate": 3.733393524258484e-05, "loss": 0.08961164951324463, "memory(GiB)": 122.96, "step": 38145, "token_acc": 0.9733024503230525, "train_speed(iter/s)": 0.234918 }, { "epoch": 2.9079960362832535, "grad_norm": 1.3854632377624512, "learning_rate": 3.73223527077269e-05, "loss": 0.08586803674697877, "memory(GiB)": 122.96, "step": 38150, "token_acc": 0.9706287287746673, "train_speed(iter/s)": 0.234924 }, { "epoch": 2.908377162893513, "grad_norm": 1.6323603391647339, "learning_rate": 3.731077089987321e-05, "loss": 0.07355605959892272, "memory(GiB)": 122.96, "step": 38155, "token_acc": 0.9698329593617552, "train_speed(iter/s)": 0.234932 }, { "epoch": 2.908758289503773, "grad_norm": 1.809833288192749, "learning_rate": 3.7299189819687905e-05, "loss": 0.0869312822818756, "memory(GiB)": 122.96, "step": 38160, "token_acc": 0.9763011152416357, "train_speed(iter/s)": 0.234937 }, { "epoch": 2.909139416114033, "grad_norm": 0.912880539894104, "learning_rate": 3.728760946783514e-05, "loss": 0.08074188828468323, "memory(GiB)": 122.96, "step": 38165, "token_acc": 0.9697234352256187, "train_speed(iter/s)": 0.234946 }, { "epoch": 2.909520542724293, "grad_norm": 1.3774921894073486, "learning_rate": 3.727602984497898e-05, "loss": 0.08652944564819336, "memory(GiB)": 122.96, "step": 38170, "token_acc": 0.9642248722316865, "train_speed(iter/s)": 0.234955 }, { "epoch": 2.9099016693345527, "grad_norm": 1.0564051866531372, "learning_rate": 3.726445095178345e-05, "loss": 0.08608655333518982, "memory(GiB)": 122.96, "step": 38175, "token_acc": 0.9667654257374726, "train_speed(iter/s)": 0.23496 }, { "epoch": 2.9102827959448128, "grad_norm": 1.6787174940109253, "learning_rate": 3.725287278891255e-05, "loss": 0.11052887439727783, "memory(GiB)": 122.96, "step": 38180, "token_acc": 0.9603426395939086, "train_speed(iter/s)": 0.234969 }, { "epoch": 2.910663922555073, "grad_norm": 2.1762208938598633, "learning_rate": 3.724129535703026e-05, "loss": 0.1554656982421875, "memory(GiB)": 122.96, "step": 38185, "token_acc": 0.9487563679952052, "train_speed(iter/s)": 0.234977 }, { "epoch": 2.911045049165333, "grad_norm": 0.9532738327980042, "learning_rate": 3.722971865680047e-05, "loss": 0.08318414092063904, "memory(GiB)": 122.96, "step": 38190, "token_acc": 0.9679385865883947, "train_speed(iter/s)": 0.234984 }, { "epoch": 2.911426175775593, "grad_norm": 0.5648732781410217, "learning_rate": 3.721814268888705e-05, "loss": 0.10271081924438477, "memory(GiB)": 122.96, "step": 38195, "token_acc": 0.9646910466582598, "train_speed(iter/s)": 0.234988 }, { "epoch": 2.911807302385853, "grad_norm": 1.3330557346343994, "learning_rate": 3.720656745395383e-05, "loss": 0.11269645690917969, "memory(GiB)": 122.96, "step": 38200, "token_acc": 0.9583544946673438, "train_speed(iter/s)": 0.234991 }, { "epoch": 2.911807302385853, "eval_loss": 0.07321149110794067, "eval_runtime": 220.7733, "eval_samples_per_second": 2.401, "eval_steps_per_second": 2.401, "eval_token_acc": 0.9693090777663996, "step": 38200 }, { "epoch": 2.9121884289961124, "grad_norm": 2.2213964462280273, "learning_rate": 3.719499295266461e-05, "loss": 0.09346674680709839, "memory(GiB)": 122.96, "step": 38205, "token_acc": 0.9693592704101021, "train_speed(iter/s)": 0.23468 }, { "epoch": 2.9125695556063724, "grad_norm": 2.366450071334839, "learning_rate": 3.7183419185683106e-05, "loss": 0.1362439751625061, "memory(GiB)": 122.96, "step": 38210, "token_acc": 0.9478672985781991, "train_speed(iter/s)": 0.234689 }, { "epoch": 2.9129506822166324, "grad_norm": 1.0613170862197876, "learning_rate": 3.717184615367304e-05, "loss": 0.07193232774734497, "memory(GiB)": 122.96, "step": 38215, "token_acc": 0.9681225419167874, "train_speed(iter/s)": 0.234695 }, { "epoch": 2.913331808826892, "grad_norm": 0.9518450498580933, "learning_rate": 3.7160273857298075e-05, "loss": 0.07799142003059387, "memory(GiB)": 122.96, "step": 38220, "token_acc": 0.9663244353182752, "train_speed(iter/s)": 0.234701 }, { "epoch": 2.913712935437152, "grad_norm": 0.7388240694999695, "learning_rate": 3.7148702297221806e-05, "loss": 0.08137394785881043, "memory(GiB)": 122.96, "step": 38225, "token_acc": 0.9690576652601969, "train_speed(iter/s)": 0.23471 }, { "epoch": 2.914094062047412, "grad_norm": 0.8096959590911865, "learning_rate": 3.713713147410783e-05, "loss": 0.0820425033569336, "memory(GiB)": 122.96, "step": 38230, "token_acc": 0.9662307241850479, "train_speed(iter/s)": 0.234718 }, { "epoch": 2.914475188657672, "grad_norm": 1.100929617881775, "learning_rate": 3.712556138861969e-05, "loss": 0.126302170753479, "memory(GiB)": 122.96, "step": 38235, "token_acc": 0.9516225066984222, "train_speed(iter/s)": 0.234723 }, { "epoch": 2.914856315267932, "grad_norm": 0.881531298160553, "learning_rate": 3.711399204142084e-05, "loss": 0.08991179466247559, "memory(GiB)": 122.96, "step": 38240, "token_acc": 0.9619482496194824, "train_speed(iter/s)": 0.234732 }, { "epoch": 2.915237441878192, "grad_norm": 0.8039235472679138, "learning_rate": 3.710242343317475e-05, "loss": 0.08665409088134765, "memory(GiB)": 122.96, "step": 38245, "token_acc": 0.9708754208754209, "train_speed(iter/s)": 0.234736 }, { "epoch": 2.915618568488452, "grad_norm": 0.881889283657074, "learning_rate": 3.709085556454483e-05, "loss": 0.09391170740127563, "memory(GiB)": 122.96, "step": 38250, "token_acc": 0.9597757995384109, "train_speed(iter/s)": 0.234745 }, { "epoch": 2.9159996950987117, "grad_norm": 0.5480813384056091, "learning_rate": 3.707928843619444e-05, "loss": 0.08970773220062256, "memory(GiB)": 122.96, "step": 38255, "token_acc": 0.9638418079096045, "train_speed(iter/s)": 0.23475 }, { "epoch": 2.9163808217089717, "grad_norm": 1.083185076713562, "learning_rate": 3.706772204878688e-05, "loss": 0.1206515908241272, "memory(GiB)": 122.96, "step": 38260, "token_acc": 0.946528555431131, "train_speed(iter/s)": 0.234759 }, { "epoch": 2.9167619483192317, "grad_norm": 1.3724526166915894, "learning_rate": 3.7056156402985465e-05, "loss": 0.09910249710083008, "memory(GiB)": 122.96, "step": 38265, "token_acc": 0.9621676891615542, "train_speed(iter/s)": 0.234766 }, { "epoch": 2.9171430749294913, "grad_norm": 1.3133405447006226, "learning_rate": 3.7044591499453414e-05, "loss": 0.11622238159179688, "memory(GiB)": 122.96, "step": 38270, "token_acc": 0.9516971279373369, "train_speed(iter/s)": 0.234774 }, { "epoch": 2.9175242015397513, "grad_norm": 1.0902714729309082, "learning_rate": 3.70330273388539e-05, "loss": 0.07822607755661011, "memory(GiB)": 122.96, "step": 38275, "token_acc": 0.966792656587473, "train_speed(iter/s)": 0.234781 }, { "epoch": 2.9179053281500114, "grad_norm": 1.2709585428237915, "learning_rate": 3.702146392185011e-05, "loss": 0.06858866810798644, "memory(GiB)": 122.96, "step": 38280, "token_acc": 0.9683333333333334, "train_speed(iter/s)": 0.234789 }, { "epoch": 2.9182864547602714, "grad_norm": 0.7964215278625488, "learning_rate": 3.700990124910513e-05, "loss": 0.06464377045631409, "memory(GiB)": 122.96, "step": 38285, "token_acc": 0.9720496894409938, "train_speed(iter/s)": 0.234797 }, { "epoch": 2.9186675813705314, "grad_norm": 0.707655131816864, "learning_rate": 3.699833932128204e-05, "loss": 0.06996945142745972, "memory(GiB)": 122.96, "step": 38290, "token_acc": 0.9778398510242086, "train_speed(iter/s)": 0.234801 }, { "epoch": 2.9190487079807914, "grad_norm": 1.1067827939987183, "learning_rate": 3.6986778139043856e-05, "loss": 0.10109896659851074, "memory(GiB)": 122.96, "step": 38295, "token_acc": 0.9610522719508029, "train_speed(iter/s)": 0.234809 }, { "epoch": 2.919429834591051, "grad_norm": 1.1973488330841064, "learning_rate": 3.6975217703053546e-05, "loss": 0.09507867097854614, "memory(GiB)": 122.96, "step": 38300, "token_acc": 0.9627064464571125, "train_speed(iter/s)": 0.234818 }, { "epoch": 2.919810961201311, "grad_norm": 1.3434109687805176, "learning_rate": 3.696365801397407e-05, "loss": 0.09561291337013245, "memory(GiB)": 122.96, "step": 38305, "token_acc": 0.9680630443799253, "train_speed(iter/s)": 0.234822 }, { "epoch": 2.920192087811571, "grad_norm": 1.5486390590667725, "learning_rate": 3.69520990724683e-05, "loss": 0.11114062070846557, "memory(GiB)": 122.96, "step": 38310, "token_acc": 0.9540067720090294, "train_speed(iter/s)": 0.234829 }, { "epoch": 2.920573214421831, "grad_norm": 0.9452990889549255, "learning_rate": 3.6940540879199104e-05, "loss": 0.07182212471961975, "memory(GiB)": 122.96, "step": 38315, "token_acc": 0.9655017921146953, "train_speed(iter/s)": 0.234836 }, { "epoch": 2.9209543410320906, "grad_norm": 0.818956732749939, "learning_rate": 3.692898343482929e-05, "loss": 0.08025586605072021, "memory(GiB)": 122.96, "step": 38320, "token_acc": 0.9713283828382838, "train_speed(iter/s)": 0.234842 }, { "epoch": 2.9213354676423506, "grad_norm": 1.9653021097183228, "learning_rate": 3.6917426740021615e-05, "loss": 0.09171321988105774, "memory(GiB)": 122.96, "step": 38325, "token_acc": 0.9673389524382902, "train_speed(iter/s)": 0.234847 }, { "epoch": 2.9217165942526107, "grad_norm": 1.2570027112960815, "learning_rate": 3.69058707954388e-05, "loss": 0.07154630422592163, "memory(GiB)": 122.96, "step": 38330, "token_acc": 0.975499303297724, "train_speed(iter/s)": 0.234848 }, { "epoch": 2.9220977208628707, "grad_norm": 1.9504152536392212, "learning_rate": 3.6894315601743533e-05, "loss": 0.06225066184997559, "memory(GiB)": 122.96, "step": 38335, "token_acc": 0.9746300211416491, "train_speed(iter/s)": 0.234858 }, { "epoch": 2.9224788474731307, "grad_norm": 1.1779415607452393, "learning_rate": 3.688276115959846e-05, "loss": 0.09836920499801635, "memory(GiB)": 122.96, "step": 38340, "token_acc": 0.9556471158507028, "train_speed(iter/s)": 0.234865 }, { "epoch": 2.9228599740833907, "grad_norm": 0.6319994330406189, "learning_rate": 3.6871207469666155e-05, "loss": 0.08379368782043457, "memory(GiB)": 122.96, "step": 38345, "token_acc": 0.968944099378882, "train_speed(iter/s)": 0.234868 }, { "epoch": 2.9232411006936503, "grad_norm": 0.5886424779891968, "learning_rate": 3.685965453260918e-05, "loss": 0.06321191787719727, "memory(GiB)": 122.96, "step": 38350, "token_acc": 0.9731661588938364, "train_speed(iter/s)": 0.234871 }, { "epoch": 2.9236222273039103, "grad_norm": 0.8608803749084473, "learning_rate": 3.684810234909003e-05, "loss": 0.0843707799911499, "memory(GiB)": 122.96, "step": 38355, "token_acc": 0.963355408388521, "train_speed(iter/s)": 0.234879 }, { "epoch": 2.9240033539141703, "grad_norm": 0.6467999815940857, "learning_rate": 3.683655091977119e-05, "loss": 0.048241302371025085, "memory(GiB)": 122.96, "step": 38360, "token_acc": 0.9788000847996609, "train_speed(iter/s)": 0.234885 }, { "epoch": 2.9243844805244303, "grad_norm": 0.7474616169929504, "learning_rate": 3.6825000245315054e-05, "loss": 0.08278734683990478, "memory(GiB)": 122.96, "step": 38365, "token_acc": 0.9606966345022359, "train_speed(iter/s)": 0.234892 }, { "epoch": 2.92476560713469, "grad_norm": 0.8409563302993774, "learning_rate": 3.6813450326384027e-05, "loss": 0.06328598260879517, "memory(GiB)": 122.96, "step": 38370, "token_acc": 0.9767402086540106, "train_speed(iter/s)": 0.234901 }, { "epoch": 2.92514673374495, "grad_norm": 0.9744145274162292, "learning_rate": 3.680190116364043e-05, "loss": 0.11127490997314453, "memory(GiB)": 122.96, "step": 38375, "token_acc": 0.9584920794064568, "train_speed(iter/s)": 0.234906 }, { "epoch": 2.92552786035521, "grad_norm": 1.0472900867462158, "learning_rate": 3.679035275774655e-05, "loss": 0.08497668504714966, "memory(GiB)": 122.96, "step": 38380, "token_acc": 0.9680242943908539, "train_speed(iter/s)": 0.234912 }, { "epoch": 2.92590898696547, "grad_norm": 0.8046517372131348, "learning_rate": 3.677880510936464e-05, "loss": 0.06863973140716553, "memory(GiB)": 122.96, "step": 38385, "token_acc": 0.9768115942028985, "train_speed(iter/s)": 0.23492 }, { "epoch": 2.92629011357573, "grad_norm": 1.1033340692520142, "learning_rate": 3.676725821915691e-05, "loss": 0.0629566490650177, "memory(GiB)": 122.96, "step": 38390, "token_acc": 0.9812910596972294, "train_speed(iter/s)": 0.234924 }, { "epoch": 2.92667124018599, "grad_norm": 1.1933232545852661, "learning_rate": 3.6755712087785496e-05, "loss": 0.12579107284545898, "memory(GiB)": 122.96, "step": 38395, "token_acc": 0.9615198451113263, "train_speed(iter/s)": 0.234926 }, { "epoch": 2.9270523667962496, "grad_norm": 0.6069520115852356, "learning_rate": 3.6744166715912545e-05, "loss": 0.07103090286254883, "memory(GiB)": 122.96, "step": 38400, "token_acc": 0.9753711790393013, "train_speed(iter/s)": 0.234931 }, { "epoch": 2.9270523667962496, "eval_loss": 0.07321581244468689, "eval_runtime": 221.6204, "eval_samples_per_second": 2.391, "eval_steps_per_second": 2.391, "eval_token_acc": 0.9690455394253358, "step": 38400 }, { "epoch": 2.9274334934065096, "grad_norm": 0.6016127467155457, "learning_rate": 3.673262210420012e-05, "loss": 0.0906354010105133, "memory(GiB)": 122.96, "step": 38405, "token_acc": 0.9692234207130584, "train_speed(iter/s)": 0.234619 }, { "epoch": 2.9278146200167696, "grad_norm": 0.8554596304893494, "learning_rate": 3.6721078253310234e-05, "loss": 0.0751638650894165, "memory(GiB)": 122.96, "step": 38410, "token_acc": 0.9739504299443601, "train_speed(iter/s)": 0.234625 }, { "epoch": 2.9281957466270296, "grad_norm": 0.6402720212936401, "learning_rate": 3.67095351639049e-05, "loss": 0.06845004558563232, "memory(GiB)": 122.96, "step": 38415, "token_acc": 0.9777312523791397, "train_speed(iter/s)": 0.234629 }, { "epoch": 2.928576873237289, "grad_norm": 1.703059434890747, "learning_rate": 3.6697992836646045e-05, "loss": 0.10728366374969482, "memory(GiB)": 122.96, "step": 38420, "token_acc": 0.9575926322553009, "train_speed(iter/s)": 0.234636 }, { "epoch": 2.9289579998475492, "grad_norm": 1.3567802906036377, "learning_rate": 3.668645127219558e-05, "loss": 0.11867262125015259, "memory(GiB)": 122.96, "step": 38425, "token_acc": 0.9516327788046827, "train_speed(iter/s)": 0.234645 }, { "epoch": 2.9293391264578093, "grad_norm": 0.8658754825592041, "learning_rate": 3.667491047121535e-05, "loss": 0.07756887674331665, "memory(GiB)": 122.96, "step": 38430, "token_acc": 0.9672786766042538, "train_speed(iter/s)": 0.234651 }, { "epoch": 2.9297202530680693, "grad_norm": 0.7964979410171509, "learning_rate": 3.666337043436716e-05, "loss": 0.08433717489242554, "memory(GiB)": 122.96, "step": 38435, "token_acc": 0.9644800483675937, "train_speed(iter/s)": 0.234657 }, { "epoch": 2.9301013796783293, "grad_norm": 1.0031805038452148, "learning_rate": 3.66518311623128e-05, "loss": 0.0697148859500885, "memory(GiB)": 122.96, "step": 38440, "token_acc": 0.9696132596685083, "train_speed(iter/s)": 0.234664 }, { "epoch": 2.9304825062885893, "grad_norm": 0.6281558871269226, "learning_rate": 3.664029265571398e-05, "loss": 0.07812336683273316, "memory(GiB)": 122.96, "step": 38445, "token_acc": 0.9638462706972505, "train_speed(iter/s)": 0.234669 }, { "epoch": 2.930863632898849, "grad_norm": 1.6045385599136353, "learning_rate": 3.6628754915232366e-05, "loss": 0.07443418502807617, "memory(GiB)": 122.96, "step": 38450, "token_acc": 0.9743187610272496, "train_speed(iter/s)": 0.234674 }, { "epoch": 2.931244759509109, "grad_norm": 1.8853859901428223, "learning_rate": 3.6617217941529615e-05, "loss": 0.12446836233139039, "memory(GiB)": 122.96, "step": 38455, "token_acc": 0.940631808278867, "train_speed(iter/s)": 0.234685 }, { "epoch": 2.931625886119369, "grad_norm": 0.6618401408195496, "learning_rate": 3.660568173526732e-05, "loss": 0.1187375545501709, "memory(GiB)": 122.96, "step": 38460, "token_acc": 0.9626112759643917, "train_speed(iter/s)": 0.234693 }, { "epoch": 2.932007012729629, "grad_norm": 0.35117802023887634, "learning_rate": 3.659414629710701e-05, "loss": 0.09195100069046021, "memory(GiB)": 122.96, "step": 38465, "token_acc": 0.9641532756489494, "train_speed(iter/s)": 0.234701 }, { "epoch": 2.9323881393398885, "grad_norm": 1.0934264659881592, "learning_rate": 3.658261162771019e-05, "loss": 0.07881757020950317, "memory(GiB)": 122.96, "step": 38470, "token_acc": 0.965500124100273, "train_speed(iter/s)": 0.234708 }, { "epoch": 2.9327692659501485, "grad_norm": 1.2915143966674805, "learning_rate": 3.657107772773835e-05, "loss": 0.08198112845420838, "memory(GiB)": 122.96, "step": 38475, "token_acc": 0.968937875751503, "train_speed(iter/s)": 0.234713 }, { "epoch": 2.9331503925604085, "grad_norm": 0.772872805595398, "learning_rate": 3.655954459785287e-05, "loss": 0.06403120160102845, "memory(GiB)": 122.96, "step": 38480, "token_acc": 0.9774681292617847, "train_speed(iter/s)": 0.234721 }, { "epoch": 2.9335315191706686, "grad_norm": 0.8432132005691528, "learning_rate": 3.654801223871514e-05, "loss": 0.1185457706451416, "memory(GiB)": 122.96, "step": 38485, "token_acc": 0.9476028294472099, "train_speed(iter/s)": 0.23473 }, { "epoch": 2.9339126457809286, "grad_norm": 1.3068593740463257, "learning_rate": 3.6536480650986484e-05, "loss": 0.09074968099594116, "memory(GiB)": 122.96, "step": 38490, "token_acc": 0.9561119293078056, "train_speed(iter/s)": 0.234738 }, { "epoch": 2.9342937723911886, "grad_norm": 0.7391459941864014, "learning_rate": 3.652494983532818e-05, "loss": 0.0646716058254242, "memory(GiB)": 122.96, "step": 38495, "token_acc": 0.9782316581564096, "train_speed(iter/s)": 0.234746 }, { "epoch": 2.934674899001448, "grad_norm": 1.7009575366973877, "learning_rate": 3.651341979240147e-05, "loss": 0.10729162693023682, "memory(GiB)": 122.96, "step": 38500, "token_acc": 0.9554945054945055, "train_speed(iter/s)": 0.234752 }, { "epoch": 2.935056025611708, "grad_norm": 0.5439935922622681, "learning_rate": 3.650189052286755e-05, "loss": 0.08538492918014526, "memory(GiB)": 122.96, "step": 38505, "token_acc": 0.968746328281048, "train_speed(iter/s)": 0.234752 }, { "epoch": 2.935437152221968, "grad_norm": 1.173685073852539, "learning_rate": 3.649036202738758e-05, "loss": 0.10654065608978272, "memory(GiB)": 122.96, "step": 38510, "token_acc": 0.9497784342688331, "train_speed(iter/s)": 0.234761 }, { "epoch": 2.935818278832228, "grad_norm": 0.7812029719352722, "learning_rate": 3.647883430662265e-05, "loss": 0.06769155263900757, "memory(GiB)": 122.96, "step": 38515, "token_acc": 0.9711945778028805, "train_speed(iter/s)": 0.23477 }, { "epoch": 2.936199405442488, "grad_norm": 0.8242233991622925, "learning_rate": 3.6467307361233824e-05, "loss": 0.0734384000301361, "memory(GiB)": 122.96, "step": 38520, "token_acc": 0.9703894827374371, "train_speed(iter/s)": 0.234772 }, { "epoch": 2.936580532052748, "grad_norm": 2.0323479175567627, "learning_rate": 3.6455781191882126e-05, "loss": 0.08858414888381957, "memory(GiB)": 122.96, "step": 38525, "token_acc": 0.9570491803278689, "train_speed(iter/s)": 0.234781 }, { "epoch": 2.936961658663008, "grad_norm": 1.0969643592834473, "learning_rate": 3.6444255799228525e-05, "loss": 0.07953286767005921, "memory(GiB)": 122.96, "step": 38530, "token_acc": 0.9744349352644284, "train_speed(iter/s)": 0.234782 }, { "epoch": 2.937342785273268, "grad_norm": 0.5396181344985962, "learning_rate": 3.6432731183933935e-05, "loss": 0.05935894846916199, "memory(GiB)": 122.96, "step": 38535, "token_acc": 0.9720479178551056, "train_speed(iter/s)": 0.234791 }, { "epoch": 2.937723911883528, "grad_norm": 1.0247985124588013, "learning_rate": 3.642120734665926e-05, "loss": 0.08878918290138245, "memory(GiB)": 122.96, "step": 38540, "token_acc": 0.9690380935814924, "train_speed(iter/s)": 0.234796 }, { "epoch": 2.938105038493788, "grad_norm": 0.732018768787384, "learning_rate": 3.6409684288065315e-05, "loss": 0.05096613168716431, "memory(GiB)": 122.96, "step": 38545, "token_acc": 0.9787900698963605, "train_speed(iter/s)": 0.234804 }, { "epoch": 2.9384861651040475, "grad_norm": 0.6222997903823853, "learning_rate": 3.639816200881293e-05, "loss": 0.08514778017997741, "memory(GiB)": 122.96, "step": 38550, "token_acc": 0.971967380224261, "train_speed(iter/s)": 0.234808 }, { "epoch": 2.9388672917143075, "grad_norm": 1.0242666006088257, "learning_rate": 3.638664050956282e-05, "loss": 0.07821987867355347, "memory(GiB)": 122.96, "step": 38555, "token_acc": 0.9678551216751556, "train_speed(iter/s)": 0.234806 }, { "epoch": 2.9392484183245675, "grad_norm": 0.6417696475982666, "learning_rate": 3.637511979097571e-05, "loss": 0.08289740681648254, "memory(GiB)": 122.96, "step": 38560, "token_acc": 0.9704701834862385, "train_speed(iter/s)": 0.234812 }, { "epoch": 2.939629544934827, "grad_norm": 0.8166964650154114, "learning_rate": 3.636359985371226e-05, "loss": 0.04867103099822998, "memory(GiB)": 122.96, "step": 38565, "token_acc": 0.9802669762042948, "train_speed(iter/s)": 0.23482 }, { "epoch": 2.940010671545087, "grad_norm": 1.5745680332183838, "learning_rate": 3.635208069843308e-05, "loss": 0.07524790167808533, "memory(GiB)": 122.96, "step": 38570, "token_acc": 0.9742238946378174, "train_speed(iter/s)": 0.234827 }, { "epoch": 2.940391798155347, "grad_norm": 0.5692694783210754, "learning_rate": 3.634056232579872e-05, "loss": 0.061606526374816895, "memory(GiB)": 122.96, "step": 38575, "token_acc": 0.979950289975145, "train_speed(iter/s)": 0.234834 }, { "epoch": 2.940772924765607, "grad_norm": 1.2897921800613403, "learning_rate": 3.6329044736469734e-05, "loss": 0.09089514017105102, "memory(GiB)": 122.96, "step": 38580, "token_acc": 0.9681077250177179, "train_speed(iter/s)": 0.234833 }, { "epoch": 2.941154051375867, "grad_norm": 2.204230546951294, "learning_rate": 3.63175279311066e-05, "loss": 0.07411049604415894, "memory(GiB)": 122.96, "step": 38585, "token_acc": 0.9743104380746349, "train_speed(iter/s)": 0.234836 }, { "epoch": 2.941535177986127, "grad_norm": 0.5812010169029236, "learning_rate": 3.630601191036972e-05, "loss": 0.07833275794982911, "memory(GiB)": 122.96, "step": 38590, "token_acc": 0.9748407643312101, "train_speed(iter/s)": 0.234845 }, { "epoch": 2.9419163045963868, "grad_norm": 0.9122231602668762, "learning_rate": 3.629449667491953e-05, "loss": 0.1112443208694458, "memory(GiB)": 122.96, "step": 38595, "token_acc": 0.9608006672226855, "train_speed(iter/s)": 0.234852 }, { "epoch": 2.9422974312066468, "grad_norm": 0.13897180557250977, "learning_rate": 3.6282982225416354e-05, "loss": 0.04806002974510193, "memory(GiB)": 122.96, "step": 38600, "token_acc": 0.9785202863961814, "train_speed(iter/s)": 0.234858 }, { "epoch": 2.9422974312066468, "eval_loss": 0.07308615744113922, "eval_runtime": 221.3702, "eval_samples_per_second": 2.394, "eval_steps_per_second": 2.394, "eval_token_acc": 0.9692789590988494, "step": 38600 }, { "epoch": 2.942678557816907, "grad_norm": 0.8170351386070251, "learning_rate": 3.6271468562520475e-05, "loss": 0.07586143016815186, "memory(GiB)": 122.96, "step": 38605, "token_acc": 0.969287369853942, "train_speed(iter/s)": 0.23455 }, { "epoch": 2.943059684427167, "grad_norm": 0.5808900594711304, "learning_rate": 3.6259955686892185e-05, "loss": 0.1036845326423645, "memory(GiB)": 122.96, "step": 38610, "token_acc": 0.9602260833158887, "train_speed(iter/s)": 0.234556 }, { "epoch": 2.9434408110374264, "grad_norm": 1.0807719230651855, "learning_rate": 3.624844359919167e-05, "loss": 0.06232611536979675, "memory(GiB)": 122.96, "step": 38615, "token_acc": 0.974917491749175, "train_speed(iter/s)": 0.234561 }, { "epoch": 2.9438219376476864, "grad_norm": 0.6244029998779297, "learning_rate": 3.6236932300079094e-05, "loss": 0.05861049294471741, "memory(GiB)": 122.96, "step": 38620, "token_acc": 0.9799201369081575, "train_speed(iter/s)": 0.23456 }, { "epoch": 2.9442030642579464, "grad_norm": 0.6895312070846558, "learning_rate": 3.6225421790214585e-05, "loss": 0.06806414127349854, "memory(GiB)": 122.96, "step": 38625, "token_acc": 0.9653069004956157, "train_speed(iter/s)": 0.234567 }, { "epoch": 2.9445841908682064, "grad_norm": 0.9582532644271851, "learning_rate": 3.6213912070258214e-05, "loss": 0.07100957036018371, "memory(GiB)": 122.96, "step": 38630, "token_acc": 0.9711479435236341, "train_speed(iter/s)": 0.234574 }, { "epoch": 2.9449653174784665, "grad_norm": 0.7546040415763855, "learning_rate": 3.620240314087002e-05, "loss": 0.07467643022537232, "memory(GiB)": 122.96, "step": 38635, "token_acc": 0.9588568612958857, "train_speed(iter/s)": 0.234582 }, { "epoch": 2.9453464440887265, "grad_norm": 1.0474066734313965, "learning_rate": 3.6190895002709956e-05, "loss": 0.08365092277526856, "memory(GiB)": 122.96, "step": 38640, "token_acc": 0.9657686212361332, "train_speed(iter/s)": 0.234591 }, { "epoch": 2.945727570698986, "grad_norm": 0.04862583801150322, "learning_rate": 3.6179387656438004e-05, "loss": 0.06738827228546143, "memory(GiB)": 122.96, "step": 38645, "token_acc": 0.9675973802137194, "train_speed(iter/s)": 0.2346 }, { "epoch": 2.946108697309246, "grad_norm": 0.9068000912666321, "learning_rate": 3.6167881102714025e-05, "loss": 0.07072598934173584, "memory(GiB)": 122.96, "step": 38650, "token_acc": 0.9730597431602457, "train_speed(iter/s)": 0.234605 }, { "epoch": 2.946489823919506, "grad_norm": 0.9595414400100708, "learning_rate": 3.6156375342197865e-05, "loss": 0.09277918338775634, "memory(GiB)": 122.96, "step": 38655, "token_acc": 0.9611125418490858, "train_speed(iter/s)": 0.234612 }, { "epoch": 2.946870950529766, "grad_norm": 0.7440850734710693, "learning_rate": 3.6144870375549355e-05, "loss": 0.06875663995742798, "memory(GiB)": 122.96, "step": 38660, "token_acc": 0.9728470111448835, "train_speed(iter/s)": 0.234618 }, { "epoch": 2.9472520771400257, "grad_norm": 0.9217122793197632, "learning_rate": 3.6133366203428226e-05, "loss": 0.07529481053352356, "memory(GiB)": 122.96, "step": 38665, "token_acc": 0.9744897959183674, "train_speed(iter/s)": 0.234624 }, { "epoch": 2.9476332037502857, "grad_norm": 0.6697252988815308, "learning_rate": 3.612186282649419e-05, "loss": 0.0612426221370697, "memory(GiB)": 122.96, "step": 38670, "token_acc": 0.9694519317160827, "train_speed(iter/s)": 0.234632 }, { "epoch": 2.9480143303605457, "grad_norm": 0.7084183692932129, "learning_rate": 3.611036024540693e-05, "loss": 0.08065415024757386, "memory(GiB)": 122.96, "step": 38675, "token_acc": 0.9647006912781292, "train_speed(iter/s)": 0.234636 }, { "epoch": 2.9483954569708057, "grad_norm": 1.5974667072296143, "learning_rate": 3.6098858460826025e-05, "loss": 0.10098352432250976, "memory(GiB)": 122.96, "step": 38680, "token_acc": 0.9640241961158866, "train_speed(iter/s)": 0.234641 }, { "epoch": 2.9487765835810658, "grad_norm": 1.3520749807357788, "learning_rate": 3.608735747341111e-05, "loss": 0.10012969970703126, "memory(GiB)": 122.96, "step": 38685, "token_acc": 0.9670473694064782, "train_speed(iter/s)": 0.234648 }, { "epoch": 2.9491577101913258, "grad_norm": 0.03705350309610367, "learning_rate": 3.607585728382167e-05, "loss": 0.08925758004188537, "memory(GiB)": 122.96, "step": 38690, "token_acc": 0.9591280653950953, "train_speed(iter/s)": 0.234658 }, { "epoch": 2.9495388368015854, "grad_norm": 0.741936445236206, "learning_rate": 3.60643578927172e-05, "loss": 0.09105641841888427, "memory(GiB)": 122.96, "step": 38695, "token_acc": 0.9669636737491433, "train_speed(iter/s)": 0.23466 }, { "epoch": 2.9499199634118454, "grad_norm": 0.5934513807296753, "learning_rate": 3.605285930075714e-05, "loss": 0.0751001477241516, "memory(GiB)": 122.96, "step": 38700, "token_acc": 0.9744670600987595, "train_speed(iter/s)": 0.234661 }, { "epoch": 2.9503010900221054, "grad_norm": 0.8418030738830566, "learning_rate": 3.604136150860088e-05, "loss": 0.05762844681739807, "memory(GiB)": 122.96, "step": 38705, "token_acc": 0.980089485458613, "train_speed(iter/s)": 0.234668 }, { "epoch": 2.9506822166323654, "grad_norm": 0.8436858654022217, "learning_rate": 3.602986451690774e-05, "loss": 0.08942219614982605, "memory(GiB)": 122.96, "step": 38710, "token_acc": 0.9645300684191573, "train_speed(iter/s)": 0.234671 }, { "epoch": 2.951063343242625, "grad_norm": 1.2350512742996216, "learning_rate": 3.601836832633706e-05, "loss": 0.09446380138397217, "memory(GiB)": 122.96, "step": 38715, "token_acc": 0.9627543604651163, "train_speed(iter/s)": 0.234677 }, { "epoch": 2.951444469852885, "grad_norm": 1.440083384513855, "learning_rate": 3.600687293754807e-05, "loss": 0.09501760601997375, "memory(GiB)": 122.96, "step": 38720, "token_acc": 0.9616455304670585, "train_speed(iter/s)": 0.234687 }, { "epoch": 2.951825596463145, "grad_norm": 2.0481960773468018, "learning_rate": 3.599537835119998e-05, "loss": 0.07629832029342651, "memory(GiB)": 122.96, "step": 38725, "token_acc": 0.9656978266561927, "train_speed(iter/s)": 0.234697 }, { "epoch": 2.952206723073405, "grad_norm": 1.0453717708587646, "learning_rate": 3.598388456795195e-05, "loss": 0.08238788843154907, "memory(GiB)": 122.96, "step": 38730, "token_acc": 0.964804896710023, "train_speed(iter/s)": 0.234704 }, { "epoch": 2.952587849683665, "grad_norm": 2.314896583557129, "learning_rate": 3.5972391588463106e-05, "loss": 0.12514473199844361, "memory(GiB)": 122.96, "step": 38735, "token_acc": 0.9527698079894064, "train_speed(iter/s)": 0.234712 }, { "epoch": 2.952968976293925, "grad_norm": 1.4636012315750122, "learning_rate": 3.5960899413392506e-05, "loss": 0.06875689029693603, "memory(GiB)": 122.96, "step": 38740, "token_acc": 0.9711103376261747, "train_speed(iter/s)": 0.234721 }, { "epoch": 2.9533501029041846, "grad_norm": 0.9940330386161804, "learning_rate": 3.594940804339917e-05, "loss": 0.04716563820838928, "memory(GiB)": 122.96, "step": 38745, "token_acc": 0.9827771797631862, "train_speed(iter/s)": 0.234725 }, { "epoch": 2.9537312295144447, "grad_norm": 1.0505119562149048, "learning_rate": 3.593791747914208e-05, "loss": 0.0678349792957306, "memory(GiB)": 122.96, "step": 38750, "token_acc": 0.9659483467675605, "train_speed(iter/s)": 0.234731 }, { "epoch": 2.9541123561247047, "grad_norm": 1.5109341144561768, "learning_rate": 3.5926427721280175e-05, "loss": 0.11112428903579712, "memory(GiB)": 122.96, "step": 38755, "token_acc": 0.9585043319653442, "train_speed(iter/s)": 0.234741 }, { "epoch": 2.9544934827349647, "grad_norm": 1.2430680990219116, "learning_rate": 3.591493877047232e-05, "loss": 0.08223534226417542, "memory(GiB)": 122.96, "step": 38760, "token_acc": 0.9670622426737709, "train_speed(iter/s)": 0.234748 }, { "epoch": 2.9548746093452243, "grad_norm": 0.8281294703483582, "learning_rate": 3.5903450627377364e-05, "loss": 0.05034450888633728, "memory(GiB)": 122.96, "step": 38765, "token_acc": 0.9804169298799748, "train_speed(iter/s)": 0.234758 }, { "epoch": 2.9552557359554843, "grad_norm": 1.1660783290863037, "learning_rate": 3.5891963292654105e-05, "loss": 0.06878971457481384, "memory(GiB)": 122.96, "step": 38770, "token_acc": 0.970767004341534, "train_speed(iter/s)": 0.234761 }, { "epoch": 2.9556368625657443, "grad_norm": 0.7993292808532715, "learning_rate": 3.5880476766961274e-05, "loss": 0.07143346667289734, "memory(GiB)": 122.96, "step": 38775, "token_acc": 0.9773267797685112, "train_speed(iter/s)": 0.234765 }, { "epoch": 2.9560179891760043, "grad_norm": 1.074449896812439, "learning_rate": 3.586899105095759e-05, "loss": 0.09173145890235901, "memory(GiB)": 122.96, "step": 38780, "token_acc": 0.962091230993543, "train_speed(iter/s)": 0.234771 }, { "epoch": 2.9563991157862644, "grad_norm": 0.8507541418075562, "learning_rate": 3.585750614530169e-05, "loss": 0.07905872464179993, "memory(GiB)": 122.96, "step": 38785, "token_acc": 0.9693670418896475, "train_speed(iter/s)": 0.234774 }, { "epoch": 2.9567802423965244, "grad_norm": 0.7372616529464722, "learning_rate": 3.584602205065217e-05, "loss": 0.0719455599784851, "memory(GiB)": 122.96, "step": 38790, "token_acc": 0.9683986630203586, "train_speed(iter/s)": 0.234784 }, { "epoch": 2.957161369006784, "grad_norm": 0.7369358539581299, "learning_rate": 3.5834538767667615e-05, "loss": 0.087141352891922, "memory(GiB)": 122.96, "step": 38795, "token_acc": 0.9696888412017167, "train_speed(iter/s)": 0.234791 }, { "epoch": 2.957542495617044, "grad_norm": 0.711685061454773, "learning_rate": 3.582305629700653e-05, "loss": 0.05474860668182373, "memory(GiB)": 122.96, "step": 38800, "token_acc": 0.9761499148211243, "train_speed(iter/s)": 0.234797 }, { "epoch": 2.957542495617044, "eval_loss": 0.07327984273433685, "eval_runtime": 221.0396, "eval_samples_per_second": 2.398, "eval_steps_per_second": 2.398, "eval_token_acc": 0.9689551834226854, "step": 38800 }, { "epoch": 2.957923622227304, "grad_norm": 1.4097583293914795, "learning_rate": 3.5811574639327373e-05, "loss": 0.04817003905773163, "memory(GiB)": 122.96, "step": 38805, "token_acc": 0.9695198121221206, "train_speed(iter/s)": 0.234486 }, { "epoch": 2.958304748837564, "grad_norm": 0.7329922318458557, "learning_rate": 3.580009379528855e-05, "loss": 0.0671981155872345, "memory(GiB)": 122.96, "step": 38810, "token_acc": 0.9743421052631579, "train_speed(iter/s)": 0.234493 }, { "epoch": 2.9586858754478236, "grad_norm": 0.6524449586868286, "learning_rate": 3.578861376554846e-05, "loss": 0.10910841226577758, "memory(GiB)": 122.96, "step": 38815, "token_acc": 0.9643954918032787, "train_speed(iter/s)": 0.234501 }, { "epoch": 2.9590670020580836, "grad_norm": 1.0033230781555176, "learning_rate": 3.5777134550765433e-05, "loss": 0.08297204375267028, "memory(GiB)": 122.96, "step": 38820, "token_acc": 0.9671444677081655, "train_speed(iter/s)": 0.234507 }, { "epoch": 2.9594481286683436, "grad_norm": 0.85258948802948, "learning_rate": 3.576565615159774e-05, "loss": 0.10434581041336059, "memory(GiB)": 122.96, "step": 38825, "token_acc": 0.9579470198675497, "train_speed(iter/s)": 0.23451 }, { "epoch": 2.9598292552786036, "grad_norm": 0.5201262831687927, "learning_rate": 3.575417856870359e-05, "loss": 0.046606266498565675, "memory(GiB)": 122.96, "step": 38830, "token_acc": 0.980043044414009, "train_speed(iter/s)": 0.234516 }, { "epoch": 2.9602103818888637, "grad_norm": 0.597226083278656, "learning_rate": 3.574270180274121e-05, "loss": 0.0701273500919342, "memory(GiB)": 122.96, "step": 38835, "token_acc": 0.9717741935483871, "train_speed(iter/s)": 0.234525 }, { "epoch": 2.9605915084991237, "grad_norm": 0.7599177360534668, "learning_rate": 3.573122585436872e-05, "loss": 0.0834250271320343, "memory(GiB)": 122.96, "step": 38840, "token_acc": 0.9686804451510334, "train_speed(iter/s)": 0.234528 }, { "epoch": 2.9609726351093832, "grad_norm": 0.4462575316429138, "learning_rate": 3.571975072424421e-05, "loss": 0.08164891004562377, "memory(GiB)": 122.96, "step": 38845, "token_acc": 0.967345669413777, "train_speed(iter/s)": 0.234534 }, { "epoch": 2.9613537617196433, "grad_norm": 0.9207409620285034, "learning_rate": 3.570827641302572e-05, "loss": 0.07861298322677612, "memory(GiB)": 122.96, "step": 38850, "token_acc": 0.9653276955602537, "train_speed(iter/s)": 0.234541 }, { "epoch": 2.9617348883299033, "grad_norm": 0.5712072849273682, "learning_rate": 3.569680292137126e-05, "loss": 0.04996066689491272, "memory(GiB)": 122.96, "step": 38855, "token_acc": 0.9789692627686619, "train_speed(iter/s)": 0.234549 }, { "epoch": 2.962116014940163, "grad_norm": 1.096729040145874, "learning_rate": 3.5685330249938786e-05, "loss": 0.07683677673339843, "memory(GiB)": 122.96, "step": 38860, "token_acc": 0.973630831643002, "train_speed(iter/s)": 0.234555 }, { "epoch": 2.962497141550423, "grad_norm": 1.3038557767868042, "learning_rate": 3.567385839938618e-05, "loss": 0.08227510452270508, "memory(GiB)": 122.96, "step": 38865, "token_acc": 0.9726130653266332, "train_speed(iter/s)": 0.234562 }, { "epoch": 2.962878268160683, "grad_norm": 1.8853020668029785, "learning_rate": 3.566238737037132e-05, "loss": 0.12806181907653807, "memory(GiB)": 122.96, "step": 38870, "token_acc": 0.9570082449941107, "train_speed(iter/s)": 0.23457 }, { "epoch": 2.963259394770943, "grad_norm": 1.2064908742904663, "learning_rate": 3.5650917163552014e-05, "loss": 0.12048227787017822, "memory(GiB)": 122.96, "step": 38875, "token_acc": 0.9596619285439877, "train_speed(iter/s)": 0.234579 }, { "epoch": 2.963640521381203, "grad_norm": 0.6385561227798462, "learning_rate": 3.563944777958601e-05, "loss": 0.07434581518173218, "memory(GiB)": 122.96, "step": 38880, "token_acc": 0.9744516751024344, "train_speed(iter/s)": 0.234585 }, { "epoch": 2.964021647991463, "grad_norm": 1.2863200902938843, "learning_rate": 3.562797921913105e-05, "loss": 0.07883182764053345, "memory(GiB)": 122.96, "step": 38885, "token_acc": 0.9683831881862931, "train_speed(iter/s)": 0.234591 }, { "epoch": 2.964402774601723, "grad_norm": 1.0779120922088623, "learning_rate": 3.561651148284478e-05, "loss": 0.06824736595153809, "memory(GiB)": 122.96, "step": 38890, "token_acc": 0.9684579439252337, "train_speed(iter/s)": 0.234601 }, { "epoch": 2.9647839012119825, "grad_norm": 0.9973769187927246, "learning_rate": 3.5605044571384813e-05, "loss": 0.06643468141555786, "memory(GiB)": 122.96, "step": 38895, "token_acc": 0.9749496112870717, "train_speed(iter/s)": 0.234605 }, { "epoch": 2.9651650278222426, "grad_norm": 0.7081709504127502, "learning_rate": 3.559357848540876e-05, "loss": 0.09061163067817687, "memory(GiB)": 122.96, "step": 38900, "token_acc": 0.965295208992688, "train_speed(iter/s)": 0.234605 }, { "epoch": 2.9655461544325026, "grad_norm": 0.8721177577972412, "learning_rate": 3.5582113225574125e-05, "loss": 0.0790637731552124, "memory(GiB)": 122.96, "step": 38905, "token_acc": 0.9694481573419896, "train_speed(iter/s)": 0.234609 }, { "epoch": 2.965927281042762, "grad_norm": 0.988643229007721, "learning_rate": 3.557064879253839e-05, "loss": 0.06460950970649719, "memory(GiB)": 122.96, "step": 38910, "token_acc": 0.9669551534225019, "train_speed(iter/s)": 0.234616 }, { "epoch": 2.966308407653022, "grad_norm": 0.6945236921310425, "learning_rate": 3.555918518695898e-05, "loss": 0.08273127675056458, "memory(GiB)": 122.96, "step": 38915, "token_acc": 0.9742160278745644, "train_speed(iter/s)": 0.234621 }, { "epoch": 2.966689534263282, "grad_norm": 0.8074384331703186, "learning_rate": 3.5547722409493286e-05, "loss": 0.08158943057060242, "memory(GiB)": 122.96, "step": 38920, "token_acc": 0.9754915930464519, "train_speed(iter/s)": 0.234625 }, { "epoch": 2.967070660873542, "grad_norm": 0.8986004590988159, "learning_rate": 3.553626046079865e-05, "loss": 0.08944888114929199, "memory(GiB)": 122.96, "step": 38925, "token_acc": 0.970175920835624, "train_speed(iter/s)": 0.234627 }, { "epoch": 2.9674517874838022, "grad_norm": 0.6507866978645325, "learning_rate": 3.5524799341532344e-05, "loss": 0.06634051203727723, "memory(GiB)": 122.96, "step": 38930, "token_acc": 0.9756327621443169, "train_speed(iter/s)": 0.234632 }, { "epoch": 2.9678329140940622, "grad_norm": 0.7415527701377869, "learning_rate": 3.5513339052351635e-05, "loss": 0.03285872340202332, "memory(GiB)": 122.96, "step": 38935, "token_acc": 0.9839857651245552, "train_speed(iter/s)": 0.234641 }, { "epoch": 2.968214040704322, "grad_norm": 0.9399220943450928, "learning_rate": 3.550187959391371e-05, "loss": 0.09051662683486938, "memory(GiB)": 122.96, "step": 38940, "token_acc": 0.9735621521335807, "train_speed(iter/s)": 0.234647 }, { "epoch": 2.968595167314582, "grad_norm": 2.7175989151000977, "learning_rate": 3.5490420966875685e-05, "loss": 0.056359076499938966, "memory(GiB)": 122.96, "step": 38945, "token_acc": 0.983362143474503, "train_speed(iter/s)": 0.234652 }, { "epoch": 2.968976293924842, "grad_norm": 1.0740936994552612, "learning_rate": 3.547896317189469e-05, "loss": 0.10753805637359619, "memory(GiB)": 122.96, "step": 38950, "token_acc": 0.9630479687858619, "train_speed(iter/s)": 0.23466 }, { "epoch": 2.969357420535102, "grad_norm": 1.4621716737747192, "learning_rate": 3.546750620962779e-05, "loss": 0.06662009358406067, "memory(GiB)": 122.96, "step": 38955, "token_acc": 0.9734897172236504, "train_speed(iter/s)": 0.234661 }, { "epoch": 2.9697385471453615, "grad_norm": 1.0432528257369995, "learning_rate": 3.545605008073196e-05, "loss": 0.0946004033088684, "memory(GiB)": 122.96, "step": 38960, "token_acc": 0.9650285010013865, "train_speed(iter/s)": 0.234666 }, { "epoch": 2.9701196737556215, "grad_norm": 1.3881069421768188, "learning_rate": 3.544459478586418e-05, "loss": 0.07019053101539612, "memory(GiB)": 122.96, "step": 38965, "token_acc": 0.9731757943477567, "train_speed(iter/s)": 0.23467 }, { "epoch": 2.9705008003658815, "grad_norm": 0.5398285388946533, "learning_rate": 3.543314032568132e-05, "loss": 0.05511276125907898, "memory(GiB)": 122.96, "step": 38970, "token_acc": 0.9749857873791927, "train_speed(iter/s)": 0.234677 }, { "epoch": 2.9708819269761415, "grad_norm": 1.0859532356262207, "learning_rate": 3.542168670084029e-05, "loss": 0.08305079936981201, "memory(GiB)": 122.96, "step": 38975, "token_acc": 0.9641754169240272, "train_speed(iter/s)": 0.234682 }, { "epoch": 2.9712630535864015, "grad_norm": 1.6940295696258545, "learning_rate": 3.541023391199786e-05, "loss": 0.04208917617797851, "memory(GiB)": 122.96, "step": 38980, "token_acc": 0.9803262955854126, "train_speed(iter/s)": 0.234688 }, { "epoch": 2.9716441801966615, "grad_norm": 0.4786969721317291, "learning_rate": 3.53987819598108e-05, "loss": 0.07224465608596801, "memory(GiB)": 122.96, "step": 38985, "token_acc": 0.9735202492211839, "train_speed(iter/s)": 0.234687 }, { "epoch": 2.972025306806921, "grad_norm": 0.6289643049240112, "learning_rate": 3.538733084493586e-05, "loss": 0.04666823148727417, "memory(GiB)": 122.96, "step": 38990, "token_acc": 0.9830172657797905, "train_speed(iter/s)": 0.234695 }, { "epoch": 2.972406433417181, "grad_norm": 0.9665145874023438, "learning_rate": 3.537588056802967e-05, "loss": 0.0947305679321289, "memory(GiB)": 122.96, "step": 38995, "token_acc": 0.9647089129314786, "train_speed(iter/s)": 0.234699 }, { "epoch": 2.972787560027441, "grad_norm": 0.6485277414321899, "learning_rate": 3.536443112974887e-05, "loss": 0.07951099276542664, "memory(GiB)": 122.96, "step": 39000, "token_acc": 0.970199043700656, "train_speed(iter/s)": 0.234699 }, { "epoch": 2.972787560027441, "eval_loss": 0.07230938971042633, "eval_runtime": 219.4103, "eval_samples_per_second": 2.416, "eval_steps_per_second": 2.416, "eval_token_acc": 0.9695801457743509, "step": 39000 }, { "epoch": 2.973168686637701, "grad_norm": 1.0107721090316772, "learning_rate": 3.535298253075003e-05, "loss": 0.07749907374382019, "memory(GiB)": 122.96, "step": 39005, "token_acc": 0.9697628515032755, "train_speed(iter/s)": 0.234392 }, { "epoch": 2.9735498132479607, "grad_norm": 0.8344621658325195, "learning_rate": 3.5341534771689665e-05, "loss": 0.09255664348602295, "memory(GiB)": 122.96, "step": 39010, "token_acc": 0.9687725198174394, "train_speed(iter/s)": 0.234398 }, { "epoch": 2.9739309398582208, "grad_norm": 1.3069581985473633, "learning_rate": 3.533008785322426e-05, "loss": 0.0548406183719635, "memory(GiB)": 122.96, "step": 39015, "token_acc": 0.9786253143336128, "train_speed(iter/s)": 0.234407 }, { "epoch": 2.974312066468481, "grad_norm": 1.726629376411438, "learning_rate": 3.531864177601024e-05, "loss": 0.12056159973144531, "memory(GiB)": 122.96, "step": 39020, "token_acc": 0.9548164417947913, "train_speed(iter/s)": 0.234415 }, { "epoch": 2.974693193078741, "grad_norm": 1.141485571861267, "learning_rate": 3.530719654070399e-05, "loss": 0.08566503524780274, "memory(GiB)": 122.96, "step": 39025, "token_acc": 0.9618699399320971, "train_speed(iter/s)": 0.234422 }, { "epoch": 2.975074319689001, "grad_norm": 0.9991254210472107, "learning_rate": 3.529575214796183e-05, "loss": 0.09134765863418579, "memory(GiB)": 122.96, "step": 39030, "token_acc": 0.9635815356065878, "train_speed(iter/s)": 0.234431 }, { "epoch": 2.975455446299261, "grad_norm": 1.125738263130188, "learning_rate": 3.5284308598440045e-05, "loss": 0.061217236518859866, "memory(GiB)": 122.96, "step": 39035, "token_acc": 0.9719756309834638, "train_speed(iter/s)": 0.234437 }, { "epoch": 2.9758365729095204, "grad_norm": 1.1593971252441406, "learning_rate": 3.527286589279488e-05, "loss": 0.0695447325706482, "memory(GiB)": 122.96, "step": 39040, "token_acc": 0.9760574620909817, "train_speed(iter/s)": 0.234443 }, { "epoch": 2.9762176995197804, "grad_norm": 1.6628764867782593, "learning_rate": 3.5261424031682515e-05, "loss": 0.06359245777130126, "memory(GiB)": 122.96, "step": 39045, "token_acc": 0.9759274992919853, "train_speed(iter/s)": 0.234451 }, { "epoch": 2.9765988261300405, "grad_norm": 1.304595947265625, "learning_rate": 3.524998301575908e-05, "loss": 0.10594902038574219, "memory(GiB)": 122.96, "step": 39050, "token_acc": 0.9626704953338119, "train_speed(iter/s)": 0.234458 }, { "epoch": 2.9769799527403005, "grad_norm": 1.1797105073928833, "learning_rate": 3.523854284568067e-05, "loss": 0.06611074805259705, "memory(GiB)": 122.96, "step": 39055, "token_acc": 0.9717643880032424, "train_speed(iter/s)": 0.234462 }, { "epoch": 2.97736107935056, "grad_norm": 0.8876939415931702, "learning_rate": 3.522710352210333e-05, "loss": 0.08016995191574097, "memory(GiB)": 122.96, "step": 39060, "token_acc": 0.9657501205981669, "train_speed(iter/s)": 0.234471 }, { "epoch": 2.97774220596082, "grad_norm": 1.6086838245391846, "learning_rate": 3.521566504568304e-05, "loss": 0.0716000735759735, "memory(GiB)": 122.96, "step": 39065, "token_acc": 0.9764219234746639, "train_speed(iter/s)": 0.234476 }, { "epoch": 2.97812333257108, "grad_norm": 1.3499586582183838, "learning_rate": 3.520422741707576e-05, "loss": 0.09208372831344605, "memory(GiB)": 122.96, "step": 39070, "token_acc": 0.9618613576397218, "train_speed(iter/s)": 0.234483 }, { "epoch": 2.97850445918134, "grad_norm": 1.6736934185028076, "learning_rate": 3.519279063693738e-05, "loss": 0.07049931287765503, "memory(GiB)": 122.96, "step": 39075, "token_acc": 0.9705389221556886, "train_speed(iter/s)": 0.234491 }, { "epoch": 2.9788855857916, "grad_norm": 0.6909894347190857, "learning_rate": 3.5181354705923725e-05, "loss": 0.07858411073684693, "memory(GiB)": 122.96, "step": 39080, "token_acc": 0.9705352758225568, "train_speed(iter/s)": 0.234496 }, { "epoch": 2.97926671240186, "grad_norm": 1.3095768690109253, "learning_rate": 3.516991962469063e-05, "loss": 0.07957220673561097, "memory(GiB)": 122.96, "step": 39085, "token_acc": 0.9701573521432447, "train_speed(iter/s)": 0.2345 }, { "epoch": 2.9796478390121197, "grad_norm": 0.6547259092330933, "learning_rate": 3.515848539389381e-05, "loss": 0.08474056720733643, "memory(GiB)": 122.96, "step": 39090, "token_acc": 0.9676173869722257, "train_speed(iter/s)": 0.2345 }, { "epoch": 2.9800289656223797, "grad_norm": 0.7898026704788208, "learning_rate": 3.5147052014189e-05, "loss": 0.07835097312927246, "memory(GiB)": 122.96, "step": 39095, "token_acc": 0.9681749296384499, "train_speed(iter/s)": 0.234507 }, { "epoch": 2.9804100922326398, "grad_norm": 0.6175777912139893, "learning_rate": 3.513561948623182e-05, "loss": 0.06380345821380615, "memory(GiB)": 122.96, "step": 39100, "token_acc": 0.9775366457262517, "train_speed(iter/s)": 0.234513 }, { "epoch": 2.9807912188428998, "grad_norm": 0.6688360571861267, "learning_rate": 3.512418781067787e-05, "loss": 0.07493141889572144, "memory(GiB)": 122.96, "step": 39105, "token_acc": 0.9667104164180885, "train_speed(iter/s)": 0.234515 }, { "epoch": 2.9811723454531593, "grad_norm": 1.4546802043914795, "learning_rate": 3.511275698818274e-05, "loss": 0.07798659205436706, "memory(GiB)": 122.96, "step": 39110, "token_acc": 0.9759970457902511, "train_speed(iter/s)": 0.234522 }, { "epoch": 2.9815534720634194, "grad_norm": 0.9583495855331421, "learning_rate": 3.5101327019401895e-05, "loss": 0.0606040358543396, "memory(GiB)": 122.96, "step": 39115, "token_acc": 0.9743554006968641, "train_speed(iter/s)": 0.234525 }, { "epoch": 2.9819345986736794, "grad_norm": 1.1618422269821167, "learning_rate": 3.508989790499081e-05, "loss": 0.08515780568122863, "memory(GiB)": 122.96, "step": 39120, "token_acc": 0.9697956766360675, "train_speed(iter/s)": 0.234529 }, { "epoch": 2.9823157252839394, "grad_norm": 1.3012700080871582, "learning_rate": 3.5078469645604895e-05, "loss": 0.07220359444618225, "memory(GiB)": 122.96, "step": 39125, "token_acc": 0.9740634005763689, "train_speed(iter/s)": 0.234538 }, { "epoch": 2.9826968518941994, "grad_norm": 1.383684754371643, "learning_rate": 3.506704224189951e-05, "loss": 0.08890300393104553, "memory(GiB)": 122.96, "step": 39130, "token_acc": 0.9625262477790341, "train_speed(iter/s)": 0.234541 }, { "epoch": 2.9830779785044594, "grad_norm": 0.5972448587417603, "learning_rate": 3.505561569452994e-05, "loss": 0.0651609718799591, "memory(GiB)": 122.96, "step": 39135, "token_acc": 0.9680672268907563, "train_speed(iter/s)": 0.234551 }, { "epoch": 2.983459105114719, "grad_norm": 0.8712660670280457, "learning_rate": 3.5044190004151456e-05, "loss": 0.06103957891464233, "memory(GiB)": 122.96, "step": 39140, "token_acc": 0.9747768723321691, "train_speed(iter/s)": 0.23456 }, { "epoch": 2.983840231724979, "grad_norm": 0.9281178116798401, "learning_rate": 3.503276517141929e-05, "loss": 0.05538129210472107, "memory(GiB)": 122.96, "step": 39145, "token_acc": 0.9797435897435898, "train_speed(iter/s)": 0.234567 }, { "epoch": 2.984221358335239, "grad_norm": 1.9713696241378784, "learning_rate": 3.502134119698857e-05, "loss": 0.09805760383605958, "memory(GiB)": 122.96, "step": 39150, "token_acc": 0.9677329881656804, "train_speed(iter/s)": 0.234567 }, { "epoch": 2.9846024849454986, "grad_norm": 0.7472063899040222, "learning_rate": 3.500991808151443e-05, "loss": 0.07481737732887268, "memory(GiB)": 122.96, "step": 39155, "token_acc": 0.9755026186855887, "train_speed(iter/s)": 0.234572 }, { "epoch": 2.9849836115557586, "grad_norm": 0.7177664637565613, "learning_rate": 3.4998495825651925e-05, "loss": 0.06893886923789978, "memory(GiB)": 122.96, "step": 39160, "token_acc": 0.9766891276634493, "train_speed(iter/s)": 0.234578 }, { "epoch": 2.9853647381660187, "grad_norm": 0.6451066732406616, "learning_rate": 3.498707443005607e-05, "loss": 0.10198855400085449, "memory(GiB)": 122.96, "step": 39165, "token_acc": 0.961433868974042, "train_speed(iter/s)": 0.234586 }, { "epoch": 2.9857458647762787, "grad_norm": 1.8478902578353882, "learning_rate": 3.497565389538181e-05, "loss": 0.09753260612487794, "memory(GiB)": 122.96, "step": 39170, "token_acc": 0.9615212527964205, "train_speed(iter/s)": 0.234592 }, { "epoch": 2.9861269913865387, "grad_norm": 1.1897917985916138, "learning_rate": 3.49642342222841e-05, "loss": 0.06039793491363525, "memory(GiB)": 122.96, "step": 39175, "token_acc": 0.9799808035102153, "train_speed(iter/s)": 0.234595 }, { "epoch": 2.9865081179967987, "grad_norm": 0.7621636986732483, "learning_rate": 3.495281541141777e-05, "loss": 0.07277787327766419, "memory(GiB)": 122.96, "step": 39180, "token_acc": 0.976927570093458, "train_speed(iter/s)": 0.234602 }, { "epoch": 2.9868892446070587, "grad_norm": 0.9261408448219299, "learning_rate": 3.4941397463437654e-05, "loss": 0.07517488598823548, "memory(GiB)": 122.96, "step": 39185, "token_acc": 0.973026130935656, "train_speed(iter/s)": 0.23461 }, { "epoch": 2.9872703712173183, "grad_norm": 0.6630003452301025, "learning_rate": 3.4929980378998514e-05, "loss": 0.10181840658187866, "memory(GiB)": 122.96, "step": 39190, "token_acc": 0.9600223651104277, "train_speed(iter/s)": 0.234618 }, { "epoch": 2.9876514978275783, "grad_norm": 1.2734942436218262, "learning_rate": 3.4918564158755065e-05, "loss": 0.0656619906425476, "memory(GiB)": 122.96, "step": 39195, "token_acc": 0.9673601147776184, "train_speed(iter/s)": 0.234627 }, { "epoch": 2.9880326244378383, "grad_norm": 1.3208194971084595, "learning_rate": 3.4907148803361976e-05, "loss": 0.10563346147537231, "memory(GiB)": 122.96, "step": 39200, "token_acc": 0.9695896433805569, "train_speed(iter/s)": 0.234628 }, { "epoch": 2.9880326244378383, "eval_loss": 0.07265514135360718, "eval_runtime": 221.3628, "eval_samples_per_second": 2.394, "eval_steps_per_second": 2.394, "eval_token_acc": 0.9695349677730257, "step": 39200 }, { "epoch": 2.988413751048098, "grad_norm": 1.0566213130950928, "learning_rate": 3.489573431347386e-05, "loss": 0.06479206085205078, "memory(GiB)": 122.96, "step": 39205, "token_acc": 0.9695677838263265, "train_speed(iter/s)": 0.234324 }, { "epoch": 2.988794877658358, "grad_norm": 0.761085569858551, "learning_rate": 3.48843206897453e-05, "loss": 0.05413353443145752, "memory(GiB)": 122.96, "step": 39210, "token_acc": 0.976957585644372, "train_speed(iter/s)": 0.23433 }, { "epoch": 2.989176004268618, "grad_norm": 0.9931512475013733, "learning_rate": 3.4872907932830766e-05, "loss": 0.062159568071365356, "memory(GiB)": 122.96, "step": 39215, "token_acc": 0.9774436090225563, "train_speed(iter/s)": 0.234333 }, { "epoch": 2.989557130878878, "grad_norm": 0.9465264678001404, "learning_rate": 3.48614960433848e-05, "loss": 0.0627434492111206, "memory(GiB)": 122.96, "step": 39220, "token_acc": 0.9781634938409854, "train_speed(iter/s)": 0.234338 }, { "epoch": 2.989938257489138, "grad_norm": 0.714407742023468, "learning_rate": 3.485008502206177e-05, "loss": 0.05273017883300781, "memory(GiB)": 122.96, "step": 39225, "token_acc": 0.9824719680371182, "train_speed(iter/s)": 0.234337 }, { "epoch": 2.990319384099398, "grad_norm": 2.22300386428833, "learning_rate": 3.4838674869516066e-05, "loss": 0.07220150828361512, "memory(GiB)": 122.96, "step": 39230, "token_acc": 0.9760528488852188, "train_speed(iter/s)": 0.234341 }, { "epoch": 2.9907005107096576, "grad_norm": 1.0791500806808472, "learning_rate": 3.482726558640201e-05, "loss": 0.07949203252792358, "memory(GiB)": 122.96, "step": 39235, "token_acc": 0.9729129662522202, "train_speed(iter/s)": 0.234342 }, { "epoch": 2.9910816373199176, "grad_norm": 1.728495478630066, "learning_rate": 3.481585717337386e-05, "loss": 0.07493292689323425, "memory(GiB)": 122.96, "step": 39240, "token_acc": 0.974762726488352, "train_speed(iter/s)": 0.234349 }, { "epoch": 2.9914627639301776, "grad_norm": 0.6211915016174316, "learning_rate": 3.480444963108582e-05, "loss": 0.10468072891235351, "memory(GiB)": 122.96, "step": 39245, "token_acc": 0.9638118214716526, "train_speed(iter/s)": 0.234353 }, { "epoch": 2.9918438905404376, "grad_norm": 0.6416475176811218, "learning_rate": 3.479304296019209e-05, "loss": 0.06595179438591003, "memory(GiB)": 122.96, "step": 39250, "token_acc": 0.9686671487105326, "train_speed(iter/s)": 0.234361 }, { "epoch": 2.992225017150697, "grad_norm": 0.8585364818572998, "learning_rate": 3.478163716134679e-05, "loss": 0.09969189167022705, "memory(GiB)": 122.96, "step": 39255, "token_acc": 0.9653804543815362, "train_speed(iter/s)": 0.234371 }, { "epoch": 2.9926061437609572, "grad_norm": 1.2608102560043335, "learning_rate": 3.4770232235203955e-05, "loss": 0.0885477066040039, "memory(GiB)": 122.96, "step": 39260, "token_acc": 0.9696274059685679, "train_speed(iter/s)": 0.234375 }, { "epoch": 2.9929872703712173, "grad_norm": 1.7529563903808594, "learning_rate": 3.475882818241763e-05, "loss": 0.08256800174713134, "memory(GiB)": 122.96, "step": 39265, "token_acc": 0.9718351615772309, "train_speed(iter/s)": 0.234383 }, { "epoch": 2.9933683969814773, "grad_norm": 1.0772836208343506, "learning_rate": 3.4747425003641795e-05, "loss": 0.0831078052520752, "memory(GiB)": 122.96, "step": 39270, "token_acc": 0.966217870257038, "train_speed(iter/s)": 0.234386 }, { "epoch": 2.9937495235917373, "grad_norm": 1.9200010299682617, "learning_rate": 3.473602269953034e-05, "loss": 0.088853257894516, "memory(GiB)": 122.96, "step": 39275, "token_acc": 0.9597104186339314, "train_speed(iter/s)": 0.234394 }, { "epoch": 2.9941306502019973, "grad_norm": 1.0271961688995361, "learning_rate": 3.472462127073716e-05, "loss": 0.0703099250793457, "memory(GiB)": 122.96, "step": 39280, "token_acc": 0.9734564339296018, "train_speed(iter/s)": 0.234399 }, { "epoch": 2.994511776812257, "grad_norm": 0.8118994832038879, "learning_rate": 3.471322071791606e-05, "loss": 0.0784456729888916, "memory(GiB)": 122.96, "step": 39285, "token_acc": 0.9710327455919395, "train_speed(iter/s)": 0.234406 }, { "epoch": 2.994892903422517, "grad_norm": 1.3292436599731445, "learning_rate": 3.470182104172081e-05, "loss": 0.0917759358882904, "memory(GiB)": 122.96, "step": 39290, "token_acc": 0.963262554769127, "train_speed(iter/s)": 0.234411 }, { "epoch": 2.995274030032777, "grad_norm": 0.6280410885810852, "learning_rate": 3.469042224280514e-05, "loss": 0.09355546236038208, "memory(GiB)": 122.96, "step": 39295, "token_acc": 0.9644194756554307, "train_speed(iter/s)": 0.234419 }, { "epoch": 2.995655156643037, "grad_norm": 1.3419440984725952, "learning_rate": 3.4679024321822704e-05, "loss": 0.12040141820907593, "memory(GiB)": 122.96, "step": 39300, "token_acc": 0.9491954438618695, "train_speed(iter/s)": 0.234425 }, { "epoch": 2.9960362832532965, "grad_norm": 1.0595062971115112, "learning_rate": 3.466762727942713e-05, "loss": 0.12129837274551392, "memory(GiB)": 122.96, "step": 39305, "token_acc": 0.9583333333333334, "train_speed(iter/s)": 0.234433 }, { "epoch": 2.9964174098635565, "grad_norm": 0.9315109848976135, "learning_rate": 3.465623111627196e-05, "loss": 0.08626197576522827, "memory(GiB)": 122.96, "step": 39310, "token_acc": 0.9662247474747475, "train_speed(iter/s)": 0.234438 }, { "epoch": 2.9967985364738166, "grad_norm": 0.9964389204978943, "learning_rate": 3.464483583301076e-05, "loss": 0.08997769355773926, "memory(GiB)": 122.96, "step": 39315, "token_acc": 0.9750992421508481, "train_speed(iter/s)": 0.234447 }, { "epoch": 2.9971796630840766, "grad_norm": 0.6324106454849243, "learning_rate": 3.463344143029695e-05, "loss": 0.054768145084381104, "memory(GiB)": 122.96, "step": 39320, "token_acc": 0.9806002928257687, "train_speed(iter/s)": 0.234455 }, { "epoch": 2.9975607896943366, "grad_norm": 1.1516257524490356, "learning_rate": 3.462204790878397e-05, "loss": 0.09918960332870483, "memory(GiB)": 122.96, "step": 39325, "token_acc": 0.9645056139080044, "train_speed(iter/s)": 0.23446 }, { "epoch": 2.9979419163045966, "grad_norm": 1.0922255516052246, "learning_rate": 3.4610655269125184e-05, "loss": 0.07015914916992187, "memory(GiB)": 122.96, "step": 39330, "token_acc": 0.9744, "train_speed(iter/s)": 0.234467 }, { "epoch": 2.998323042914856, "grad_norm": 0.7641240358352661, "learning_rate": 3.45992635119739e-05, "loss": 0.06108701229095459, "memory(GiB)": 122.96, "step": 39335, "token_acc": 0.9768477292965272, "train_speed(iter/s)": 0.234472 }, { "epoch": 2.998704169525116, "grad_norm": 1.30362868309021, "learning_rate": 3.458787263798338e-05, "loss": 0.0817350447177887, "memory(GiB)": 122.96, "step": 39340, "token_acc": 0.96390760346487, "train_speed(iter/s)": 0.234479 }, { "epoch": 2.999085296135376, "grad_norm": 2.031428813934326, "learning_rate": 3.457648264780684e-05, "loss": 0.12164015769958496, "memory(GiB)": 122.96, "step": 39345, "token_acc": 0.9583106267029973, "train_speed(iter/s)": 0.234486 }, { "epoch": 2.9994664227456362, "grad_norm": 1.254213809967041, "learning_rate": 3.456509354209744e-05, "loss": 0.07482206225395202, "memory(GiB)": 122.96, "step": 39350, "token_acc": 0.9732609149780655, "train_speed(iter/s)": 0.234493 }, { "epoch": 2.999847549355896, "grad_norm": 1.2808424234390259, "learning_rate": 3.4553705321508324e-05, "loss": 0.10698556900024414, "memory(GiB)": 122.96, "step": 39355, "token_acc": 0.9667471934062811, "train_speed(iter/s)": 0.234499 }, { "epoch": 3.000228675966156, "grad_norm": 0.6597931385040283, "learning_rate": 3.454231798669252e-05, "loss": 0.08201568126678467, "memory(GiB)": 122.96, "step": 39360, "token_acc": 0.9722695207978594, "train_speed(iter/s)": 0.234506 }, { "epoch": 3.000609802576416, "grad_norm": 0.8968120217323303, "learning_rate": 3.4530931538303034e-05, "loss": 0.0778917670249939, "memory(GiB)": 122.96, "step": 39365, "token_acc": 0.973754100921731, "train_speed(iter/s)": 0.234511 }, { "epoch": 3.000990929186676, "grad_norm": 1.5238381624221802, "learning_rate": 3.451954597699285e-05, "loss": 0.10934780836105347, "memory(GiB)": 122.96, "step": 39370, "token_acc": 0.9615581752947207, "train_speed(iter/s)": 0.234518 }, { "epoch": 3.001372055796936, "grad_norm": 0.7919889688491821, "learning_rate": 3.450816130341487e-05, "loss": 0.0701261043548584, "memory(GiB)": 122.96, "step": 39375, "token_acc": 0.9709870703248187, "train_speed(iter/s)": 0.234526 }, { "epoch": 3.0017531824071955, "grad_norm": 0.9384965300559998, "learning_rate": 3.4496777518221935e-05, "loss": 0.06657161116600037, "memory(GiB)": 122.96, "step": 39380, "token_acc": 0.974376731301939, "train_speed(iter/s)": 0.234536 }, { "epoch": 3.0021343090174555, "grad_norm": 0.9657983183860779, "learning_rate": 3.4485394622066875e-05, "loss": 0.05467992424964905, "memory(GiB)": 122.96, "step": 39385, "token_acc": 0.9806991207377225, "train_speed(iter/s)": 0.234543 }, { "epoch": 3.0025154356277155, "grad_norm": 0.9216153025627136, "learning_rate": 3.447401261560243e-05, "loss": 0.08483916521072388, "memory(GiB)": 122.96, "step": 39390, "token_acc": 0.9669893273765202, "train_speed(iter/s)": 0.23455 }, { "epoch": 3.0028965622379755, "grad_norm": 0.9467447400093079, "learning_rate": 3.4462631499481325e-05, "loss": 0.05708732604980469, "memory(GiB)": 122.96, "step": 39395, "token_acc": 0.9744897959183674, "train_speed(iter/s)": 0.234556 }, { "epoch": 3.0032776888482355, "grad_norm": 1.3860000371932983, "learning_rate": 3.445125127435619e-05, "loss": 0.07783139944076538, "memory(GiB)": 122.96, "step": 39400, "token_acc": 0.9675440838231536, "train_speed(iter/s)": 0.234563 }, { "epoch": 3.0032776888482355, "eval_loss": 0.07260935008525848, "eval_runtime": 221.2813, "eval_samples_per_second": 2.395, "eval_steps_per_second": 2.395, "eval_token_acc": 0.9694446117703753, "step": 39400 }, { "epoch": 3.003658815458495, "grad_norm": 1.5388834476470947, "learning_rate": 3.4439871940879654e-05, "loss": 0.08239893317222595, "memory(GiB)": 122.96, "step": 39405, "token_acc": 0.9694291488489251, "train_speed(iter/s)": 0.234263 }, { "epoch": 3.004039942068755, "grad_norm": 1.2024186849594116, "learning_rate": 3.442849349970424e-05, "loss": 0.06583164930343628, "memory(GiB)": 122.96, "step": 39410, "token_acc": 0.9773193059161087, "train_speed(iter/s)": 0.234263 }, { "epoch": 3.004421068679015, "grad_norm": 0.9287282228469849, "learning_rate": 3.441711595148246e-05, "loss": 0.05454905033111572, "memory(GiB)": 122.96, "step": 39415, "token_acc": 0.9800934741215164, "train_speed(iter/s)": 0.234269 }, { "epoch": 3.004802195289275, "grad_norm": 1.6532049179077148, "learning_rate": 3.440573929686678e-05, "loss": 0.06142799854278565, "memory(GiB)": 122.96, "step": 39420, "token_acc": 0.9798748980146859, "train_speed(iter/s)": 0.234276 }, { "epoch": 3.005183321899535, "grad_norm": 0.5667494535446167, "learning_rate": 3.439436353650958e-05, "loss": 0.07788003087043763, "memory(GiB)": 122.96, "step": 39425, "token_acc": 0.9742558326629123, "train_speed(iter/s)": 0.234281 }, { "epoch": 3.0055644485097948, "grad_norm": 1.404039978981018, "learning_rate": 3.438298867106321e-05, "loss": 0.0836568832397461, "memory(GiB)": 122.96, "step": 39430, "token_acc": 0.9653952025167125, "train_speed(iter/s)": 0.234288 }, { "epoch": 3.005945575120055, "grad_norm": 1.1280783414840698, "learning_rate": 3.4371614701179965e-05, "loss": 0.0565992534160614, "memory(GiB)": 122.96, "step": 39435, "token_acc": 0.973175965665236, "train_speed(iter/s)": 0.234293 }, { "epoch": 3.006326701730315, "grad_norm": 0.5237492322921753, "learning_rate": 3.43602416275121e-05, "loss": 0.08752325177192688, "memory(GiB)": 122.96, "step": 39440, "token_acc": 0.9662790697674418, "train_speed(iter/s)": 0.234299 }, { "epoch": 3.006707828340575, "grad_norm": 0.921880304813385, "learning_rate": 3.434886945071179e-05, "loss": 0.060970187187194824, "memory(GiB)": 122.96, "step": 39445, "token_acc": 0.9769276276868468, "train_speed(iter/s)": 0.234306 }, { "epoch": 3.007088954950835, "grad_norm": 1.2174757719039917, "learning_rate": 3.43374981714312e-05, "loss": 0.05646783709526062, "memory(GiB)": 122.96, "step": 39450, "token_acc": 0.975, "train_speed(iter/s)": 0.234315 }, { "epoch": 3.0074700815610944, "grad_norm": 0.6392403841018677, "learning_rate": 3.43261277903224e-05, "loss": 0.06315624117851257, "memory(GiB)": 122.96, "step": 39455, "token_acc": 0.9746132054913924, "train_speed(iter/s)": 0.234316 }, { "epoch": 3.0078512081713544, "grad_norm": 0.6718295812606812, "learning_rate": 3.4314758308037433e-05, "loss": 0.03826099634170532, "memory(GiB)": 122.96, "step": 39460, "token_acc": 0.9845814977973568, "train_speed(iter/s)": 0.234316 }, { "epoch": 3.0082323347816144, "grad_norm": 0.9157667756080627, "learning_rate": 3.43033897252283e-05, "loss": 0.07007969617843628, "memory(GiB)": 122.96, "step": 39465, "token_acc": 0.9737827715355806, "train_speed(iter/s)": 0.234321 }, { "epoch": 3.0086134613918745, "grad_norm": 0.9105504155158997, "learning_rate": 3.429202204254691e-05, "loss": 0.07612475156784057, "memory(GiB)": 122.96, "step": 39470, "token_acc": 0.9777103512661921, "train_speed(iter/s)": 0.234322 }, { "epoch": 3.0089945880021345, "grad_norm": 1.0837302207946777, "learning_rate": 3.428065526064518e-05, "loss": 0.05890547037124634, "memory(GiB)": 122.96, "step": 39475, "token_acc": 0.9707034576733095, "train_speed(iter/s)": 0.234329 }, { "epoch": 3.009375714612394, "grad_norm": 0.6902436017990112, "learning_rate": 3.426928938017489e-05, "loss": 0.05693358778953552, "memory(GiB)": 122.96, "step": 39480, "token_acc": 0.9768064228367529, "train_speed(iter/s)": 0.234336 }, { "epoch": 3.009756841222654, "grad_norm": 1.31316339969635, "learning_rate": 3.4257924401787864e-05, "loss": 0.05612483024597168, "memory(GiB)": 122.96, "step": 39485, "token_acc": 0.9750554323725056, "train_speed(iter/s)": 0.234343 }, { "epoch": 3.010137967832914, "grad_norm": 0.7805521488189697, "learning_rate": 3.424656032613585e-05, "loss": 0.037118139863014224, "memory(GiB)": 122.96, "step": 39490, "token_acc": 0.9869626497533475, "train_speed(iter/s)": 0.234352 }, { "epoch": 3.010519094443174, "grad_norm": 1.3148024082183838, "learning_rate": 3.423519715387048e-05, "loss": 0.07974357008934022, "memory(GiB)": 122.96, "step": 39495, "token_acc": 0.977651710503696, "train_speed(iter/s)": 0.234357 }, { "epoch": 3.010900221053434, "grad_norm": 1.2969496250152588, "learning_rate": 3.422383488564339e-05, "loss": 0.07749972939491272, "memory(GiB)": 122.96, "step": 39500, "token_acc": 0.9555555555555556, "train_speed(iter/s)": 0.234366 }, { "epoch": 3.0112813476636937, "grad_norm": 1.3388887643814087, "learning_rate": 3.421247352210617e-05, "loss": 0.06747925281524658, "memory(GiB)": 122.96, "step": 39505, "token_acc": 0.9760919540229885, "train_speed(iter/s)": 0.234369 }, { "epoch": 3.0116624742739537, "grad_norm": 1.1166969537734985, "learning_rate": 3.4201113063910326e-05, "loss": 0.06820608973503113, "memory(GiB)": 122.96, "step": 39510, "token_acc": 0.9800099950024987, "train_speed(iter/s)": 0.234379 }, { "epoch": 3.0120436008842137, "grad_norm": 1.6922825574874878, "learning_rate": 3.418975351170732e-05, "loss": 0.0918799340724945, "memory(GiB)": 122.96, "step": 39515, "token_acc": 0.9587393738337134, "train_speed(iter/s)": 0.234385 }, { "epoch": 3.0124247274944738, "grad_norm": 0.9951008558273315, "learning_rate": 3.41783948661486e-05, "loss": 0.04682411551475525, "memory(GiB)": 122.96, "step": 39520, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.234391 }, { "epoch": 3.012805854104734, "grad_norm": 1.0013409852981567, "learning_rate": 3.416703712788552e-05, "loss": 0.07305909991264344, "memory(GiB)": 122.96, "step": 39525, "token_acc": 0.9755525787006029, "train_speed(iter/s)": 0.2344 }, { "epoch": 3.0131869807149934, "grad_norm": 1.9884096384048462, "learning_rate": 3.415568029756937e-05, "loss": 0.07594524025917053, "memory(GiB)": 122.96, "step": 39530, "token_acc": 0.9721879709549254, "train_speed(iter/s)": 0.234405 }, { "epoch": 3.0135681073252534, "grad_norm": 1.562718391418457, "learning_rate": 3.414432437585142e-05, "loss": 0.07913765907287598, "memory(GiB)": 122.96, "step": 39535, "token_acc": 0.9642582036666153, "train_speed(iter/s)": 0.23441 }, { "epoch": 3.0139492339355134, "grad_norm": 1.7101428508758545, "learning_rate": 3.41329693633829e-05, "loss": 0.06119677424430847, "memory(GiB)": 122.96, "step": 39540, "token_acc": 0.9757975797579758, "train_speed(iter/s)": 0.234417 }, { "epoch": 3.0143303605457734, "grad_norm": 1.1923686265945435, "learning_rate": 3.4121615260814966e-05, "loss": 0.08163259029388428, "memory(GiB)": 122.96, "step": 39545, "token_acc": 0.9683116883116883, "train_speed(iter/s)": 0.234425 }, { "epoch": 3.0147114871560334, "grad_norm": 1.3107987642288208, "learning_rate": 3.411026206879869e-05, "loss": 0.05776560306549072, "memory(GiB)": 122.96, "step": 39550, "token_acc": 0.9667070217917676, "train_speed(iter/s)": 0.234432 }, { "epoch": 3.015092613766293, "grad_norm": 0.5181317925453186, "learning_rate": 3.409890978798517e-05, "loss": 0.06792814135551453, "memory(GiB)": 122.96, "step": 39555, "token_acc": 0.975609756097561, "train_speed(iter/s)": 0.234441 }, { "epoch": 3.015473740376553, "grad_norm": 0.5551623702049255, "learning_rate": 3.408755841902537e-05, "loss": 0.044603532552719115, "memory(GiB)": 122.96, "step": 39560, "token_acc": 0.9845708775313404, "train_speed(iter/s)": 0.234447 }, { "epoch": 3.015854866986813, "grad_norm": 0.6403271555900574, "learning_rate": 3.407620796257025e-05, "loss": 0.09473435878753662, "memory(GiB)": 122.96, "step": 39565, "token_acc": 0.9591103122893732, "train_speed(iter/s)": 0.234454 }, { "epoch": 3.016235993597073, "grad_norm": 1.9262216091156006, "learning_rate": 3.406485841927071e-05, "loss": 0.0753549337387085, "memory(GiB)": 122.96, "step": 39570, "token_acc": 0.9758210499094906, "train_speed(iter/s)": 0.234453 }, { "epoch": 3.016617120207333, "grad_norm": 1.3632495403289795, "learning_rate": 3.40535097897776e-05, "loss": 0.0568355917930603, "memory(GiB)": 122.96, "step": 39575, "token_acc": 0.9780536466415429, "train_speed(iter/s)": 0.23446 }, { "epoch": 3.0169982468175927, "grad_norm": 1.514310598373413, "learning_rate": 3.404216207474169e-05, "loss": 0.07645566463470459, "memory(GiB)": 122.96, "step": 39580, "token_acc": 0.9647302904564315, "train_speed(iter/s)": 0.234468 }, { "epoch": 3.0173793734278527, "grad_norm": 1.2851296663284302, "learning_rate": 3.4030815274813735e-05, "loss": 0.04687686562538147, "memory(GiB)": 122.96, "step": 39585, "token_acc": 0.9825680272108843, "train_speed(iter/s)": 0.234472 }, { "epoch": 3.0177605000381127, "grad_norm": 0.5882842540740967, "learning_rate": 3.401946939064442e-05, "loss": 0.05242878198623657, "memory(GiB)": 122.96, "step": 39590, "token_acc": 0.9815969130305728, "train_speed(iter/s)": 0.234476 }, { "epoch": 3.0181416266483727, "grad_norm": 0.954261839389801, "learning_rate": 3.400812442288439e-05, "loss": 0.049172204732894895, "memory(GiB)": 122.96, "step": 39595, "token_acc": 0.975355969331873, "train_speed(iter/s)": 0.234483 }, { "epoch": 3.0185227532586327, "grad_norm": 0.4422500729560852, "learning_rate": 3.399678037218419e-05, "loss": 0.03304167687892914, "memory(GiB)": 122.96, "step": 39600, "token_acc": 0.9881831610044313, "train_speed(iter/s)": 0.234489 }, { "epoch": 3.0185227532586327, "eval_loss": 0.07239662855863571, "eval_runtime": 221.0759, "eval_samples_per_second": 2.397, "eval_steps_per_second": 2.397, "eval_token_acc": 0.9696328534425637, "step": 39600 }, { "epoch": 3.0189038798688923, "grad_norm": 1.5617244243621826, "learning_rate": 3.398543723919438e-05, "loss": 0.07254239916801453, "memory(GiB)": 122.96, "step": 39605, "token_acc": 0.9698526192908216, "train_speed(iter/s)": 0.234188 }, { "epoch": 3.0192850064791523, "grad_norm": 1.3559612035751343, "learning_rate": 3.3974095024565436e-05, "loss": 0.07168787121772766, "memory(GiB)": 122.96, "step": 39610, "token_acc": 0.9704785581106277, "train_speed(iter/s)": 0.234193 }, { "epoch": 3.0196661330894123, "grad_norm": 1.290589451789856, "learning_rate": 3.396275372894775e-05, "loss": 0.0632422149181366, "memory(GiB)": 122.96, "step": 39615, "token_acc": 0.9732480372201221, "train_speed(iter/s)": 0.234201 }, { "epoch": 3.0200472596996724, "grad_norm": 0.8012451529502869, "learning_rate": 3.395141335299172e-05, "loss": 0.07209588289260864, "memory(GiB)": 122.96, "step": 39620, "token_acc": 0.9724151234567902, "train_speed(iter/s)": 0.234206 }, { "epoch": 3.0204283863099324, "grad_norm": 1.111770510673523, "learning_rate": 3.394007389734768e-05, "loss": 0.07420333027839661, "memory(GiB)": 122.96, "step": 39625, "token_acc": 0.9684531278006812, "train_speed(iter/s)": 0.234211 }, { "epoch": 3.020809512920192, "grad_norm": 1.039993166923523, "learning_rate": 3.3928735362665866e-05, "loss": 0.08121557831764221, "memory(GiB)": 122.96, "step": 39630, "token_acc": 0.9733105443071259, "train_speed(iter/s)": 0.234211 }, { "epoch": 3.021190639530452, "grad_norm": 1.17144775390625, "learning_rate": 3.3917397749596504e-05, "loss": 0.05303125381469727, "memory(GiB)": 122.96, "step": 39635, "token_acc": 0.9711445976274447, "train_speed(iter/s)": 0.234221 }, { "epoch": 3.021571766140712, "grad_norm": 0.7270400524139404, "learning_rate": 3.390606105878974e-05, "loss": 0.06085293292999268, "memory(GiB)": 122.96, "step": 39640, "token_acc": 0.9799918334013883, "train_speed(iter/s)": 0.234229 }, { "epoch": 3.021952892750972, "grad_norm": 0.7814453840255737, "learning_rate": 3.389472529089569e-05, "loss": 0.06372824907302857, "memory(GiB)": 122.96, "step": 39645, "token_acc": 0.9755749688001426, "train_speed(iter/s)": 0.234233 }, { "epoch": 3.0223340193612316, "grad_norm": 0.7135666012763977, "learning_rate": 3.3883390446564424e-05, "loss": 0.03833313584327698, "memory(GiB)": 122.96, "step": 39650, "token_acc": 0.9837832940750136, "train_speed(iter/s)": 0.234232 }, { "epoch": 3.0227151459714916, "grad_norm": 1.035219669342041, "learning_rate": 3.38720565264459e-05, "loss": 0.0752606749534607, "memory(GiB)": 122.96, "step": 39655, "token_acc": 0.9673590504451038, "train_speed(iter/s)": 0.234239 }, { "epoch": 3.0230962725817516, "grad_norm": 0.9009223580360413, "learning_rate": 3.386072353119011e-05, "loss": 0.06705437898635865, "memory(GiB)": 122.96, "step": 39660, "token_acc": 0.9721503879053113, "train_speed(iter/s)": 0.234247 }, { "epoch": 3.0234773991920116, "grad_norm": 1.4092539548873901, "learning_rate": 3.3849391461446924e-05, "loss": 0.06422204971313476, "memory(GiB)": 122.96, "step": 39665, "token_acc": 0.9728220402084885, "train_speed(iter/s)": 0.234255 }, { "epoch": 3.0238585258022717, "grad_norm": 0.5667319297790527, "learning_rate": 3.3838060317866184e-05, "loss": 0.048988950252532956, "memory(GiB)": 122.96, "step": 39670, "token_acc": 0.9826860084230229, "train_speed(iter/s)": 0.234263 }, { "epoch": 3.0242396524125312, "grad_norm": 1.8842958211898804, "learning_rate": 3.382673010109769e-05, "loss": 0.08209805488586426, "memory(GiB)": 122.96, "step": 39675, "token_acc": 0.9717462632154575, "train_speed(iter/s)": 0.234268 }, { "epoch": 3.0246207790227913, "grad_norm": 0.3077614903450012, "learning_rate": 3.3815400811791174e-05, "loss": 0.05828458070755005, "memory(GiB)": 122.96, "step": 39680, "token_acc": 0.9707687810581701, "train_speed(iter/s)": 0.234277 }, { "epoch": 3.0250019056330513, "grad_norm": 1.7499667406082153, "learning_rate": 3.380407245059631e-05, "loss": 0.06342191696166992, "memory(GiB)": 122.96, "step": 39685, "token_acc": 0.9785, "train_speed(iter/s)": 0.234284 }, { "epoch": 3.0253830322433113, "grad_norm": 0.6214100122451782, "learning_rate": 3.379274501816274e-05, "loss": 0.04329520165920257, "memory(GiB)": 122.96, "step": 39690, "token_acc": 0.9824884792626728, "train_speed(iter/s)": 0.234288 }, { "epoch": 3.0257641588535713, "grad_norm": 1.757106900215149, "learning_rate": 3.378141851514003e-05, "loss": 0.08212856054306031, "memory(GiB)": 122.96, "step": 39695, "token_acc": 0.9602080624187256, "train_speed(iter/s)": 0.234296 }, { "epoch": 3.026145285463831, "grad_norm": 0.8971019387245178, "learning_rate": 3.377009294217771e-05, "loss": 0.046687576174736026, "memory(GiB)": 122.96, "step": 39700, "token_acc": 0.9769115442278861, "train_speed(iter/s)": 0.234304 }, { "epoch": 3.026526412074091, "grad_norm": 0.575691282749176, "learning_rate": 3.3758768299925224e-05, "loss": 0.05405145883560181, "memory(GiB)": 122.96, "step": 39705, "token_acc": 0.9819048783469861, "train_speed(iter/s)": 0.234306 }, { "epoch": 3.026907538684351, "grad_norm": 1.5340170860290527, "learning_rate": 3.3747444589032026e-05, "loss": 0.049078524112701416, "memory(GiB)": 122.96, "step": 39710, "token_acc": 0.9803767660910518, "train_speed(iter/s)": 0.234312 }, { "epoch": 3.027288665294611, "grad_norm": 2.0387609004974365, "learning_rate": 3.3736121810147455e-05, "loss": 0.060889029502868654, "memory(GiB)": 122.96, "step": 39715, "token_acc": 0.9732246798603027, "train_speed(iter/s)": 0.234321 }, { "epoch": 3.027669791904871, "grad_norm": 0.6770581603050232, "learning_rate": 3.3724799963920814e-05, "loss": 0.07171829938888549, "memory(GiB)": 122.96, "step": 39720, "token_acc": 0.9773263433813892, "train_speed(iter/s)": 0.234326 }, { "epoch": 3.0280509185151305, "grad_norm": 1.4031425714492798, "learning_rate": 3.371347905100138e-05, "loss": 0.042297637462615965, "memory(GiB)": 122.96, "step": 39725, "token_acc": 0.9801324503311258, "train_speed(iter/s)": 0.234337 }, { "epoch": 3.0284320451253905, "grad_norm": 1.0513629913330078, "learning_rate": 3.3702159072038355e-05, "loss": 0.05656819343566895, "memory(GiB)": 122.96, "step": 39730, "token_acc": 0.9798817913162082, "train_speed(iter/s)": 0.234339 }, { "epoch": 3.0288131717356506, "grad_norm": 1.9232016801834106, "learning_rate": 3.369084002768085e-05, "loss": 0.0476178914308548, "memory(GiB)": 122.96, "step": 39735, "token_acc": 0.9830618892508143, "train_speed(iter/s)": 0.234343 }, { "epoch": 3.0291942983459106, "grad_norm": 1.2645000219345093, "learning_rate": 3.3679521918578004e-05, "loss": 0.07205018997192383, "memory(GiB)": 122.96, "step": 39740, "token_acc": 0.9824644549763033, "train_speed(iter/s)": 0.234349 }, { "epoch": 3.0295754249561706, "grad_norm": 0.6505276560783386, "learning_rate": 3.366820474537882e-05, "loss": 0.04790619909763336, "memory(GiB)": 122.96, "step": 39745, "token_acc": 0.9834645669291339, "train_speed(iter/s)": 0.234356 }, { "epoch": 3.02995655156643, "grad_norm": 0.8190963268280029, "learning_rate": 3.365688850873232e-05, "loss": 0.07938244938850403, "memory(GiB)": 122.96, "step": 39750, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.234362 }, { "epoch": 3.03033767817669, "grad_norm": 0.7670210599899292, "learning_rate": 3.364557320928742e-05, "loss": 0.07830352187156678, "memory(GiB)": 122.96, "step": 39755, "token_acc": 0.9695389681668496, "train_speed(iter/s)": 0.234371 }, { "epoch": 3.03071880478695, "grad_norm": 1.638051986694336, "learning_rate": 3.3634258847693e-05, "loss": 0.06114376187324524, "memory(GiB)": 122.96, "step": 39760, "token_acc": 0.9744458930899609, "train_speed(iter/s)": 0.23438 }, { "epoch": 3.0310999313972102, "grad_norm": 1.4499859809875488, "learning_rate": 3.3622945424597893e-05, "loss": 0.08804128170013428, "memory(GiB)": 122.96, "step": 39765, "token_acc": 0.965499306518724, "train_speed(iter/s)": 0.234384 }, { "epoch": 3.0314810580074703, "grad_norm": 1.9644252061843872, "learning_rate": 3.3611632940650874e-05, "loss": 0.07820584177970887, "memory(GiB)": 122.96, "step": 39770, "token_acc": 0.9695682944090588, "train_speed(iter/s)": 0.234393 }, { "epoch": 3.03186218461773, "grad_norm": 1.825711965560913, "learning_rate": 3.3600321396500644e-05, "loss": 0.06245817542076111, "memory(GiB)": 122.96, "step": 39775, "token_acc": 0.9756023177798109, "train_speed(iter/s)": 0.234397 }, { "epoch": 3.03224331122799, "grad_norm": 0.6751904487609863, "learning_rate": 3.358901079279588e-05, "loss": 0.06365445852279664, "memory(GiB)": 122.96, "step": 39780, "token_acc": 0.9783503310073358, "train_speed(iter/s)": 0.234404 }, { "epoch": 3.03262443783825, "grad_norm": 0.6121198534965515, "learning_rate": 3.35777011301852e-05, "loss": 0.07140827775001526, "memory(GiB)": 122.96, "step": 39785, "token_acc": 0.9787631668365613, "train_speed(iter/s)": 0.234408 }, { "epoch": 3.03300556444851, "grad_norm": 1.6158921718597412, "learning_rate": 3.356639240931714e-05, "loss": 0.0793246567249298, "memory(GiB)": 122.96, "step": 39790, "token_acc": 0.9714604591836735, "train_speed(iter/s)": 0.234413 }, { "epoch": 3.03338669105877, "grad_norm": 1.299395203590393, "learning_rate": 3.355508463084022e-05, "loss": 0.0688313364982605, "memory(GiB)": 122.96, "step": 39795, "token_acc": 0.9733986699334967, "train_speed(iter/s)": 0.234414 }, { "epoch": 3.0337678176690295, "grad_norm": 0.825907826423645, "learning_rate": 3.354377779540289e-05, "loss": 0.0452489823102951, "memory(GiB)": 122.96, "step": 39800, "token_acc": 0.9830300552034349, "train_speed(iter/s)": 0.23442 }, { "epoch": 3.0337678176690295, "eval_loss": 0.07258981466293335, "eval_runtime": 221.2895, "eval_samples_per_second": 2.395, "eval_steps_per_second": 2.395, "eval_token_acc": 0.9702879344617794, "step": 39800 }, { "epoch": 3.0341489442792895, "grad_norm": 1.7762625217437744, "learning_rate": 3.353247190365353e-05, "loss": 0.0694502592086792, "memory(GiB)": 122.96, "step": 39805, "token_acc": 0.9704177304500258, "train_speed(iter/s)": 0.234121 }, { "epoch": 3.0345300708895495, "grad_norm": 0.9209100604057312, "learning_rate": 3.352116695624047e-05, "loss": 0.06532583236694336, "memory(GiB)": 122.96, "step": 39810, "token_acc": 0.9730240549828179, "train_speed(iter/s)": 0.234127 }, { "epoch": 3.0349111974998095, "grad_norm": 0.6756927371025085, "learning_rate": 3.350986295381204e-05, "loss": 0.0552116334438324, "memory(GiB)": 122.96, "step": 39815, "token_acc": 0.9807490374518726, "train_speed(iter/s)": 0.234133 }, { "epoch": 3.0352923241100696, "grad_norm": 1.176149845123291, "learning_rate": 3.3498559897016435e-05, "loss": 0.051213139295578004, "memory(GiB)": 122.96, "step": 39820, "token_acc": 0.9861141245389455, "train_speed(iter/s)": 0.234138 }, { "epoch": 3.035673450720329, "grad_norm": 0.7002317905426025, "learning_rate": 3.348725778650184e-05, "loss": 0.052402842044830325, "memory(GiB)": 122.96, "step": 39825, "token_acc": 0.9786460166608002, "train_speed(iter/s)": 0.234138 }, { "epoch": 3.036054577330589, "grad_norm": 0.7214280962944031, "learning_rate": 3.3475956622916394e-05, "loss": 0.054445904493331906, "memory(GiB)": 122.96, "step": 39830, "token_acc": 0.9815515610217597, "train_speed(iter/s)": 0.234147 }, { "epoch": 3.036435703940849, "grad_norm": 0.8075754642486572, "learning_rate": 3.346465640690815e-05, "loss": 0.0523833692073822, "memory(GiB)": 122.96, "step": 39835, "token_acc": 0.9773510585918267, "train_speed(iter/s)": 0.234153 }, { "epoch": 3.036816830551109, "grad_norm": 0.5790780186653137, "learning_rate": 3.345335713912512e-05, "loss": 0.0834058403968811, "memory(GiB)": 122.96, "step": 39840, "token_acc": 0.9723600283486888, "train_speed(iter/s)": 0.234159 }, { "epoch": 3.037197957161369, "grad_norm": 0.5245856046676636, "learning_rate": 3.344205882021528e-05, "loss": 0.04336692094802856, "memory(GiB)": 122.96, "step": 39845, "token_acc": 0.9859688195991091, "train_speed(iter/s)": 0.234165 }, { "epoch": 3.0375790837716288, "grad_norm": 0.8140866756439209, "learning_rate": 3.343076145082653e-05, "loss": 0.08442255854606628, "memory(GiB)": 122.96, "step": 39850, "token_acc": 0.9719264278799613, "train_speed(iter/s)": 0.234171 }, { "epoch": 3.037960210381889, "grad_norm": 1.099595546722412, "learning_rate": 3.3419465031606714e-05, "loss": 0.05792512893676758, "memory(GiB)": 122.96, "step": 39855, "token_acc": 0.97934493951018, "train_speed(iter/s)": 0.234181 }, { "epoch": 3.038341336992149, "grad_norm": 1.8679360151290894, "learning_rate": 3.3408169563203636e-05, "loss": 0.09112204313278198, "memory(GiB)": 122.96, "step": 39860, "token_acc": 0.9667904741096788, "train_speed(iter/s)": 0.234188 }, { "epoch": 3.038722463602409, "grad_norm": 1.1673696041107178, "learning_rate": 3.339687504626504e-05, "loss": 0.0587047815322876, "memory(GiB)": 122.96, "step": 39865, "token_acc": 0.9763857002295835, "train_speed(iter/s)": 0.234194 }, { "epoch": 3.039103590212669, "grad_norm": 1.9940006732940674, "learning_rate": 3.338558148143862e-05, "loss": 0.06191736459732056, "memory(GiB)": 122.96, "step": 39870, "token_acc": 0.9801849405548216, "train_speed(iter/s)": 0.234201 }, { "epoch": 3.0394847168229284, "grad_norm": 0.6273790597915649, "learning_rate": 3.337428886937198e-05, "loss": 0.044448471069335936, "memory(GiB)": 122.96, "step": 39875, "token_acc": 0.9809741248097412, "train_speed(iter/s)": 0.234201 }, { "epoch": 3.0398658434331884, "grad_norm": 1.2552859783172607, "learning_rate": 3.336299721071272e-05, "loss": 0.07316847443580628, "memory(GiB)": 122.96, "step": 39880, "token_acc": 0.9782012415186949, "train_speed(iter/s)": 0.234206 }, { "epoch": 3.0402469700434485, "grad_norm": 3.1244125366210938, "learning_rate": 3.3351706506108384e-05, "loss": 0.09549397230148315, "memory(GiB)": 122.96, "step": 39885, "token_acc": 0.970942299709423, "train_speed(iter/s)": 0.234214 }, { "epoch": 3.0406280966537085, "grad_norm": 1.0998260974884033, "learning_rate": 3.3340416756206425e-05, "loss": 0.06104463338851929, "memory(GiB)": 122.96, "step": 39890, "token_acc": 0.9679334916864608, "train_speed(iter/s)": 0.23422 }, { "epoch": 3.0410092232639685, "grad_norm": 0.6428917050361633, "learning_rate": 3.332912796165424e-05, "loss": 0.11186854839324951, "memory(GiB)": 122.96, "step": 39895, "token_acc": 0.9716098864395457, "train_speed(iter/s)": 0.234226 }, { "epoch": 3.041390349874228, "grad_norm": 0.9653410315513611, "learning_rate": 3.3317840123099214e-05, "loss": 0.060212457180023195, "memory(GiB)": 122.96, "step": 39900, "token_acc": 0.970788830266375, "train_speed(iter/s)": 0.234228 }, { "epoch": 3.041771476484488, "grad_norm": 0.7813236117362976, "learning_rate": 3.330655324118864e-05, "loss": 0.07211906909942627, "memory(GiB)": 122.96, "step": 39905, "token_acc": 0.9762075134168158, "train_speed(iter/s)": 0.234234 }, { "epoch": 3.042152603094748, "grad_norm": 2.0067830085754395, "learning_rate": 3.329526731656978e-05, "loss": 0.0581806480884552, "memory(GiB)": 122.96, "step": 39910, "token_acc": 0.9784964160693449, "train_speed(iter/s)": 0.23424 }, { "epoch": 3.042533729705008, "grad_norm": 0.6909515857696533, "learning_rate": 3.3283982349889794e-05, "loss": 0.07647414207458496, "memory(GiB)": 122.96, "step": 39915, "token_acc": 0.9739640130861504, "train_speed(iter/s)": 0.234246 }, { "epoch": 3.042914856315268, "grad_norm": 0.7479629516601562, "learning_rate": 3.327269834179588e-05, "loss": 0.054646795988082884, "memory(GiB)": 122.96, "step": 39920, "token_acc": 0.9785932721712538, "train_speed(iter/s)": 0.234253 }, { "epoch": 3.0432959829255277, "grad_norm": 0.5888261198997498, "learning_rate": 3.326141529293508e-05, "loss": 0.060467648506164554, "memory(GiB)": 122.96, "step": 39925, "token_acc": 0.9750441696113075, "train_speed(iter/s)": 0.234259 }, { "epoch": 3.0436771095357877, "grad_norm": 1.0729564428329468, "learning_rate": 3.3250133203954426e-05, "loss": 0.07042725086212158, "memory(GiB)": 122.96, "step": 39930, "token_acc": 0.9685880829015544, "train_speed(iter/s)": 0.234266 }, { "epoch": 3.0440582361460478, "grad_norm": 1.3364334106445312, "learning_rate": 3.323885207550091e-05, "loss": 0.054604601860046384, "memory(GiB)": 122.96, "step": 39935, "token_acc": 0.9825174825174825, "train_speed(iter/s)": 0.234275 }, { "epoch": 3.0444393627563078, "grad_norm": 0.7985712289810181, "learning_rate": 3.3227571908221456e-05, "loss": 0.05734869241714478, "memory(GiB)": 122.96, "step": 39940, "token_acc": 0.974441754102771, "train_speed(iter/s)": 0.234278 }, { "epoch": 3.0448204893665674, "grad_norm": 1.1172962188720703, "learning_rate": 3.3216292702762895e-05, "loss": 0.050196290016174316, "memory(GiB)": 122.96, "step": 39945, "token_acc": 0.9807692307692307, "train_speed(iter/s)": 0.234279 }, { "epoch": 3.0452016159768274, "grad_norm": 1.2852576971054077, "learning_rate": 3.320501445977209e-05, "loss": 0.07328345775604247, "memory(GiB)": 122.96, "step": 39950, "token_acc": 0.9681616832779624, "train_speed(iter/s)": 0.234287 }, { "epoch": 3.0455827425870874, "grad_norm": 0.7059462070465088, "learning_rate": 3.319373717989576e-05, "loss": 0.06209991574287414, "memory(GiB)": 122.96, "step": 39955, "token_acc": 0.9738219895287958, "train_speed(iter/s)": 0.234294 }, { "epoch": 3.0459638691973474, "grad_norm": 1.0393316745758057, "learning_rate": 3.31824608637806e-05, "loss": 0.060081905126571654, "memory(GiB)": 122.96, "step": 39960, "token_acc": 0.9764831640833779, "train_speed(iter/s)": 0.234299 }, { "epoch": 3.0463449958076074, "grad_norm": 1.7570558786392212, "learning_rate": 3.317118551207328e-05, "loss": 0.06531715393066406, "memory(GiB)": 122.96, "step": 39965, "token_acc": 0.9728997289972899, "train_speed(iter/s)": 0.23431 }, { "epoch": 3.046726122417867, "grad_norm": 0.2915506064891815, "learning_rate": 3.315991112542036e-05, "loss": 0.0696272611618042, "memory(GiB)": 122.96, "step": 39970, "token_acc": 0.9625668449197861, "train_speed(iter/s)": 0.234319 }, { "epoch": 3.047107249028127, "grad_norm": 1.1175479888916016, "learning_rate": 3.314863770446841e-05, "loss": 0.06097676157951355, "memory(GiB)": 122.96, "step": 39975, "token_acc": 0.9790040376850606, "train_speed(iter/s)": 0.234328 }, { "epoch": 3.047488375638387, "grad_norm": 1.1273046731948853, "learning_rate": 3.3137365249863874e-05, "loss": 0.05729676485061645, "memory(GiB)": 122.96, "step": 39980, "token_acc": 0.9761985145172181, "train_speed(iter/s)": 0.234333 }, { "epoch": 3.047869502248647, "grad_norm": 0.5920534729957581, "learning_rate": 3.3126093762253184e-05, "loss": 0.09189997315406799, "memory(GiB)": 122.96, "step": 39985, "token_acc": 0.9680147624173459, "train_speed(iter/s)": 0.234336 }, { "epoch": 3.048250628858907, "grad_norm": 0.6231995820999146, "learning_rate": 3.311482324228273e-05, "loss": 0.053027182817459106, "memory(GiB)": 122.96, "step": 39990, "token_acc": 0.9831127339114559, "train_speed(iter/s)": 0.234341 }, { "epoch": 3.0486317554691666, "grad_norm": 1.3645877838134766, "learning_rate": 3.310355369059879e-05, "loss": 0.07021919488906861, "memory(GiB)": 122.96, "step": 39995, "token_acc": 0.9713704403429734, "train_speed(iter/s)": 0.234343 }, { "epoch": 3.0490128820794267, "grad_norm": 0.9373550415039062, "learning_rate": 3.3092285107847644e-05, "loss": 0.0505083441734314, "memory(GiB)": 122.96, "step": 40000, "token_acc": 0.9814914887868144, "train_speed(iter/s)": 0.234345 }, { "epoch": 3.0490128820794267, "eval_loss": 0.0704260915517807, "eval_runtime": 221.5179, "eval_samples_per_second": 2.393, "eval_steps_per_second": 2.393, "eval_token_acc": 0.9705364134690682, "step": 40000 }, { "epoch": 3.0493940086896867, "grad_norm": 0.5062888860702515, "learning_rate": 3.3081017494675485e-05, "loss": 0.042469573020935056, "memory(GiB)": 122.96, "step": 40005, "token_acc": 0.9710289198427744, "train_speed(iter/s)": 0.234046 }, { "epoch": 3.0497751352999467, "grad_norm": 1.2162554264068604, "learning_rate": 3.3069750851728454e-05, "loss": 0.054365295171737674, "memory(GiB)": 122.96, "step": 40010, "token_acc": 0.9778200253485425, "train_speed(iter/s)": 0.234049 }, { "epoch": 3.0501562619102067, "grad_norm": 0.9594593644142151, "learning_rate": 3.305848517965263e-05, "loss": 0.0965767741203308, "memory(GiB)": 122.96, "step": 40015, "token_acc": 0.9670050761421319, "train_speed(iter/s)": 0.234056 }, { "epoch": 3.0505373885204663, "grad_norm": 0.7352628111839294, "learning_rate": 3.3047220479094085e-05, "loss": 0.05550400614738464, "memory(GiB)": 122.96, "step": 40020, "token_acc": 0.9802836388792805, "train_speed(iter/s)": 0.234061 }, { "epoch": 3.0509185151307263, "grad_norm": 0.5569922924041748, "learning_rate": 3.3035956750698785e-05, "loss": 0.03913738429546356, "memory(GiB)": 122.96, "step": 40025, "token_acc": 0.9756450446105619, "train_speed(iter/s)": 0.234066 }, { "epoch": 3.0512996417409863, "grad_norm": 1.095737338066101, "learning_rate": 3.302469399511263e-05, "loss": 0.050398558378219604, "memory(GiB)": 122.96, "step": 40030, "token_acc": 0.9785542168674699, "train_speed(iter/s)": 0.234075 }, { "epoch": 3.0516807683512464, "grad_norm": 0.5388192534446716, "learning_rate": 3.301343221298149e-05, "loss": 0.04310442209243774, "memory(GiB)": 122.96, "step": 40035, "token_acc": 0.9767576318223867, "train_speed(iter/s)": 0.234077 }, { "epoch": 3.0520618949615064, "grad_norm": 0.725538432598114, "learning_rate": 3.30021714049512e-05, "loss": 0.05553781986236572, "memory(GiB)": 122.96, "step": 40040, "token_acc": 0.9734822051639916, "train_speed(iter/s)": 0.234082 }, { "epoch": 3.052443021571766, "grad_norm": 1.3483809232711792, "learning_rate": 3.2990911571667496e-05, "loss": 0.06187505722045898, "memory(GiB)": 122.96, "step": 40045, "token_acc": 0.9808680248007086, "train_speed(iter/s)": 0.234087 }, { "epoch": 3.052824148182026, "grad_norm": 0.2374332994222641, "learning_rate": 3.297965271377608e-05, "loss": 0.046360284090042114, "memory(GiB)": 122.96, "step": 40050, "token_acc": 0.971241570805236, "train_speed(iter/s)": 0.234094 }, { "epoch": 3.053205274792286, "grad_norm": 1.6220355033874512, "learning_rate": 3.29683948319226e-05, "loss": 0.06030192971229553, "memory(GiB)": 122.96, "step": 40055, "token_acc": 0.9785258270458502, "train_speed(iter/s)": 0.234103 }, { "epoch": 3.053586401402546, "grad_norm": 1.0841282606124878, "learning_rate": 3.295713792675264e-05, "loss": 0.058750379085540774, "memory(GiB)": 122.96, "step": 40060, "token_acc": 0.9776741086304566, "train_speed(iter/s)": 0.23411 }, { "epoch": 3.053967528012806, "grad_norm": 0.6131526827812195, "learning_rate": 3.294588199891172e-05, "loss": 0.04483100175857544, "memory(GiB)": 122.96, "step": 40065, "token_acc": 0.9756151629350477, "train_speed(iter/s)": 0.234118 }, { "epoch": 3.0543486546230656, "grad_norm": 0.16994024813175201, "learning_rate": 3.2934627049045344e-05, "loss": 0.062362265586853025, "memory(GiB)": 122.96, "step": 40070, "token_acc": 0.9750223015165032, "train_speed(iter/s)": 0.234123 }, { "epoch": 3.0547297812333256, "grad_norm": 1.1065882444381714, "learning_rate": 3.29233730777989e-05, "loss": 0.07814797759056091, "memory(GiB)": 122.96, "step": 40075, "token_acc": 0.9659350307287093, "train_speed(iter/s)": 0.234128 }, { "epoch": 3.0551109078435856, "grad_norm": 1.3531697988510132, "learning_rate": 3.2912120085817774e-05, "loss": 0.04784272313117981, "memory(GiB)": 122.96, "step": 40080, "token_acc": 0.9788318306546452, "train_speed(iter/s)": 0.234137 }, { "epoch": 3.0554920344538457, "grad_norm": 2.436333179473877, "learning_rate": 3.290086807374726e-05, "loss": 0.08367094993591309, "memory(GiB)": 122.96, "step": 40085, "token_acc": 0.9533715925394548, "train_speed(iter/s)": 0.234145 }, { "epoch": 3.0558731610641057, "grad_norm": 1.683335542678833, "learning_rate": 3.288961704223261e-05, "loss": 0.06922162771224975, "memory(GiB)": 122.96, "step": 40090, "token_acc": 0.974415666456096, "train_speed(iter/s)": 0.23415 }, { "epoch": 3.0562542876743652, "grad_norm": 0.9991024136543274, "learning_rate": 3.287836699191903e-05, "loss": 0.06491246223449706, "memory(GiB)": 122.96, "step": 40095, "token_acc": 0.9761490683229813, "train_speed(iter/s)": 0.234152 }, { "epoch": 3.0566354142846253, "grad_norm": 0.993813157081604, "learning_rate": 3.286711792345163e-05, "loss": 0.06463882923126221, "memory(GiB)": 122.96, "step": 40100, "token_acc": 0.9753946806997582, "train_speed(iter/s)": 0.234156 }, { "epoch": 3.0570165408948853, "grad_norm": 1.272670865058899, "learning_rate": 3.285586983747553e-05, "loss": 0.06782611012458802, "memory(GiB)": 122.96, "step": 40105, "token_acc": 0.975024975024975, "train_speed(iter/s)": 0.234165 }, { "epoch": 3.0573976675051453, "grad_norm": 0.9784106016159058, "learning_rate": 3.2844622734635735e-05, "loss": 0.07725061178207397, "memory(GiB)": 122.96, "step": 40110, "token_acc": 0.9709322935129386, "train_speed(iter/s)": 0.234169 }, { "epoch": 3.0577787941154053, "grad_norm": 1.2272591590881348, "learning_rate": 3.28333766155772e-05, "loss": 0.07126542329788207, "memory(GiB)": 122.96, "step": 40115, "token_acc": 0.9605831533477321, "train_speed(iter/s)": 0.234179 }, { "epoch": 3.058159920725665, "grad_norm": 0.5836849212646484, "learning_rate": 3.282213148094487e-05, "loss": 0.06153574585914612, "memory(GiB)": 122.96, "step": 40120, "token_acc": 0.9773896561061468, "train_speed(iter/s)": 0.234182 }, { "epoch": 3.058541047335925, "grad_norm": 1.0087261199951172, "learning_rate": 3.2810887331383574e-05, "loss": 0.07264306545257568, "memory(GiB)": 122.96, "step": 40125, "token_acc": 0.9711174242424242, "train_speed(iter/s)": 0.23419 }, { "epoch": 3.058922173946185, "grad_norm": 0.6476016640663147, "learning_rate": 3.279964416753813e-05, "loss": 0.07261946201324462, "memory(GiB)": 122.96, "step": 40130, "token_acc": 0.9716722509899482, "train_speed(iter/s)": 0.234198 }, { "epoch": 3.059303300556445, "grad_norm": 1.2144378423690796, "learning_rate": 3.278840199005326e-05, "loss": 0.053327149152755736, "memory(GiB)": 122.96, "step": 40135, "token_acc": 0.9835882133532264, "train_speed(iter/s)": 0.234201 }, { "epoch": 3.059684427166705, "grad_norm": 0.8927167057991028, "learning_rate": 3.2777160799573684e-05, "loss": 0.07927498817443848, "memory(GiB)": 122.96, "step": 40140, "token_acc": 0.9711495783399912, "train_speed(iter/s)": 0.234211 }, { "epoch": 3.0600655537769645, "grad_norm": 0.5923752188682556, "learning_rate": 3.276592059674401e-05, "loss": 0.04439226984977722, "memory(GiB)": 122.96, "step": 40145, "token_acc": 0.9818741450068399, "train_speed(iter/s)": 0.234215 }, { "epoch": 3.0604466803872246, "grad_norm": 0.9738351106643677, "learning_rate": 3.2754681382208786e-05, "loss": 0.04353592991828918, "memory(GiB)": 122.96, "step": 40150, "token_acc": 0.977364001460387, "train_speed(iter/s)": 0.234224 }, { "epoch": 3.0608278069974846, "grad_norm": 1.0424336194992065, "learning_rate": 3.274344315661256e-05, "loss": 0.05864318609237671, "memory(GiB)": 122.96, "step": 40155, "token_acc": 0.9776191130028834, "train_speed(iter/s)": 0.234226 }, { "epoch": 3.0612089336077446, "grad_norm": 0.714299201965332, "learning_rate": 3.273220592059981e-05, "loss": 0.06144073009490967, "memory(GiB)": 122.96, "step": 40160, "token_acc": 0.9762606671838635, "train_speed(iter/s)": 0.234229 }, { "epoch": 3.0615900602180046, "grad_norm": 1.3669302463531494, "learning_rate": 3.2720969674814916e-05, "loss": 0.06509497165679931, "memory(GiB)": 122.96, "step": 40165, "token_acc": 0.9656453110492108, "train_speed(iter/s)": 0.234237 }, { "epoch": 3.061971186828264, "grad_norm": 3.169528007507324, "learning_rate": 3.270973441990222e-05, "loss": 0.06185004711151123, "memory(GiB)": 122.96, "step": 40170, "token_acc": 0.9769775678866588, "train_speed(iter/s)": 0.234246 }, { "epoch": 3.062352313438524, "grad_norm": 1.0040183067321777, "learning_rate": 3.2698500156506026e-05, "loss": 0.07717486023902893, "memory(GiB)": 122.96, "step": 40175, "token_acc": 0.9712918660287081, "train_speed(iter/s)": 0.234253 }, { "epoch": 3.0627334400487842, "grad_norm": 1.4344481229782104, "learning_rate": 3.2687266885270564e-05, "loss": 0.07158398628234863, "memory(GiB)": 122.96, "step": 40180, "token_acc": 0.974511672224869, "train_speed(iter/s)": 0.23426 }, { "epoch": 3.0631145666590442, "grad_norm": 0.7907947897911072, "learning_rate": 3.267603460683999e-05, "loss": 0.041824734210968016, "memory(GiB)": 122.96, "step": 40185, "token_acc": 0.9811220420101037, "train_speed(iter/s)": 0.234265 }, { "epoch": 3.0634956932693043, "grad_norm": 1.1564909219741821, "learning_rate": 3.2664803321858447e-05, "loss": 0.08315824270248413, "memory(GiB)": 122.96, "step": 40190, "token_acc": 0.9667545104398946, "train_speed(iter/s)": 0.234272 }, { "epoch": 3.063876819879564, "grad_norm": 0.744450032711029, "learning_rate": 3.2653573030969986e-05, "loss": 0.06905866861343384, "memory(GiB)": 122.96, "step": 40195, "token_acc": 0.9726212607740409, "train_speed(iter/s)": 0.234277 }, { "epoch": 3.064257946489824, "grad_norm": 0.6399838328361511, "learning_rate": 3.264234373481862e-05, "loss": 0.037275031208992004, "memory(GiB)": 122.96, "step": 40200, "token_acc": 0.9863058901171424, "train_speed(iter/s)": 0.23428 }, { "epoch": 3.064257946489824, "eval_loss": 0.07084307074546814, "eval_runtime": 220.8269, "eval_samples_per_second": 2.4, "eval_steps_per_second": 2.4, "eval_token_acc": 0.9702578157942293, "step": 40200 }, { "epoch": 3.064639073100084, "grad_norm": 1.5782195329666138, "learning_rate": 3.263111543404828e-05, "loss": 0.07506142258644104, "memory(GiB)": 122.96, "step": 40205, "token_acc": 0.9702562672459344, "train_speed(iter/s)": 0.233987 }, { "epoch": 3.065020199710344, "grad_norm": 0.6643892526626587, "learning_rate": 3.2619888129302876e-05, "loss": 0.04253174662590027, "memory(GiB)": 122.96, "step": 40210, "token_acc": 0.9804151322397054, "train_speed(iter/s)": 0.23399 }, { "epoch": 3.065401326320604, "grad_norm": 1.3351500034332275, "learning_rate": 3.260866182122624e-05, "loss": 0.08559784889221192, "memory(GiB)": 122.96, "step": 40215, "token_acc": 0.9617728531855956, "train_speed(iter/s)": 0.234 }, { "epoch": 3.0657824529308635, "grad_norm": 0.9973918795585632, "learning_rate": 3.259743651046213e-05, "loss": 0.09768767952919007, "memory(GiB)": 122.96, "step": 40220, "token_acc": 0.961509000382995, "train_speed(iter/s)": 0.234006 }, { "epoch": 3.0661635795411235, "grad_norm": 2.3457822799682617, "learning_rate": 3.258621219765429e-05, "loss": 0.061995089054107666, "memory(GiB)": 122.96, "step": 40225, "token_acc": 0.9801687763713081, "train_speed(iter/s)": 0.234015 }, { "epoch": 3.0665447061513835, "grad_norm": 1.0093181133270264, "learning_rate": 3.2574988883446365e-05, "loss": 0.060358178615570066, "memory(GiB)": 122.96, "step": 40230, "token_acc": 0.9797461132506062, "train_speed(iter/s)": 0.23402 }, { "epoch": 3.0669258327616435, "grad_norm": 0.5886979103088379, "learning_rate": 3.2563766568481956e-05, "loss": 0.04835548996925354, "memory(GiB)": 122.96, "step": 40235, "token_acc": 0.9849119865884325, "train_speed(iter/s)": 0.234026 }, { "epoch": 3.067306959371903, "grad_norm": 0.0003671533486340195, "learning_rate": 3.255254525340463e-05, "loss": 0.08188050985336304, "memory(GiB)": 122.96, "step": 40240, "token_acc": 0.9692307692307692, "train_speed(iter/s)": 0.234035 }, { "epoch": 3.067688085982163, "grad_norm": 0.6623596549034119, "learning_rate": 3.254132493885788e-05, "loss": 0.05557176470756531, "memory(GiB)": 122.96, "step": 40245, "token_acc": 0.9762985375693394, "train_speed(iter/s)": 0.234041 }, { "epoch": 3.068069212592423, "grad_norm": 0.9340169429779053, "learning_rate": 3.25301056254851e-05, "loss": 0.06265113353729249, "memory(GiB)": 122.96, "step": 40250, "token_acc": 0.9719016557952835, "train_speed(iter/s)": 0.234048 }, { "epoch": 3.068450339202683, "grad_norm": 0.6245326399803162, "learning_rate": 3.251888731392971e-05, "loss": 0.05410281419754028, "memory(GiB)": 122.96, "step": 40255, "token_acc": 0.9805287319930347, "train_speed(iter/s)": 0.234052 }, { "epoch": 3.068831465812943, "grad_norm": 3.942952871322632, "learning_rate": 3.250767000483501e-05, "loss": 0.05541174411773682, "memory(GiB)": 122.96, "step": 40260, "token_acc": 0.9811215991116047, "train_speed(iter/s)": 0.234062 }, { "epoch": 3.069212592423203, "grad_norm": 1.4570636749267578, "learning_rate": 3.2496453698844256e-05, "loss": 0.049525362253189084, "memory(GiB)": 122.96, "step": 40265, "token_acc": 0.9845482028888143, "train_speed(iter/s)": 0.234067 }, { "epoch": 3.069593719033463, "grad_norm": 1.3044590950012207, "learning_rate": 3.2485238396600656e-05, "loss": 0.05081298351287842, "memory(GiB)": 122.96, "step": 40270, "token_acc": 0.9797696856520386, "train_speed(iter/s)": 0.234072 }, { "epoch": 3.069974845643723, "grad_norm": 0.5210493206977844, "learning_rate": 3.247402409874736e-05, "loss": 0.06445257663726807, "memory(GiB)": 122.96, "step": 40275, "token_acc": 0.9750085005100306, "train_speed(iter/s)": 0.234079 }, { "epoch": 3.070355972253983, "grad_norm": 1.5920966863632202, "learning_rate": 3.246281080592743e-05, "loss": 0.09065452814102173, "memory(GiB)": 122.96, "step": 40280, "token_acc": 0.9685430463576159, "train_speed(iter/s)": 0.234087 }, { "epoch": 3.070737098864243, "grad_norm": 1.4825024604797363, "learning_rate": 3.2451598518783944e-05, "loss": 0.07094126343727111, "memory(GiB)": 122.96, "step": 40285, "token_acc": 0.9754885155982174, "train_speed(iter/s)": 0.234092 }, { "epoch": 3.0711182254745024, "grad_norm": 1.0272026062011719, "learning_rate": 3.244038723795983e-05, "loss": 0.06890535354614258, "memory(GiB)": 122.96, "step": 40290, "token_acc": 0.9746415294742432, "train_speed(iter/s)": 0.234093 }, { "epoch": 3.0714993520847624, "grad_norm": 1.1526970863342285, "learning_rate": 3.2429176964098036e-05, "loss": 0.07823984622955323, "memory(GiB)": 122.96, "step": 40295, "token_acc": 0.9719231888274659, "train_speed(iter/s)": 0.234097 }, { "epoch": 3.0718804786950225, "grad_norm": 1.0353963375091553, "learning_rate": 3.24179676978414e-05, "loss": 0.0589880108833313, "memory(GiB)": 122.96, "step": 40300, "token_acc": 0.9778441623101817, "train_speed(iter/s)": 0.234104 }, { "epoch": 3.0722616053052825, "grad_norm": 0.1328011453151703, "learning_rate": 3.240675943983274e-05, "loss": 0.07748907804489136, "memory(GiB)": 122.96, "step": 40305, "token_acc": 0.9681059862610403, "train_speed(iter/s)": 0.234113 }, { "epoch": 3.0726427319155425, "grad_norm": 1.0474193096160889, "learning_rate": 3.239555219071475e-05, "loss": 0.0744431495666504, "memory(GiB)": 122.96, "step": 40310, "token_acc": 0.9693803159173755, "train_speed(iter/s)": 0.234121 }, { "epoch": 3.073023858525802, "grad_norm": 2.983072280883789, "learning_rate": 3.238434595113018e-05, "loss": 0.09595105648040772, "memory(GiB)": 122.96, "step": 40315, "token_acc": 0.9769835596854897, "train_speed(iter/s)": 0.234124 }, { "epoch": 3.073404985136062, "grad_norm": 0.6839165687561035, "learning_rate": 3.2373140721721605e-05, "loss": 0.059659868478775024, "memory(GiB)": 122.96, "step": 40320, "token_acc": 0.9770526002086127, "train_speed(iter/s)": 0.234128 }, { "epoch": 3.073786111746322, "grad_norm": 1.813169002532959, "learning_rate": 3.236193650313161e-05, "loss": 0.059799933433532716, "memory(GiB)": 122.96, "step": 40325, "token_acc": 0.9764932562620424, "train_speed(iter/s)": 0.234137 }, { "epoch": 3.074167238356582, "grad_norm": 0.9068735837936401, "learning_rate": 3.235073329600272e-05, "loss": 0.04233308136463165, "memory(GiB)": 122.96, "step": 40330, "token_acc": 0.9818552959854843, "train_speed(iter/s)": 0.234138 }, { "epoch": 3.074548364966842, "grad_norm": 0.8088947534561157, "learning_rate": 3.233953110097737e-05, "loss": 0.05729679465293884, "memory(GiB)": 122.96, "step": 40335, "token_acc": 0.9750254841997962, "train_speed(iter/s)": 0.234144 }, { "epoch": 3.0749294915771017, "grad_norm": 0.9844661355018616, "learning_rate": 3.2328329918697945e-05, "loss": 0.03824906051158905, "memory(GiB)": 122.96, "step": 40340, "token_acc": 0.980584666298952, "train_speed(iter/s)": 0.234147 }, { "epoch": 3.0753106181873617, "grad_norm": 0.971795916557312, "learning_rate": 3.2317129749806794e-05, "loss": 0.0664734423160553, "memory(GiB)": 122.96, "step": 40345, "token_acc": 0.9739560912613, "train_speed(iter/s)": 0.234154 }, { "epoch": 3.0756917447976218, "grad_norm": 0.8621265292167664, "learning_rate": 3.230593059494621e-05, "loss": 0.04170198142528534, "memory(GiB)": 122.96, "step": 40350, "token_acc": 0.9786289203441576, "train_speed(iter/s)": 0.234162 }, { "epoch": 3.0760728714078818, "grad_norm": 1.2580446004867554, "learning_rate": 3.229473245475838e-05, "loss": 0.05285307765007019, "memory(GiB)": 122.96, "step": 40355, "token_acc": 0.9781906300484653, "train_speed(iter/s)": 0.234169 }, { "epoch": 3.076453998018142, "grad_norm": 1.1658308506011963, "learning_rate": 3.2283535329885485e-05, "loss": 0.0348027378320694, "memory(GiB)": 122.96, "step": 40360, "token_acc": 0.9894825410180901, "train_speed(iter/s)": 0.234171 }, { "epoch": 3.0768351246284014, "grad_norm": 0.861822783946991, "learning_rate": 3.2272339220969625e-05, "loss": 0.044558069109916686, "memory(GiB)": 122.96, "step": 40365, "token_acc": 0.9791483757682178, "train_speed(iter/s)": 0.234178 }, { "epoch": 3.0772162512386614, "grad_norm": 1.1759922504425049, "learning_rate": 3.2261144128652855e-05, "loss": 0.041172435879707335, "memory(GiB)": 122.96, "step": 40370, "token_acc": 0.9737747205503009, "train_speed(iter/s)": 0.234188 }, { "epoch": 3.0775973778489214, "grad_norm": 0.7100277543067932, "learning_rate": 3.2249950053577125e-05, "loss": 0.05116206407546997, "memory(GiB)": 122.96, "step": 40375, "token_acc": 0.981438127090301, "train_speed(iter/s)": 0.234193 }, { "epoch": 3.0779785044591814, "grad_norm": 1.4321022033691406, "learning_rate": 3.223875699638441e-05, "loss": 0.06808379888534546, "memory(GiB)": 122.96, "step": 40380, "token_acc": 0.9647769204964777, "train_speed(iter/s)": 0.234201 }, { "epoch": 3.0783596310694414, "grad_norm": 0.8641403317451477, "learning_rate": 3.222756495771656e-05, "loss": 0.0887679100036621, "memory(GiB)": 122.96, "step": 40385, "token_acc": 0.9718786616326, "train_speed(iter/s)": 0.234204 }, { "epoch": 3.078740757679701, "grad_norm": 1.9083986282348633, "learning_rate": 3.221637393821537e-05, "loss": 0.04914510250091553, "memory(GiB)": 122.96, "step": 40390, "token_acc": 0.9789473684210527, "train_speed(iter/s)": 0.234208 }, { "epoch": 3.079121884289961, "grad_norm": 1.0334970951080322, "learning_rate": 3.2205183938522624e-05, "loss": 0.07798879146575928, "memory(GiB)": 122.96, "step": 40395, "token_acc": 0.9715189873417721, "train_speed(iter/s)": 0.234216 }, { "epoch": 3.079503010900221, "grad_norm": 0.48327475786209106, "learning_rate": 3.219399495927999e-05, "loss": 0.06794584393501282, "memory(GiB)": 122.96, "step": 40400, "token_acc": 0.9675456389452333, "train_speed(iter/s)": 0.234223 }, { "epoch": 3.079503010900221, "eval_loss": 0.06965147703886032, "eval_runtime": 220.8708, "eval_samples_per_second": 2.4, "eval_steps_per_second": 2.4, "eval_token_acc": 0.9702427564604542, "step": 40400 }, { "epoch": 3.079884137510481, "grad_norm": 0.6358887553215027, "learning_rate": 3.2182807001129114e-05, "loss": 0.046459048986434937, "memory(GiB)": 122.96, "step": 40405, "token_acc": 0.9707811525530649, "train_speed(iter/s)": 0.233929 }, { "epoch": 3.080265264120741, "grad_norm": 2.55283522605896, "learning_rate": 3.2171620064711586e-05, "loss": 0.07580753564834594, "memory(GiB)": 122.96, "step": 40410, "token_acc": 0.9847176079734219, "train_speed(iter/s)": 0.233936 }, { "epoch": 3.0806463907310007, "grad_norm": 0.792969286441803, "learning_rate": 3.21604341506689e-05, "loss": 0.0788317859172821, "memory(GiB)": 122.96, "step": 40415, "token_acc": 0.9716453674121406, "train_speed(iter/s)": 0.233946 }, { "epoch": 3.0810275173412607, "grad_norm": 1.3298557996749878, "learning_rate": 3.2149249259642535e-05, "loss": 0.07411404848098754, "memory(GiB)": 122.96, "step": 40420, "token_acc": 0.9724727100142383, "train_speed(iter/s)": 0.233949 }, { "epoch": 3.0814086439515207, "grad_norm": 0.89655601978302, "learning_rate": 3.2138065392273895e-05, "loss": 0.05589293241500855, "memory(GiB)": 122.96, "step": 40425, "token_acc": 0.9786386099155109, "train_speed(iter/s)": 0.233952 }, { "epoch": 3.0817897705617807, "grad_norm": 0.9705274701118469, "learning_rate": 3.2126882549204294e-05, "loss": 0.10183897018432617, "memory(GiB)": 122.96, "step": 40430, "token_acc": 0.9670977246685218, "train_speed(iter/s)": 0.233956 }, { "epoch": 3.0821708971720407, "grad_norm": 1.3217989206314087, "learning_rate": 3.211570073107506e-05, "loss": 0.0580452561378479, "memory(GiB)": 122.96, "step": 40435, "token_acc": 0.9766726480389644, "train_speed(iter/s)": 0.233957 }, { "epoch": 3.0825520237823003, "grad_norm": 0.8320159316062927, "learning_rate": 3.2104519938527396e-05, "loss": 0.11575267314910889, "memory(GiB)": 122.96, "step": 40440, "token_acc": 0.9642295597484277, "train_speed(iter/s)": 0.233962 }, { "epoch": 3.0829331503925603, "grad_norm": 2.169710397720337, "learning_rate": 3.209334017220246e-05, "loss": 0.09004729986190796, "memory(GiB)": 122.96, "step": 40445, "token_acc": 0.9726174496644295, "train_speed(iter/s)": 0.23397 }, { "epoch": 3.0833142770028203, "grad_norm": 1.1367822885513306, "learning_rate": 3.208216143274136e-05, "loss": 0.04818301796913147, "memory(GiB)": 122.96, "step": 40450, "token_acc": 0.977510222626079, "train_speed(iter/s)": 0.233975 }, { "epoch": 3.0836954036130804, "grad_norm": 0.9515669941902161, "learning_rate": 3.207098372078517e-05, "loss": 0.06039578914642334, "memory(GiB)": 122.96, "step": 40455, "token_acc": 0.9791019486020898, "train_speed(iter/s)": 0.233978 }, { "epoch": 3.0840765302233404, "grad_norm": 0.8431556224822998, "learning_rate": 3.205980703697485e-05, "loss": 0.04501516819000244, "memory(GiB)": 122.96, "step": 40460, "token_acc": 0.9778427075724373, "train_speed(iter/s)": 0.233986 }, { "epoch": 3.0844576568336, "grad_norm": 0.6239203810691833, "learning_rate": 3.2048631381951356e-05, "loss": 0.050639814138412474, "memory(GiB)": 122.96, "step": 40465, "token_acc": 0.9773281675993686, "train_speed(iter/s)": 0.233988 }, { "epoch": 3.08483878344386, "grad_norm": 1.677498698234558, "learning_rate": 3.203745675635554e-05, "loss": 0.03927751183509827, "memory(GiB)": 122.96, "step": 40470, "token_acc": 0.9807401812688822, "train_speed(iter/s)": 0.233996 }, { "epoch": 3.08521991005412, "grad_norm": 1.430440902709961, "learning_rate": 3.202628316082823e-05, "loss": 0.07997772693634034, "memory(GiB)": 122.96, "step": 40475, "token_acc": 0.9724930362116991, "train_speed(iter/s)": 0.234004 }, { "epoch": 3.08560103666438, "grad_norm": 0.9831953048706055, "learning_rate": 3.201511059601016e-05, "loss": 0.053567242622375486, "memory(GiB)": 122.96, "step": 40480, "token_acc": 0.9800078400627205, "train_speed(iter/s)": 0.234009 }, { "epoch": 3.08598216327464, "grad_norm": 1.0131467580795288, "learning_rate": 3.200393906254204e-05, "loss": 0.06834284663200378, "memory(GiB)": 122.96, "step": 40485, "token_acc": 0.9740879645414252, "train_speed(iter/s)": 0.234014 }, { "epoch": 3.0863632898848996, "grad_norm": 1.2827280759811401, "learning_rate": 3.199276856106451e-05, "loss": 0.10797680616378784, "memory(GiB)": 122.96, "step": 40490, "token_acc": 0.9565786602662012, "train_speed(iter/s)": 0.234021 }, { "epoch": 3.0867444164951596, "grad_norm": 0.8202342987060547, "learning_rate": 3.198159909221813e-05, "loss": 0.08195743560791016, "memory(GiB)": 122.96, "step": 40495, "token_acc": 0.9787391841779975, "train_speed(iter/s)": 0.234028 }, { "epoch": 3.0871255431054196, "grad_norm": 1.3255513906478882, "learning_rate": 3.197043065664344e-05, "loss": 0.07763301730155944, "memory(GiB)": 122.96, "step": 40500, "token_acc": 0.9719379333113238, "train_speed(iter/s)": 0.234033 }, { "epoch": 3.0875066697156797, "grad_norm": 0.503959059715271, "learning_rate": 3.1959263254980874e-05, "loss": 0.051429980993270875, "memory(GiB)": 122.96, "step": 40505, "token_acc": 0.9790405216581276, "train_speed(iter/s)": 0.234038 }, { "epoch": 3.0878877963259397, "grad_norm": 0.9061885476112366, "learning_rate": 3.194809688787084e-05, "loss": 0.06830111742019654, "memory(GiB)": 122.96, "step": 40510, "token_acc": 0.9728997289972899, "train_speed(iter/s)": 0.234043 }, { "epoch": 3.0882689229361993, "grad_norm": 0.7196916341781616, "learning_rate": 3.193693155595369e-05, "loss": 0.04603073298931122, "memory(GiB)": 122.96, "step": 40515, "token_acc": 0.9819277108433735, "train_speed(iter/s)": 0.234048 }, { "epoch": 3.0886500495464593, "grad_norm": 1.4720286130905151, "learning_rate": 3.192576725986969e-05, "loss": 0.08834622502326965, "memory(GiB)": 122.96, "step": 40520, "token_acc": 0.9582493521451195, "train_speed(iter/s)": 0.234056 }, { "epoch": 3.0890311761567193, "grad_norm": 1.7539218664169312, "learning_rate": 3.191460400025904e-05, "loss": 0.09731671810150147, "memory(GiB)": 122.96, "step": 40525, "token_acc": 0.9633328577543159, "train_speed(iter/s)": 0.23406 }, { "epoch": 3.0894123027669793, "grad_norm": 2.0427074432373047, "learning_rate": 3.190344177776195e-05, "loss": 0.1167303204536438, "memory(GiB)": 122.96, "step": 40530, "token_acc": 0.9605118829981718, "train_speed(iter/s)": 0.234069 }, { "epoch": 3.0897934293772393, "grad_norm": 0.1697077453136444, "learning_rate": 3.1892280593018485e-05, "loss": 0.05711507201194763, "memory(GiB)": 122.96, "step": 40535, "token_acc": 0.9794776119402985, "train_speed(iter/s)": 0.234077 }, { "epoch": 3.090174555987499, "grad_norm": 0.5123584270477295, "learning_rate": 3.188112044666871e-05, "loss": 0.04919119477272034, "memory(GiB)": 122.96, "step": 40540, "token_acc": 0.9790374331550802, "train_speed(iter/s)": 0.234084 }, { "epoch": 3.090555682597759, "grad_norm": 1.7662264108657837, "learning_rate": 3.1869961339352574e-05, "loss": 0.05811744332313538, "memory(GiB)": 122.96, "step": 40545, "token_acc": 0.974184120798831, "train_speed(iter/s)": 0.234093 }, { "epoch": 3.090936809208019, "grad_norm": 1.1987863779067993, "learning_rate": 3.185880327171002e-05, "loss": 0.05687412023544312, "memory(GiB)": 122.96, "step": 40550, "token_acc": 0.9745185185185186, "train_speed(iter/s)": 0.234101 }, { "epoch": 3.091317935818279, "grad_norm": 2.2116799354553223, "learning_rate": 3.184764624438093e-05, "loss": 0.0777698814868927, "memory(GiB)": 122.96, "step": 40555, "token_acc": 0.9731318219983207, "train_speed(iter/s)": 0.234109 }, { "epoch": 3.091699062428539, "grad_norm": 1.1024830341339111, "learning_rate": 3.183649025800509e-05, "loss": 0.05249854326248169, "memory(GiB)": 122.96, "step": 40560, "token_acc": 0.9728910591008587, "train_speed(iter/s)": 0.234114 }, { "epoch": 3.0920801890387986, "grad_norm": 0.9509339928627014, "learning_rate": 3.182533531322223e-05, "loss": 0.049197572469711306, "memory(GiB)": 122.96, "step": 40565, "token_acc": 0.9823835784313726, "train_speed(iter/s)": 0.234115 }, { "epoch": 3.0924613156490586, "grad_norm": 0.6877323985099792, "learning_rate": 3.1814181410672065e-05, "loss": 0.07038125395774841, "memory(GiB)": 122.96, "step": 40570, "token_acc": 0.97037158291046, "train_speed(iter/s)": 0.23412 }, { "epoch": 3.0928424422593186, "grad_norm": 6.658283233642578, "learning_rate": 3.1803028550994204e-05, "loss": 0.08202228546142579, "memory(GiB)": 122.96, "step": 40575, "token_acc": 0.9733484547774313, "train_speed(iter/s)": 0.234127 }, { "epoch": 3.0932235688695786, "grad_norm": 0.6767612099647522, "learning_rate": 3.1791876734828204e-05, "loss": 0.0516794741153717, "memory(GiB)": 122.96, "step": 40580, "token_acc": 0.9788335617481346, "train_speed(iter/s)": 0.234129 }, { "epoch": 3.093604695479838, "grad_norm": 1.534783124923706, "learning_rate": 3.1780725962813576e-05, "loss": 0.07141729593276977, "memory(GiB)": 122.96, "step": 40585, "token_acc": 0.9720876138786586, "train_speed(iter/s)": 0.234135 }, { "epoch": 3.093985822090098, "grad_norm": 0.6500986814498901, "learning_rate": 3.176957623558977e-05, "loss": 0.08700929284095764, "memory(GiB)": 122.96, "step": 40590, "token_acc": 0.9772360348724572, "train_speed(iter/s)": 0.234139 }, { "epoch": 3.094366948700358, "grad_norm": 1.1677542924880981, "learning_rate": 3.1758427553796176e-05, "loss": 0.06951172351837158, "memory(GiB)": 122.96, "step": 40595, "token_acc": 0.9764018185754493, "train_speed(iter/s)": 0.234144 }, { "epoch": 3.0947480753106182, "grad_norm": 1.2128084897994995, "learning_rate": 3.174727991807209e-05, "loss": 0.06538564562797547, "memory(GiB)": 122.96, "step": 40600, "token_acc": 0.9707668090847763, "train_speed(iter/s)": 0.23415 }, { "epoch": 3.0947480753106182, "eval_loss": 0.07009056955575943, "eval_runtime": 219.7654, "eval_samples_per_second": 2.412, "eval_steps_per_second": 2.412, "eval_token_acc": 0.9701448707909162, "step": 40600 }, { "epoch": 3.0951292019208783, "grad_norm": 1.1937199831008911, "learning_rate": 3.1736133329056816e-05, "loss": 0.07111892104148865, "memory(GiB)": 122.96, "step": 40605, "token_acc": 0.970181348481179, "train_speed(iter/s)": 0.233862 }, { "epoch": 3.095510328531138, "grad_norm": 1.3945248126983643, "learning_rate": 3.172498778738954e-05, "loss": 0.05958819389343262, "memory(GiB)": 122.96, "step": 40610, "token_acc": 0.9791054521710741, "train_speed(iter/s)": 0.233866 }, { "epoch": 3.095891455141398, "grad_norm": 0.8373230695724487, "learning_rate": 3.171384329370939e-05, "loss": 0.03982595503330231, "memory(GiB)": 122.96, "step": 40615, "token_acc": 0.9793935825728584, "train_speed(iter/s)": 0.233874 }, { "epoch": 3.096272581751658, "grad_norm": 2.2430405616760254, "learning_rate": 3.170269984865549e-05, "loss": 0.07809516787528992, "memory(GiB)": 122.96, "step": 40620, "token_acc": 0.9684265010351967, "train_speed(iter/s)": 0.233883 }, { "epoch": 3.096653708361918, "grad_norm": 1.1688148975372314, "learning_rate": 3.169155745286684e-05, "loss": 0.07574228644371032, "memory(GiB)": 122.96, "step": 40625, "token_acc": 0.9685621027314895, "train_speed(iter/s)": 0.233887 }, { "epoch": 3.097034834972178, "grad_norm": 1.1869450807571411, "learning_rate": 3.168041610698239e-05, "loss": 0.07978163957595825, "memory(GiB)": 122.96, "step": 40630, "token_acc": 0.9692507579038545, "train_speed(iter/s)": 0.233894 }, { "epoch": 3.0974159615824375, "grad_norm": 2.0459418296813965, "learning_rate": 3.166927581164109e-05, "loss": 0.04399622082710266, "memory(GiB)": 122.96, "step": 40635, "token_acc": 0.9863541975674874, "train_speed(iter/s)": 0.233902 }, { "epoch": 3.0977970881926975, "grad_norm": 1.891729712486267, "learning_rate": 3.1658136567481744e-05, "loss": 0.08150098323822022, "memory(GiB)": 122.96, "step": 40640, "token_acc": 0.9695021881838074, "train_speed(iter/s)": 0.233906 }, { "epoch": 3.0981782148029575, "grad_norm": 1.25875985622406, "learning_rate": 3.164699837514315e-05, "loss": 0.0846285104751587, "memory(GiB)": 122.96, "step": 40645, "token_acc": 0.9732718894009217, "train_speed(iter/s)": 0.233914 }, { "epoch": 3.0985593414132175, "grad_norm": 2.322939157485962, "learning_rate": 3.163586123526402e-05, "loss": 0.09692569971084594, "memory(GiB)": 122.96, "step": 40650, "token_acc": 0.9661167287564307, "train_speed(iter/s)": 0.23392 }, { "epoch": 3.0989404680234776, "grad_norm": 0.7375257015228271, "learning_rate": 3.162472514848305e-05, "loss": 0.05152239799499512, "memory(GiB)": 122.96, "step": 40655, "token_acc": 0.980706961683756, "train_speed(iter/s)": 0.233922 }, { "epoch": 3.099321594633737, "grad_norm": 0.9018100500106812, "learning_rate": 3.1613590115438804e-05, "loss": 0.04377131760120392, "memory(GiB)": 122.96, "step": 40660, "token_acc": 0.981260979894593, "train_speed(iter/s)": 0.233927 }, { "epoch": 3.099702721243997, "grad_norm": 1.0182322263717651, "learning_rate": 3.160245613676984e-05, "loss": 0.08011202812194824, "memory(GiB)": 122.96, "step": 40665, "token_acc": 0.9599922690374951, "train_speed(iter/s)": 0.233934 }, { "epoch": 3.100083847854257, "grad_norm": 0.5748077034950256, "learning_rate": 3.1591323213114655e-05, "loss": 0.044398444890975955, "memory(GiB)": 122.96, "step": 40670, "token_acc": 0.9803671596124426, "train_speed(iter/s)": 0.233941 }, { "epoch": 3.100464974464517, "grad_norm": 1.0145440101623535, "learning_rate": 3.158019134511166e-05, "loss": 0.07262259721755981, "memory(GiB)": 122.96, "step": 40675, "token_acc": 0.969047619047619, "train_speed(iter/s)": 0.233946 }, { "epoch": 3.100846101074777, "grad_norm": 0.6751973032951355, "learning_rate": 3.156906053339918e-05, "loss": 0.05023207664489746, "memory(GiB)": 122.96, "step": 40680, "token_acc": 0.9838670840356369, "train_speed(iter/s)": 0.233951 }, { "epoch": 3.101227227685037, "grad_norm": 0.9415655136108398, "learning_rate": 3.155793077861556e-05, "loss": 0.07174451351165771, "memory(GiB)": 122.96, "step": 40685, "token_acc": 0.9724409448818898, "train_speed(iter/s)": 0.233957 }, { "epoch": 3.101608354295297, "grad_norm": 1.2594071626663208, "learning_rate": 3.154680208139905e-05, "loss": 0.06050864458084106, "memory(GiB)": 122.96, "step": 40690, "token_acc": 0.9767928476317291, "train_speed(iter/s)": 0.233963 }, { "epoch": 3.101989480905557, "grad_norm": 1.5684599876403809, "learning_rate": 3.15356744423878e-05, "loss": 0.04644063115119934, "memory(GiB)": 122.96, "step": 40695, "token_acc": 0.9813031161473088, "train_speed(iter/s)": 0.233969 }, { "epoch": 3.102370607515817, "grad_norm": 0.7719554305076599, "learning_rate": 3.152454786221993e-05, "loss": 0.06398332715034485, "memory(GiB)": 122.96, "step": 40700, "token_acc": 0.9720388349514563, "train_speed(iter/s)": 0.233978 }, { "epoch": 3.102751734126077, "grad_norm": 0.9517346024513245, "learning_rate": 3.1513422341533506e-05, "loss": 0.049184536933898924, "memory(GiB)": 122.96, "step": 40705, "token_acc": 0.9792440463185492, "train_speed(iter/s)": 0.233982 }, { "epoch": 3.1031328607363364, "grad_norm": 0.8249827027320862, "learning_rate": 3.150229788096653e-05, "loss": 0.06018694639205933, "memory(GiB)": 122.96, "step": 40710, "token_acc": 0.9771352706888922, "train_speed(iter/s)": 0.233986 }, { "epoch": 3.1035139873465964, "grad_norm": 0.6713043451309204, "learning_rate": 3.149117448115692e-05, "loss": 0.06072781682014465, "memory(GiB)": 122.96, "step": 40715, "token_acc": 0.9752475247524752, "train_speed(iter/s)": 0.233994 }, { "epoch": 3.1038951139568565, "grad_norm": 1.2094985246658325, "learning_rate": 3.148005214274256e-05, "loss": 0.06459769010543823, "memory(GiB)": 122.96, "step": 40720, "token_acc": 0.9647577092511013, "train_speed(iter/s)": 0.234001 }, { "epoch": 3.1042762405671165, "grad_norm": 0.5747514963150024, "learning_rate": 3.146893086636128e-05, "loss": 0.050724643468856814, "memory(GiB)": 122.96, "step": 40725, "token_acc": 0.9845605700712589, "train_speed(iter/s)": 0.234005 }, { "epoch": 3.1046573671773765, "grad_norm": 0.8817791938781738, "learning_rate": 3.145781065265081e-05, "loss": 0.058845806121826175, "memory(GiB)": 122.96, "step": 40730, "token_acc": 0.9828049435787212, "train_speed(iter/s)": 0.234011 }, { "epoch": 3.105038493787636, "grad_norm": 0.7612600326538086, "learning_rate": 3.144669150224885e-05, "loss": 0.08230015635490417, "memory(GiB)": 122.96, "step": 40735, "token_acc": 0.9734725312339552, "train_speed(iter/s)": 0.234016 }, { "epoch": 3.105419620397896, "grad_norm": 1.0903247594833374, "learning_rate": 3.143557341579304e-05, "loss": 0.06947977542877197, "memory(GiB)": 122.96, "step": 40740, "token_acc": 0.971830985915493, "train_speed(iter/s)": 0.234024 }, { "epoch": 3.105800747008156, "grad_norm": 0.9312857389450073, "learning_rate": 3.1424456393920956e-05, "loss": 0.04508732557296753, "memory(GiB)": 122.96, "step": 40745, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.234031 }, { "epoch": 3.106181873618416, "grad_norm": 0.7610155940055847, "learning_rate": 3.1413340437270075e-05, "loss": 0.09056665301322937, "memory(GiB)": 122.96, "step": 40750, "token_acc": 0.9660924750679963, "train_speed(iter/s)": 0.234039 }, { "epoch": 3.106563000228676, "grad_norm": 0.6402648687362671, "learning_rate": 3.140222554647788e-05, "loss": 0.09071980714797974, "memory(GiB)": 122.96, "step": 40755, "token_acc": 0.9714606381273048, "train_speed(iter/s)": 0.234043 }, { "epoch": 3.1069441268389357, "grad_norm": 1.206046223640442, "learning_rate": 3.139111172218175e-05, "loss": 0.03416937589645386, "memory(GiB)": 122.96, "step": 40760, "token_acc": 0.982690794649882, "train_speed(iter/s)": 0.234052 }, { "epoch": 3.1073252534491957, "grad_norm": 0.7276979088783264, "learning_rate": 3.1379998965019e-05, "loss": 0.04066526591777801, "memory(GiB)": 122.96, "step": 40765, "token_acc": 0.97756487994179, "train_speed(iter/s)": 0.234054 }, { "epoch": 3.1077063800594558, "grad_norm": 1.0220636129379272, "learning_rate": 3.13688872756269e-05, "loss": 0.06596564054489136, "memory(GiB)": 122.96, "step": 40770, "token_acc": 0.971147748890298, "train_speed(iter/s)": 0.234061 }, { "epoch": 3.108087506669716, "grad_norm": 1.1848812103271484, "learning_rate": 3.1357776654642655e-05, "loss": 0.05785113573074341, "memory(GiB)": 122.96, "step": 40775, "token_acc": 0.9758935993349959, "train_speed(iter/s)": 0.234072 }, { "epoch": 3.108468633279976, "grad_norm": 1.1563011407852173, "learning_rate": 3.134666710270342e-05, "loss": 0.07946353554725646, "memory(GiB)": 122.96, "step": 40780, "token_acc": 0.9743491577335375, "train_speed(iter/s)": 0.234074 }, { "epoch": 3.1088497598902354, "grad_norm": 0.6480773091316223, "learning_rate": 3.133555862044625e-05, "loss": 0.05938659310340881, "memory(GiB)": 122.96, "step": 40785, "token_acc": 0.9735085074305406, "train_speed(iter/s)": 0.23408 }, { "epoch": 3.1092308865004954, "grad_norm": 0.6247876882553101, "learning_rate": 3.1324451208508186e-05, "loss": 0.03987505733966827, "memory(GiB)": 122.96, "step": 40790, "token_acc": 0.9840704647676162, "train_speed(iter/s)": 0.234085 }, { "epoch": 3.1096120131107554, "grad_norm": 1.1854546070098877, "learning_rate": 3.131334486752618e-05, "loss": 0.051394641399383545, "memory(GiB)": 122.96, "step": 40795, "token_acc": 0.9741824440619621, "train_speed(iter/s)": 0.234095 }, { "epoch": 3.1099931397210154, "grad_norm": 1.30801260471344, "learning_rate": 3.130223959813713e-05, "loss": 0.04961842894554138, "memory(GiB)": 122.96, "step": 40800, "token_acc": 0.9765721331689272, "train_speed(iter/s)": 0.234101 }, { "epoch": 3.1099931397210154, "eval_loss": 0.07041678577661514, "eval_runtime": 221.6091, "eval_samples_per_second": 2.392, "eval_steps_per_second": 2.392, "eval_token_acc": 0.970815011143907, "step": 40800 }, { "epoch": 3.1103742663312755, "grad_norm": 1.104972004890442, "learning_rate": 3.1291135400977874e-05, "loss": 0.07187273502349853, "memory(GiB)": 122.96, "step": 40805, "token_acc": 0.970935585601914, "train_speed(iter/s)": 0.233809 }, { "epoch": 3.110755392941535, "grad_norm": 1.6173006296157837, "learning_rate": 3.1280032276685175e-05, "loss": 0.08159438967704773, "memory(GiB)": 122.96, "step": 40810, "token_acc": 0.971715755025713, "train_speed(iter/s)": 0.233815 }, { "epoch": 3.111136519551795, "grad_norm": 0.8091890215873718, "learning_rate": 3.126893022589574e-05, "loss": 0.046999281644821166, "memory(GiB)": 122.96, "step": 40815, "token_acc": 0.9812717770034843, "train_speed(iter/s)": 0.233822 }, { "epoch": 3.111517646162055, "grad_norm": 0.9577683210372925, "learning_rate": 3.1257829249246265e-05, "loss": 0.0512359619140625, "memory(GiB)": 122.96, "step": 40820, "token_acc": 0.9715726730857405, "train_speed(iter/s)": 0.23383 }, { "epoch": 3.111898772772315, "grad_norm": 0.9569116234779358, "learning_rate": 3.124672934737328e-05, "loss": 0.09420149326324463, "memory(GiB)": 122.96, "step": 40825, "token_acc": 0.9642381586456154, "train_speed(iter/s)": 0.233836 }, { "epoch": 3.112279899382575, "grad_norm": 3.6409029960632324, "learning_rate": 3.123563052091336e-05, "loss": 0.06678684949874877, "memory(GiB)": 122.96, "step": 40830, "token_acc": 0.9768247202983484, "train_speed(iter/s)": 0.233844 }, { "epoch": 3.1126610259928347, "grad_norm": 0.6608148813247681, "learning_rate": 3.122453277050296e-05, "loss": 0.04060501456260681, "memory(GiB)": 122.96, "step": 40835, "token_acc": 0.9817677368212445, "train_speed(iter/s)": 0.23385 }, { "epoch": 3.1130421526030947, "grad_norm": 0.848366379737854, "learning_rate": 3.1213436096778454e-05, "loss": 0.07134703993797302, "memory(GiB)": 122.96, "step": 40840, "token_acc": 0.9766364162965824, "train_speed(iter/s)": 0.233856 }, { "epoch": 3.1134232792133547, "grad_norm": 0.8616316318511963, "learning_rate": 3.1202340500376223e-05, "loss": 0.05601824522018432, "memory(GiB)": 122.96, "step": 40845, "token_acc": 0.9796385848816492, "train_speed(iter/s)": 0.233863 }, { "epoch": 3.1138044058236147, "grad_norm": 1.1332385540008545, "learning_rate": 3.119124598193253e-05, "loss": 0.053549349308013916, "memory(GiB)": 122.96, "step": 40850, "token_acc": 0.9769181789239095, "train_speed(iter/s)": 0.233867 }, { "epoch": 3.1141855324338747, "grad_norm": 0.8757167458534241, "learning_rate": 3.118015254208358e-05, "loss": 0.054135262966156006, "memory(GiB)": 122.96, "step": 40855, "token_acc": 0.9764862466725821, "train_speed(iter/s)": 0.233877 }, { "epoch": 3.1145666590441343, "grad_norm": 1.084897518157959, "learning_rate": 3.116906018146557e-05, "loss": 0.04940584897994995, "memory(GiB)": 122.96, "step": 40860, "token_acc": 0.977402668118704, "train_speed(iter/s)": 0.233883 }, { "epoch": 3.1149477856543943, "grad_norm": 1.1350955963134766, "learning_rate": 3.115796890071457e-05, "loss": 0.07470813989639283, "memory(GiB)": 122.96, "step": 40865, "token_acc": 0.9701393497013935, "train_speed(iter/s)": 0.233893 }, { "epoch": 3.1153289122646544, "grad_norm": 1.2562497854232788, "learning_rate": 3.1146878700466606e-05, "loss": 0.07117047309875488, "memory(GiB)": 122.96, "step": 40870, "token_acc": 0.9696342305037957, "train_speed(iter/s)": 0.233899 }, { "epoch": 3.1157100388749144, "grad_norm": 0.5867315530776978, "learning_rate": 3.1135789581357666e-05, "loss": 0.06778972148895264, "memory(GiB)": 122.96, "step": 40875, "token_acc": 0.9759068760151597, "train_speed(iter/s)": 0.233901 }, { "epoch": 3.116091165485174, "grad_norm": 0.000580324383918196, "learning_rate": 3.112470154402365e-05, "loss": 0.0697576105594635, "memory(GiB)": 122.96, "step": 40880, "token_acc": 0.9703055515707932, "train_speed(iter/s)": 0.233904 }, { "epoch": 3.116472292095434, "grad_norm": 1.7832475900650024, "learning_rate": 3.1113614589100415e-05, "loss": 0.10597015619277954, "memory(GiB)": 122.96, "step": 40885, "token_acc": 0.9693295292439372, "train_speed(iter/s)": 0.233914 }, { "epoch": 3.116853418705694, "grad_norm": 0.9005212187767029, "learning_rate": 3.1102528717223724e-05, "loss": 0.05830283164978027, "memory(GiB)": 122.96, "step": 40890, "token_acc": 0.9774624373956594, "train_speed(iter/s)": 0.233914 }, { "epoch": 3.117234545315954, "grad_norm": 1.7553929090499878, "learning_rate": 3.109144392902933e-05, "loss": 0.06409925818443299, "memory(GiB)": 122.96, "step": 40895, "token_acc": 0.9741379310344828, "train_speed(iter/s)": 0.233923 }, { "epoch": 3.117615671926214, "grad_norm": 1.6358855962753296, "learning_rate": 3.1080360225152876e-05, "loss": 0.06992838382720948, "memory(GiB)": 122.96, "step": 40900, "token_acc": 0.9708228857989515, "train_speed(iter/s)": 0.233929 }, { "epoch": 3.1179967985364736, "grad_norm": 0.5959569811820984, "learning_rate": 3.1069277606229965e-05, "loss": 0.05138227343559265, "memory(GiB)": 122.96, "step": 40905, "token_acc": 0.9790209790209791, "train_speed(iter/s)": 0.233935 }, { "epoch": 3.1183779251467336, "grad_norm": 0.7105233073234558, "learning_rate": 3.1058196072896126e-05, "loss": 0.08401997685432434, "memory(GiB)": 122.96, "step": 40910, "token_acc": 0.962173425228402, "train_speed(iter/s)": 0.233942 }, { "epoch": 3.1187590517569936, "grad_norm": 0.3886406123638153, "learning_rate": 3.104711562578686e-05, "loss": 0.04868377149105072, "memory(GiB)": 122.96, "step": 40915, "token_acc": 0.9782729805013928, "train_speed(iter/s)": 0.233949 }, { "epoch": 3.1191401783672537, "grad_norm": 0.5179718136787415, "learning_rate": 3.1036036265537535e-05, "loss": 0.05837680101394653, "memory(GiB)": 122.96, "step": 40920, "token_acc": 0.9818893783651493, "train_speed(iter/s)": 0.233955 }, { "epoch": 3.1195213049775137, "grad_norm": 0.6908483505249023, "learning_rate": 3.102495799278356e-05, "loss": 0.08188855051994323, "memory(GiB)": 122.96, "step": 40925, "token_acc": 0.9704840613931524, "train_speed(iter/s)": 0.233964 }, { "epoch": 3.1199024315877733, "grad_norm": 1.1881844997406006, "learning_rate": 3.101388080816017e-05, "loss": 0.07655901312828065, "memory(GiB)": 122.96, "step": 40930, "token_acc": 0.9707668090847763, "train_speed(iter/s)": 0.23397 }, { "epoch": 3.1202835581980333, "grad_norm": 1.0043611526489258, "learning_rate": 3.100280471230261e-05, "loss": 0.048185303807258606, "memory(GiB)": 122.96, "step": 40935, "token_acc": 0.9854473589651456, "train_speed(iter/s)": 0.233974 }, { "epoch": 3.1206646848082933, "grad_norm": 2.980212926864624, "learning_rate": 3.0991729705846036e-05, "loss": 0.07683063745498657, "memory(GiB)": 122.96, "step": 40940, "token_acc": 0.9735261401557286, "train_speed(iter/s)": 0.233979 }, { "epoch": 3.1210458114185533, "grad_norm": 1.590779423713684, "learning_rate": 3.098065578942556e-05, "loss": 0.054518884420394896, "memory(GiB)": 122.96, "step": 40945, "token_acc": 0.979075691411936, "train_speed(iter/s)": 0.233983 }, { "epoch": 3.1214269380288133, "grad_norm": 0.6149834990501404, "learning_rate": 3.09695829636762e-05, "loss": 0.0773003876209259, "memory(GiB)": 122.96, "step": 40950, "token_acc": 0.9744451916610626, "train_speed(iter/s)": 0.233989 }, { "epoch": 3.121808064639073, "grad_norm": 0.4793623089790344, "learning_rate": 3.095851122923296e-05, "loss": 0.058064723014831544, "memory(GiB)": 122.96, "step": 40955, "token_acc": 0.9802167555479099, "train_speed(iter/s)": 0.233997 }, { "epoch": 3.122189191249333, "grad_norm": 1.0694605112075806, "learning_rate": 3.0947440586730734e-05, "loss": 0.04200972318649292, "memory(GiB)": 122.96, "step": 40960, "token_acc": 0.9771241830065359, "train_speed(iter/s)": 0.234003 }, { "epoch": 3.122570317859593, "grad_norm": 1.7235950231552124, "learning_rate": 3.093637103680438e-05, "loss": 0.08496562242507935, "memory(GiB)": 122.96, "step": 40965, "token_acc": 0.9660460021905805, "train_speed(iter/s)": 0.234011 }, { "epoch": 3.122951444469853, "grad_norm": 1.1172890663146973, "learning_rate": 3.092530258008868e-05, "loss": 0.08178526163101196, "memory(GiB)": 122.96, "step": 40970, "token_acc": 0.9742236961109436, "train_speed(iter/s)": 0.234015 }, { "epoch": 3.123332571080113, "grad_norm": 1.2570968866348267, "learning_rate": 3.0914235217218366e-05, "loss": 0.05247173309326172, "memory(GiB)": 122.96, "step": 40975, "token_acc": 0.9787669957161482, "train_speed(iter/s)": 0.23402 }, { "epoch": 3.1237136976903725, "grad_norm": 1.2520493268966675, "learning_rate": 3.090316894882808e-05, "loss": 0.07378085255622864, "memory(GiB)": 122.96, "step": 40980, "token_acc": 0.9713763702801461, "train_speed(iter/s)": 0.234026 }, { "epoch": 3.1240948243006326, "grad_norm": 0.8613846302032471, "learning_rate": 3.0892103775552443e-05, "loss": 0.08498408198356629, "memory(GiB)": 122.96, "step": 40985, "token_acc": 0.9690824468085106, "train_speed(iter/s)": 0.23403 }, { "epoch": 3.1244759509108926, "grad_norm": 2.482330799102783, "learning_rate": 3.0881039698025986e-05, "loss": 0.11203494071960449, "memory(GiB)": 122.96, "step": 40990, "token_acc": 0.9577006507592191, "train_speed(iter/s)": 0.234037 }, { "epoch": 3.1248570775211526, "grad_norm": 1.1744680404663086, "learning_rate": 3.086997671688317e-05, "loss": 0.0530243992805481, "memory(GiB)": 122.96, "step": 40995, "token_acc": 0.9784244856999498, "train_speed(iter/s)": 0.234044 }, { "epoch": 3.1252382041314126, "grad_norm": 1.5945223569869995, "learning_rate": 3.0858914832758425e-05, "loss": 0.0846670150756836, "memory(GiB)": 122.96, "step": 41000, "token_acc": 0.9722479185938946, "train_speed(iter/s)": 0.23405 }, { "epoch": 3.1252382041314126, "eval_loss": 0.0689433142542839, "eval_runtime": 221.6041, "eval_samples_per_second": 2.392, "eval_steps_per_second": 2.392, "eval_token_acc": 0.970912896813445, "step": 41000 }, { "epoch": 3.125619330741672, "grad_norm": 1.2955076694488525, "learning_rate": 3.084785404628608e-05, "loss": 0.057130742073059085, "memory(GiB)": 122.96, "step": 41005, "token_acc": 0.9711582634661766, "train_speed(iter/s)": 0.233757 }, { "epoch": 3.126000457351932, "grad_norm": 0.6317728757858276, "learning_rate": 3.083679435810043e-05, "loss": 0.04344964027404785, "memory(GiB)": 122.96, "step": 41010, "token_acc": 0.9838129496402878, "train_speed(iter/s)": 0.233765 }, { "epoch": 3.1263815839621922, "grad_norm": 0.40359827876091003, "learning_rate": 3.082573576883571e-05, "loss": 0.056244826316833495, "memory(GiB)": 122.96, "step": 41015, "token_acc": 0.9750857142857143, "train_speed(iter/s)": 0.233771 }, { "epoch": 3.1267627105724523, "grad_norm": 1.3705552816390991, "learning_rate": 3.0814678279126055e-05, "loss": 0.06375553607940673, "memory(GiB)": 122.96, "step": 41020, "token_acc": 0.9740619491312013, "train_speed(iter/s)": 0.233778 }, { "epoch": 3.1271438371827123, "grad_norm": 1.5133610963821411, "learning_rate": 3.080362188960556e-05, "loss": 0.06079742908477783, "memory(GiB)": 122.96, "step": 41025, "token_acc": 0.9711243088265411, "train_speed(iter/s)": 0.233785 }, { "epoch": 3.127524963792972, "grad_norm": 5.296872615814209, "learning_rate": 3.079256660090827e-05, "loss": 0.06026759147644043, "memory(GiB)": 122.96, "step": 41030, "token_acc": 0.9785488958990536, "train_speed(iter/s)": 0.233793 }, { "epoch": 3.127906090403232, "grad_norm": 2.393446683883667, "learning_rate": 3.078151241366816e-05, "loss": 0.08776463270187378, "memory(GiB)": 122.96, "step": 41035, "token_acc": 0.9666076957098629, "train_speed(iter/s)": 0.233799 }, { "epoch": 3.128287217013492, "grad_norm": 1.828258752822876, "learning_rate": 3.077045932851913e-05, "loss": 0.06124182939529419, "memory(GiB)": 122.96, "step": 41040, "token_acc": 0.976, "train_speed(iter/s)": 0.233806 }, { "epoch": 3.128668343623752, "grad_norm": 2.193493604660034, "learning_rate": 3.0759407346095014e-05, "loss": 0.09893736243247986, "memory(GiB)": 122.96, "step": 41045, "token_acc": 0.9706264199935086, "train_speed(iter/s)": 0.233809 }, { "epoch": 3.129049470234012, "grad_norm": 1.6973962783813477, "learning_rate": 3.0748356467029605e-05, "loss": 0.10050392150878906, "memory(GiB)": 122.96, "step": 41050, "token_acc": 0.9626126126126127, "train_speed(iter/s)": 0.233814 }, { "epoch": 3.1294305968442715, "grad_norm": 2.2159523963928223, "learning_rate": 3.0737306691956615e-05, "loss": 0.08656326532363892, "memory(GiB)": 122.96, "step": 41055, "token_acc": 0.9706867671691792, "train_speed(iter/s)": 0.233823 }, { "epoch": 3.1298117234545315, "grad_norm": 0.8030562400817871, "learning_rate": 3.072625802150968e-05, "loss": 0.052810651063919065, "memory(GiB)": 122.96, "step": 41060, "token_acc": 0.9780252859723059, "train_speed(iter/s)": 0.23383 }, { "epoch": 3.1301928500647915, "grad_norm": 0.8524441123008728, "learning_rate": 3.0715210456322427e-05, "loss": 0.03762415945529938, "memory(GiB)": 122.96, "step": 41065, "token_acc": 0.9838568935427574, "train_speed(iter/s)": 0.233837 }, { "epoch": 3.1305739766750516, "grad_norm": 2.027395725250244, "learning_rate": 3.0704163997028356e-05, "loss": 0.04519851207733154, "memory(GiB)": 122.96, "step": 41070, "token_acc": 0.982678338610542, "train_speed(iter/s)": 0.233842 }, { "epoch": 3.1309551032853116, "grad_norm": 2.054636001586914, "learning_rate": 3.0693118644260926e-05, "loss": 0.06948146224021912, "memory(GiB)": 122.96, "step": 41075, "token_acc": 0.9768955418158152, "train_speed(iter/s)": 0.233846 }, { "epoch": 3.131336229895571, "grad_norm": 4.334886074066162, "learning_rate": 3.068207439865356e-05, "loss": 0.07127683758735656, "memory(GiB)": 122.96, "step": 41080, "token_acc": 0.9658119658119658, "train_speed(iter/s)": 0.233856 }, { "epoch": 3.131717356505831, "grad_norm": 0.8018355965614319, "learning_rate": 3.067103126083956e-05, "loss": 0.04780671000480652, "memory(GiB)": 122.96, "step": 41085, "token_acc": 0.9824504737295435, "train_speed(iter/s)": 0.233858 }, { "epoch": 3.132098483116091, "grad_norm": 1.9356883764266968, "learning_rate": 3.065998923145224e-05, "loss": 0.07857391238212585, "memory(GiB)": 122.96, "step": 41090, "token_acc": 0.9696609161213563, "train_speed(iter/s)": 0.233862 }, { "epoch": 3.132479609726351, "grad_norm": 0.7646064758300781, "learning_rate": 3.0648948311124785e-05, "loss": 0.10466885566711426, "memory(GiB)": 122.96, "step": 41095, "token_acc": 0.9604834068674468, "train_speed(iter/s)": 0.233866 }, { "epoch": 3.132860736336611, "grad_norm": 1.8019771575927734, "learning_rate": 3.0637908500490344e-05, "loss": 0.08575916290283203, "memory(GiB)": 122.96, "step": 41100, "token_acc": 0.9697986577181208, "train_speed(iter/s)": 0.233874 }, { "epoch": 3.133241862946871, "grad_norm": 0.9144595861434937, "learning_rate": 3.0626869800182e-05, "loss": 0.05940815806388855, "memory(GiB)": 122.96, "step": 41105, "token_acc": 0.9796019900497512, "train_speed(iter/s)": 0.23388 }, { "epoch": 3.133622989557131, "grad_norm": 0.8346425890922546, "learning_rate": 3.0615832210832775e-05, "loss": 0.04400671124458313, "memory(GiB)": 122.96, "step": 41110, "token_acc": 0.9765124555160143, "train_speed(iter/s)": 0.233888 }, { "epoch": 3.134004116167391, "grad_norm": 0.7869294285774231, "learning_rate": 3.060479573307561e-05, "loss": 0.09113326668739319, "memory(GiB)": 122.96, "step": 41115, "token_acc": 0.9632518374081296, "train_speed(iter/s)": 0.233894 }, { "epoch": 3.134385242777651, "grad_norm": 1.6671888828277588, "learning_rate": 3.0593760367543414e-05, "loss": 0.09717616438865662, "memory(GiB)": 122.96, "step": 41120, "token_acc": 0.9565001242853591, "train_speed(iter/s)": 0.233901 }, { "epoch": 3.134766369387911, "grad_norm": 0.938633382320404, "learning_rate": 3.0582726114868996e-05, "loss": 0.053297394514083864, "memory(GiB)": 122.96, "step": 41125, "token_acc": 0.9678751720972923, "train_speed(iter/s)": 0.233909 }, { "epoch": 3.1351474959981704, "grad_norm": 1.7748560905456543, "learning_rate": 3.057169297568513e-05, "loss": 0.10676794052124024, "memory(GiB)": 122.96, "step": 41130, "token_acc": 0.9568640876412188, "train_speed(iter/s)": 0.233917 }, { "epoch": 3.1355286226084305, "grad_norm": 1.1242393255233765, "learning_rate": 3.056066095062452e-05, "loss": 0.05804198980331421, "memory(GiB)": 122.96, "step": 41135, "token_acc": 0.9754750331418471, "train_speed(iter/s)": 0.233923 }, { "epoch": 3.1359097492186905, "grad_norm": 1.2128175497055054, "learning_rate": 3.054963004031979e-05, "loss": 0.05337294936180115, "memory(GiB)": 122.96, "step": 41140, "token_acc": 0.9807971014492753, "train_speed(iter/s)": 0.233927 }, { "epoch": 3.1362908758289505, "grad_norm": 0.21051447093486786, "learning_rate": 3.053860024540352e-05, "loss": 0.0401511013507843, "memory(GiB)": 122.96, "step": 41145, "token_acc": 0.9878603945371776, "train_speed(iter/s)": 0.233933 }, { "epoch": 3.1366720024392105, "grad_norm": 0.9152271151542664, "learning_rate": 3.052757156650821e-05, "loss": 0.044893139600753786, "memory(GiB)": 122.96, "step": 41150, "token_acc": 0.9838065194532072, "train_speed(iter/s)": 0.233938 }, { "epoch": 3.13705312904947, "grad_norm": 1.4421234130859375, "learning_rate": 3.051654400426631e-05, "loss": 0.051659798622131346, "memory(GiB)": 122.96, "step": 41155, "token_acc": 0.9764499121265378, "train_speed(iter/s)": 0.233946 }, { "epoch": 3.13743425565973, "grad_norm": 1.3432954549789429, "learning_rate": 3.0505517559310205e-05, "loss": 0.04382171332836151, "memory(GiB)": 122.96, "step": 41160, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.233955 }, { "epoch": 3.13781538226999, "grad_norm": 1.2426396608352661, "learning_rate": 3.0494492232272188e-05, "loss": 0.05401742458343506, "memory(GiB)": 122.96, "step": 41165, "token_acc": 0.9803803199517054, "train_speed(iter/s)": 0.233957 }, { "epoch": 3.13819650888025, "grad_norm": 2.2157418727874756, "learning_rate": 3.0483468023784532e-05, "loss": 0.06951776742935181, "memory(GiB)": 122.96, "step": 41170, "token_acc": 0.9693273542600896, "train_speed(iter/s)": 0.233963 }, { "epoch": 3.1385776354905097, "grad_norm": 1.059841275215149, "learning_rate": 3.0472444934479416e-05, "loss": 0.08610001802444459, "memory(GiB)": 122.96, "step": 41175, "token_acc": 0.9636363636363636, "train_speed(iter/s)": 0.233969 }, { "epoch": 3.1389587621007697, "grad_norm": 0.5356501340866089, "learning_rate": 3.0461422964988963e-05, "loss": 0.06430829763412475, "memory(GiB)": 122.96, "step": 41180, "token_acc": 0.9760335530257639, "train_speed(iter/s)": 0.233969 }, { "epoch": 3.1393398887110298, "grad_norm": 1.2409876585006714, "learning_rate": 3.0450402115945232e-05, "loss": 0.09535632133483887, "memory(GiB)": 122.96, "step": 41185, "token_acc": 0.962272396212673, "train_speed(iter/s)": 0.233974 }, { "epoch": 3.1397210153212898, "grad_norm": 0.9350152611732483, "learning_rate": 3.0439382387980226e-05, "loss": 0.0920930802822113, "memory(GiB)": 122.96, "step": 41190, "token_acc": 0.9685185185185186, "train_speed(iter/s)": 0.23398 }, { "epoch": 3.14010214193155, "grad_norm": 1.5308011770248413, "learning_rate": 3.0428363781725854e-05, "loss": 0.07395251989364623, "memory(GiB)": 122.96, "step": 41195, "token_acc": 0.9721254355400697, "train_speed(iter/s)": 0.233988 }, { "epoch": 3.14048326854181, "grad_norm": 1.7124793529510498, "learning_rate": 3.041734629781401e-05, "loss": 0.08258944153785705, "memory(GiB)": 122.96, "step": 41200, "token_acc": 0.9590407470288624, "train_speed(iter/s)": 0.233994 }, { "epoch": 3.14048326854181, "eval_loss": 0.06769044697284698, "eval_runtime": 220.0738, "eval_samples_per_second": 2.408, "eval_steps_per_second": 2.408, "eval_token_acc": 0.9711538461538461, "step": 41200 }, { "epoch": 3.1408643951520694, "grad_norm": 0.6931485533714294, "learning_rate": 3.0406329936876475e-05, "loss": 0.06759366989135743, "memory(GiB)": 122.96, "step": 41205, "token_acc": 0.9713930699500155, "train_speed(iter/s)": 0.233703 }, { "epoch": 3.1412455217623294, "grad_norm": 0.7603834271430969, "learning_rate": 3.0395314699544997e-05, "loss": 0.10035171508789062, "memory(GiB)": 122.96, "step": 41210, "token_acc": 0.9664608710161855, "train_speed(iter/s)": 0.233707 }, { "epoch": 3.1416266483725894, "grad_norm": 0.7873270511627197, "learning_rate": 3.038430058645122e-05, "loss": 0.04833589196205139, "memory(GiB)": 122.96, "step": 41215, "token_acc": 0.9786969643174153, "train_speed(iter/s)": 0.233713 }, { "epoch": 3.1420077749828494, "grad_norm": 0.9613159894943237, "learning_rate": 3.0373287598226784e-05, "loss": 0.06658474802970886, "memory(GiB)": 122.96, "step": 41220, "token_acc": 0.9721208582688659, "train_speed(iter/s)": 0.233716 }, { "epoch": 3.142388901593109, "grad_norm": 0.5019503235816956, "learning_rate": 3.0362275735503242e-05, "loss": 0.05706588625907898, "memory(GiB)": 122.96, "step": 41225, "token_acc": 0.976203551162365, "train_speed(iter/s)": 0.233722 }, { "epoch": 3.142770028203369, "grad_norm": 0.9879562258720398, "learning_rate": 3.0351264998912053e-05, "loss": 0.06776690483093262, "memory(GiB)": 122.96, "step": 41230, "token_acc": 0.9668774966711052, "train_speed(iter/s)": 0.233725 }, { "epoch": 3.143151154813629, "grad_norm": 0.5349322557449341, "learning_rate": 3.0340255389084634e-05, "loss": 0.06462484002113342, "memory(GiB)": 122.96, "step": 41235, "token_acc": 0.9790996784565916, "train_speed(iter/s)": 0.233732 }, { "epoch": 3.143532281423889, "grad_norm": 1.471511960029602, "learning_rate": 3.0329246906652337e-05, "loss": 0.06500527858734131, "memory(GiB)": 122.96, "step": 41240, "token_acc": 0.971702418986764, "train_speed(iter/s)": 0.233739 }, { "epoch": 3.143913408034149, "grad_norm": 0.8499729633331299, "learning_rate": 3.0318239552246448e-05, "loss": 0.04652838110923767, "memory(GiB)": 122.96, "step": 41245, "token_acc": 0.9805529075309819, "train_speed(iter/s)": 0.233743 }, { "epoch": 3.1442945346444087, "grad_norm": 1.6324365139007568, "learning_rate": 3.0307233326498174e-05, "loss": 0.06890680789947509, "memory(GiB)": 122.96, "step": 41250, "token_acc": 0.9740932642487047, "train_speed(iter/s)": 0.233749 }, { "epoch": 3.1446756612546687, "grad_norm": 1.3680241107940674, "learning_rate": 3.029622823003869e-05, "loss": 0.07449865341186523, "memory(GiB)": 122.96, "step": 41255, "token_acc": 0.9691166321601105, "train_speed(iter/s)": 0.233753 }, { "epoch": 3.1450567878649287, "grad_norm": 1.0384217500686646, "learning_rate": 3.028522426349909e-05, "loss": 0.08250809311866761, "memory(GiB)": 122.96, "step": 41260, "token_acc": 0.9618287373004354, "train_speed(iter/s)": 0.233757 }, { "epoch": 3.1454379144751887, "grad_norm": 0.41949376463890076, "learning_rate": 3.0274221427510386e-05, "loss": 0.05253629088401794, "memory(GiB)": 122.96, "step": 41265, "token_acc": 0.9800214056368177, "train_speed(iter/s)": 0.233761 }, { "epoch": 3.1458190410854487, "grad_norm": 0.8782514333724976, "learning_rate": 3.026321972270354e-05, "loss": 0.07180822491645814, "memory(GiB)": 122.96, "step": 41270, "token_acc": 0.9712936046511628, "train_speed(iter/s)": 0.233766 }, { "epoch": 3.1462001676957083, "grad_norm": 1.048801302909851, "learning_rate": 3.0252219149709455e-05, "loss": 0.06300634145736694, "memory(GiB)": 122.96, "step": 41275, "token_acc": 0.9719656992084432, "train_speed(iter/s)": 0.233771 }, { "epoch": 3.1465812943059683, "grad_norm": 1.2616082429885864, "learning_rate": 3.0241219709158965e-05, "loss": 0.06545564532279968, "memory(GiB)": 122.96, "step": 41280, "token_acc": 0.9743235236026071, "train_speed(iter/s)": 0.233776 }, { "epoch": 3.1469624209162284, "grad_norm": 0.8236479163169861, "learning_rate": 3.0230221401682822e-05, "loss": 0.07151327729225158, "memory(GiB)": 122.96, "step": 41285, "token_acc": 0.9748693704901716, "train_speed(iter/s)": 0.233782 }, { "epoch": 3.1473435475264884, "grad_norm": 0.8467496633529663, "learning_rate": 3.0219224227911747e-05, "loss": 0.05326000452041626, "memory(GiB)": 122.96, "step": 41290, "token_acc": 0.9806903991370011, "train_speed(iter/s)": 0.233783 }, { "epoch": 3.1477246741367484, "grad_norm": 1.935588002204895, "learning_rate": 3.0208228188476374e-05, "loss": 0.09105112552642822, "memory(GiB)": 122.96, "step": 41295, "token_acc": 0.9649357601713062, "train_speed(iter/s)": 0.233789 }, { "epoch": 3.148105800747008, "grad_norm": 0.9834100008010864, "learning_rate": 3.0197233284007254e-05, "loss": 0.049700969457626344, "memory(GiB)": 122.96, "step": 41300, "token_acc": 0.9791364821790786, "train_speed(iter/s)": 0.233796 }, { "epoch": 3.148486927357268, "grad_norm": 1.1245007514953613, "learning_rate": 3.0186239515134917e-05, "loss": 0.07985422015190125, "memory(GiB)": 122.96, "step": 41305, "token_acc": 0.9684287812041116, "train_speed(iter/s)": 0.233804 }, { "epoch": 3.148868053967528, "grad_norm": 1.973496437072754, "learning_rate": 3.01752468824898e-05, "loss": 0.07230067253112793, "memory(GiB)": 122.96, "step": 41310, "token_acc": 0.967677440853049, "train_speed(iter/s)": 0.233812 }, { "epoch": 3.149249180577788, "grad_norm": 1.0982078313827515, "learning_rate": 3.0164255386702266e-05, "loss": 0.05986018180847168, "memory(GiB)": 122.96, "step": 41315, "token_acc": 0.9770057485628593, "train_speed(iter/s)": 0.233813 }, { "epoch": 3.149630307188048, "grad_norm": 0.5968378782272339, "learning_rate": 3.0153265028402643e-05, "loss": 0.05105769634246826, "memory(GiB)": 122.96, "step": 41320, "token_acc": 0.9772988978450403, "train_speed(iter/s)": 0.233819 }, { "epoch": 3.1500114337983076, "grad_norm": 1.1003705263137817, "learning_rate": 3.0142275808221175e-05, "loss": 0.05596068501472473, "memory(GiB)": 122.96, "step": 41325, "token_acc": 0.9763434579439252, "train_speed(iter/s)": 0.233823 }, { "epoch": 3.1503925604085676, "grad_norm": 1.6825363636016846, "learning_rate": 3.0131287726788037e-05, "loss": 0.049200701713562014, "memory(GiB)": 122.96, "step": 41330, "token_acc": 0.9781965425946114, "train_speed(iter/s)": 0.233826 }, { "epoch": 3.1507736870188277, "grad_norm": 0.8921375870704651, "learning_rate": 3.0120300784733335e-05, "loss": 0.08356298208236694, "memory(GiB)": 122.96, "step": 41335, "token_acc": 0.9677571193221934, "train_speed(iter/s)": 0.233834 }, { "epoch": 3.1511548136290877, "grad_norm": 0.6563031077384949, "learning_rate": 3.0109314982687142e-05, "loss": 0.04998570680618286, "memory(GiB)": 122.96, "step": 41340, "token_acc": 0.9752130131680867, "train_speed(iter/s)": 0.233843 }, { "epoch": 3.1515359402393477, "grad_norm": 1.4205410480499268, "learning_rate": 3.0098330321279432e-05, "loss": 0.06513535380363464, "memory(GiB)": 122.96, "step": 41345, "token_acc": 0.9752332485156913, "train_speed(iter/s)": 0.233849 }, { "epoch": 3.1519170668496073, "grad_norm": 1.3406301736831665, "learning_rate": 3.0087346801140104e-05, "loss": 0.09504803419113159, "memory(GiB)": 122.96, "step": 41350, "token_acc": 0.9653831194087904, "train_speed(iter/s)": 0.233858 }, { "epoch": 3.1522981934598673, "grad_norm": 0.8556622862815857, "learning_rate": 3.0076364422899034e-05, "loss": 0.05497164726257324, "memory(GiB)": 122.96, "step": 41355, "token_acc": 0.9800890138205669, "train_speed(iter/s)": 0.233859 }, { "epoch": 3.1526793200701273, "grad_norm": 0.37950411438941956, "learning_rate": 3.0065383187186023e-05, "loss": 0.05001155138015747, "memory(GiB)": 122.96, "step": 41360, "token_acc": 0.9802559912854031, "train_speed(iter/s)": 0.233861 }, { "epoch": 3.1530604466803873, "grad_norm": 0.839168906211853, "learning_rate": 3.0054403094630778e-05, "loss": 0.046772506833076474, "memory(GiB)": 122.96, "step": 41365, "token_acc": 0.9765400115141047, "train_speed(iter/s)": 0.233866 }, { "epoch": 3.1534415732906473, "grad_norm": 1.3182730674743652, "learning_rate": 3.0043424145862953e-05, "loss": 0.05238469243049622, "memory(GiB)": 122.96, "step": 41370, "token_acc": 0.9783412572636028, "train_speed(iter/s)": 0.233873 }, { "epoch": 3.153822699900907, "grad_norm": 0.9910061359405518, "learning_rate": 3.0032446341512134e-05, "loss": 0.06720551252365112, "memory(GiB)": 122.96, "step": 41375, "token_acc": 0.9741091314031181, "train_speed(iter/s)": 0.233881 }, { "epoch": 3.154203826511167, "grad_norm": 1.1701642274856567, "learning_rate": 3.002146968220787e-05, "loss": 0.05850543975830078, "memory(GiB)": 122.96, "step": 41380, "token_acc": 0.9786673058485139, "train_speed(iter/s)": 0.233884 }, { "epoch": 3.154584953121427, "grad_norm": 1.1980109214782715, "learning_rate": 3.0010494168579604e-05, "loss": 0.057343071699142455, "memory(GiB)": 122.96, "step": 41385, "token_acc": 0.9680067950169875, "train_speed(iter/s)": 0.233892 }, { "epoch": 3.154966079731687, "grad_norm": 1.472232460975647, "learning_rate": 2.9999519801256727e-05, "loss": 0.09713571071624756, "memory(GiB)": 122.96, "step": 41390, "token_acc": 0.9656340755082284, "train_speed(iter/s)": 0.233899 }, { "epoch": 3.155347206341947, "grad_norm": 0.47783902287483215, "learning_rate": 2.9988546580868583e-05, "loss": 0.06012204885482788, "memory(GiB)": 122.96, "step": 41395, "token_acc": 0.9800693240901213, "train_speed(iter/s)": 0.233901 }, { "epoch": 3.1557283329522066, "grad_norm": 0.5391678214073181, "learning_rate": 2.9977574508044437e-05, "loss": 0.051166027784347534, "memory(GiB)": 122.96, "step": 41400, "token_acc": 0.9823733862959285, "train_speed(iter/s)": 0.233909 }, { "epoch": 3.1557283329522066, "eval_loss": 0.06848479807376862, "eval_runtime": 220.5993, "eval_samples_per_second": 2.403, "eval_steps_per_second": 2.403, "eval_token_acc": 0.9718013975061743, "step": 41400 }, { "epoch": 3.1561094595624666, "grad_norm": 1.6586476564407349, "learning_rate": 2.9966603583413455e-05, "loss": 0.07701042890548707, "memory(GiB)": 122.96, "step": 41405, "token_acc": 0.9715322396649813, "train_speed(iter/s)": 0.233625 }, { "epoch": 3.1564905861727266, "grad_norm": 2.020293951034546, "learning_rate": 2.995563380760481e-05, "loss": 0.06349784135818481, "memory(GiB)": 122.96, "step": 41410, "token_acc": 0.970162124870645, "train_speed(iter/s)": 0.233632 }, { "epoch": 3.1568717127829866, "grad_norm": 1.1042976379394531, "learning_rate": 2.9944665181247543e-05, "loss": 0.06132028102874756, "memory(GiB)": 122.96, "step": 41415, "token_acc": 0.9740932642487047, "train_speed(iter/s)": 0.233637 }, { "epoch": 3.1572528393932466, "grad_norm": 1.6652374267578125, "learning_rate": 2.9933697704970654e-05, "loss": 0.08311035037040711, "memory(GiB)": 122.96, "step": 41420, "token_acc": 0.9758149316508938, "train_speed(iter/s)": 0.233644 }, { "epoch": 3.157633966003506, "grad_norm": 1.0442605018615723, "learning_rate": 2.992273137940309e-05, "loss": 0.05910235643386841, "memory(GiB)": 122.96, "step": 41425, "token_acc": 0.9742698191933241, "train_speed(iter/s)": 0.233649 }, { "epoch": 3.1580150926137662, "grad_norm": 2.1014113426208496, "learning_rate": 2.991176620517372e-05, "loss": 0.0698774516582489, "memory(GiB)": 122.96, "step": 41430, "token_acc": 0.9770612421553776, "train_speed(iter/s)": 0.233655 }, { "epoch": 3.1583962192240262, "grad_norm": 1.0567227602005005, "learning_rate": 2.9900802182911326e-05, "loss": 0.0701134204864502, "memory(GiB)": 122.96, "step": 41435, "token_acc": 0.9774258760107817, "train_speed(iter/s)": 0.233664 }, { "epoch": 3.1587773458342863, "grad_norm": 0.7786219716072083, "learning_rate": 2.988983931324465e-05, "loss": 0.07259177565574645, "memory(GiB)": 122.96, "step": 41440, "token_acc": 0.9715693707354056, "train_speed(iter/s)": 0.233673 }, { "epoch": 3.1591584724445463, "grad_norm": 0.5490391254425049, "learning_rate": 2.987887759680238e-05, "loss": 0.04783483147621155, "memory(GiB)": 122.96, "step": 41445, "token_acc": 0.9842549439476005, "train_speed(iter/s)": 0.233675 }, { "epoch": 3.159539599054806, "grad_norm": 0.5900206565856934, "learning_rate": 2.98679170342131e-05, "loss": 0.060641562938690184, "memory(GiB)": 122.96, "step": 41450, "token_acc": 0.9774127310061602, "train_speed(iter/s)": 0.233679 }, { "epoch": 3.159920725665066, "grad_norm": 1.441440463066101, "learning_rate": 2.9856957626105346e-05, "loss": 0.07627586126327515, "memory(GiB)": 122.96, "step": 41455, "token_acc": 0.9694505494505494, "train_speed(iter/s)": 0.233685 }, { "epoch": 3.160301852275326, "grad_norm": 0.2709362208843231, "learning_rate": 2.9845999373107614e-05, "loss": 0.05195193886756897, "memory(GiB)": 122.96, "step": 41460, "token_acc": 0.9784145887606996, "train_speed(iter/s)": 0.233694 }, { "epoch": 3.160682978885586, "grad_norm": 1.6669529676437378, "learning_rate": 2.983504227584828e-05, "loss": 0.10714271068572997, "memory(GiB)": 122.96, "step": 41465, "token_acc": 0.9558270676691729, "train_speed(iter/s)": 0.233703 }, { "epoch": 3.1610641054958455, "grad_norm": 0.9114015698432922, "learning_rate": 2.9824086334955692e-05, "loss": 0.06858885288238525, "memory(GiB)": 122.96, "step": 41470, "token_acc": 0.9707773232028054, "train_speed(iter/s)": 0.233709 }, { "epoch": 3.1614452321061055, "grad_norm": 0.7932776808738708, "learning_rate": 2.9813131551058133e-05, "loss": 0.038591507077217105, "memory(GiB)": 122.96, "step": 41475, "token_acc": 0.9846519276448018, "train_speed(iter/s)": 0.233714 }, { "epoch": 3.1618263587163655, "grad_norm": 1.2704881429672241, "learning_rate": 2.9802177924783803e-05, "loss": 0.06561845541000366, "memory(GiB)": 122.96, "step": 41480, "token_acc": 0.9748110831234257, "train_speed(iter/s)": 0.233721 }, { "epoch": 3.1622074853266255, "grad_norm": 0.8512484431266785, "learning_rate": 2.9791225456760818e-05, "loss": 0.07014963626861573, "memory(GiB)": 122.96, "step": 41485, "token_acc": 0.971597874948917, "train_speed(iter/s)": 0.233726 }, { "epoch": 3.1625886119368856, "grad_norm": 1.2230534553527832, "learning_rate": 2.9780274147617293e-05, "loss": 0.08682125210762023, "memory(GiB)": 122.96, "step": 41490, "token_acc": 0.9709202219246221, "train_speed(iter/s)": 0.233731 }, { "epoch": 3.1629697385471456, "grad_norm": 1.1891520023345947, "learning_rate": 2.97693239979812e-05, "loss": 0.07740952968597412, "memory(GiB)": 122.96, "step": 41495, "token_acc": 0.9677900387712496, "train_speed(iter/s)": 0.233739 }, { "epoch": 3.163350865157405, "grad_norm": 1.4557856321334839, "learning_rate": 2.975837500848051e-05, "loss": 0.04073510468006134, "memory(GiB)": 122.96, "step": 41500, "token_acc": 0.9851280120481928, "train_speed(iter/s)": 0.233745 }, { "epoch": 3.163731991767665, "grad_norm": 0.6580830216407776, "learning_rate": 2.974742717974308e-05, "loss": 0.05300735235214234, "memory(GiB)": 122.96, "step": 41505, "token_acc": 0.9713842058562555, "train_speed(iter/s)": 0.23375 }, { "epoch": 3.164113118377925, "grad_norm": 0.9973852038383484, "learning_rate": 2.973648051239671e-05, "loss": 0.06453022360801697, "memory(GiB)": 122.96, "step": 41510, "token_acc": 0.9722010080953108, "train_speed(iter/s)": 0.233753 }, { "epoch": 3.164494244988185, "grad_norm": 1.146131992340088, "learning_rate": 2.9725535007069148e-05, "loss": 0.06635384559631348, "memory(GiB)": 122.96, "step": 41515, "token_acc": 0.9619283065512979, "train_speed(iter/s)": 0.233761 }, { "epoch": 3.164875371598445, "grad_norm": 1.5216649770736694, "learning_rate": 2.971459066438808e-05, "loss": 0.0736556589603424, "memory(GiB)": 122.96, "step": 41520, "token_acc": 0.9651094027202839, "train_speed(iter/s)": 0.23377 }, { "epoch": 3.165256498208705, "grad_norm": 1.0645077228546143, "learning_rate": 2.970364748498109e-05, "loss": 0.07407851815223694, "memory(GiB)": 122.96, "step": 41525, "token_acc": 0.9780263683579704, "train_speed(iter/s)": 0.233772 }, { "epoch": 3.165637624818965, "grad_norm": 0.8308607339859009, "learning_rate": 2.9692705469475734e-05, "loss": 0.06713278889656067, "memory(GiB)": 122.96, "step": 41530, "token_acc": 0.9723786066150598, "train_speed(iter/s)": 0.233777 }, { "epoch": 3.166018751429225, "grad_norm": 1.037408471107483, "learning_rate": 2.9681764618499486e-05, "loss": 0.04068241715431213, "memory(GiB)": 122.96, "step": 41535, "token_acc": 0.9786076186901858, "train_speed(iter/s)": 0.233782 }, { "epoch": 3.166399878039485, "grad_norm": 0.5918450951576233, "learning_rate": 2.967082493267975e-05, "loss": 0.06180088520050049, "memory(GiB)": 122.96, "step": 41540, "token_acc": 0.9754901960784313, "train_speed(iter/s)": 0.233783 }, { "epoch": 3.166781004649745, "grad_norm": 0.7628582119941711, "learning_rate": 2.9659886412643856e-05, "loss": 0.05979889035224915, "memory(GiB)": 122.96, "step": 41545, "token_acc": 0.9741100323624595, "train_speed(iter/s)": 0.233789 }, { "epoch": 3.1671621312600045, "grad_norm": 1.070630431175232, "learning_rate": 2.9648949059019095e-05, "loss": 0.06691375374794006, "memory(GiB)": 122.96, "step": 41550, "token_acc": 0.9726292507602986, "train_speed(iter/s)": 0.233797 }, { "epoch": 3.1675432578702645, "grad_norm": 0.851128101348877, "learning_rate": 2.9638012872432663e-05, "loss": 0.0429253876209259, "memory(GiB)": 122.96, "step": 41555, "token_acc": 0.9775474956822107, "train_speed(iter/s)": 0.233806 }, { "epoch": 3.1679243844805245, "grad_norm": 0.3952985405921936, "learning_rate": 2.9627077853511692e-05, "loss": 0.03765738904476166, "memory(GiB)": 122.96, "step": 41560, "token_acc": 0.9830451000339098, "train_speed(iter/s)": 0.233814 }, { "epoch": 3.1683055110907845, "grad_norm": 0.6665393710136414, "learning_rate": 2.9616144002883273e-05, "loss": 0.05728347301483154, "memory(GiB)": 122.96, "step": 41565, "token_acc": 0.9820014398848093, "train_speed(iter/s)": 0.233824 }, { "epoch": 3.168686637701044, "grad_norm": 1.4150234460830688, "learning_rate": 2.9605211321174408e-05, "loss": 0.0659214735031128, "memory(GiB)": 122.96, "step": 41570, "token_acc": 0.9751257685858021, "train_speed(iter/s)": 0.233831 }, { "epoch": 3.169067764311304, "grad_norm": 1.4562711715698242, "learning_rate": 2.9594279809012015e-05, "loss": 0.0645095944404602, "memory(GiB)": 122.96, "step": 41575, "token_acc": 0.9799242424242425, "train_speed(iter/s)": 0.23384 }, { "epoch": 3.169448890921564, "grad_norm": 0.15288378298282623, "learning_rate": 2.9583349467022992e-05, "loss": 0.05517424345016479, "memory(GiB)": 122.96, "step": 41580, "token_acc": 0.9730476848652384, "train_speed(iter/s)": 0.233851 }, { "epoch": 3.169830017531824, "grad_norm": 0.6692416667938232, "learning_rate": 2.9572420295834137e-05, "loss": 0.06157075166702271, "memory(GiB)": 122.96, "step": 41585, "token_acc": 0.9735305566368236, "train_speed(iter/s)": 0.233856 }, { "epoch": 3.170211144142084, "grad_norm": 1.4360891580581665, "learning_rate": 2.9561492296072167e-05, "loss": 0.05766244530677796, "memory(GiB)": 122.96, "step": 41590, "token_acc": 0.9764007728401877, "train_speed(iter/s)": 0.233862 }, { "epoch": 3.1705922707523437, "grad_norm": 2.032360553741455, "learning_rate": 2.9550565468363777e-05, "loss": 0.08509247303009033, "memory(GiB)": 122.96, "step": 41595, "token_acc": 0.9676682481110526, "train_speed(iter/s)": 0.233868 }, { "epoch": 3.1709733973626038, "grad_norm": 0.606160044670105, "learning_rate": 2.9539639813335562e-05, "loss": 0.07540404200553893, "memory(GiB)": 122.96, "step": 41600, "token_acc": 0.9760080936551525, "train_speed(iter/s)": 0.233872 }, { "epoch": 3.1709733973626038, "eval_loss": 0.06945876777172089, "eval_runtime": 221.3335, "eval_samples_per_second": 2.395, "eval_steps_per_second": 2.395, "eval_token_acc": 0.9715002108306728, "step": 41600 }, { "epoch": 3.1713545239728638, "grad_norm": 1.0155447721481323, "learning_rate": 2.952871533161406e-05, "loss": 0.07248306274414062, "memory(GiB)": 122.96, "step": 41605, "token_acc": 0.9717095136021637, "train_speed(iter/s)": 0.233584 }, { "epoch": 3.171735650583124, "grad_norm": 1.2093439102172852, "learning_rate": 2.9517792023825717e-05, "loss": 0.10081007480621337, "memory(GiB)": 122.96, "step": 41610, "token_acc": 0.9633977900552486, "train_speed(iter/s)": 0.233591 }, { "epoch": 3.172116777193384, "grad_norm": 0.9452424645423889, "learning_rate": 2.9506869890596955e-05, "loss": 0.0590671181678772, "memory(GiB)": 122.96, "step": 41615, "token_acc": 0.9772357723577236, "train_speed(iter/s)": 0.233597 }, { "epoch": 3.1724979038036434, "grad_norm": 1.1566349267959595, "learning_rate": 2.9495948932554118e-05, "loss": 0.0628302276134491, "memory(GiB)": 122.96, "step": 41620, "token_acc": 0.9743388834476004, "train_speed(iter/s)": 0.233603 }, { "epoch": 3.1728790304139034, "grad_norm": 1.1924704313278198, "learning_rate": 2.9485029150323458e-05, "loss": 0.04935285747051239, "memory(GiB)": 122.96, "step": 41625, "token_acc": 0.9838214212574237, "train_speed(iter/s)": 0.23361 }, { "epoch": 3.1732601570241634, "grad_norm": 0.8630245327949524, "learning_rate": 2.9474110544531163e-05, "loss": 0.06599261164665222, "memory(GiB)": 122.96, "step": 41630, "token_acc": 0.9724605867092396, "train_speed(iter/s)": 0.233614 }, { "epoch": 3.1736412836344234, "grad_norm": 0.36059150099754333, "learning_rate": 2.946319311580339e-05, "loss": 0.04272204637527466, "memory(GiB)": 122.96, "step": 41635, "token_acc": 0.9777361631294216, "train_speed(iter/s)": 0.233621 }, { "epoch": 3.1740224102446835, "grad_norm": 0.9269136190414429, "learning_rate": 2.9452276864766192e-05, "loss": 0.05727453231811523, "memory(GiB)": 122.96, "step": 41640, "token_acc": 0.975008799718409, "train_speed(iter/s)": 0.23363 }, { "epoch": 3.174403536854943, "grad_norm": 1.1380795240402222, "learning_rate": 2.9441361792045556e-05, "loss": 0.06212894320487976, "memory(GiB)": 122.96, "step": 41645, "token_acc": 0.9760191846522782, "train_speed(iter/s)": 0.233631 }, { "epoch": 3.174784663465203, "grad_norm": 0.6806704998016357, "learning_rate": 2.943044789826741e-05, "loss": 0.07609431743621826, "memory(GiB)": 122.96, "step": 41650, "token_acc": 0.9750432419075858, "train_speed(iter/s)": 0.233638 }, { "epoch": 3.175165790075463, "grad_norm": 0.645335853099823, "learning_rate": 2.9419535184057635e-05, "loss": 0.030690330266952514, "memory(GiB)": 122.96, "step": 41655, "token_acc": 0.9865523289807056, "train_speed(iter/s)": 0.233644 }, { "epoch": 3.175546916685723, "grad_norm": 0.6478248834609985, "learning_rate": 2.940862365004201e-05, "loss": 0.07085552215576171, "memory(GiB)": 122.96, "step": 41660, "token_acc": 0.9721142470846713, "train_speed(iter/s)": 0.233647 }, { "epoch": 3.175928043295983, "grad_norm": 1.6494673490524292, "learning_rate": 2.939771329684625e-05, "loss": 0.06327407956123351, "memory(GiB)": 122.96, "step": 41665, "token_acc": 0.9809932556713673, "train_speed(iter/s)": 0.233654 }, { "epoch": 3.1763091699062427, "grad_norm": 2.364428758621216, "learning_rate": 2.9386804125096045e-05, "loss": 0.0631700575351715, "memory(GiB)": 122.96, "step": 41670, "token_acc": 0.9826294277929155, "train_speed(iter/s)": 0.233659 }, { "epoch": 3.1766902965165027, "grad_norm": 0.6007527709007263, "learning_rate": 2.9375896135416957e-05, "loss": 0.04597944617271423, "memory(GiB)": 122.96, "step": 41675, "token_acc": 0.9844736842105263, "train_speed(iter/s)": 0.233666 }, { "epoch": 3.1770714231267627, "grad_norm": 1.3521523475646973, "learning_rate": 2.9364989328434516e-05, "loss": 0.05808382630348206, "memory(GiB)": 122.96, "step": 41680, "token_acc": 0.9709812905689195, "train_speed(iter/s)": 0.233672 }, { "epoch": 3.1774525497370227, "grad_norm": 1.880145788192749, "learning_rate": 2.9354083704774188e-05, "loss": 0.07504408359527588, "memory(GiB)": 122.96, "step": 41685, "token_acc": 0.9720730397422127, "train_speed(iter/s)": 0.233679 }, { "epoch": 3.1778336763472828, "grad_norm": 0.563736617565155, "learning_rate": 2.934317926506135e-05, "loss": 0.053441751003265384, "memory(GiB)": 122.96, "step": 41690, "token_acc": 0.9758378799688231, "train_speed(iter/s)": 0.233682 }, { "epoch": 3.1782148029575423, "grad_norm": 0.7579193711280823, "learning_rate": 2.9332276009921312e-05, "loss": 0.05543901920318604, "memory(GiB)": 122.96, "step": 41695, "token_acc": 0.9733870967741935, "train_speed(iter/s)": 0.233691 }, { "epoch": 3.1785959295678023, "grad_norm": 0.9986939430236816, "learning_rate": 2.9321373939979336e-05, "loss": 0.06089034080505371, "memory(GiB)": 122.96, "step": 41700, "token_acc": 0.9736320380650277, "train_speed(iter/s)": 0.233696 }, { "epoch": 3.1789770561780624, "grad_norm": 0.6006383299827576, "learning_rate": 2.931047305586061e-05, "loss": 0.053903496265411376, "memory(GiB)": 122.96, "step": 41705, "token_acc": 0.9800590841949779, "train_speed(iter/s)": 0.233699 }, { "epoch": 3.1793581827883224, "grad_norm": 1.6536331176757812, "learning_rate": 2.9299573358190246e-05, "loss": 0.09062209725379944, "memory(GiB)": 122.96, "step": 41710, "token_acc": 0.975100695715855, "train_speed(iter/s)": 0.233708 }, { "epoch": 3.1797393093985824, "grad_norm": 0.8072085976600647, "learning_rate": 2.928867484759328e-05, "loss": 0.04719461798667908, "memory(GiB)": 122.96, "step": 41715, "token_acc": 0.9837099316868103, "train_speed(iter/s)": 0.233712 }, { "epoch": 3.180120436008842, "grad_norm": 0.6361351609230042, "learning_rate": 2.9277777524694705e-05, "loss": 0.055147993564605716, "memory(GiB)": 122.96, "step": 41720, "token_acc": 0.9798206278026906, "train_speed(iter/s)": 0.233716 }, { "epoch": 3.180501562619102, "grad_norm": 1.6861317157745361, "learning_rate": 2.926688139011943e-05, "loss": 0.0712714970111847, "memory(GiB)": 122.96, "step": 41725, "token_acc": 0.9652080344332855, "train_speed(iter/s)": 0.233724 }, { "epoch": 3.180882689229362, "grad_norm": 0.6705349683761597, "learning_rate": 2.925598644449228e-05, "loss": 0.06979209780693055, "memory(GiB)": 122.96, "step": 41730, "token_acc": 0.9688052741598328, "train_speed(iter/s)": 0.233728 }, { "epoch": 3.181263815839622, "grad_norm": 1.554783582687378, "learning_rate": 2.9245092688438046e-05, "loss": 0.08325998783111573, "memory(GiB)": 122.96, "step": 41735, "token_acc": 0.9748995983935743, "train_speed(iter/s)": 0.233736 }, { "epoch": 3.181644942449882, "grad_norm": 1.439249873161316, "learning_rate": 2.9234200122581445e-05, "loss": 0.04538442492485047, "memory(GiB)": 122.96, "step": 41740, "token_acc": 0.9795310755489394, "train_speed(iter/s)": 0.233744 }, { "epoch": 3.1820260690601416, "grad_norm": 0.8866991996765137, "learning_rate": 2.9223308747547085e-05, "loss": 0.05833116173744202, "memory(GiB)": 122.96, "step": 41745, "token_acc": 0.974155069582505, "train_speed(iter/s)": 0.233754 }, { "epoch": 3.1824071956704016, "grad_norm": 0.6320008039474487, "learning_rate": 2.921241856395954e-05, "loss": 0.05486854910850525, "memory(GiB)": 122.96, "step": 41750, "token_acc": 0.9778657549037251, "train_speed(iter/s)": 0.233761 }, { "epoch": 3.1827883222806617, "grad_norm": 1.4702850580215454, "learning_rate": 2.9201529572443352e-05, "loss": 0.06429152488708496, "memory(GiB)": 122.96, "step": 41755, "token_acc": 0.9760294117647059, "train_speed(iter/s)": 0.233764 }, { "epoch": 3.1831694488909217, "grad_norm": 0.6732486486434937, "learning_rate": 2.9190641773622916e-05, "loss": 0.06327551007270812, "memory(GiB)": 122.96, "step": 41760, "token_acc": 0.977112676056338, "train_speed(iter/s)": 0.233772 }, { "epoch": 3.1835505755011813, "grad_norm": 0.9415630102157593, "learning_rate": 2.9179755168122625e-05, "loss": 0.05203160047531128, "memory(GiB)": 122.96, "step": 41765, "token_acc": 0.9781550203974207, "train_speed(iter/s)": 0.233774 }, { "epoch": 3.1839317021114413, "grad_norm": 1.2698845863342285, "learning_rate": 2.916886975656673e-05, "loss": 0.07148418426513672, "memory(GiB)": 122.96, "step": 41770, "token_acc": 0.974694046878241, "train_speed(iter/s)": 0.233779 }, { "epoch": 3.1843128287217013, "grad_norm": 0.5888361930847168, "learning_rate": 2.9157985539579496e-05, "loss": 0.03702530860900879, "memory(GiB)": 122.96, "step": 41775, "token_acc": 0.9827722459301407, "train_speed(iter/s)": 0.233783 }, { "epoch": 3.1846939553319613, "grad_norm": 1.5327491760253906, "learning_rate": 2.9147102517785084e-05, "loss": 0.08921311497688293, "memory(GiB)": 122.96, "step": 41780, "token_acc": 0.9691379921958141, "train_speed(iter/s)": 0.233791 }, { "epoch": 3.1850750819422213, "grad_norm": 1.0220773220062256, "learning_rate": 2.9136220691807565e-05, "loss": 0.09791937470436096, "memory(GiB)": 122.96, "step": 41785, "token_acc": 0.9635097031016753, "train_speed(iter/s)": 0.233795 }, { "epoch": 3.1854562085524813, "grad_norm": 0.8032410740852356, "learning_rate": 2.912534006227098e-05, "loss": 0.04858251810073853, "memory(GiB)": 122.96, "step": 41790, "token_acc": 0.9800294496594883, "train_speed(iter/s)": 0.233796 }, { "epoch": 3.185837335162741, "grad_norm": 2.0223424434661865, "learning_rate": 2.9114460629799257e-05, "loss": 0.10475491285324097, "memory(GiB)": 122.96, "step": 41795, "token_acc": 0.9703459637561779, "train_speed(iter/s)": 0.233804 }, { "epoch": 3.186218461773001, "grad_norm": 1.5567214488983154, "learning_rate": 2.9103582395016293e-05, "loss": 0.059638023376464844, "memory(GiB)": 122.96, "step": 41800, "token_acc": 0.9857425742574257, "train_speed(iter/s)": 0.233812 }, { "epoch": 3.186218461773001, "eval_loss": 0.06655236333608627, "eval_runtime": 221.0681, "eval_samples_per_second": 2.397, "eval_steps_per_second": 2.397, "eval_token_acc": 0.9718013975061743, "step": 41800 }, { "epoch": 3.186599588383261, "grad_norm": 0.7251859903335571, "learning_rate": 2.909270535854593e-05, "loss": 0.06731876134872436, "memory(GiB)": 122.96, "step": 41805, "token_acc": 0.9720658277982958, "train_speed(iter/s)": 0.233529 }, { "epoch": 3.186980714993521, "grad_norm": 1.7249196767807007, "learning_rate": 2.9081829521011873e-05, "loss": 0.07086960077285767, "memory(GiB)": 122.96, "step": 41810, "token_acc": 0.9748908296943232, "train_speed(iter/s)": 0.233536 }, { "epoch": 3.1873618416037806, "grad_norm": 1.1304986476898193, "learning_rate": 2.9070954883037815e-05, "loss": 0.049729830026626586, "memory(GiB)": 122.96, "step": 41815, "token_acc": 0.9804951237809453, "train_speed(iter/s)": 0.233545 }, { "epoch": 3.1877429682140406, "grad_norm": 2.143695831298828, "learning_rate": 2.90600814452474e-05, "loss": 0.07715476751327514, "memory(GiB)": 122.96, "step": 41820, "token_acc": 0.9769187464815162, "train_speed(iter/s)": 0.23355 }, { "epoch": 3.1881240948243006, "grad_norm": 1.068039894104004, "learning_rate": 2.9049209208264115e-05, "loss": 0.060927993059158324, "memory(GiB)": 122.96, "step": 41825, "token_acc": 0.9805574673090158, "train_speed(iter/s)": 0.233555 }, { "epoch": 3.1885052214345606, "grad_norm": 1.123646855354309, "learning_rate": 2.903833817271146e-05, "loss": 0.0747305691242218, "memory(GiB)": 122.96, "step": 41830, "token_acc": 0.9708692612444652, "train_speed(iter/s)": 0.233561 }, { "epoch": 3.1888863480448206, "grad_norm": 1.3242628574371338, "learning_rate": 2.902746833921286e-05, "loss": 0.03678010404109955, "memory(GiB)": 122.96, "step": 41835, "token_acc": 0.9797446059004844, "train_speed(iter/s)": 0.233571 }, { "epoch": 3.1892674746550806, "grad_norm": 0.8322247862815857, "learning_rate": 2.90165997083916e-05, "loss": 0.04504353404045105, "memory(GiB)": 122.96, "step": 41840, "token_acc": 0.9779969650986343, "train_speed(iter/s)": 0.233576 }, { "epoch": 3.18964860126534, "grad_norm": 0.6445869207382202, "learning_rate": 2.900573228087098e-05, "loss": 0.05141534805297852, "memory(GiB)": 122.96, "step": 41845, "token_acc": 0.9847801578354002, "train_speed(iter/s)": 0.233581 }, { "epoch": 3.1900297278756002, "grad_norm": 0.8549160957336426, "learning_rate": 2.8994866057274206e-05, "loss": 0.03571479916572571, "memory(GiB)": 122.96, "step": 41850, "token_acc": 0.9835983263598327, "train_speed(iter/s)": 0.233584 }, { "epoch": 3.1904108544858603, "grad_norm": 0.5422884225845337, "learning_rate": 2.8984001038224362e-05, "loss": 0.09740809798240661, "memory(GiB)": 122.96, "step": 41855, "token_acc": 0.969929046063746, "train_speed(iter/s)": 0.233586 }, { "epoch": 3.1907919810961203, "grad_norm": 1.1810939311981201, "learning_rate": 2.8973137224344537e-05, "loss": 0.08486742973327636, "memory(GiB)": 122.96, "step": 41860, "token_acc": 0.9607250755287009, "train_speed(iter/s)": 0.233593 }, { "epoch": 3.19117310770638, "grad_norm": 2.13507080078125, "learning_rate": 2.8962274616257734e-05, "loss": 0.059251779317855836, "memory(GiB)": 122.96, "step": 41865, "token_acc": 0.982089552238806, "train_speed(iter/s)": 0.233599 }, { "epoch": 3.19155423431664, "grad_norm": 0.869504451751709, "learning_rate": 2.8951413214586836e-05, "loss": 0.05158516764640808, "memory(GiB)": 122.96, "step": 41870, "token_acc": 0.9743360190987765, "train_speed(iter/s)": 0.233608 }, { "epoch": 3.1919353609269, "grad_norm": 1.5324188470840454, "learning_rate": 2.8940553019954707e-05, "loss": 0.0516184151172638, "memory(GiB)": 122.96, "step": 41875, "token_acc": 0.9800127578141612, "train_speed(iter/s)": 0.233614 }, { "epoch": 3.19231648753716, "grad_norm": 0.9871914982795715, "learning_rate": 2.8929694032984166e-05, "loss": 0.05978899598121643, "memory(GiB)": 122.96, "step": 41880, "token_acc": 0.9773117254528122, "train_speed(iter/s)": 0.233619 }, { "epoch": 3.19269761414742, "grad_norm": 1.7186481952667236, "learning_rate": 2.8918836254297844e-05, "loss": 0.06493679285049439, "memory(GiB)": 122.96, "step": 41885, "token_acc": 0.9702187063750581, "train_speed(iter/s)": 0.233628 }, { "epoch": 3.1930787407576795, "grad_norm": 0.33100444078445435, "learning_rate": 2.8907979684518483e-05, "loss": 0.06620514988899232, "memory(GiB)": 122.96, "step": 41890, "token_acc": 0.9721694036300778, "train_speed(iter/s)": 0.233633 }, { "epoch": 3.1934598673679395, "grad_norm": 0.8563975095748901, "learning_rate": 2.889712432426858e-05, "loss": 0.06153750419616699, "memory(GiB)": 122.96, "step": 41895, "token_acc": 0.9739130434782609, "train_speed(iter/s)": 0.23364 }, { "epoch": 3.1938409939781995, "grad_norm": 1.35704505443573, "learning_rate": 2.888627017417067e-05, "loss": 0.06620625257492066, "memory(GiB)": 122.96, "step": 41900, "token_acc": 0.9684719043986778, "train_speed(iter/s)": 0.233647 }, { "epoch": 3.1942221205884596, "grad_norm": 1.499839186668396, "learning_rate": 2.8875417234847214e-05, "loss": 0.053743505477905275, "memory(GiB)": 122.96, "step": 41905, "token_acc": 0.9861551773867897, "train_speed(iter/s)": 0.233656 }, { "epoch": 3.1946032471987196, "grad_norm": 0.7952582836151123, "learning_rate": 2.8864565506920517e-05, "loss": 0.052200138568878174, "memory(GiB)": 122.96, "step": 41910, "token_acc": 0.9777335264301231, "train_speed(iter/s)": 0.23366 }, { "epoch": 3.194984373808979, "grad_norm": 0.8939226865768433, "learning_rate": 2.8853714991012915e-05, "loss": 0.05643333792686463, "memory(GiB)": 122.96, "step": 41915, "token_acc": 0.9752544752544753, "train_speed(iter/s)": 0.233664 }, { "epoch": 3.195365500419239, "grad_norm": 1.5397398471832275, "learning_rate": 2.8842865687746645e-05, "loss": 0.08442361950874329, "memory(GiB)": 122.96, "step": 41920, "token_acc": 0.9681750372948782, "train_speed(iter/s)": 0.23367 }, { "epoch": 3.195746627029499, "grad_norm": 1.6088355779647827, "learning_rate": 2.8832017597743827e-05, "loss": 0.07817186713218689, "memory(GiB)": 122.96, "step": 41925, "token_acc": 0.9679186228482003, "train_speed(iter/s)": 0.233677 }, { "epoch": 3.196127753639759, "grad_norm": 0.6909950375556946, "learning_rate": 2.8821170721626567e-05, "loss": 0.0733165442943573, "memory(GiB)": 122.96, "step": 41930, "token_acc": 0.9729235272594219, "train_speed(iter/s)": 0.233684 }, { "epoch": 3.1965088802500192, "grad_norm": 0.9355821013450623, "learning_rate": 2.881032506001691e-05, "loss": 0.04861036241054535, "memory(GiB)": 122.96, "step": 41935, "token_acc": 0.9804347826086957, "train_speed(iter/s)": 0.233693 }, { "epoch": 3.196890006860279, "grad_norm": 2.2333266735076904, "learning_rate": 2.8799480613536755e-05, "loss": 0.08451087474822998, "memory(GiB)": 122.96, "step": 41940, "token_acc": 0.9718563954537254, "train_speed(iter/s)": 0.233697 }, { "epoch": 3.197271133470539, "grad_norm": 1.583493709564209, "learning_rate": 2.8788637382808004e-05, "loss": 0.040038949251174925, "memory(GiB)": 122.96, "step": 41945, "token_acc": 0.9810379241516967, "train_speed(iter/s)": 0.233702 }, { "epoch": 3.197652260080799, "grad_norm": 1.087618350982666, "learning_rate": 2.877779536845249e-05, "loss": 0.06693893671035767, "memory(GiB)": 122.96, "step": 41950, "token_acc": 0.9783476472722874, "train_speed(iter/s)": 0.233702 }, { "epoch": 3.198033386691059, "grad_norm": 0.8555540442466736, "learning_rate": 2.87669545710919e-05, "loss": 0.07469189763069153, "memory(GiB)": 122.96, "step": 41955, "token_acc": 0.9739524348810872, "train_speed(iter/s)": 0.233709 }, { "epoch": 3.198414513301319, "grad_norm": 0.8266153931617737, "learning_rate": 2.875611499134796e-05, "loss": 0.0729659378528595, "memory(GiB)": 122.96, "step": 41960, "token_acc": 0.9746037156979717, "train_speed(iter/s)": 0.233713 }, { "epoch": 3.1987956399115784, "grad_norm": 0.5709372758865356, "learning_rate": 2.8745276629842216e-05, "loss": 0.04907079935073853, "memory(GiB)": 122.96, "step": 41965, "token_acc": 0.9845747025121199, "train_speed(iter/s)": 0.233717 }, { "epoch": 3.1991767665218385, "grad_norm": 1.1534748077392578, "learning_rate": 2.8734439487196228e-05, "loss": 0.06662549972534179, "memory(GiB)": 122.96, "step": 41970, "token_acc": 0.9743464052287582, "train_speed(iter/s)": 0.23372 }, { "epoch": 3.1995578931320985, "grad_norm": 1.1692193746566772, "learning_rate": 2.8723603564031466e-05, "loss": 0.042132461071014406, "memory(GiB)": 122.96, "step": 41975, "token_acc": 0.9783561643835617, "train_speed(iter/s)": 0.233727 }, { "epoch": 3.1999390197423585, "grad_norm": 0.851092517375946, "learning_rate": 2.8712768860969285e-05, "loss": 0.06523092985153198, "memory(GiB)": 122.96, "step": 41980, "token_acc": 0.978114023236963, "train_speed(iter/s)": 0.233734 }, { "epoch": 3.2003201463526185, "grad_norm": 1.7831929922103882, "learning_rate": 2.870193537863103e-05, "loss": 0.08090447783470153, "memory(GiB)": 122.96, "step": 41985, "token_acc": 0.9741847826086957, "train_speed(iter/s)": 0.233741 }, { "epoch": 3.200701272962878, "grad_norm": 0.9572160243988037, "learning_rate": 2.8691103117637964e-05, "loss": 0.06469687223434448, "memory(GiB)": 122.96, "step": 41990, "token_acc": 0.97901914503016, "train_speed(iter/s)": 0.233748 }, { "epoch": 3.201082399573138, "grad_norm": 0.7688141465187073, "learning_rate": 2.868027207861123e-05, "loss": 0.08029439449310302, "memory(GiB)": 122.96, "step": 41995, "token_acc": 0.9722578987927049, "train_speed(iter/s)": 0.233756 }, { "epoch": 3.201463526183398, "grad_norm": 1.1438865661621094, "learning_rate": 2.866944226217196e-05, "loss": 0.10112118721008301, "memory(GiB)": 122.96, "step": 42000, "token_acc": 0.9588571428571429, "train_speed(iter/s)": 0.233766 }, { "epoch": 3.201463526183398, "eval_loss": 0.06670878827571869, "eval_runtime": 223.0341, "eval_samples_per_second": 2.376, "eval_steps_per_second": 2.376, "eval_token_acc": 0.9718164568399494, "step": 42000 }, { "epoch": 3.201844652793658, "grad_norm": 0.6726539731025696, "learning_rate": 2.8658613668941203e-05, "loss": 0.06921271085739136, "memory(GiB)": 122.96, "step": 42005, "token_acc": 0.971821104102979, "train_speed(iter/s)": 0.23348 }, { "epoch": 3.202225779403918, "grad_norm": 1.5786051750183105, "learning_rate": 2.8647786299539902e-05, "loss": 0.07723052501678467, "memory(GiB)": 122.96, "step": 42010, "token_acc": 0.9809305873379099, "train_speed(iter/s)": 0.233481 }, { "epoch": 3.2026069060141777, "grad_norm": 2.877349853515625, "learning_rate": 2.8636960154588965e-05, "loss": 0.04806656241416931, "memory(GiB)": 122.96, "step": 42015, "token_acc": 0.9805571677307022, "train_speed(iter/s)": 0.233489 }, { "epoch": 3.2029880326244378, "grad_norm": 1.2190872430801392, "learning_rate": 2.8626135234709227e-05, "loss": 0.07134389281272888, "memory(GiB)": 122.96, "step": 42020, "token_acc": 0.9708240534521159, "train_speed(iter/s)": 0.233496 }, { "epoch": 3.203369159234698, "grad_norm": 0.7249446511268616, "learning_rate": 2.861531154052145e-05, "loss": 0.05651550889015198, "memory(GiB)": 122.96, "step": 42025, "token_acc": 0.9815063887020847, "train_speed(iter/s)": 0.233504 }, { "epoch": 3.203750285844958, "grad_norm": 0.7617385983467102, "learning_rate": 2.8604489072646333e-05, "loss": 0.059232407808303834, "memory(GiB)": 122.96, "step": 42030, "token_acc": 0.9770142753447859, "train_speed(iter/s)": 0.23351 }, { "epoch": 3.204131412455218, "grad_norm": 0.5910804867744446, "learning_rate": 2.8593667831704467e-05, "loss": 0.05051870346069336, "memory(GiB)": 122.96, "step": 42035, "token_acc": 0.9756165142698808, "train_speed(iter/s)": 0.233517 }, { "epoch": 3.2045125390654774, "grad_norm": 0.4959962069988251, "learning_rate": 2.8582847818316415e-05, "loss": 0.0621164083480835, "memory(GiB)": 122.96, "step": 42040, "token_acc": 0.9793190416141235, "train_speed(iter/s)": 0.23352 }, { "epoch": 3.2048936656757374, "grad_norm": 0.8037670850753784, "learning_rate": 2.8572029033102664e-05, "loss": 0.04299565255641937, "memory(GiB)": 122.96, "step": 42045, "token_acc": 0.9812382739212008, "train_speed(iter/s)": 0.233523 }, { "epoch": 3.2052747922859974, "grad_norm": 0.9555992484092712, "learning_rate": 2.8561211476683604e-05, "loss": 0.07040913105010986, "memory(GiB)": 122.96, "step": 42050, "token_acc": 0.9686159403928652, "train_speed(iter/s)": 0.233531 }, { "epoch": 3.2056559188962575, "grad_norm": 1.847030520439148, "learning_rate": 2.8550395149679565e-05, "loss": 0.08210671544075013, "memory(GiB)": 122.96, "step": 42055, "token_acc": 0.9743099207433725, "train_speed(iter/s)": 0.233538 }, { "epoch": 3.2060370455065175, "grad_norm": 0.8517504930496216, "learning_rate": 2.8539580052710846e-05, "loss": 0.051465940475463864, "memory(GiB)": 122.96, "step": 42060, "token_acc": 0.9755620723362659, "train_speed(iter/s)": 0.233546 }, { "epoch": 3.206418172116777, "grad_norm": 1.4270883798599243, "learning_rate": 2.8528766186397603e-05, "loss": 0.06202307343482971, "memory(GiB)": 122.96, "step": 42065, "token_acc": 0.9752611324903794, "train_speed(iter/s)": 0.233552 }, { "epoch": 3.206799298727037, "grad_norm": 1.7516244649887085, "learning_rate": 2.8517953551359988e-05, "loss": 0.06053359508514404, "memory(GiB)": 122.96, "step": 42070, "token_acc": 0.9743167599604873, "train_speed(iter/s)": 0.233561 }, { "epoch": 3.207180425337297, "grad_norm": 0.4553517699241638, "learning_rate": 2.8507142148218062e-05, "loss": 0.07278724312782288, "memory(GiB)": 122.96, "step": 42075, "token_acc": 0.9729004218330564, "train_speed(iter/s)": 0.233563 }, { "epoch": 3.207561551947557, "grad_norm": 1.1812692880630493, "learning_rate": 2.849633197759178e-05, "loss": 0.07626963257789612, "memory(GiB)": 122.96, "step": 42080, "token_acc": 0.97185667752443, "train_speed(iter/s)": 0.233565 }, { "epoch": 3.207942678557817, "grad_norm": 0.8561417460441589, "learning_rate": 2.8485523040101064e-05, "loss": 0.07135959863662719, "memory(GiB)": 122.96, "step": 42085, "token_acc": 0.970873786407767, "train_speed(iter/s)": 0.233571 }, { "epoch": 3.2083238051680767, "grad_norm": 1.2827221155166626, "learning_rate": 2.8474715336365787e-05, "loss": 0.05540143251419068, "memory(GiB)": 122.96, "step": 42090, "token_acc": 0.9738219895287958, "train_speed(iter/s)": 0.233581 }, { "epoch": 3.2087049317783367, "grad_norm": 0.8401476740837097, "learning_rate": 2.8463908867005675e-05, "loss": 0.06007001996040344, "memory(GiB)": 122.96, "step": 42095, "token_acc": 0.9825548677546426, "train_speed(iter/s)": 0.233588 }, { "epoch": 3.2090860583885967, "grad_norm": 1.5432426929473877, "learning_rate": 2.8453103632640443e-05, "loss": 0.060094451904296874, "memory(GiB)": 122.96, "step": 42100, "token_acc": 0.9798417483044461, "train_speed(iter/s)": 0.233593 }, { "epoch": 3.2094671849988567, "grad_norm": 0.6609178185462952, "learning_rate": 2.844229963388976e-05, "loss": 0.04187192618846893, "memory(GiB)": 122.96, "step": 42105, "token_acc": 0.9820193637621023, "train_speed(iter/s)": 0.233603 }, { "epoch": 3.2098483116091163, "grad_norm": 1.10936439037323, "learning_rate": 2.843149687137312e-05, "loss": 0.07802796363830566, "memory(GiB)": 122.96, "step": 42110, "token_acc": 0.975005680527153, "train_speed(iter/s)": 0.23361 }, { "epoch": 3.2102294382193763, "grad_norm": 0.9833263158798218, "learning_rate": 2.8420695345710053e-05, "loss": 0.07058386206626892, "memory(GiB)": 122.96, "step": 42115, "token_acc": 0.9663307938068322, "train_speed(iter/s)": 0.233617 }, { "epoch": 3.2106105648296364, "grad_norm": 1.2911237478256226, "learning_rate": 2.8409895057519985e-05, "loss": 0.051799678802490236, "memory(GiB)": 122.96, "step": 42120, "token_acc": 0.9853255069370331, "train_speed(iter/s)": 0.233624 }, { "epoch": 3.2109916914398964, "grad_norm": 1.903690218925476, "learning_rate": 2.839909600742222e-05, "loss": 0.09086737632751465, "memory(GiB)": 122.96, "step": 42125, "token_acc": 0.9676011637133034, "train_speed(iter/s)": 0.233627 }, { "epoch": 3.2113728180501564, "grad_norm": 0.8191927075386047, "learning_rate": 2.838829819603609e-05, "loss": 0.057541847229003906, "memory(GiB)": 122.96, "step": 42130, "token_acc": 0.9828136073706591, "train_speed(iter/s)": 0.23363 }, { "epoch": 3.2117539446604164, "grad_norm": 0.7303987145423889, "learning_rate": 2.837750162398074e-05, "loss": 0.05979266166687012, "memory(GiB)": 122.96, "step": 42135, "token_acc": 0.9722486219349934, "train_speed(iter/s)": 0.233636 }, { "epoch": 3.212135071270676, "grad_norm": 1.9157589673995972, "learning_rate": 2.8366706291875333e-05, "loss": 0.07706948518753051, "memory(GiB)": 122.96, "step": 42140, "token_acc": 0.9738004121283486, "train_speed(iter/s)": 0.233639 }, { "epoch": 3.212516197880936, "grad_norm": 2.5069448947906494, "learning_rate": 2.8355912200338952e-05, "loss": 0.07527621984481811, "memory(GiB)": 122.96, "step": 42145, "token_acc": 0.9660739832413652, "train_speed(iter/s)": 0.233645 }, { "epoch": 3.212897324491196, "grad_norm": 0.07322249561548233, "learning_rate": 2.8345119349990517e-05, "loss": 0.053117644786834714, "memory(GiB)": 122.96, "step": 42150, "token_acc": 0.9786541980077251, "train_speed(iter/s)": 0.233649 }, { "epoch": 3.213278451101456, "grad_norm": 1.0949307680130005, "learning_rate": 2.8334327741449025e-05, "loss": 0.06552187204360962, "memory(GiB)": 122.96, "step": 42155, "token_acc": 0.9729314057213165, "train_speed(iter/s)": 0.233652 }, { "epoch": 3.2136595777117156, "grad_norm": 1.506376028060913, "learning_rate": 2.8323537375333308e-05, "loss": 0.1002872109413147, "memory(GiB)": 122.96, "step": 42160, "token_acc": 0.9545983701979045, "train_speed(iter/s)": 0.233661 }, { "epoch": 3.2140407043219756, "grad_norm": 0.9093676805496216, "learning_rate": 2.831274825226212e-05, "loss": 0.053935778141021726, "memory(GiB)": 122.96, "step": 42165, "token_acc": 0.978756884343037, "train_speed(iter/s)": 0.233664 }, { "epoch": 3.2144218309322357, "grad_norm": 1.2035579681396484, "learning_rate": 2.8301960372854174e-05, "loss": 0.04539737403392792, "memory(GiB)": 122.96, "step": 42170, "token_acc": 0.9819628647214854, "train_speed(iter/s)": 0.233668 }, { "epoch": 3.2148029575424957, "grad_norm": 1.8014506101608276, "learning_rate": 2.8291173737728133e-05, "loss": 0.0655114471912384, "memory(GiB)": 122.96, "step": 42175, "token_acc": 0.9764409542154393, "train_speed(iter/s)": 0.233673 }, { "epoch": 3.2151840841527557, "grad_norm": 1.4017510414123535, "learning_rate": 2.828038834750252e-05, "loss": 0.048455968499183655, "memory(GiB)": 122.96, "step": 42180, "token_acc": 0.9808231992516371, "train_speed(iter/s)": 0.233682 }, { "epoch": 3.2155652107630153, "grad_norm": 1.5404976606369019, "learning_rate": 2.826960420279585e-05, "loss": 0.06078876256942749, "memory(GiB)": 122.96, "step": 42185, "token_acc": 0.9680885704982091, "train_speed(iter/s)": 0.23369 }, { "epoch": 3.2159463373732753, "grad_norm": 0.9111979007720947, "learning_rate": 2.825882130422653e-05, "loss": 0.05747076272964478, "memory(GiB)": 122.96, "step": 42190, "token_acc": 0.9829396325459318, "train_speed(iter/s)": 0.233697 }, { "epoch": 3.2163274639835353, "grad_norm": 2.2338764667510986, "learning_rate": 2.8248039652412912e-05, "loss": 0.08016111850738525, "memory(GiB)": 122.96, "step": 42195, "token_acc": 0.9625299760191847, "train_speed(iter/s)": 0.233704 }, { "epoch": 3.2167085905937953, "grad_norm": 0.861918568611145, "learning_rate": 2.8237259247973303e-05, "loss": 0.05830413103103638, "memory(GiB)": 122.96, "step": 42200, "token_acc": 0.9716268120431173, "train_speed(iter/s)": 0.233708 }, { "epoch": 3.2167085905937953, "eval_loss": 0.0659181997179985, "eval_runtime": 220.1876, "eval_samples_per_second": 2.407, "eval_steps_per_second": 2.407, "eval_token_acc": 0.9721854105174387, "step": 42200 }, { "epoch": 3.2170897172040553, "grad_norm": 0.6821376085281372, "learning_rate": 2.8226480091525857e-05, "loss": 0.061796563863754275, "memory(GiB)": 122.96, "step": 42205, "token_acc": 0.9725842187599544, "train_speed(iter/s)": 0.233426 }, { "epoch": 3.217470843814315, "grad_norm": 1.0680803060531616, "learning_rate": 2.821570218368874e-05, "loss": 0.08358233571052551, "memory(GiB)": 122.96, "step": 42210, "token_acc": 0.9758992805755395, "train_speed(iter/s)": 0.233429 }, { "epoch": 3.217851970424575, "grad_norm": 1.0940077304840088, "learning_rate": 2.8204925525080034e-05, "loss": 0.042455455660820006, "memory(GiB)": 122.96, "step": 42215, "token_acc": 0.979381443298969, "train_speed(iter/s)": 0.233437 }, { "epoch": 3.218233097034835, "grad_norm": 1.3784866333007812, "learning_rate": 2.8194150116317687e-05, "loss": 0.05985978841781616, "memory(GiB)": 122.96, "step": 42220, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.233447 }, { "epoch": 3.218614223645095, "grad_norm": 1.018708348274231, "learning_rate": 2.818337595801963e-05, "loss": 0.05287977457046509, "memory(GiB)": 122.96, "step": 42225, "token_acc": 0.9785038693035254, "train_speed(iter/s)": 0.233452 }, { "epoch": 3.218995350255355, "grad_norm": 1.260504126548767, "learning_rate": 2.8172603050803752e-05, "loss": 0.09550297856330872, "memory(GiB)": 122.96, "step": 42230, "token_acc": 0.9653989133543037, "train_speed(iter/s)": 0.23346 }, { "epoch": 3.2193764768656146, "grad_norm": 0.3555678725242615, "learning_rate": 2.8161831395287776e-05, "loss": 0.052314257621765135, "memory(GiB)": 122.96, "step": 42235, "token_acc": 0.9788226848528356, "train_speed(iter/s)": 0.233469 }, { "epoch": 3.2197576034758746, "grad_norm": 1.6358704566955566, "learning_rate": 2.8151060992089423e-05, "loss": 0.06829994320869445, "memory(GiB)": 122.96, "step": 42240, "token_acc": 0.9768015794669299, "train_speed(iter/s)": 0.233478 }, { "epoch": 3.2201387300861346, "grad_norm": 0.8939799070358276, "learning_rate": 2.814029184182635e-05, "loss": 0.043975254893302916, "memory(GiB)": 122.96, "step": 42245, "token_acc": 0.980154355016538, "train_speed(iter/s)": 0.233484 }, { "epoch": 3.2205198566963946, "grad_norm": 0.785541832447052, "learning_rate": 2.8129523945116088e-05, "loss": 0.05905129909515381, "memory(GiB)": 122.96, "step": 42250, "token_acc": 0.9755034565674782, "train_speed(iter/s)": 0.233487 }, { "epoch": 3.2209009833066546, "grad_norm": 0.5917999744415283, "learning_rate": 2.8118757302576125e-05, "loss": 0.05921238660812378, "memory(GiB)": 122.96, "step": 42255, "token_acc": 0.9795684552224556, "train_speed(iter/s)": 0.233491 }, { "epoch": 3.221282109916914, "grad_norm": 0.8329206109046936, "learning_rate": 2.8107991914823916e-05, "loss": 0.041399520635604856, "memory(GiB)": 122.96, "step": 42260, "token_acc": 0.9821449827401499, "train_speed(iter/s)": 0.233492 }, { "epoch": 3.2216632365271742, "grad_norm": 1.259564995765686, "learning_rate": 2.8097227782476754e-05, "loss": 0.06436173915863037, "memory(GiB)": 122.96, "step": 42265, "token_acc": 0.9784133837021047, "train_speed(iter/s)": 0.233494 }, { "epoch": 3.2220443631374343, "grad_norm": 0.6623247265815735, "learning_rate": 2.8086464906151945e-05, "loss": 0.05489648580551147, "memory(GiB)": 122.96, "step": 42270, "token_acc": 0.9790360925005404, "train_speed(iter/s)": 0.233501 }, { "epoch": 3.2224254897476943, "grad_norm": 0.5724599361419678, "learning_rate": 2.8075703286466696e-05, "loss": 0.07188607454299926, "memory(GiB)": 122.96, "step": 42275, "token_acc": 0.9708708250724417, "train_speed(iter/s)": 0.233505 }, { "epoch": 3.2228066163579543, "grad_norm": 1.0917340517044067, "learning_rate": 2.8064942924038106e-05, "loss": 0.09012957215309143, "memory(GiB)": 122.96, "step": 42280, "token_acc": 0.9645456241836163, "train_speed(iter/s)": 0.23351 }, { "epoch": 3.223187742968214, "grad_norm": 1.8548345565795898, "learning_rate": 2.8054183819483248e-05, "loss": 0.10426207780838012, "memory(GiB)": 122.96, "step": 42285, "token_acc": 0.9549578742709008, "train_speed(iter/s)": 0.233518 }, { "epoch": 3.223568869578474, "grad_norm": 1.2381476163864136, "learning_rate": 2.80434259734191e-05, "loss": 0.043931832909584044, "memory(GiB)": 122.96, "step": 42290, "token_acc": 0.9818286703201615, "train_speed(iter/s)": 0.233526 }, { "epoch": 3.223949996188734, "grad_norm": 0.8472701907157898, "learning_rate": 2.8032669386462596e-05, "loss": 0.07227838635444642, "memory(GiB)": 122.96, "step": 42295, "token_acc": 0.9708664396519107, "train_speed(iter/s)": 0.233531 }, { "epoch": 3.224331122798994, "grad_norm": 1.9096413850784302, "learning_rate": 2.802191405923057e-05, "loss": 0.061519956588745116, "memory(GiB)": 122.96, "step": 42300, "token_acc": 0.9748181309457191, "train_speed(iter/s)": 0.233539 }, { "epoch": 3.224712249409254, "grad_norm": 1.86320161819458, "learning_rate": 2.8011159992339764e-05, "loss": 0.0994529366493225, "memory(GiB)": 122.96, "step": 42305, "token_acc": 0.9646896973402629, "train_speed(iter/s)": 0.233544 }, { "epoch": 3.2250933760195135, "grad_norm": 1.1967010498046875, "learning_rate": 2.8000407186406896e-05, "loss": 0.05222654342651367, "memory(GiB)": 122.96, "step": 42310, "token_acc": 0.9796124343964473, "train_speed(iter/s)": 0.233549 }, { "epoch": 3.2254745026297735, "grad_norm": 1.3244593143463135, "learning_rate": 2.79896556420486e-05, "loss": 0.043341583013534545, "memory(GiB)": 122.96, "step": 42315, "token_acc": 0.9825653798256538, "train_speed(iter/s)": 0.233556 }, { "epoch": 3.2258556292400336, "grad_norm": 2.3630356788635254, "learning_rate": 2.797890535988139e-05, "loss": 0.08988655805587768, "memory(GiB)": 122.96, "step": 42320, "token_acc": 0.9710998877665544, "train_speed(iter/s)": 0.233563 }, { "epoch": 3.2262367558502936, "grad_norm": 1.042773723602295, "learning_rate": 2.7968156340521777e-05, "loss": 0.10488255023956299, "memory(GiB)": 122.96, "step": 42325, "token_acc": 0.9610580455547392, "train_speed(iter/s)": 0.233572 }, { "epoch": 3.2266178824605536, "grad_norm": 1.1702932119369507, "learning_rate": 2.7957408584586175e-05, "loss": 0.08937157392501831, "memory(GiB)": 122.96, "step": 42330, "token_acc": 0.9668508287292817, "train_speed(iter/s)": 0.233579 }, { "epoch": 3.226999009070813, "grad_norm": 0.8519013524055481, "learning_rate": 2.7946662092690877e-05, "loss": 0.04470755457878113, "memory(GiB)": 122.96, "step": 42335, "token_acc": 0.9787760148361838, "train_speed(iter/s)": 0.233585 }, { "epoch": 3.227380135681073, "grad_norm": 0.9887986183166504, "learning_rate": 2.7935916865452165e-05, "loss": 0.08159132599830628, "memory(GiB)": 122.96, "step": 42340, "token_acc": 0.9681050656660413, "train_speed(iter/s)": 0.233587 }, { "epoch": 3.227761262291333, "grad_norm": 0.565908670425415, "learning_rate": 2.7925172903486258e-05, "loss": 0.04901260733604431, "memory(GiB)": 122.96, "step": 42345, "token_acc": 0.9828750981932443, "train_speed(iter/s)": 0.233591 }, { "epoch": 3.228142388901593, "grad_norm": 0.18376196920871735, "learning_rate": 2.791443020740922e-05, "loss": 0.06212977170944214, "memory(GiB)": 122.96, "step": 42350, "token_acc": 0.9728, "train_speed(iter/s)": 0.233598 }, { "epoch": 3.2285235155118532, "grad_norm": 0.9201083183288574, "learning_rate": 2.7903688777837144e-05, "loss": 0.047337332367897035, "memory(GiB)": 122.96, "step": 42355, "token_acc": 0.9805043441407078, "train_speed(iter/s)": 0.233604 }, { "epoch": 3.228904642122113, "grad_norm": 0.6614370346069336, "learning_rate": 2.7892948615385957e-05, "loss": 0.08147130608558655, "memory(GiB)": 122.96, "step": 42360, "token_acc": 0.9738537324744221, "train_speed(iter/s)": 0.233613 }, { "epoch": 3.229285768732373, "grad_norm": 0.9469267129898071, "learning_rate": 2.788220972067157e-05, "loss": 0.06097877025604248, "memory(GiB)": 122.96, "step": 42365, "token_acc": 0.9781337401474701, "train_speed(iter/s)": 0.233621 }, { "epoch": 3.229666895342633, "grad_norm": 0.7664675712585449, "learning_rate": 2.7871472094309847e-05, "loss": 0.05720457434654236, "memory(GiB)": 122.96, "step": 42370, "token_acc": 0.9764226638686582, "train_speed(iter/s)": 0.233624 }, { "epoch": 3.230048021952893, "grad_norm": 0.9239668846130371, "learning_rate": 2.7860735736916487e-05, "loss": 0.04413672685623169, "memory(GiB)": 122.96, "step": 42375, "token_acc": 0.981178196040088, "train_speed(iter/s)": 0.233632 }, { "epoch": 3.230429148563153, "grad_norm": 0.7836955785751343, "learning_rate": 2.7850000649107188e-05, "loss": 0.07724674344062805, "memory(GiB)": 122.96, "step": 42380, "token_acc": 0.9788797061524335, "train_speed(iter/s)": 0.23363 }, { "epoch": 3.2308102751734125, "grad_norm": 0.7979056239128113, "learning_rate": 2.783926683149759e-05, "loss": 0.06362406015396119, "memory(GiB)": 122.96, "step": 42385, "token_acc": 0.9770174172936206, "train_speed(iter/s)": 0.23363 }, { "epoch": 3.2311914017836725, "grad_norm": 0.6546550393104553, "learning_rate": 2.782853428470318e-05, "loss": 0.05029805898666382, "memory(GiB)": 122.96, "step": 42390, "token_acc": 0.9797803730172564, "train_speed(iter/s)": 0.233635 }, { "epoch": 3.2315725283939325, "grad_norm": 0.7344277501106262, "learning_rate": 2.7817803009339438e-05, "loss": 0.0731450378894806, "memory(GiB)": 122.96, "step": 42395, "token_acc": 0.9713396659023911, "train_speed(iter/s)": 0.23364 }, { "epoch": 3.2319536550041925, "grad_norm": 1.6547328233718872, "learning_rate": 2.7807073006021777e-05, "loss": 0.05156984329223633, "memory(GiB)": 122.96, "step": 42400, "token_acc": 0.9817422249729227, "train_speed(iter/s)": 0.233643 }, { "epoch": 3.2319536550041925, "eval_loss": 0.06537575274705887, "eval_runtime": 220.0535, "eval_samples_per_second": 2.409, "eval_steps_per_second": 2.409, "eval_token_acc": 0.9728706102042045, "step": 42400 }, { "epoch": 3.232334781614452, "grad_norm": 0.683313250541687, "learning_rate": 2.7796344275365472e-05, "loss": 0.06546038985252381, "memory(GiB)": 122.96, "step": 42405, "token_acc": 0.9729916034172992, "train_speed(iter/s)": 0.233368 }, { "epoch": 3.232715908224712, "grad_norm": 1.1989493370056152, "learning_rate": 2.7785616817985783e-05, "loss": 0.05036668181419372, "memory(GiB)": 122.96, "step": 42410, "token_acc": 0.9780787738371057, "train_speed(iter/s)": 0.233373 }, { "epoch": 3.233097034834972, "grad_norm": 1.1419326066970825, "learning_rate": 2.7774890634497907e-05, "loss": 0.07751496434211731, "memory(GiB)": 122.96, "step": 42415, "token_acc": 0.9680019540791402, "train_speed(iter/s)": 0.233378 }, { "epoch": 3.233478161445232, "grad_norm": 0.7279849052429199, "learning_rate": 2.776416572551687e-05, "loss": 0.05963137149810791, "memory(GiB)": 122.96, "step": 42420, "token_acc": 0.9768885822697482, "train_speed(iter/s)": 0.233384 }, { "epoch": 3.233859288055492, "grad_norm": 0.7193878889083862, "learning_rate": 2.775344209165779e-05, "loss": 0.056727665662765506, "memory(GiB)": 122.96, "step": 42425, "token_acc": 0.9763549415515409, "train_speed(iter/s)": 0.233391 }, { "epoch": 3.234240414665752, "grad_norm": 2.8124334812164307, "learning_rate": 2.774271973353554e-05, "loss": 0.09787141680717468, "memory(GiB)": 122.96, "step": 42430, "token_acc": 0.9642857142857143, "train_speed(iter/s)": 0.233398 }, { "epoch": 3.2346215412760118, "grad_norm": 0.8358386754989624, "learning_rate": 2.773199865176503e-05, "loss": 0.05534396171569824, "memory(GiB)": 122.96, "step": 42435, "token_acc": 0.9796009863259358, "train_speed(iter/s)": 0.233403 }, { "epoch": 3.2350026678862718, "grad_norm": 0.9465641379356384, "learning_rate": 2.7721278846961087e-05, "loss": 0.08611618280410767, "memory(GiB)": 122.96, "step": 42440, "token_acc": 0.9669946699466995, "train_speed(iter/s)": 0.233409 }, { "epoch": 3.235383794496532, "grad_norm": 1.398116111755371, "learning_rate": 2.771056031973839e-05, "loss": 0.052656954526901244, "memory(GiB)": 122.96, "step": 42445, "token_acc": 0.9758839157980789, "train_speed(iter/s)": 0.233417 }, { "epoch": 3.235764921106792, "grad_norm": 2.1333959102630615, "learning_rate": 2.7699843070711618e-05, "loss": 0.058349609375, "memory(GiB)": 122.96, "step": 42450, "token_acc": 0.9813307802776448, "train_speed(iter/s)": 0.233423 }, { "epoch": 3.2361460477170514, "grad_norm": 0.8491339087486267, "learning_rate": 2.7689127100495387e-05, "loss": 0.03567093908786774, "memory(GiB)": 122.96, "step": 42455, "token_acc": 0.9817015952455427, "train_speed(iter/s)": 0.233428 }, { "epoch": 3.2365271743273114, "grad_norm": 1.2560086250305176, "learning_rate": 2.7678412409704163e-05, "loss": 0.07583127617835998, "memory(GiB)": 122.96, "step": 42460, "token_acc": 0.9703764320785597, "train_speed(iter/s)": 0.233431 }, { "epoch": 3.2369083009375714, "grad_norm": 1.1431241035461426, "learning_rate": 2.7667698998952403e-05, "loss": 0.03927004039287567, "memory(GiB)": 122.96, "step": 42465, "token_acc": 0.9859518348623854, "train_speed(iter/s)": 0.233437 }, { "epoch": 3.2372894275478314, "grad_norm": 0.6823190450668335, "learning_rate": 2.7656986868854486e-05, "loss": 0.05524343252182007, "memory(GiB)": 122.96, "step": 42470, "token_acc": 0.9747334599094494, "train_speed(iter/s)": 0.233438 }, { "epoch": 3.2376705541580915, "grad_norm": 0.9150939583778381, "learning_rate": 2.7646276020024676e-05, "loss": 0.055554866790771484, "memory(GiB)": 122.96, "step": 42475, "token_acc": 0.9751364463311097, "train_speed(iter/s)": 0.233445 }, { "epoch": 3.2380516807683515, "grad_norm": 0.16562843322753906, "learning_rate": 2.763556645307719e-05, "loss": 0.05763424038887024, "memory(GiB)": 122.96, "step": 42480, "token_acc": 0.9798607657881651, "train_speed(iter/s)": 0.233453 }, { "epoch": 3.238432807378611, "grad_norm": 0.1558966040611267, "learning_rate": 2.762485816862621e-05, "loss": 0.09004406332969665, "memory(GiB)": 122.96, "step": 42485, "token_acc": 0.9781063406312198, "train_speed(iter/s)": 0.233459 }, { "epoch": 3.238813933988871, "grad_norm": 2.2350833415985107, "learning_rate": 2.761415116728576e-05, "loss": 0.06885814070701599, "memory(GiB)": 122.96, "step": 42490, "token_acc": 0.97362539114886, "train_speed(iter/s)": 0.233466 }, { "epoch": 3.239195060599131, "grad_norm": 0.7142639756202698, "learning_rate": 2.7603445449669863e-05, "loss": 0.05394957661628723, "memory(GiB)": 122.96, "step": 42495, "token_acc": 0.975059697532502, "train_speed(iter/s)": 0.233471 }, { "epoch": 3.239576187209391, "grad_norm": 0.9952199459075928, "learning_rate": 2.7592741016392452e-05, "loss": 0.06824737787246704, "memory(GiB)": 122.96, "step": 42500, "token_acc": 0.9745752201413865, "train_speed(iter/s)": 0.233473 }, { "epoch": 3.2399573138196507, "grad_norm": 2.21818470954895, "learning_rate": 2.7582037868067346e-05, "loss": 0.10317122936248779, "memory(GiB)": 122.96, "step": 42505, "token_acc": 0.972485592117494, "train_speed(iter/s)": 0.233478 }, { "epoch": 3.2403384404299107, "grad_norm": 1.1227920055389404, "learning_rate": 2.7571336005308335e-05, "loss": 0.05135197639465332, "memory(GiB)": 122.96, "step": 42510, "token_acc": 0.9788764044943821, "train_speed(iter/s)": 0.233482 }, { "epoch": 3.2407195670401707, "grad_norm": 2.1567065715789795, "learning_rate": 2.7560635428729135e-05, "loss": 0.06397298574447632, "memory(GiB)": 122.96, "step": 42515, "token_acc": 0.9758175955780746, "train_speed(iter/s)": 0.233488 }, { "epoch": 3.2411006936504307, "grad_norm": 0.5576828718185425, "learning_rate": 2.7549936138943345e-05, "loss": 0.05491306781768799, "memory(GiB)": 122.96, "step": 42520, "token_acc": 0.9773681055155875, "train_speed(iter/s)": 0.233494 }, { "epoch": 3.2414818202606908, "grad_norm": 0.8951201438903809, "learning_rate": 2.753923813656456e-05, "loss": 0.06830212473869324, "memory(GiB)": 122.96, "step": 42525, "token_acc": 0.9706293706293706, "train_speed(iter/s)": 0.233495 }, { "epoch": 3.2418629468709503, "grad_norm": 0.8029420971870422, "learning_rate": 2.7528541422206217e-05, "loss": 0.048886162042617795, "memory(GiB)": 122.96, "step": 42530, "token_acc": 0.9796131124753852, "train_speed(iter/s)": 0.233498 }, { "epoch": 3.2422440734812104, "grad_norm": 1.6024067401885986, "learning_rate": 2.751784599648174e-05, "loss": 0.06046299338340759, "memory(GiB)": 122.96, "step": 42535, "token_acc": 0.9790432801822323, "train_speed(iter/s)": 0.233503 }, { "epoch": 3.2426252000914704, "grad_norm": 1.3231877088546753, "learning_rate": 2.7507151860004487e-05, "loss": 0.07005232572555542, "memory(GiB)": 122.96, "step": 42540, "token_acc": 0.9747466071121801, "train_speed(iter/s)": 0.233509 }, { "epoch": 3.2430063267017304, "grad_norm": 0.4790107011795044, "learning_rate": 2.7496459013387675e-05, "loss": 0.06078174114227295, "memory(GiB)": 122.96, "step": 42545, "token_acc": 0.9767633875914131, "train_speed(iter/s)": 0.233511 }, { "epoch": 3.2433874533119904, "grad_norm": 1.602644443511963, "learning_rate": 2.7485767457244492e-05, "loss": 0.06068713068962097, "memory(GiB)": 122.96, "step": 42550, "token_acc": 0.9697794718213995, "train_speed(iter/s)": 0.233518 }, { "epoch": 3.24376857992225, "grad_norm": 1.1763359308242798, "learning_rate": 2.7475077192188104e-05, "loss": 0.06238076686859131, "memory(GiB)": 122.96, "step": 42555, "token_acc": 0.9803967327887981, "train_speed(iter/s)": 0.233524 }, { "epoch": 3.24414970653251, "grad_norm": 0.9470227360725403, "learning_rate": 2.746438821883149e-05, "loss": 0.05665228962898254, "memory(GiB)": 122.96, "step": 42560, "token_acc": 0.9767441860465116, "train_speed(iter/s)": 0.233528 }, { "epoch": 3.24453083314277, "grad_norm": 0.8338575959205627, "learning_rate": 2.745370053778763e-05, "loss": 0.06641061305999756, "memory(GiB)": 122.96, "step": 42565, "token_acc": 0.9759557585958163, "train_speed(iter/s)": 0.233535 }, { "epoch": 3.24491195975303, "grad_norm": 0.0708136335015297, "learning_rate": 2.7443014149669444e-05, "loss": 0.05849201679229736, "memory(GiB)": 122.96, "step": 42570, "token_acc": 0.9648616125150421, "train_speed(iter/s)": 0.233541 }, { "epoch": 3.24529308636329, "grad_norm": 1.2843830585479736, "learning_rate": 2.7432329055089696e-05, "loss": 0.07030618786811829, "memory(GiB)": 122.96, "step": 42575, "token_acc": 0.971003717472119, "train_speed(iter/s)": 0.233548 }, { "epoch": 3.2456742129735496, "grad_norm": 1.155849575996399, "learning_rate": 2.7421645254661165e-05, "loss": 0.07388638257980347, "memory(GiB)": 122.96, "step": 42580, "token_acc": 0.9722522522522522, "train_speed(iter/s)": 0.233556 }, { "epoch": 3.2460553395838097, "grad_norm": 0.7255603671073914, "learning_rate": 2.7410962748996495e-05, "loss": 0.08492366075515748, "memory(GiB)": 122.96, "step": 42585, "token_acc": 0.9679817905918058, "train_speed(iter/s)": 0.233559 }, { "epoch": 3.2464364661940697, "grad_norm": 1.1885102987289429, "learning_rate": 2.7400281538708273e-05, "loss": 0.05834152102470398, "memory(GiB)": 122.96, "step": 42590, "token_acc": 0.9705254164886801, "train_speed(iter/s)": 0.233568 }, { "epoch": 3.2468175928043297, "grad_norm": 1.6499050855636597, "learning_rate": 2.7389601624409055e-05, "loss": 0.046013647317886354, "memory(GiB)": 122.96, "step": 42595, "token_acc": 0.9808013355592654, "train_speed(iter/s)": 0.233577 }, { "epoch": 3.2471987194145897, "grad_norm": 0.7977024912834167, "learning_rate": 2.7378923006711238e-05, "loss": 0.07959501147270202, "memory(GiB)": 122.96, "step": 42600, "token_acc": 0.9735254691689008, "train_speed(iter/s)": 0.233585 }, { "epoch": 3.2471987194145897, "eval_loss": 0.06596631556749344, "eval_runtime": 220.2794, "eval_samples_per_second": 2.406, "eval_steps_per_second": 2.406, "eval_token_acc": 0.9723284741883019, "step": 42600 }, { "epoch": 3.2475798460248493, "grad_norm": 0.6999056935310364, "learning_rate": 2.736824568622721e-05, "loss": 0.06116113066673279, "memory(GiB)": 122.96, "step": 42605, "token_acc": 0.9726316395815182, "train_speed(iter/s)": 0.233306 }, { "epoch": 3.2479609726351093, "grad_norm": 1.7752422094345093, "learning_rate": 2.7357569663569293e-05, "loss": 0.06276138424873352, "memory(GiB)": 122.96, "step": 42610, "token_acc": 0.9783386874713171, "train_speed(iter/s)": 0.233304 }, { "epoch": 3.2483420992453693, "grad_norm": 2.740935802459717, "learning_rate": 2.7346894939349653e-05, "loss": 0.028673011064529418, "memory(GiB)": 122.96, "step": 42615, "token_acc": 0.9860180600058258, "train_speed(iter/s)": 0.233312 }, { "epoch": 3.2487232258556293, "grad_norm": 1.0245343446731567, "learning_rate": 2.733622151418047e-05, "loss": 0.07028174996376038, "memory(GiB)": 122.96, "step": 42620, "token_acc": 0.9723439211391018, "train_speed(iter/s)": 0.233319 }, { "epoch": 3.2491043524658894, "grad_norm": 0.6487531661987305, "learning_rate": 2.7325549388673833e-05, "loss": 0.05863792300224304, "memory(GiB)": 122.96, "step": 42625, "token_acc": 0.9810874704491725, "train_speed(iter/s)": 0.233323 }, { "epoch": 3.249485479076149, "grad_norm": 1.0830954313278198, "learning_rate": 2.7314878563441693e-05, "loss": 0.04603674709796905, "memory(GiB)": 122.96, "step": 42630, "token_acc": 0.9736328125, "train_speed(iter/s)": 0.233331 }, { "epoch": 3.249866605686409, "grad_norm": 0.8688685894012451, "learning_rate": 2.7304209039095995e-05, "loss": 0.0803896427154541, "memory(GiB)": 122.96, "step": 42635, "token_acc": 0.9615799697819987, "train_speed(iter/s)": 0.233338 }, { "epoch": 3.250247732296669, "grad_norm": 1.6832741498947144, "learning_rate": 2.7293540816248607e-05, "loss": 0.05064167976379395, "memory(GiB)": 122.96, "step": 42640, "token_acc": 0.9779168200220831, "train_speed(iter/s)": 0.233347 }, { "epoch": 3.250628858906929, "grad_norm": 1.3469057083129883, "learning_rate": 2.7282873895511267e-05, "loss": 0.04696687161922455, "memory(GiB)": 122.96, "step": 42645, "token_acc": 0.9838282078472959, "train_speed(iter/s)": 0.233353 }, { "epoch": 3.2510099855171886, "grad_norm": 0.7458909749984741, "learning_rate": 2.7272208277495686e-05, "loss": 0.05144243836402893, "memory(GiB)": 122.96, "step": 42650, "token_acc": 0.9753496089120645, "train_speed(iter/s)": 0.233361 }, { "epoch": 3.2513911121274486, "grad_norm": 1.1928120851516724, "learning_rate": 2.7261543962813512e-05, "loss": 0.08403071165084838, "memory(GiB)": 122.96, "step": 42655, "token_acc": 0.9706512425021423, "train_speed(iter/s)": 0.233368 }, { "epoch": 3.2517722387377086, "grad_norm": 3.7232553958892822, "learning_rate": 2.7250880952076253e-05, "loss": 0.06704549789428711, "memory(GiB)": 122.96, "step": 42660, "token_acc": 0.9752994011976048, "train_speed(iter/s)": 0.233376 }, { "epoch": 3.2521533653479686, "grad_norm": 1.1748361587524414, "learning_rate": 2.72402192458954e-05, "loss": 0.04871947467327118, "memory(GiB)": 122.96, "step": 42665, "token_acc": 0.982064491509254, "train_speed(iter/s)": 0.233382 }, { "epoch": 3.2525344919582286, "grad_norm": 0.6439116597175598, "learning_rate": 2.7229558844882374e-05, "loss": 0.055596137046813966, "memory(GiB)": 122.96, "step": 42670, "token_acc": 0.9774538057297847, "train_speed(iter/s)": 0.233386 }, { "epoch": 3.2529156185684887, "grad_norm": 1.0294725894927979, "learning_rate": 2.7218899749648463e-05, "loss": 0.059202718734741214, "memory(GiB)": 122.96, "step": 42675, "token_acc": 0.975785896346644, "train_speed(iter/s)": 0.233388 }, { "epoch": 3.2532967451787482, "grad_norm": 1.2087512016296387, "learning_rate": 2.7208241960804932e-05, "loss": 0.03812982439994812, "memory(GiB)": 122.96, "step": 42680, "token_acc": 0.9850195897672275, "train_speed(iter/s)": 0.233394 }, { "epoch": 3.2536778717890082, "grad_norm": 0.9924610257148743, "learning_rate": 2.719758547896296e-05, "loss": 0.06293061971664429, "memory(GiB)": 122.96, "step": 42685, "token_acc": 0.9782825263996812, "train_speed(iter/s)": 0.2334 }, { "epoch": 3.2540589983992683, "grad_norm": 1.272883415222168, "learning_rate": 2.718693030473364e-05, "loss": 0.04352408051490784, "memory(GiB)": 122.96, "step": 42690, "token_acc": 0.9817162359824476, "train_speed(iter/s)": 0.233407 }, { "epoch": 3.2544401250095283, "grad_norm": 1.3397119045257568, "learning_rate": 2.7176276438728027e-05, "loss": 0.05131037831306458, "memory(GiB)": 122.96, "step": 42695, "token_acc": 0.9758083832335329, "train_speed(iter/s)": 0.233415 }, { "epoch": 3.254821251619788, "grad_norm": 0.7463000416755676, "learning_rate": 2.7165623881557023e-05, "loss": 0.0645828366279602, "memory(GiB)": 122.96, "step": 42700, "token_acc": 0.9743533053283219, "train_speed(iter/s)": 0.23342 }, { "epoch": 3.255202378230048, "grad_norm": 1.0446393489837646, "learning_rate": 2.7154972633831522e-05, "loss": 0.0653878927230835, "memory(GiB)": 122.96, "step": 42705, "token_acc": 0.973252073810733, "train_speed(iter/s)": 0.233427 }, { "epoch": 3.255583504840308, "grad_norm": 0.7506847381591797, "learning_rate": 2.714432269616235e-05, "loss": 0.05527141094207764, "memory(GiB)": 122.96, "step": 42710, "token_acc": 0.976630083925113, "train_speed(iter/s)": 0.23343 }, { "epoch": 3.255964631450568, "grad_norm": 1.1880439519882202, "learning_rate": 2.7133674069160186e-05, "loss": 0.06855987906455993, "memory(GiB)": 122.96, "step": 42715, "token_acc": 0.9747606614447345, "train_speed(iter/s)": 0.233438 }, { "epoch": 3.256345758060828, "grad_norm": 1.8700916767120361, "learning_rate": 2.712302675343571e-05, "loss": 0.05160095095634461, "memory(GiB)": 122.96, "step": 42720, "token_acc": 0.9749646678780537, "train_speed(iter/s)": 0.233444 }, { "epoch": 3.256726884671088, "grad_norm": 1.023488163948059, "learning_rate": 2.7112380749599496e-05, "loss": 0.07144472002983093, "memory(GiB)": 122.96, "step": 42725, "token_acc": 0.9634594594594594, "train_speed(iter/s)": 0.233451 }, { "epoch": 3.2571080112813475, "grad_norm": 0.988815188407898, "learning_rate": 2.7101736058262016e-05, "loss": 0.0679600715637207, "memory(GiB)": 122.96, "step": 42730, "token_acc": 0.975397973950796, "train_speed(iter/s)": 0.233457 }, { "epoch": 3.2574891378916075, "grad_norm": 1.0338600873947144, "learning_rate": 2.709109268003372e-05, "loss": 0.05806206464767456, "memory(GiB)": 122.96, "step": 42735, "token_acc": 0.974973563623546, "train_speed(iter/s)": 0.233465 }, { "epoch": 3.2578702645018676, "grad_norm": 0.9501224756240845, "learning_rate": 2.7080450615524968e-05, "loss": 0.06254867911338806, "memory(GiB)": 122.96, "step": 42740, "token_acc": 0.9733100523821402, "train_speed(iter/s)": 0.233472 }, { "epoch": 3.2582513911121276, "grad_norm": 1.8176747560501099, "learning_rate": 2.7069809865345987e-05, "loss": 0.04526399075984955, "memory(GiB)": 122.96, "step": 42745, "token_acc": 0.9859180357465246, "train_speed(iter/s)": 0.233476 }, { "epoch": 3.258632517722387, "grad_norm": 1.0573756694793701, "learning_rate": 2.705917043010702e-05, "loss": 0.06219180226325989, "memory(GiB)": 122.96, "step": 42750, "token_acc": 0.9762395441871742, "train_speed(iter/s)": 0.233476 }, { "epoch": 3.259013644332647, "grad_norm": 0.8191227316856384, "learning_rate": 2.7048532310418156e-05, "loss": 0.05459409952163696, "memory(GiB)": 122.96, "step": 42755, "token_acc": 0.9813495661183784, "train_speed(iter/s)": 0.23348 }, { "epoch": 3.259394770942907, "grad_norm": 0.5790355801582336, "learning_rate": 2.7037895506889456e-05, "loss": 0.0374012291431427, "memory(GiB)": 122.96, "step": 42760, "token_acc": 0.9831748354059985, "train_speed(iter/s)": 0.233487 }, { "epoch": 3.259775897553167, "grad_norm": 2.1171586513519287, "learning_rate": 2.7027260020130905e-05, "loss": 0.04637461006641388, "memory(GiB)": 122.96, "step": 42765, "token_acc": 0.9774730656219393, "train_speed(iter/s)": 0.233494 }, { "epoch": 3.2601570241634272, "grad_norm": 0.8658002018928528, "learning_rate": 2.7016625850752374e-05, "loss": 0.07586174011230469, "memory(GiB)": 122.96, "step": 42770, "token_acc": 0.9712306872669153, "train_speed(iter/s)": 0.233503 }, { "epoch": 3.2605381507736872, "grad_norm": 1.0051231384277344, "learning_rate": 2.7005992999363688e-05, "loss": 0.06472283601760864, "memory(GiB)": 122.96, "step": 42775, "token_acc": 0.9743685687558465, "train_speed(iter/s)": 0.233507 }, { "epoch": 3.260919277383947, "grad_norm": 0.3723355531692505, "learning_rate": 2.699536146657462e-05, "loss": 0.04558103382587433, "memory(GiB)": 122.96, "step": 42780, "token_acc": 0.9857317570322055, "train_speed(iter/s)": 0.233515 }, { "epoch": 3.261300403994207, "grad_norm": 0.6477387547492981, "learning_rate": 2.69847312529948e-05, "loss": 0.05501596331596374, "memory(GiB)": 122.96, "step": 42785, "token_acc": 0.9769274057400112, "train_speed(iter/s)": 0.233523 }, { "epoch": 3.261681530604467, "grad_norm": 0.7046604156494141, "learning_rate": 2.6974102359233834e-05, "loss": 0.04391325414180756, "memory(GiB)": 122.96, "step": 42790, "token_acc": 0.981922525107604, "train_speed(iter/s)": 0.233527 }, { "epoch": 3.262062657214727, "grad_norm": 0.04762738198041916, "learning_rate": 2.6963474785901267e-05, "loss": 0.035621154308319095, "memory(GiB)": 122.96, "step": 42795, "token_acc": 0.9836692147324531, "train_speed(iter/s)": 0.233534 }, { "epoch": 3.2624437838249865, "grad_norm": 0.6623939275741577, "learning_rate": 2.69528485336065e-05, "loss": 0.0626000702381134, "memory(GiB)": 122.96, "step": 42800, "token_acc": 0.9743700193840189, "train_speed(iter/s)": 0.23354 }, { "epoch": 3.2624437838249865, "eval_loss": 0.06727106124162674, "eval_runtime": 218.6048, "eval_samples_per_second": 2.424, "eval_steps_per_second": 2.424, "eval_token_acc": 0.9724188301909523, "step": 42800 }, { "epoch": 3.2628249104352465, "grad_norm": 2.086151361465454, "learning_rate": 2.6942223602958917e-05, "loss": 0.08656785488128663, "memory(GiB)": 122.96, "step": 42805, "token_acc": 0.9723977091574659, "train_speed(iter/s)": 0.233269 }, { "epoch": 3.2632060370455065, "grad_norm": 1.3119465112686157, "learning_rate": 2.693159999456783e-05, "loss": 0.07125670313835145, "memory(GiB)": 122.96, "step": 42810, "token_acc": 0.966866009374495, "train_speed(iter/s)": 0.233275 }, { "epoch": 3.2635871636557665, "grad_norm": 1.1251730918884277, "learning_rate": 2.6920977709042412e-05, "loss": 0.06106266975402832, "memory(GiB)": 122.96, "step": 42815, "token_acc": 0.9716053299492385, "train_speed(iter/s)": 0.23328 }, { "epoch": 3.2639682902660265, "grad_norm": 1.5668684244155884, "learning_rate": 2.6910356746991823e-05, "loss": 0.04346319437026978, "memory(GiB)": 122.96, "step": 42820, "token_acc": 0.9799749687108886, "train_speed(iter/s)": 0.23329 }, { "epoch": 3.2643494168762865, "grad_norm": 1.2774184942245483, "learning_rate": 2.6899737109025125e-05, "loss": 0.06582842469215393, "memory(GiB)": 122.96, "step": 42825, "token_acc": 0.973630831643002, "train_speed(iter/s)": 0.233299 }, { "epoch": 3.264730543486546, "grad_norm": 1.1230212450027466, "learning_rate": 2.68891187957513e-05, "loss": 0.0769266963005066, "memory(GiB)": 122.96, "step": 42830, "token_acc": 0.975074646241724, "train_speed(iter/s)": 0.233303 }, { "epoch": 3.265111670096806, "grad_norm": 1.1386878490447998, "learning_rate": 2.6878501807779295e-05, "loss": 0.06817191243171691, "memory(GiB)": 122.96, "step": 42835, "token_acc": 0.9745587533789156, "train_speed(iter/s)": 0.233308 }, { "epoch": 3.265492796707066, "grad_norm": 0.8600606322288513, "learning_rate": 2.6867886145717886e-05, "loss": 0.06460253596305847, "memory(GiB)": 122.96, "step": 42840, "token_acc": 0.9771858428906154, "train_speed(iter/s)": 0.23331 }, { "epoch": 3.265873923317326, "grad_norm": 0.7039764523506165, "learning_rate": 2.6857271810175866e-05, "loss": 0.05943437218666077, "memory(GiB)": 122.96, "step": 42845, "token_acc": 0.9704757858963466, "train_speed(iter/s)": 0.233312 }, { "epoch": 3.2662550499275858, "grad_norm": 0.48147517442703247, "learning_rate": 2.6846658801761926e-05, "loss": 0.054924231767654416, "memory(GiB)": 122.96, "step": 42850, "token_acc": 0.9803733186933846, "train_speed(iter/s)": 0.233312 }, { "epoch": 3.2666361765378458, "grad_norm": 1.4190236330032349, "learning_rate": 2.6836047121084644e-05, "loss": 0.05920307040214538, "memory(GiB)": 122.96, "step": 42855, "token_acc": 0.9671111111111111, "train_speed(iter/s)": 0.233321 }, { "epoch": 3.267017303148106, "grad_norm": 1.7944756746292114, "learning_rate": 2.6825436768752565e-05, "loss": 0.05452497005462646, "memory(GiB)": 122.96, "step": 42860, "token_acc": 0.9803431022158685, "train_speed(iter/s)": 0.233329 }, { "epoch": 3.267398429758366, "grad_norm": 2.0745229721069336, "learning_rate": 2.6814827745374167e-05, "loss": 0.07820318937301636, "memory(GiB)": 122.96, "step": 42865, "token_acc": 0.9708597285067874, "train_speed(iter/s)": 0.233334 }, { "epoch": 3.267779556368626, "grad_norm": 0.6876674294471741, "learning_rate": 2.6804220051557782e-05, "loss": 0.06902634501457214, "memory(GiB)": 122.96, "step": 42870, "token_acc": 0.9721714401787528, "train_speed(iter/s)": 0.23334 }, { "epoch": 3.2681606829788854, "grad_norm": 2.2552127838134766, "learning_rate": 2.6793613687911732e-05, "loss": 0.10674506425857544, "memory(GiB)": 122.96, "step": 42875, "token_acc": 0.9669653839395537, "train_speed(iter/s)": 0.233345 }, { "epoch": 3.2685418095891454, "grad_norm": 0.7671197056770325, "learning_rate": 2.6783008655044273e-05, "loss": 0.03793119490146637, "memory(GiB)": 122.96, "step": 42880, "token_acc": 0.9812108559498957, "train_speed(iter/s)": 0.233351 }, { "epoch": 3.2689229361994054, "grad_norm": 1.1196846961975098, "learning_rate": 2.67724049535635e-05, "loss": 0.06871196627616882, "memory(GiB)": 122.96, "step": 42885, "token_acc": 0.9767589266849778, "train_speed(iter/s)": 0.233357 }, { "epoch": 3.2693040628096655, "grad_norm": 0.6253530383110046, "learning_rate": 2.6761802584077522e-05, "loss": 0.06854056119918824, "memory(GiB)": 122.96, "step": 42890, "token_acc": 0.9663120567375887, "train_speed(iter/s)": 0.233364 }, { "epoch": 3.2696851894199255, "grad_norm": 1.3982338905334473, "learning_rate": 2.6751201547194345e-05, "loss": 0.057530772686004636, "memory(GiB)": 122.96, "step": 42895, "token_acc": 0.9723132969034608, "train_speed(iter/s)": 0.233372 }, { "epoch": 3.270066316030185, "grad_norm": 0.0003903468023054302, "learning_rate": 2.6740601843521852e-05, "loss": 0.05774573683738708, "memory(GiB)": 122.96, "step": 42900, "token_acc": 0.9703125, "train_speed(iter/s)": 0.233378 }, { "epoch": 3.270447442640445, "grad_norm": 0.5279640555381775, "learning_rate": 2.6730003473667903e-05, "loss": 0.04635041058063507, "memory(GiB)": 122.96, "step": 42905, "token_acc": 0.9634551495016611, "train_speed(iter/s)": 0.233386 }, { "epoch": 3.270828569250705, "grad_norm": 1.2309820652008057, "learning_rate": 2.671940643824029e-05, "loss": 0.041554155945777896, "memory(GiB)": 122.96, "step": 42910, "token_acc": 0.9763277693474962, "train_speed(iter/s)": 0.233395 }, { "epoch": 3.271209695860965, "grad_norm": 1.037993311882019, "learning_rate": 2.670881073784666e-05, "loss": 0.061218470335006714, "memory(GiB)": 122.96, "step": 42915, "token_acc": 0.9771543086172345, "train_speed(iter/s)": 0.233397 }, { "epoch": 3.271590822471225, "grad_norm": 1.7300680875778198, "learning_rate": 2.6698216373094674e-05, "loss": 0.06739042401313781, "memory(GiB)": 122.96, "step": 42920, "token_acc": 0.9744140275541242, "train_speed(iter/s)": 0.233402 }, { "epoch": 3.2719719490814847, "grad_norm": 1.25575852394104, "learning_rate": 2.668762334459183e-05, "loss": 0.08511364459991455, "memory(GiB)": 122.96, "step": 42925, "token_acc": 0.96760710553814, "train_speed(iter/s)": 0.233408 }, { "epoch": 3.2723530756917447, "grad_norm": 0.5344502329826355, "learning_rate": 2.6677031652945593e-05, "loss": 0.03652408719062805, "memory(GiB)": 122.96, "step": 42930, "token_acc": 0.9838134430727024, "train_speed(iter/s)": 0.233414 }, { "epoch": 3.2727342023020047, "grad_norm": 1.253894329071045, "learning_rate": 2.666644129876339e-05, "loss": 0.07466774582862853, "memory(GiB)": 122.96, "step": 42935, "token_acc": 0.9690500071643502, "train_speed(iter/s)": 0.233419 }, { "epoch": 3.2731153289122648, "grad_norm": 1.7072559595108032, "learning_rate": 2.665585228265247e-05, "loss": 0.05427144169807434, "memory(GiB)": 122.96, "step": 42940, "token_acc": 0.9797320508416352, "train_speed(iter/s)": 0.233422 }, { "epoch": 3.2734964555225248, "grad_norm": 1.2604657411575317, "learning_rate": 2.66452646052201e-05, "loss": 0.050177091360092164, "memory(GiB)": 122.96, "step": 42945, "token_acc": 0.973968105065666, "train_speed(iter/s)": 0.233429 }, { "epoch": 3.2738775821327843, "grad_norm": 1.5712522268295288, "learning_rate": 2.6634678267073433e-05, "loss": 0.046173095703125, "memory(GiB)": 122.96, "step": 42950, "token_acc": 0.9814773488060701, "train_speed(iter/s)": 0.233436 }, { "epoch": 3.2742587087430444, "grad_norm": 1.0377196073532104, "learning_rate": 2.6624093268819505e-05, "loss": 0.06701436042785644, "memory(GiB)": 122.96, "step": 42955, "token_acc": 0.9753634894991923, "train_speed(iter/s)": 0.233442 }, { "epoch": 3.2746398353533044, "grad_norm": 1.124483346939087, "learning_rate": 2.6613509611065397e-05, "loss": 0.05004933476448059, "memory(GiB)": 122.96, "step": 42960, "token_acc": 0.9837013062073748, "train_speed(iter/s)": 0.23344 }, { "epoch": 3.2750209619635644, "grad_norm": 1.3881689310073853, "learning_rate": 2.6602927294417956e-05, "loss": 0.07120344638824463, "memory(GiB)": 122.96, "step": 42965, "token_acc": 0.9729959600255156, "train_speed(iter/s)": 0.233446 }, { "epoch": 3.2754020885738244, "grad_norm": 1.135448932647705, "learning_rate": 2.659234631948407e-05, "loss": 0.06625648736953735, "memory(GiB)": 122.96, "step": 42970, "token_acc": 0.9755388713974328, "train_speed(iter/s)": 0.233453 }, { "epoch": 3.275783215184084, "grad_norm": 2.003007411956787, "learning_rate": 2.6581766686870507e-05, "loss": 0.0742311954498291, "memory(GiB)": 122.96, "step": 42975, "token_acc": 0.9781704781704782, "train_speed(iter/s)": 0.23346 }, { "epoch": 3.276164341794344, "grad_norm": 0.5491850972175598, "learning_rate": 2.6571188397183938e-05, "loss": 0.04553976655006409, "memory(GiB)": 122.96, "step": 42980, "token_acc": 0.9779929577464789, "train_speed(iter/s)": 0.23347 }, { "epoch": 3.276545468404604, "grad_norm": 0.8654868602752686, "learning_rate": 2.6560611451030988e-05, "loss": 0.0394243061542511, "memory(GiB)": 122.96, "step": 42985, "token_acc": 0.9852981214266268, "train_speed(iter/s)": 0.233476 }, { "epoch": 3.276926595014864, "grad_norm": 1.4415119886398315, "learning_rate": 2.6550035849018217e-05, "loss": 0.07932916879653931, "memory(GiB)": 122.96, "step": 42990, "token_acc": 0.9688550302487117, "train_speed(iter/s)": 0.233483 }, { "epoch": 3.2773077216251236, "grad_norm": 0.7831910848617554, "learning_rate": 2.6539461591752056e-05, "loss": 0.04906262159347534, "memory(GiB)": 122.96, "step": 42995, "token_acc": 0.9768243895709753, "train_speed(iter/s)": 0.233486 }, { "epoch": 3.2776888482353836, "grad_norm": 2.097409725189209, "learning_rate": 2.65288886798389e-05, "loss": 0.052426719665527345, "memory(GiB)": 122.96, "step": 43000, "token_acc": 0.9805555555555555, "train_speed(iter/s)": 0.233494 }, { "epoch": 3.2776888482353836, "eval_loss": 0.06629740446805954, "eval_runtime": 220.0038, "eval_samples_per_second": 2.409, "eval_steps_per_second": 2.409, "eval_token_acc": 0.9726748388651286, "step": 43000 }, { "epoch": 3.2780699748456437, "grad_norm": 1.3887238502502441, "learning_rate": 2.651831711388507e-05, "loss": 0.07164985537528992, "memory(GiB)": 122.96, "step": 43005, "token_acc": 0.9727641072679605, "train_speed(iter/s)": 0.233217 }, { "epoch": 3.2784511014559037, "grad_norm": 0.7943814992904663, "learning_rate": 2.6507746894496777e-05, "loss": 0.05299915075302124, "memory(GiB)": 122.96, "step": 43010, "token_acc": 0.9812108559498957, "train_speed(iter/s)": 0.233225 }, { "epoch": 3.2788322280661637, "grad_norm": 1.3195894956588745, "learning_rate": 2.649717802228018e-05, "loss": 0.05487884879112244, "memory(GiB)": 122.96, "step": 43015, "token_acc": 0.9774810681546433, "train_speed(iter/s)": 0.233232 }, { "epoch": 3.2792133546764237, "grad_norm": 1.5997304916381836, "learning_rate": 2.6486610497841367e-05, "loss": 0.07814643383026124, "memory(GiB)": 122.96, "step": 43020, "token_acc": 0.973456987527982, "train_speed(iter/s)": 0.233241 }, { "epoch": 3.2795944812866833, "grad_norm": 0.6714390516281128, "learning_rate": 2.6476044321786318e-05, "loss": 0.03801571130752564, "memory(GiB)": 122.96, "step": 43025, "token_acc": 0.984415139578695, "train_speed(iter/s)": 0.233244 }, { "epoch": 3.2799756078969433, "grad_norm": 0.6634875535964966, "learning_rate": 2.646547949472096e-05, "loss": 0.05601094961166382, "memory(GiB)": 122.96, "step": 43030, "token_acc": 0.9778652238071815, "train_speed(iter/s)": 0.233251 }, { "epoch": 3.2803567345072033, "grad_norm": 0.5223979353904724, "learning_rate": 2.6454916017251158e-05, "loss": 0.05309295654296875, "memory(GiB)": 122.96, "step": 43035, "token_acc": 0.978008658008658, "train_speed(iter/s)": 0.233257 }, { "epoch": 3.2807378611174633, "grad_norm": 1.6771008968353271, "learning_rate": 2.6444353889982642e-05, "loss": 0.07242034673690796, "memory(GiB)": 122.96, "step": 43040, "token_acc": 0.9767709291628335, "train_speed(iter/s)": 0.233263 }, { "epoch": 3.281118987727723, "grad_norm": 1.278633952140808, "learning_rate": 2.6433793113521116e-05, "loss": 0.05275624394416809, "memory(GiB)": 122.96, "step": 43045, "token_acc": 0.9781420765027322, "train_speed(iter/s)": 0.233271 }, { "epoch": 3.281500114337983, "grad_norm": 0.5493510961532593, "learning_rate": 2.6423233688472217e-05, "loss": 0.06562319993972779, "memory(GiB)": 122.96, "step": 43050, "token_acc": 0.9762005949851253, "train_speed(iter/s)": 0.233274 }, { "epoch": 3.281881240948243, "grad_norm": 0.8573600649833679, "learning_rate": 2.6412675615441434e-05, "loss": 0.052223026752471924, "memory(GiB)": 122.96, "step": 43055, "token_acc": 0.9785262206148282, "train_speed(iter/s)": 0.233279 }, { "epoch": 3.282262367558503, "grad_norm": 1.6509133577346802, "learning_rate": 2.6402118895034245e-05, "loss": 0.0759082317352295, "memory(GiB)": 122.96, "step": 43060, "token_acc": 0.9692099147947327, "train_speed(iter/s)": 0.233283 }, { "epoch": 3.282643494168763, "grad_norm": 0.8339722156524658, "learning_rate": 2.6391563527856057e-05, "loss": 0.07879940271377564, "memory(GiB)": 122.96, "step": 43065, "token_acc": 0.9631929046563192, "train_speed(iter/s)": 0.233291 }, { "epoch": 3.283024620779023, "grad_norm": 1.0828884840011597, "learning_rate": 2.638100951451211e-05, "loss": 0.052259761095047, "memory(GiB)": 122.96, "step": 43070, "token_acc": 0.9643469317792253, "train_speed(iter/s)": 0.2333 }, { "epoch": 3.2834057473892826, "grad_norm": 0.7441643476486206, "learning_rate": 2.6370456855607673e-05, "loss": 0.04365265965461731, "memory(GiB)": 122.96, "step": 43075, "token_acc": 0.9768058316766071, "train_speed(iter/s)": 0.233305 }, { "epoch": 3.2837868739995426, "grad_norm": 2.9038941860198975, "learning_rate": 2.63599055517479e-05, "loss": 0.0848358154296875, "memory(GiB)": 122.96, "step": 43080, "token_acc": 0.9723456790123457, "train_speed(iter/s)": 0.233313 }, { "epoch": 3.2841680006098026, "grad_norm": 0.97982257604599, "learning_rate": 2.63493556035378e-05, "loss": 0.0463539183139801, "memory(GiB)": 122.96, "step": 43085, "token_acc": 0.9849902534113061, "train_speed(iter/s)": 0.233319 }, { "epoch": 3.2845491272200626, "grad_norm": 0.11799737066030502, "learning_rate": 2.6338807011582446e-05, "loss": 0.06758823990821838, "memory(GiB)": 122.96, "step": 43090, "token_acc": 0.9730421094057458, "train_speed(iter/s)": 0.233325 }, { "epoch": 3.2849302538303222, "grad_norm": 1.0011916160583496, "learning_rate": 2.632825977648668e-05, "loss": 0.07166142463684082, "memory(GiB)": 122.96, "step": 43095, "token_acc": 0.9710655235762401, "train_speed(iter/s)": 0.233329 }, { "epoch": 3.2853113804405822, "grad_norm": 1.1652113199234009, "learning_rate": 2.6317713898855368e-05, "loss": 0.07425054907798767, "memory(GiB)": 122.96, "step": 43100, "token_acc": 0.9689922480620154, "train_speed(iter/s)": 0.233334 }, { "epoch": 3.2856925070508423, "grad_norm": 1.444705605506897, "learning_rate": 2.630716937929329e-05, "loss": 0.05553842186927795, "memory(GiB)": 122.96, "step": 43105, "token_acc": 0.9785695151352799, "train_speed(iter/s)": 0.233341 }, { "epoch": 3.2860736336611023, "grad_norm": 0.6072777509689331, "learning_rate": 2.6296626218405073e-05, "loss": 0.050655257701873777, "memory(GiB)": 122.96, "step": 43110, "token_acc": 0.9809470124013528, "train_speed(iter/s)": 0.233344 }, { "epoch": 3.2864547602713623, "grad_norm": 1.986367106437683, "learning_rate": 2.6286084416795354e-05, "loss": 0.06505702137947082, "memory(GiB)": 122.96, "step": 43115, "token_acc": 0.9766123316796598, "train_speed(iter/s)": 0.233351 }, { "epoch": 3.2868358868816223, "grad_norm": 1.9065816402435303, "learning_rate": 2.6275543975068662e-05, "loss": 0.06778401136398315, "memory(GiB)": 122.96, "step": 43120, "token_acc": 0.977907333537895, "train_speed(iter/s)": 0.233359 }, { "epoch": 3.287217013491882, "grad_norm": 0.839687168598175, "learning_rate": 2.6265004893829408e-05, "loss": 0.03966890573501587, "memory(GiB)": 122.96, "step": 43125, "token_acc": 0.9859350581698212, "train_speed(iter/s)": 0.233364 }, { "epoch": 3.287598140102142, "grad_norm": 1.4101654291152954, "learning_rate": 2.6254467173682002e-05, "loss": 0.08644756078720092, "memory(GiB)": 122.96, "step": 43130, "token_acc": 0.9606379368849678, "train_speed(iter/s)": 0.233372 }, { "epoch": 3.287979266712402, "grad_norm": 1.2434519529342651, "learning_rate": 2.624393081523069e-05, "loss": 0.06445740461349488, "memory(GiB)": 122.96, "step": 43135, "token_acc": 0.9739696312364425, "train_speed(iter/s)": 0.233379 }, { "epoch": 3.288360393322662, "grad_norm": 2.1101233959198, "learning_rate": 2.6233395819079703e-05, "loss": 0.09202950596809387, "memory(GiB)": 122.96, "step": 43140, "token_acc": 0.9713558243157224, "train_speed(iter/s)": 0.233388 }, { "epoch": 3.2887415199329215, "grad_norm": 0.9122503399848938, "learning_rate": 2.6222862185833196e-05, "loss": 0.06035665869712829, "memory(GiB)": 122.96, "step": 43145, "token_acc": 0.9799679487179487, "train_speed(iter/s)": 0.233395 }, { "epoch": 3.2891226465431815, "grad_norm": 1.0373295545578003, "learning_rate": 2.6212329916095178e-05, "loss": 0.0486588180065155, "memory(GiB)": 122.96, "step": 43150, "token_acc": 0.9794021144732045, "train_speed(iter/s)": 0.233399 }, { "epoch": 3.2895037731534416, "grad_norm": 1.1227869987487793, "learning_rate": 2.6201799010469664e-05, "loss": 0.05163888931274414, "memory(GiB)": 122.96, "step": 43155, "token_acc": 0.9781625533014602, "train_speed(iter/s)": 0.233404 }, { "epoch": 3.2898848997637016, "grad_norm": 1.6563754081726074, "learning_rate": 2.6191269469560547e-05, "loss": 0.06953897476196289, "memory(GiB)": 122.96, "step": 43160, "token_acc": 0.9696663296258847, "train_speed(iter/s)": 0.233411 }, { "epoch": 3.2902660263739616, "grad_norm": 0.4903081953525543, "learning_rate": 2.6180741293971628e-05, "loss": 0.04076177477836609, "memory(GiB)": 122.96, "step": 43165, "token_acc": 0.9853285137146502, "train_speed(iter/s)": 0.233418 }, { "epoch": 3.290647152984221, "grad_norm": 1.1328871250152588, "learning_rate": 2.6170214484306653e-05, "loss": 0.0523238480091095, "memory(GiB)": 122.96, "step": 43170, "token_acc": 0.9791755206119847, "train_speed(iter/s)": 0.233422 }, { "epoch": 3.291028279594481, "grad_norm": 1.3890408277511597, "learning_rate": 2.615968904116932e-05, "loss": 0.07925750613212586, "memory(GiB)": 122.96, "step": 43175, "token_acc": 0.9724179283465747, "train_speed(iter/s)": 0.233426 }, { "epoch": 3.291409406204741, "grad_norm": 1.4719852209091187, "learning_rate": 2.6149164965163166e-05, "loss": 0.05045233368873596, "memory(GiB)": 122.96, "step": 43180, "token_acc": 0.9827351203998183, "train_speed(iter/s)": 0.233432 }, { "epoch": 3.2917905328150012, "grad_norm": 1.1074665784835815, "learning_rate": 2.6138642256891722e-05, "loss": 0.032188969850540164, "memory(GiB)": 122.96, "step": 43185, "token_acc": 0.9832369942196532, "train_speed(iter/s)": 0.23344 }, { "epoch": 3.2921716594252612, "grad_norm": 0.06933192908763885, "learning_rate": 2.612812091695843e-05, "loss": 0.02745142877101898, "memory(GiB)": 122.96, "step": 43190, "token_acc": 0.9853889127632144, "train_speed(iter/s)": 0.233448 }, { "epoch": 3.292552786035521, "grad_norm": 0.8318012356758118, "learning_rate": 2.6117600945966608e-05, "loss": 0.08856772780418395, "memory(GiB)": 122.96, "step": 43195, "token_acc": 0.9716408856536245, "train_speed(iter/s)": 0.233452 }, { "epoch": 3.292933912645781, "grad_norm": 1.1519464254379272, "learning_rate": 2.610708234451954e-05, "loss": 0.0589733898639679, "memory(GiB)": 122.96, "step": 43200, "token_acc": 0.9795094226742447, "train_speed(iter/s)": 0.233455 }, { "epoch": 3.292933912645781, "eval_loss": 0.06555566936731339, "eval_runtime": 218.1299, "eval_samples_per_second": 2.43, "eval_steps_per_second": 2.43, "eval_token_acc": 0.9729910848744051, "step": 43200 }, { "epoch": 3.293315039256041, "grad_norm": 1.306340217590332, "learning_rate": 2.609656511322045e-05, "loss": 0.04675832092761993, "memory(GiB)": 122.96, "step": 43205, "token_acc": 0.9734317130047433, "train_speed(iter/s)": 0.233185 }, { "epoch": 3.293696165866301, "grad_norm": 1.379501223564148, "learning_rate": 2.6086049252672396e-05, "loss": 0.07079951763153076, "memory(GiB)": 122.96, "step": 43210, "token_acc": 0.9763202405562864, "train_speed(iter/s)": 0.23319 }, { "epoch": 3.294077292476561, "grad_norm": 0.8375155329704285, "learning_rate": 2.6075534763478447e-05, "loss": 0.0443683922290802, "memory(GiB)": 122.96, "step": 43215, "token_acc": 0.9776526482491642, "train_speed(iter/s)": 0.233195 }, { "epoch": 3.2944584190868205, "grad_norm": 1.8716472387313843, "learning_rate": 2.606502164624156e-05, "loss": 0.12098459005355836, "memory(GiB)": 122.96, "step": 43220, "token_acc": 0.9515850144092219, "train_speed(iter/s)": 0.233203 }, { "epoch": 3.2948395456970805, "grad_norm": 1.555069923400879, "learning_rate": 2.6054509901564607e-05, "loss": 0.07459107637405396, "memory(GiB)": 122.96, "step": 43225, "token_acc": 0.9662162162162162, "train_speed(iter/s)": 0.233212 }, { "epoch": 3.2952206723073405, "grad_norm": 1.3115854263305664, "learning_rate": 2.604399953005041e-05, "loss": 0.044482851028442384, "memory(GiB)": 122.96, "step": 43230, "token_acc": 0.9835620810724545, "train_speed(iter/s)": 0.233216 }, { "epoch": 3.2956017989176005, "grad_norm": 0.593228280544281, "learning_rate": 2.6033490532301654e-05, "loss": 0.08005259037017823, "memory(GiB)": 122.96, "step": 43235, "token_acc": 0.9717094703049759, "train_speed(iter/s)": 0.233222 }, { "epoch": 3.2959829255278605, "grad_norm": 0.5631779432296753, "learning_rate": 2.6022982908920988e-05, "loss": 0.055846214294433594, "memory(GiB)": 122.96, "step": 43240, "token_acc": 0.9850673194614443, "train_speed(iter/s)": 0.233227 }, { "epoch": 3.29636405213812, "grad_norm": 1.0380909442901611, "learning_rate": 2.6012476660511013e-05, "loss": 0.07757140398025512, "memory(GiB)": 122.96, "step": 43245, "token_acc": 0.9692240052758848, "train_speed(iter/s)": 0.233234 }, { "epoch": 3.29674517874838, "grad_norm": 0.9685123562812805, "learning_rate": 2.6001971787674166e-05, "loss": 0.07173245549201965, "memory(GiB)": 122.96, "step": 43250, "token_acc": 0.9790849673202614, "train_speed(iter/s)": 0.233235 }, { "epoch": 3.29712630535864, "grad_norm": 2.2753586769104004, "learning_rate": 2.5991468291012876e-05, "loss": 0.05635181665420532, "memory(GiB)": 122.96, "step": 43255, "token_acc": 0.9789901129943502, "train_speed(iter/s)": 0.233239 }, { "epoch": 3.2975074319689, "grad_norm": 0.6287574768066406, "learning_rate": 2.598096617112948e-05, "loss": 0.04113634824752808, "memory(GiB)": 122.96, "step": 43260, "token_acc": 0.9793379717478389, "train_speed(iter/s)": 0.233244 }, { "epoch": 3.29788855857916, "grad_norm": 0.3938201069831848, "learning_rate": 2.597046542862619e-05, "loss": 0.07683411836624146, "memory(GiB)": 122.96, "step": 43265, "token_acc": 0.9761207826220921, "train_speed(iter/s)": 0.233247 }, { "epoch": 3.2982696851894198, "grad_norm": 0.5408012866973877, "learning_rate": 2.59599660641052e-05, "loss": 0.0613761305809021, "memory(GiB)": 122.96, "step": 43270, "token_acc": 0.974191931846655, "train_speed(iter/s)": 0.233254 }, { "epoch": 3.29865081179968, "grad_norm": 0.8508277535438538, "learning_rate": 2.594946807816862e-05, "loss": 0.04948974847793579, "memory(GiB)": 122.96, "step": 43275, "token_acc": 0.978796992481203, "train_speed(iter/s)": 0.233258 }, { "epoch": 3.29903193840994, "grad_norm": 0.41940754652023315, "learning_rate": 2.5938971471418417e-05, "loss": 0.05962207317352295, "memory(GiB)": 122.96, "step": 43280, "token_acc": 0.9804921586127757, "train_speed(iter/s)": 0.233261 }, { "epoch": 3.2994130650202, "grad_norm": 1.1734434366226196, "learning_rate": 2.592847624445654e-05, "loss": 0.059108293056488036, "memory(GiB)": 122.96, "step": 43285, "token_acc": 0.97837122048113, "train_speed(iter/s)": 0.233268 }, { "epoch": 3.2997941916304594, "grad_norm": 1.3188318014144897, "learning_rate": 2.5917982397884866e-05, "loss": 0.05860029458999634, "memory(GiB)": 122.96, "step": 43290, "token_acc": 0.9783856159143076, "train_speed(iter/s)": 0.233273 }, { "epoch": 3.3001753182407194, "grad_norm": 0.9537093043327332, "learning_rate": 2.590748993230513e-05, "loss": 0.05869206786155701, "memory(GiB)": 122.96, "step": 43295, "token_acc": 0.9773143066227589, "train_speed(iter/s)": 0.233277 }, { "epoch": 3.3005564448509794, "grad_norm": 0.5334658026695251, "learning_rate": 2.589699884831904e-05, "loss": 0.05696225166320801, "memory(GiB)": 122.96, "step": 43300, "token_acc": 0.98104929051531, "train_speed(iter/s)": 0.233279 }, { "epoch": 3.3009375714612395, "grad_norm": 0.5653412938117981, "learning_rate": 2.588650914652823e-05, "loss": 0.07813243269920349, "memory(GiB)": 122.96, "step": 43305, "token_acc": 0.9738089299077076, "train_speed(iter/s)": 0.233284 }, { "epoch": 3.3013186980714995, "grad_norm": 1.1537421941757202, "learning_rate": 2.5876020827534207e-05, "loss": 0.07212315201759338, "memory(GiB)": 122.96, "step": 43310, "token_acc": 0.9771428571428571, "train_speed(iter/s)": 0.233289 }, { "epoch": 3.3016998246817595, "grad_norm": 1.065189003944397, "learning_rate": 2.586553389193846e-05, "loss": 0.03707319498062134, "memory(GiB)": 122.96, "step": 43315, "token_acc": 0.984241083771081, "train_speed(iter/s)": 0.233293 }, { "epoch": 3.302080951292019, "grad_norm": 1.0739792585372925, "learning_rate": 2.5855048340342324e-05, "loss": 0.04096525311470032, "memory(GiB)": 122.96, "step": 43320, "token_acc": 0.9840487238979119, "train_speed(iter/s)": 0.2333 }, { "epoch": 3.302462077902279, "grad_norm": 1.3751819133758545, "learning_rate": 2.5844564173347124e-05, "loss": 0.10850030183792114, "memory(GiB)": 122.96, "step": 43325, "token_acc": 0.9724137931034482, "train_speed(iter/s)": 0.233306 }, { "epoch": 3.302843204512539, "grad_norm": 0.600742757320404, "learning_rate": 2.5834081391554087e-05, "loss": 0.06470043659210205, "memory(GiB)": 122.96, "step": 43330, "token_acc": 0.9786336059087312, "train_speed(iter/s)": 0.233311 }, { "epoch": 3.303224331122799, "grad_norm": 1.007386565208435, "learning_rate": 2.582359999556433e-05, "loss": 0.06740909814834595, "memory(GiB)": 122.96, "step": 43335, "token_acc": 0.9738562091503268, "train_speed(iter/s)": 0.233314 }, { "epoch": 3.3036054577330587, "grad_norm": 0.8662719130516052, "learning_rate": 2.581311998597891e-05, "loss": 0.07285399436950683, "memory(GiB)": 122.96, "step": 43340, "token_acc": 0.9758847478860007, "train_speed(iter/s)": 0.233321 }, { "epoch": 3.3039865843433187, "grad_norm": 1.238338589668274, "learning_rate": 2.5802641363398837e-05, "loss": 0.06057997941970825, "memory(GiB)": 122.96, "step": 43345, "token_acc": 0.9798829553767374, "train_speed(iter/s)": 0.233327 }, { "epoch": 3.3043677109535787, "grad_norm": 0.5237149596214294, "learning_rate": 2.579216412842498e-05, "loss": 0.06359029412269593, "memory(GiB)": 122.96, "step": 43350, "token_acc": 0.9710620525059666, "train_speed(iter/s)": 0.23333 }, { "epoch": 3.3047488375638387, "grad_norm": 2.287907838821411, "learning_rate": 2.5781688281658172e-05, "loss": 0.053119432926177976, "memory(GiB)": 122.96, "step": 43355, "token_acc": 0.9806900918012029, "train_speed(iter/s)": 0.233333 }, { "epoch": 3.3051299641740988, "grad_norm": 0.8908995389938354, "learning_rate": 2.577121382369915e-05, "loss": 0.06950249671936035, "memory(GiB)": 122.96, "step": 43360, "token_acc": 0.9687947323217865, "train_speed(iter/s)": 0.233341 }, { "epoch": 3.305511090784359, "grad_norm": 1.611815333366394, "learning_rate": 2.5760740755148583e-05, "loss": 0.06057101488113403, "memory(GiB)": 122.96, "step": 43365, "token_acc": 0.9773796597900832, "train_speed(iter/s)": 0.233348 }, { "epoch": 3.3058922173946184, "grad_norm": 0.7775827646255493, "learning_rate": 2.575026907660707e-05, "loss": 0.05794335603713989, "memory(GiB)": 122.96, "step": 43370, "token_acc": 0.9748618292030665, "train_speed(iter/s)": 0.233352 }, { "epoch": 3.3062733440048784, "grad_norm": 0.530635416507721, "learning_rate": 2.573979878867507e-05, "loss": 0.048687133193016055, "memory(GiB)": 122.96, "step": 43375, "token_acc": 0.9825119236883942, "train_speed(iter/s)": 0.233358 }, { "epoch": 3.3066544706151384, "grad_norm": 1.5021804571151733, "learning_rate": 2.572932989195303e-05, "loss": 0.049078845977783205, "memory(GiB)": 122.96, "step": 43380, "token_acc": 0.9792082027912276, "train_speed(iter/s)": 0.233365 }, { "epoch": 3.3070355972253984, "grad_norm": 0.9418014883995056, "learning_rate": 2.571886238704131e-05, "loss": 0.03942302763462067, "memory(GiB)": 122.96, "step": 43385, "token_acc": 0.9836588282184137, "train_speed(iter/s)": 0.233373 }, { "epoch": 3.307416723835658, "grad_norm": 2.5579335689544678, "learning_rate": 2.5708396274540138e-05, "loss": 0.059057211875915526, "memory(GiB)": 122.96, "step": 43390, "token_acc": 0.9816313823163139, "train_speed(iter/s)": 0.233381 }, { "epoch": 3.307797850445918, "grad_norm": 0.7712226510047913, "learning_rate": 2.5697931555049704e-05, "loss": 0.05392873287200928, "memory(GiB)": 122.96, "step": 43395, "token_acc": 0.9806237558062375, "train_speed(iter/s)": 0.233382 }, { "epoch": 3.308178977056178, "grad_norm": 0.812745988368988, "learning_rate": 2.5687468229170148e-05, "loss": 0.0674196720123291, "memory(GiB)": 122.96, "step": 43400, "token_acc": 0.9765372168284789, "train_speed(iter/s)": 0.233388 }, { "epoch": 3.308178977056178, "eval_loss": 0.06446022540330887, "eval_runtime": 218.51, "eval_samples_per_second": 2.426, "eval_steps_per_second": 2.426, "eval_token_acc": 0.9732245045479188, "step": 43400 }, { "epoch": 3.308560103666438, "grad_norm": 0.8570162653923035, "learning_rate": 2.567700629750144e-05, "loss": 0.04314883053302765, "memory(GiB)": 122.96, "step": 43405, "token_acc": 0.9733470813830143, "train_speed(iter/s)": 0.233123 }, { "epoch": 3.308941230276698, "grad_norm": 2.0318408012390137, "learning_rate": 2.566654576064355e-05, "loss": 0.06737207174301148, "memory(GiB)": 122.96, "step": 43410, "token_acc": 0.9742103158736506, "train_speed(iter/s)": 0.233127 }, { "epoch": 3.309322356886958, "grad_norm": 1.230647325515747, "learning_rate": 2.5656086619196363e-05, "loss": 0.05435967445373535, "memory(GiB)": 122.96, "step": 43415, "token_acc": 0.9772030651340996, "train_speed(iter/s)": 0.233132 }, { "epoch": 3.3097034834972177, "grad_norm": 1.8972312211990356, "learning_rate": 2.5645628873759616e-05, "loss": 0.06132686138153076, "memory(GiB)": 122.96, "step": 43420, "token_acc": 0.9769553072625698, "train_speed(iter/s)": 0.233135 }, { "epoch": 3.3100846101074777, "grad_norm": 0.8500514030456543, "learning_rate": 2.5635172524933038e-05, "loss": 0.0614313006401062, "memory(GiB)": 122.96, "step": 43425, "token_acc": 0.9772603633989773, "train_speed(iter/s)": 0.233135 }, { "epoch": 3.3104657367177377, "grad_norm": 1.096959114074707, "learning_rate": 2.5624717573316258e-05, "loss": 0.06796914935112, "memory(GiB)": 122.96, "step": 43430, "token_acc": 0.9776013031969049, "train_speed(iter/s)": 0.23314 }, { "epoch": 3.3108468633279977, "grad_norm": 1.077849268913269, "learning_rate": 2.5614264019508803e-05, "loss": 0.0526293158531189, "memory(GiB)": 122.96, "step": 43435, "token_acc": 0.9756592292089249, "train_speed(iter/s)": 0.233144 }, { "epoch": 3.3112279899382573, "grad_norm": 1.8198542594909668, "learning_rate": 2.5603811864110138e-05, "loss": 0.06116398572921753, "memory(GiB)": 122.96, "step": 43440, "token_acc": 0.9727534713125491, "train_speed(iter/s)": 0.233151 }, { "epoch": 3.3116091165485173, "grad_norm": 0.9005258679389954, "learning_rate": 2.559336110771967e-05, "loss": 0.06380571126937866, "memory(GiB)": 122.96, "step": 43445, "token_acc": 0.9816411682892907, "train_speed(iter/s)": 0.23316 }, { "epoch": 3.3119902431587773, "grad_norm": 0.8317550420761108, "learning_rate": 2.5582911750936665e-05, "loss": 0.05601559281349182, "memory(GiB)": 122.96, "step": 43450, "token_acc": 0.981694560669456, "train_speed(iter/s)": 0.23317 }, { "epoch": 3.3123713697690373, "grad_norm": 1.4843950271606445, "learning_rate": 2.5572463794360358e-05, "loss": 0.05815284848213196, "memory(GiB)": 122.96, "step": 43455, "token_acc": 0.9714015151515152, "train_speed(iter/s)": 0.233176 }, { "epoch": 3.3127524963792974, "grad_norm": 0.469039648771286, "learning_rate": 2.556201723858992e-05, "loss": 0.056556212902069095, "memory(GiB)": 122.96, "step": 43460, "token_acc": 0.9816058861164427, "train_speed(iter/s)": 0.23318 }, { "epoch": 3.313133622989557, "grad_norm": 0.8462313413619995, "learning_rate": 2.5551572084224363e-05, "loss": 0.08724916577339173, "memory(GiB)": 122.96, "step": 43465, "token_acc": 0.9674442896935933, "train_speed(iter/s)": 0.233184 }, { "epoch": 3.313514749599817, "grad_norm": 2.011833906173706, "learning_rate": 2.554112833186269e-05, "loss": 0.06701802611351013, "memory(GiB)": 122.96, "step": 43470, "token_acc": 0.9760826236637072, "train_speed(iter/s)": 0.23319 }, { "epoch": 3.313895876210077, "grad_norm": 1.0637280941009521, "learning_rate": 2.553068598210383e-05, "loss": 0.07009115815162659, "memory(GiB)": 122.96, "step": 43475, "token_acc": 0.9732163472639114, "train_speed(iter/s)": 0.233196 }, { "epoch": 3.314277002820337, "grad_norm": 1.1840953826904297, "learning_rate": 2.5520245035546554e-05, "loss": 0.06631782650947571, "memory(GiB)": 122.96, "step": 43480, "token_acc": 0.9696428571428571, "train_speed(iter/s)": 0.233204 }, { "epoch": 3.314658129430597, "grad_norm": 1.1116892099380493, "learning_rate": 2.550980549278964e-05, "loss": 0.056482553482055664, "memory(GiB)": 122.96, "step": 43485, "token_acc": 0.9774514167037531, "train_speed(iter/s)": 0.233207 }, { "epoch": 3.3150392560408566, "grad_norm": 0.6869076490402222, "learning_rate": 2.5499367354431702e-05, "loss": 0.051216882467269895, "memory(GiB)": 122.96, "step": 43490, "token_acc": 0.9792288894922617, "train_speed(iter/s)": 0.23321 }, { "epoch": 3.3154203826511166, "grad_norm": 0.8409478068351746, "learning_rate": 2.5488930621071365e-05, "loss": 0.044657614827156064, "memory(GiB)": 122.96, "step": 43495, "token_acc": 0.9763084212793453, "train_speed(iter/s)": 0.233217 }, { "epoch": 3.3158015092613766, "grad_norm": 1.3125985860824585, "learning_rate": 2.547849529330713e-05, "loss": 0.08092774152755737, "memory(GiB)": 122.96, "step": 43500, "token_acc": 0.973302822273074, "train_speed(iter/s)": 0.233225 }, { "epoch": 3.3161826358716366, "grad_norm": 0.6294171810150146, "learning_rate": 2.5468061371737384e-05, "loss": 0.05103256106376648, "memory(GiB)": 122.96, "step": 43505, "token_acc": 0.9829492224095934, "train_speed(iter/s)": 0.233227 }, { "epoch": 3.3165637624818967, "grad_norm": 1.7856898307800293, "learning_rate": 2.545762885696047e-05, "loss": 0.053104615211486815, "memory(GiB)": 122.96, "step": 43510, "token_acc": 0.9780192640158064, "train_speed(iter/s)": 0.233234 }, { "epoch": 3.3169448890921562, "grad_norm": 0.7250064611434937, "learning_rate": 2.544719774957467e-05, "loss": 0.046107977628707886, "memory(GiB)": 122.96, "step": 43515, "token_acc": 0.981372671583948, "train_speed(iter/s)": 0.233238 }, { "epoch": 3.3173260157024163, "grad_norm": 0.6059370636940002, "learning_rate": 2.5436768050178116e-05, "loss": 0.0446925550699234, "memory(GiB)": 122.96, "step": 43520, "token_acc": 0.9754283866795991, "train_speed(iter/s)": 0.233245 }, { "epoch": 3.3177071423126763, "grad_norm": 1.5341695547103882, "learning_rate": 2.5426339759368955e-05, "loss": 0.06653455495834351, "memory(GiB)": 122.96, "step": 43525, "token_acc": 0.9708945260347129, "train_speed(iter/s)": 0.233251 }, { "epoch": 3.3180882689229363, "grad_norm": 0.8813326358795166, "learning_rate": 2.541591287774515e-05, "loss": 0.07075355052947999, "memory(GiB)": 122.96, "step": 43530, "token_acc": 0.9738175675675675, "train_speed(iter/s)": 0.233254 }, { "epoch": 3.3184693955331963, "grad_norm": 1.3796311616897583, "learning_rate": 2.540548740590466e-05, "loss": 0.05609139204025269, "memory(GiB)": 122.96, "step": 43535, "token_acc": 0.9809885931558935, "train_speed(iter/s)": 0.233257 }, { "epoch": 3.318850522143456, "grad_norm": 1.9510160684585571, "learning_rate": 2.539506334444535e-05, "loss": 0.06280604004859924, "memory(GiB)": 122.96, "step": 43540, "token_acc": 0.9773599386032233, "train_speed(iter/s)": 0.233263 }, { "epoch": 3.319231648753716, "grad_norm": 0.8234132528305054, "learning_rate": 2.5384640693964963e-05, "loss": 0.03851305544376373, "memory(GiB)": 122.96, "step": 43545, "token_acc": 0.9832344596337375, "train_speed(iter/s)": 0.233271 }, { "epoch": 3.319612775363976, "grad_norm": 0.7227919101715088, "learning_rate": 2.5374219455061197e-05, "loss": 0.037763357162475586, "memory(GiB)": 122.96, "step": 43550, "token_acc": 0.98430468664252, "train_speed(iter/s)": 0.23327 }, { "epoch": 3.319993901974236, "grad_norm": 1.1076167821884155, "learning_rate": 2.5363799628331693e-05, "loss": 0.06556554436683655, "memory(GiB)": 122.96, "step": 43555, "token_acc": 0.9763132452465356, "train_speed(iter/s)": 0.233272 }, { "epoch": 3.320375028584496, "grad_norm": 2.2598764896392822, "learning_rate": 2.5353381214373927e-05, "loss": 0.09332822561264038, "memory(GiB)": 122.96, "step": 43560, "token_acc": 0.9721025641025641, "train_speed(iter/s)": 0.233279 }, { "epoch": 3.3207561551947555, "grad_norm": 0.7781385779380798, "learning_rate": 2.534296421378538e-05, "loss": 0.06744717955589294, "memory(GiB)": 122.96, "step": 43565, "token_acc": 0.976939856653163, "train_speed(iter/s)": 0.233286 }, { "epoch": 3.3211372818050156, "grad_norm": 3.371396541595459, "learning_rate": 2.533254862716343e-05, "loss": 0.054049670696258545, "memory(GiB)": 122.96, "step": 43570, "token_acc": 0.9658081705150977, "train_speed(iter/s)": 0.233295 }, { "epoch": 3.3215184084152756, "grad_norm": 1.1022162437438965, "learning_rate": 2.532213445510533e-05, "loss": 0.08739669919013977, "memory(GiB)": 122.96, "step": 43575, "token_acc": 0.9639123942259831, "train_speed(iter/s)": 0.233302 }, { "epoch": 3.3218995350255356, "grad_norm": 1.2815632820129395, "learning_rate": 2.531172169820829e-05, "loss": 0.054278111457824706, "memory(GiB)": 122.96, "step": 43580, "token_acc": 0.9728958630527818, "train_speed(iter/s)": 0.233307 }, { "epoch": 3.322280661635795, "grad_norm": 0.8798713684082031, "learning_rate": 2.5301310357069475e-05, "loss": 0.05037150979042053, "memory(GiB)": 122.96, "step": 43585, "token_acc": 0.9795418527589684, "train_speed(iter/s)": 0.233309 }, { "epoch": 3.322661788246055, "grad_norm": 1.9138996601104736, "learning_rate": 2.529090043228587e-05, "loss": 0.044535106420516966, "memory(GiB)": 122.96, "step": 43590, "token_acc": 0.9793137016305671, "train_speed(iter/s)": 0.233314 }, { "epoch": 3.323042914856315, "grad_norm": 1.3547956943511963, "learning_rate": 2.5280491924454457e-05, "loss": 0.06682538986206055, "memory(GiB)": 122.96, "step": 43595, "token_acc": 0.9700292397660819, "train_speed(iter/s)": 0.233322 }, { "epoch": 3.323424041466575, "grad_norm": 1.511224389076233, "learning_rate": 2.527008483417214e-05, "loss": 0.050530529022216795, "memory(GiB)": 122.96, "step": 43600, "token_acc": 0.9805208553885243, "train_speed(iter/s)": 0.233327 }, { "epoch": 3.323424041466575, "eval_loss": 0.06466751545667648, "eval_runtime": 219.6808, "eval_samples_per_second": 2.413, "eval_steps_per_second": 2.413, "eval_token_acc": 0.9741883019095235, "step": 43600 }, { "epoch": 3.3238051680768352, "grad_norm": 0.644939124584198, "learning_rate": 2.5259679162035682e-05, "loss": 0.05153728723526001, "memory(GiB)": 122.96, "step": 43605, "token_acc": 0.9745424444584745, "train_speed(iter/s)": 0.233053 }, { "epoch": 3.3241862946870953, "grad_norm": 1.3360873460769653, "learning_rate": 2.5249274908641812e-05, "loss": 0.0776296854019165, "memory(GiB)": 122.96, "step": 43610, "token_acc": 0.9636752136752137, "train_speed(iter/s)": 0.23306 }, { "epoch": 3.324567421297355, "grad_norm": 1.2155892848968506, "learning_rate": 2.523887207458719e-05, "loss": 0.06167711615562439, "memory(GiB)": 122.96, "step": 43615, "token_acc": 0.9751322751322752, "train_speed(iter/s)": 0.233067 }, { "epoch": 3.324948547907615, "grad_norm": 1.2589771747589111, "learning_rate": 2.5228470660468305e-05, "loss": 0.0626868486404419, "memory(GiB)": 122.96, "step": 43620, "token_acc": 0.9670972459176213, "train_speed(iter/s)": 0.233075 }, { "epoch": 3.325329674517875, "grad_norm": 0.4537317156791687, "learning_rate": 2.521807066688172e-05, "loss": 0.041295981407165526, "memory(GiB)": 122.96, "step": 43625, "token_acc": 0.9863325740318907, "train_speed(iter/s)": 0.233081 }, { "epoch": 3.325710801128135, "grad_norm": 1.1126576662063599, "learning_rate": 2.5207672094423756e-05, "loss": 0.09216393232345581, "memory(GiB)": 122.96, "step": 43630, "token_acc": 0.9686690833764888, "train_speed(iter/s)": 0.233089 }, { "epoch": 3.3260919277383945, "grad_norm": 0.6229046583175659, "learning_rate": 2.5197274943690752e-05, "loss": 0.0681134045124054, "memory(GiB)": 122.96, "step": 43635, "token_acc": 0.9817895683453237, "train_speed(iter/s)": 0.233096 }, { "epoch": 3.3264730543486545, "grad_norm": 0.3182755410671234, "learning_rate": 2.518687921527894e-05, "loss": 0.03471006155014038, "memory(GiB)": 122.96, "step": 43640, "token_acc": 0.9793250950570342, "train_speed(iter/s)": 0.233103 }, { "epoch": 3.3268541809589145, "grad_norm": 1.8050720691680908, "learning_rate": 2.517648490978445e-05, "loss": 0.06595907807350158, "memory(GiB)": 122.96, "step": 43645, "token_acc": 0.9788359788359788, "train_speed(iter/s)": 0.233111 }, { "epoch": 3.3272353075691745, "grad_norm": 1.124678611755371, "learning_rate": 2.5166092027803346e-05, "loss": 0.0583000898361206, "memory(GiB)": 122.96, "step": 43650, "token_acc": 0.9747957992998834, "train_speed(iter/s)": 0.233114 }, { "epoch": 3.3276164341794345, "grad_norm": 1.6982529163360596, "learning_rate": 2.5155700569931645e-05, "loss": 0.06635286808013915, "memory(GiB)": 122.96, "step": 43655, "token_acc": 0.9694753577106519, "train_speed(iter/s)": 0.233123 }, { "epoch": 3.3279975607896946, "grad_norm": 1.554481863975525, "learning_rate": 2.514531053676521e-05, "loss": 0.04445186853408813, "memory(GiB)": 122.96, "step": 43660, "token_acc": 0.9820333041191937, "train_speed(iter/s)": 0.233132 }, { "epoch": 3.328378687399954, "grad_norm": 1.8577653169631958, "learning_rate": 2.5134921928899867e-05, "loss": 0.06762505769729614, "memory(GiB)": 122.96, "step": 43665, "token_acc": 0.9712844392704696, "train_speed(iter/s)": 0.23314 }, { "epoch": 3.328759814010214, "grad_norm": 0.5590760111808777, "learning_rate": 2.5124534746931382e-05, "loss": 0.06840322613716125, "memory(GiB)": 122.96, "step": 43670, "token_acc": 0.9751098096632503, "train_speed(iter/s)": 0.233146 }, { "epoch": 3.329140940620474, "grad_norm": 0.6067086458206177, "learning_rate": 2.5114148991455384e-05, "loss": 0.051205897331237794, "memory(GiB)": 122.96, "step": 43675, "token_acc": 0.97265625, "train_speed(iter/s)": 0.233155 }, { "epoch": 3.329522067230734, "grad_norm": 1.2094911336898804, "learning_rate": 2.5103764663067454e-05, "loss": 0.06961935758590698, "memory(GiB)": 122.96, "step": 43680, "token_acc": 0.9749689054726368, "train_speed(iter/s)": 0.233161 }, { "epoch": 3.3299031938409938, "grad_norm": 0.9792380928993225, "learning_rate": 2.50933817623631e-05, "loss": 0.05126116871833801, "memory(GiB)": 122.96, "step": 43685, "token_acc": 0.9824538258575198, "train_speed(iter/s)": 0.233166 }, { "epoch": 3.3302843204512538, "grad_norm": 0.6729854941368103, "learning_rate": 2.5083000289937708e-05, "loss": 0.06261124610900878, "memory(GiB)": 122.96, "step": 43690, "token_acc": 0.9779156327543425, "train_speed(iter/s)": 0.233167 }, { "epoch": 3.330665447061514, "grad_norm": 0.6824474930763245, "learning_rate": 2.5072620246386637e-05, "loss": 0.05964975953102112, "memory(GiB)": 122.96, "step": 43695, "token_acc": 0.9791271347248577, "train_speed(iter/s)": 0.233175 }, { "epoch": 3.331046573671774, "grad_norm": 1.7649608850479126, "learning_rate": 2.5062241632305095e-05, "loss": 0.07836387157440186, "memory(GiB)": 122.96, "step": 43700, "token_acc": 0.9704772475027746, "train_speed(iter/s)": 0.23318 }, { "epoch": 3.331427700282034, "grad_norm": 1.373166561126709, "learning_rate": 2.5051864448288275e-05, "loss": 0.049796459078788755, "memory(GiB)": 122.96, "step": 43705, "token_acc": 0.9822603719599428, "train_speed(iter/s)": 0.233183 }, { "epoch": 3.331808826892294, "grad_norm": 0.14401856064796448, "learning_rate": 2.5041488694931276e-05, "loss": 0.035978260636329654, "memory(GiB)": 122.96, "step": 43710, "token_acc": 0.9845840605002909, "train_speed(iter/s)": 0.23319 }, { "epoch": 3.3321899535025534, "grad_norm": 1.4736273288726807, "learning_rate": 2.5031114372829056e-05, "loss": 0.06637260913848878, "memory(GiB)": 122.96, "step": 43715, "token_acc": 0.9768734712030243, "train_speed(iter/s)": 0.233195 }, { "epoch": 3.3325710801128134, "grad_norm": 0.6535669565200806, "learning_rate": 2.502074148257656e-05, "loss": 0.06989907622337341, "memory(GiB)": 122.96, "step": 43720, "token_acc": 0.9763560500695411, "train_speed(iter/s)": 0.233204 }, { "epoch": 3.3329522067230735, "grad_norm": 0.6371797919273376, "learning_rate": 2.5010370024768637e-05, "loss": 0.04076791107654572, "memory(GiB)": 122.96, "step": 43725, "token_acc": 0.9864111498257839, "train_speed(iter/s)": 0.233208 }, { "epoch": 3.3333333333333335, "grad_norm": 1.5498253107070923, "learning_rate": 2.500000000000001e-05, "loss": 0.06601820588111877, "memory(GiB)": 122.96, "step": 43730, "token_acc": 0.9746660525103639, "train_speed(iter/s)": 0.233217 }, { "epoch": 3.333714459943593, "grad_norm": 0.5761213898658752, "learning_rate": 2.4989631408865372e-05, "loss": 0.040820419788360596, "memory(GiB)": 122.96, "step": 43735, "token_acc": 0.9806691449814127, "train_speed(iter/s)": 0.233221 }, { "epoch": 3.334095586553853, "grad_norm": 1.5061615705490112, "learning_rate": 2.4979264251959323e-05, "loss": 0.04720512926578522, "memory(GiB)": 122.96, "step": 43740, "token_acc": 0.9809791082008107, "train_speed(iter/s)": 0.233228 }, { "epoch": 3.334476713164113, "grad_norm": 0.8082138299942017, "learning_rate": 2.4968898529876343e-05, "loss": 0.049436911940574646, "memory(GiB)": 122.96, "step": 43745, "token_acc": 0.983424336973479, "train_speed(iter/s)": 0.233233 }, { "epoch": 3.334857839774373, "grad_norm": 1.275333046913147, "learning_rate": 2.4958534243210875e-05, "loss": 0.0780949592590332, "memory(GiB)": 122.96, "step": 43750, "token_acc": 0.9708881883923605, "train_speed(iter/s)": 0.233238 }, { "epoch": 3.335238966384633, "grad_norm": 1.8342673778533936, "learning_rate": 2.4948171392557264e-05, "loss": 0.08842989206314086, "memory(GiB)": 122.96, "step": 43755, "token_acc": 0.9711286089238845, "train_speed(iter/s)": 0.233245 }, { "epoch": 3.335620092994893, "grad_norm": 0.925440788269043, "learning_rate": 2.4937809978509762e-05, "loss": 0.060745644569396975, "memory(GiB)": 122.96, "step": 43760, "token_acc": 0.9733475479744137, "train_speed(iter/s)": 0.233251 }, { "epoch": 3.3360012196051527, "grad_norm": 0.919176459312439, "learning_rate": 2.4927450001662578e-05, "loss": 0.06560834646224975, "memory(GiB)": 122.96, "step": 43765, "token_acc": 0.9685752330226365, "train_speed(iter/s)": 0.233252 }, { "epoch": 3.3363823462154127, "grad_norm": 0.5483986735343933, "learning_rate": 2.4917091462609766e-05, "loss": 0.061206763982772826, "memory(GiB)": 122.96, "step": 43770, "token_acc": 0.9792511328404484, "train_speed(iter/s)": 0.233258 }, { "epoch": 3.3367634728256728, "grad_norm": 1.5045193433761597, "learning_rate": 2.490673436194536e-05, "loss": 0.04695393443107605, "memory(GiB)": 122.96, "step": 43775, "token_acc": 0.9817733990147783, "train_speed(iter/s)": 0.233263 }, { "epoch": 3.337144599435933, "grad_norm": 1.1470825672149658, "learning_rate": 2.489637870026331e-05, "loss": 0.0520973801612854, "memory(GiB)": 122.96, "step": 43780, "token_acc": 0.9780181437543615, "train_speed(iter/s)": 0.23327 }, { "epoch": 3.3375257260461924, "grad_norm": 1.266113042831421, "learning_rate": 2.4886024478157428e-05, "loss": 0.08785345554351806, "memory(GiB)": 122.96, "step": 43785, "token_acc": 0.9741460357254779, "train_speed(iter/s)": 0.233274 }, { "epoch": 3.3379068526564524, "grad_norm": 2.385178804397583, "learning_rate": 2.4875671696221496e-05, "loss": 0.05536339282989502, "memory(GiB)": 122.96, "step": 43790, "token_acc": 0.9774627408215194, "train_speed(iter/s)": 0.233278 }, { "epoch": 3.3382879792667124, "grad_norm": 1.0440956354141235, "learning_rate": 2.4865320355049227e-05, "loss": 0.05363719463348389, "memory(GiB)": 122.96, "step": 43795, "token_acc": 0.9811912225705329, "train_speed(iter/s)": 0.23328 }, { "epoch": 3.3386691058769724, "grad_norm": 0.24311289191246033, "learning_rate": 2.485497045523417e-05, "loss": 0.04497693777084351, "memory(GiB)": 122.96, "step": 43800, "token_acc": 0.9807174887892377, "train_speed(iter/s)": 0.233289 }, { "epoch": 3.3386691058769724, "eval_loss": 0.06455516070127487, "eval_runtime": 219.0885, "eval_samples_per_second": 2.419, "eval_steps_per_second": 2.419, "eval_token_acc": 0.9738419372326969, "step": 43800 }, { "epoch": 3.3390502324872324, "grad_norm": 1.2025376558303833, "learning_rate": 2.4844621997369877e-05, "loss": 0.08300713896751404, "memory(GiB)": 122.96, "step": 43805, "token_acc": 0.9737081613187422, "train_speed(iter/s)": 0.233022 }, { "epoch": 3.339431359097492, "grad_norm": 0.6402226686477661, "learning_rate": 2.483427498204979e-05, "loss": 0.04550257921218872, "memory(GiB)": 122.96, "step": 43810, "token_acc": 0.9825441850316387, "train_speed(iter/s)": 0.233028 }, { "epoch": 3.339812485707752, "grad_norm": 0.9381176233291626, "learning_rate": 2.4823929409867236e-05, "loss": 0.05181613564491272, "memory(GiB)": 122.96, "step": 43815, "token_acc": 0.9828945411274729, "train_speed(iter/s)": 0.233031 }, { "epoch": 3.340193612318012, "grad_norm": 1.7082713842391968, "learning_rate": 2.4813585281415497e-05, "loss": 0.08114227056503295, "memory(GiB)": 122.96, "step": 43820, "token_acc": 0.9712115860120099, "train_speed(iter/s)": 0.233035 }, { "epoch": 3.340574738928272, "grad_norm": 1.5848287343978882, "learning_rate": 2.4803242597287778e-05, "loss": 0.05699493288993836, "memory(GiB)": 122.96, "step": 43825, "token_acc": 0.9744801512287334, "train_speed(iter/s)": 0.233043 }, { "epoch": 3.340955865538532, "grad_norm": 1.319579839706421, "learning_rate": 2.4792901358077158e-05, "loss": 0.07841584086418152, "memory(GiB)": 122.96, "step": 43830, "token_acc": 0.9766601833842734, "train_speed(iter/s)": 0.23305 }, { "epoch": 3.3413369921487917, "grad_norm": 0.6124122142791748, "learning_rate": 2.4782561564376666e-05, "loss": 0.032730591297149655, "memory(GiB)": 122.96, "step": 43835, "token_acc": 0.9812738853503185, "train_speed(iter/s)": 0.233052 }, { "epoch": 3.3417181187590517, "grad_norm": 0.9673547148704529, "learning_rate": 2.477222321677926e-05, "loss": 0.06582072973251343, "memory(GiB)": 122.96, "step": 43840, "token_acc": 0.970780993992354, "train_speed(iter/s)": 0.233059 }, { "epoch": 3.3420992453693117, "grad_norm": 1.5894330739974976, "learning_rate": 2.4761886315877762e-05, "loss": 0.04536490738391876, "memory(GiB)": 122.96, "step": 43845, "token_acc": 0.9828478083310646, "train_speed(iter/s)": 0.233067 }, { "epoch": 3.3424803719795717, "grad_norm": 0.10198890417814255, "learning_rate": 2.4751550862264973e-05, "loss": 0.06523984670639038, "memory(GiB)": 122.96, "step": 43850, "token_acc": 0.9682937956204379, "train_speed(iter/s)": 0.233074 }, { "epoch": 3.3428614985898317, "grad_norm": 1.8042820692062378, "learning_rate": 2.4741216856533587e-05, "loss": 0.058815276622772215, "memory(GiB)": 122.96, "step": 43855, "token_acc": 0.9795617361989043, "train_speed(iter/s)": 0.233078 }, { "epoch": 3.3432426252000913, "grad_norm": 1.149338722229004, "learning_rate": 2.4730884299276185e-05, "loss": 0.06321935653686524, "memory(GiB)": 122.96, "step": 43860, "token_acc": 0.9771487542385375, "train_speed(iter/s)": 0.233081 }, { "epoch": 3.3436237518103513, "grad_norm": 0.9939195513725281, "learning_rate": 2.47205531910853e-05, "loss": 0.05889768600463867, "memory(GiB)": 122.96, "step": 43865, "token_acc": 0.9737724916132967, "train_speed(iter/s)": 0.233088 }, { "epoch": 3.3440048784206113, "grad_norm": 4.7021870613098145, "learning_rate": 2.47102235325534e-05, "loss": 0.12778213024139404, "memory(GiB)": 122.96, "step": 43870, "token_acc": 0.9476942123181678, "train_speed(iter/s)": 0.233095 }, { "epoch": 3.3443860050308714, "grad_norm": 1.4087525606155396, "learning_rate": 2.4699895324272805e-05, "loss": 0.06970478296279907, "memory(GiB)": 122.96, "step": 43875, "token_acc": 0.9732685297691372, "train_speed(iter/s)": 0.233105 }, { "epoch": 3.3447671316411314, "grad_norm": 0.9756368398666382, "learning_rate": 2.4689568566835825e-05, "loss": 0.051277446746826175, "memory(GiB)": 122.96, "step": 43880, "token_acc": 0.9765912677538138, "train_speed(iter/s)": 0.233113 }, { "epoch": 3.345148258251391, "grad_norm": 0.9426102638244629, "learning_rate": 2.467924326083461e-05, "loss": 0.07531914710998536, "memory(GiB)": 122.96, "step": 43885, "token_acc": 0.9674067076051016, "train_speed(iter/s)": 0.233115 }, { "epoch": 3.345529384861651, "grad_norm": 0.6925322413444519, "learning_rate": 2.4668919406861274e-05, "loss": 0.06596051454544068, "memory(GiB)": 122.96, "step": 43890, "token_acc": 0.9757437684266953, "train_speed(iter/s)": 0.23312 }, { "epoch": 3.345910511471911, "grad_norm": 1.156816005706787, "learning_rate": 2.46585970055079e-05, "loss": 0.055578690767288205, "memory(GiB)": 122.96, "step": 43895, "token_acc": 0.9760694968038026, "train_speed(iter/s)": 0.233126 }, { "epoch": 3.346291638082171, "grad_norm": 1.7518346309661865, "learning_rate": 2.4648276057366364e-05, "loss": 0.05713763236999512, "memory(GiB)": 122.96, "step": 43900, "token_acc": 0.981371231035145, "train_speed(iter/s)": 0.233132 }, { "epoch": 3.346672764692431, "grad_norm": 1.5152511596679688, "learning_rate": 2.4637956563028553e-05, "loss": 0.05599833726882934, "memory(GiB)": 122.96, "step": 43905, "token_acc": 0.9818529130850048, "train_speed(iter/s)": 0.233139 }, { "epoch": 3.3470538913026906, "grad_norm": 0.5966803431510925, "learning_rate": 2.462763852308626e-05, "loss": 0.06549359560012817, "memory(GiB)": 122.96, "step": 43910, "token_acc": 0.9740717029449424, "train_speed(iter/s)": 0.233147 }, { "epoch": 3.3474350179129506, "grad_norm": 2.56820011138916, "learning_rate": 2.4617321938131137e-05, "loss": 0.05900737047195435, "memory(GiB)": 122.96, "step": 43915, "token_acc": 0.9774281805745554, "train_speed(iter/s)": 0.233151 }, { "epoch": 3.3478161445232106, "grad_norm": 0.8917888402938843, "learning_rate": 2.460700680875483e-05, "loss": 0.054902517795562746, "memory(GiB)": 122.96, "step": 43920, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.233157 }, { "epoch": 3.3481972711334707, "grad_norm": 1.3632572889328003, "learning_rate": 2.459669313554882e-05, "loss": 0.05599033236503601, "memory(GiB)": 122.96, "step": 43925, "token_acc": 0.9743497757847533, "train_speed(iter/s)": 0.233163 }, { "epoch": 3.3485783977437302, "grad_norm": 1.5159212350845337, "learning_rate": 2.458638091910458e-05, "loss": 0.04724599719047547, "memory(GiB)": 122.96, "step": 43930, "token_acc": 0.9778704297626684, "train_speed(iter/s)": 0.23317 }, { "epoch": 3.3489595243539902, "grad_norm": 1.1915700435638428, "learning_rate": 2.4576070160013477e-05, "loss": 0.0532284140586853, "memory(GiB)": 122.96, "step": 43935, "token_acc": 0.9757623143080532, "train_speed(iter/s)": 0.233175 }, { "epoch": 3.3493406509642503, "grad_norm": 2.301565170288086, "learning_rate": 2.4565760858866745e-05, "loss": 0.08379313945770264, "memory(GiB)": 122.96, "step": 43940, "token_acc": 0.9747245625405055, "train_speed(iter/s)": 0.23318 }, { "epoch": 3.3497217775745103, "grad_norm": 1.1230695247650146, "learning_rate": 2.4555453016255603e-05, "loss": 0.08312181830406189, "memory(GiB)": 122.96, "step": 43945, "token_acc": 0.9655963302752294, "train_speed(iter/s)": 0.233181 }, { "epoch": 3.3501029041847703, "grad_norm": 0.053981512784957886, "learning_rate": 2.454514663277117e-05, "loss": 0.03673867881298065, "memory(GiB)": 122.96, "step": 43950, "token_acc": 0.9789951268694337, "train_speed(iter/s)": 0.233187 }, { "epoch": 3.3504840307950303, "grad_norm": 1.3009376525878906, "learning_rate": 2.4534841709004436e-05, "loss": 0.040092259645462036, "memory(GiB)": 122.96, "step": 43955, "token_acc": 0.9821098087600246, "train_speed(iter/s)": 0.233194 }, { "epoch": 3.35086515740529, "grad_norm": 0.7206600904464722, "learning_rate": 2.4524538245546357e-05, "loss": 0.04388972222805023, "memory(GiB)": 122.96, "step": 43960, "token_acc": 0.9778051787916153, "train_speed(iter/s)": 0.233198 }, { "epoch": 3.35124628401555, "grad_norm": 1.75179922580719, "learning_rate": 2.4514236242987808e-05, "loss": 0.061567765474319455, "memory(GiB)": 122.96, "step": 43965, "token_acc": 0.9660112359550562, "train_speed(iter/s)": 0.233206 }, { "epoch": 3.35162741062581, "grad_norm": 0.9129390716552734, "learning_rate": 2.4503935701919524e-05, "loss": 0.040321967005729674, "memory(GiB)": 122.96, "step": 43970, "token_acc": 0.9861224489795918, "train_speed(iter/s)": 0.233208 }, { "epoch": 3.35200853723607, "grad_norm": 0.6605785489082336, "learning_rate": 2.4493636622932208e-05, "loss": 0.059191399812698366, "memory(GiB)": 122.96, "step": 43975, "token_acc": 0.9771829749890303, "train_speed(iter/s)": 0.233217 }, { "epoch": 3.3523896638463295, "grad_norm": 0.7959014177322388, "learning_rate": 2.448333900661649e-05, "loss": 0.06732171177864074, "memory(GiB)": 122.96, "step": 43980, "token_acc": 0.9726520331054336, "train_speed(iter/s)": 0.233223 }, { "epoch": 3.3527707904565895, "grad_norm": 0.7398163080215454, "learning_rate": 2.447304285356285e-05, "loss": 0.06435588002204895, "memory(GiB)": 122.96, "step": 43985, "token_acc": 0.975882558545963, "train_speed(iter/s)": 0.233231 }, { "epoch": 3.3531519170668496, "grad_norm": 0.6480658650398254, "learning_rate": 2.4462748164361743e-05, "loss": 0.0635388195514679, "memory(GiB)": 122.96, "step": 43990, "token_acc": 0.9767640526225867, "train_speed(iter/s)": 0.233236 }, { "epoch": 3.3535330436771096, "grad_norm": 1.9184963703155518, "learning_rate": 2.4452454939603536e-05, "loss": 0.05610812902450561, "memory(GiB)": 122.96, "step": 43995, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.233246 }, { "epoch": 3.3539141702873696, "grad_norm": 0.8690956234931946, "learning_rate": 2.4442163179878468e-05, "loss": 0.049541366100311277, "memory(GiB)": 122.96, "step": 44000, "token_acc": 0.9808856926570779, "train_speed(iter/s)": 0.233251 }, { "epoch": 3.3539141702873696, "eval_loss": 0.0641208291053772, "eval_runtime": 219.9004, "eval_samples_per_second": 2.41, "eval_steps_per_second": 2.41, "eval_token_acc": 0.9739247635684597, "step": 44000 }, { "epoch": 3.3542952968976296, "grad_norm": 2.7612063884735107, "learning_rate": 2.4431872885776735e-05, "loss": 0.06798668503761292, "memory(GiB)": 122.96, "step": 44005, "token_acc": 0.9738963052214736, "train_speed(iter/s)": 0.232986 }, { "epoch": 3.354676423507889, "grad_norm": 0.7698947191238403, "learning_rate": 2.4421584057888464e-05, "loss": 0.06509630680084229, "memory(GiB)": 122.96, "step": 44010, "token_acc": 0.9722135007849294, "train_speed(iter/s)": 0.23299 }, { "epoch": 3.355057550118149, "grad_norm": 0.890510082244873, "learning_rate": 2.441129669680363e-05, "loss": 0.05350332260131836, "memory(GiB)": 122.96, "step": 44015, "token_acc": 0.9818941504178273, "train_speed(iter/s)": 0.232992 }, { "epoch": 3.3554386767284092, "grad_norm": 1.143143892288208, "learning_rate": 2.4401010803112185e-05, "loss": 0.057432925701141356, "memory(GiB)": 122.96, "step": 44020, "token_acc": 0.9717532467532467, "train_speed(iter/s)": 0.233 }, { "epoch": 3.3558198033386692, "grad_norm": 1.5885876417160034, "learning_rate": 2.439072637740397e-05, "loss": 0.08503876924514771, "memory(GiB)": 122.96, "step": 44025, "token_acc": 0.9662146632957941, "train_speed(iter/s)": 0.233007 }, { "epoch": 3.356200929948929, "grad_norm": 1.3217629194259644, "learning_rate": 2.4380443420268762e-05, "loss": 0.03536791205406189, "memory(GiB)": 122.96, "step": 44030, "token_acc": 0.9831291888144211, "train_speed(iter/s)": 0.233013 }, { "epoch": 3.356582056559189, "grad_norm": 1.4449936151504517, "learning_rate": 2.4370161932296255e-05, "loss": 0.07088247537612916, "memory(GiB)": 122.96, "step": 44035, "token_acc": 0.9761339662447257, "train_speed(iter/s)": 0.233016 }, { "epoch": 3.356963183169449, "grad_norm": 1.4985376596450806, "learning_rate": 2.4359881914076e-05, "loss": 0.0628083348274231, "memory(GiB)": 122.96, "step": 44040, "token_acc": 0.969220475075276, "train_speed(iter/s)": 0.233023 }, { "epoch": 3.357344309779709, "grad_norm": 0.505158007144928, "learning_rate": 2.4349603366197533e-05, "loss": 0.055997252464294434, "memory(GiB)": 122.96, "step": 44045, "token_acc": 0.9791870372957117, "train_speed(iter/s)": 0.233026 }, { "epoch": 3.357725436389969, "grad_norm": 0.823479175567627, "learning_rate": 2.4339326289250303e-05, "loss": 0.04296434819698334, "memory(GiB)": 122.96, "step": 44050, "token_acc": 0.9802990325417766, "train_speed(iter/s)": 0.233031 }, { "epoch": 3.358106563000229, "grad_norm": 0.8842217922210693, "learning_rate": 2.4329050683823607e-05, "loss": 0.07562910318374634, "memory(GiB)": 122.96, "step": 44055, "token_acc": 0.9578721885041057, "train_speed(iter/s)": 0.233039 }, { "epoch": 3.3584876896104885, "grad_norm": 0.40543967485427856, "learning_rate": 2.431877655050673e-05, "loss": 0.049786347150802615, "memory(GiB)": 122.96, "step": 44060, "token_acc": 0.9859555555555556, "train_speed(iter/s)": 0.233044 }, { "epoch": 3.3588688162207485, "grad_norm": 1.4748872518539429, "learning_rate": 2.430850388988886e-05, "loss": 0.07160642743110657, "memory(GiB)": 122.96, "step": 44065, "token_acc": 0.9768583450210379, "train_speed(iter/s)": 0.233047 }, { "epoch": 3.3592499428310085, "grad_norm": 1.589954137802124, "learning_rate": 2.429823270255905e-05, "loss": 0.05800718665122986, "memory(GiB)": 122.96, "step": 44070, "token_acc": 0.9756055953599454, "train_speed(iter/s)": 0.233052 }, { "epoch": 3.3596310694412685, "grad_norm": 1.0199363231658936, "learning_rate": 2.4287962989106327e-05, "loss": 0.06516411304473876, "memory(GiB)": 122.96, "step": 44075, "token_acc": 0.971699604743083, "train_speed(iter/s)": 0.233058 }, { "epoch": 3.360012196051528, "grad_norm": 0.8284401893615723, "learning_rate": 2.427769475011962e-05, "loss": 0.07612704038619995, "memory(GiB)": 122.96, "step": 44080, "token_acc": 0.9848853370396108, "train_speed(iter/s)": 0.233061 }, { "epoch": 3.360393322661788, "grad_norm": 1.637305498123169, "learning_rate": 2.426742798618774e-05, "loss": 0.040493893623352054, "memory(GiB)": 122.96, "step": 44085, "token_acc": 0.9844164456233422, "train_speed(iter/s)": 0.233069 }, { "epoch": 3.360774449272048, "grad_norm": 0.9701366424560547, "learning_rate": 2.4257162697899466e-05, "loss": 0.038423779606819156, "memory(GiB)": 122.96, "step": 44090, "token_acc": 0.9804956402019275, "train_speed(iter/s)": 0.233076 }, { "epoch": 3.361155575882308, "grad_norm": 1.2129161357879639, "learning_rate": 2.424689888584344e-05, "loss": 0.049733692407608034, "memory(GiB)": 122.96, "step": 44095, "token_acc": 0.9767744332961724, "train_speed(iter/s)": 0.233083 }, { "epoch": 3.361536702492568, "grad_norm": 1.0093894004821777, "learning_rate": 2.4236636550608244e-05, "loss": 0.05670593380928039, "memory(GiB)": 122.96, "step": 44100, "token_acc": 0.9710460772104608, "train_speed(iter/s)": 0.23309 }, { "epoch": 3.3619178291028278, "grad_norm": 1.3244441747665405, "learning_rate": 2.4226375692782404e-05, "loss": 0.09026939868927002, "memory(GiB)": 122.96, "step": 44105, "token_acc": 0.9615223755750731, "train_speed(iter/s)": 0.233095 }, { "epoch": 3.362298955713088, "grad_norm": 0.5751805901527405, "learning_rate": 2.42161163129543e-05, "loss": 0.04120635986328125, "memory(GiB)": 122.96, "step": 44110, "token_acc": 0.9817100044072279, "train_speed(iter/s)": 0.233101 }, { "epoch": 3.362680082323348, "grad_norm": 0.4790802597999573, "learning_rate": 2.4205858411712273e-05, "loss": 0.04508569836616516, "memory(GiB)": 122.96, "step": 44115, "token_acc": 0.9821490268340435, "train_speed(iter/s)": 0.233102 }, { "epoch": 3.363061208933608, "grad_norm": 0.9648776054382324, "learning_rate": 2.4195601989644583e-05, "loss": 0.04757298529148102, "memory(GiB)": 122.96, "step": 44120, "token_acc": 0.9741326216382673, "train_speed(iter/s)": 0.233109 }, { "epoch": 3.363442335543868, "grad_norm": 1.3644541501998901, "learning_rate": 2.4185347047339356e-05, "loss": 0.04893730282783508, "memory(GiB)": 122.96, "step": 44125, "token_acc": 0.9796817625458997, "train_speed(iter/s)": 0.233116 }, { "epoch": 3.3638234621541274, "grad_norm": 1.0198274850845337, "learning_rate": 2.417509358538468e-05, "loss": 0.04947633445262909, "memory(GiB)": 122.96, "step": 44130, "token_acc": 0.9790098436595251, "train_speed(iter/s)": 0.233118 }, { "epoch": 3.3642045887643874, "grad_norm": 1.2192336320877075, "learning_rate": 2.416484160436856e-05, "loss": 0.07878044247627258, "memory(GiB)": 122.96, "step": 44135, "token_acc": 0.9683597002497918, "train_speed(iter/s)": 0.233126 }, { "epoch": 3.3645857153746475, "grad_norm": 1.2690094709396362, "learning_rate": 2.415459110487887e-05, "loss": 0.07246209979057312, "memory(GiB)": 122.96, "step": 44140, "token_acc": 0.9737939249553306, "train_speed(iter/s)": 0.233132 }, { "epoch": 3.3649668419849075, "grad_norm": 0.7593148946762085, "learning_rate": 2.414434208750344e-05, "loss": 0.041126078367233275, "memory(GiB)": 122.96, "step": 44145, "token_acc": 0.9823505267478452, "train_speed(iter/s)": 0.233132 }, { "epoch": 3.3653479685951675, "grad_norm": 0.5682486891746521, "learning_rate": 2.413409455283003e-05, "loss": 0.06483138203620911, "memory(GiB)": 122.96, "step": 44150, "token_acc": 0.974955729825449, "train_speed(iter/s)": 0.233136 }, { "epoch": 3.365729095205427, "grad_norm": 2.1568024158477783, "learning_rate": 2.412384850144622e-05, "loss": 0.06908537149429321, "memory(GiB)": 122.96, "step": 44155, "token_acc": 0.9738276990185387, "train_speed(iter/s)": 0.233145 }, { "epoch": 3.366110221815687, "grad_norm": 0.8953927755355835, "learning_rate": 2.411360393393966e-05, "loss": 0.03600887060165405, "memory(GiB)": 122.96, "step": 44160, "token_acc": 0.977892621303474, "train_speed(iter/s)": 0.233153 }, { "epoch": 3.366491348425947, "grad_norm": 2.8665616512298584, "learning_rate": 2.4103360850897773e-05, "loss": 0.04886245131492615, "memory(GiB)": 122.96, "step": 44165, "token_acc": 0.9808743169398907, "train_speed(iter/s)": 0.233161 }, { "epoch": 3.366872475036207, "grad_norm": 0.6728200912475586, "learning_rate": 2.4093119252907958e-05, "loss": 0.048656615614891055, "memory(GiB)": 122.96, "step": 44170, "token_acc": 0.9839242788461539, "train_speed(iter/s)": 0.233163 }, { "epoch": 3.367253601646467, "grad_norm": 1.1258678436279297, "learning_rate": 2.408287914055755e-05, "loss": 0.06595627665519714, "memory(GiB)": 122.96, "step": 44175, "token_acc": 0.9753886010362695, "train_speed(iter/s)": 0.23317 }, { "epoch": 3.3676347282567267, "grad_norm": 2.0387094020843506, "learning_rate": 2.407264051443374e-05, "loss": 0.11055407524108887, "memory(GiB)": 122.96, "step": 44180, "token_acc": 0.9563586771224003, "train_speed(iter/s)": 0.233177 }, { "epoch": 3.3680158548669867, "grad_norm": 1.042396068572998, "learning_rate": 2.4062403375123676e-05, "loss": 0.05765061974525452, "memory(GiB)": 122.96, "step": 44185, "token_acc": 0.9784463403682083, "train_speed(iter/s)": 0.233186 }, { "epoch": 3.3683969814772468, "grad_norm": 1.14240562915802, "learning_rate": 2.405216772321443e-05, "loss": 0.04889840483665466, "memory(GiB)": 122.96, "step": 44190, "token_acc": 0.9818481848184818, "train_speed(iter/s)": 0.233191 }, { "epoch": 3.3687781080875068, "grad_norm": 1.8351939916610718, "learning_rate": 2.404193355929294e-05, "loss": 0.06059239506721496, "memory(GiB)": 122.96, "step": 44195, "token_acc": 0.9804798698657992, "train_speed(iter/s)": 0.233196 }, { "epoch": 3.369159234697767, "grad_norm": 0.1520765721797943, "learning_rate": 2.4031700883946097e-05, "loss": 0.03774245381355286, "memory(GiB)": 122.96, "step": 44200, "token_acc": 0.9772459788152217, "train_speed(iter/s)": 0.233204 }, { "epoch": 3.369159234697767, "eval_loss": 0.06408892571926117, "eval_runtime": 220.4716, "eval_samples_per_second": 2.404, "eval_steps_per_second": 2.404, "eval_token_acc": 0.9735859285585206, "step": 44200 }, { "epoch": 3.3695403613080264, "grad_norm": 1.700217604637146, "learning_rate": 2.402146969776072e-05, "loss": 0.06829191446304321, "memory(GiB)": 122.96, "step": 44205, "token_acc": 0.9736034825289237, "train_speed(iter/s)": 0.232939 }, { "epoch": 3.3699214879182864, "grad_norm": 0.7346890568733215, "learning_rate": 2.4011240001323476e-05, "loss": 0.05858136415481567, "memory(GiB)": 122.96, "step": 44210, "token_acc": 0.9777562862669246, "train_speed(iter/s)": 0.232942 }, { "epoch": 3.3703026145285464, "grad_norm": 0.6547887325286865, "learning_rate": 2.4001011795221022e-05, "loss": 0.05694189667701721, "memory(GiB)": 122.96, "step": 44215, "token_acc": 0.9747093023255814, "train_speed(iter/s)": 0.232945 }, { "epoch": 3.3706837411388064, "grad_norm": 0.5810291171073914, "learning_rate": 2.3990785080039907e-05, "loss": 0.07741057872772217, "memory(GiB)": 122.96, "step": 44220, "token_acc": 0.9734939759036144, "train_speed(iter/s)": 0.232952 }, { "epoch": 3.371064867749066, "grad_norm": 1.0284430980682373, "learning_rate": 2.3980559856366552e-05, "loss": 0.09234346151351928, "memory(GiB)": 122.96, "step": 44225, "token_acc": 0.953625081645983, "train_speed(iter/s)": 0.232958 }, { "epoch": 3.371445994359326, "grad_norm": 1.5088107585906982, "learning_rate": 2.3970336124787345e-05, "loss": 0.05481499433517456, "memory(GiB)": 122.96, "step": 44230, "token_acc": 0.977871786527823, "train_speed(iter/s)": 0.232966 }, { "epoch": 3.371827120969586, "grad_norm": 0.9391525983810425, "learning_rate": 2.396011388588859e-05, "loss": 0.057309466600418094, "memory(GiB)": 122.96, "step": 44235, "token_acc": 0.9788091068301226, "train_speed(iter/s)": 0.23297 }, { "epoch": 3.372208247579846, "grad_norm": 1.015039324760437, "learning_rate": 2.3949893140256442e-05, "loss": 0.038518857955932614, "memory(GiB)": 122.96, "step": 44240, "token_acc": 0.9835381405054486, "train_speed(iter/s)": 0.232976 }, { "epoch": 3.372589374190106, "grad_norm": 1.1808701753616333, "learning_rate": 2.3939673888477033e-05, "loss": 0.05093239545822144, "memory(GiB)": 122.96, "step": 44245, "token_acc": 0.9800115041702617, "train_speed(iter/s)": 0.23298 }, { "epoch": 3.372970500800366, "grad_norm": 1.8549648523330688, "learning_rate": 2.3929456131136413e-05, "loss": 0.0771709680557251, "memory(GiB)": 122.96, "step": 44250, "token_acc": 0.9698705707617229, "train_speed(iter/s)": 0.232986 }, { "epoch": 3.3733516274106257, "grad_norm": 0.854964554309845, "learning_rate": 2.3919239868820488e-05, "loss": 0.07730629444122314, "memory(GiB)": 122.96, "step": 44255, "token_acc": 0.9627064464571125, "train_speed(iter/s)": 0.232995 }, { "epoch": 3.3737327540208857, "grad_norm": 0.8983915448188782, "learning_rate": 2.390902510211514e-05, "loss": 0.038231202960014345, "memory(GiB)": 122.96, "step": 44260, "token_acc": 0.9803050782004248, "train_speed(iter/s)": 0.232999 }, { "epoch": 3.3741138806311457, "grad_norm": 0.8731570243835449, "learning_rate": 2.3898811831606105e-05, "loss": 0.05808987021446228, "memory(GiB)": 122.96, "step": 44265, "token_acc": 0.9792494481236204, "train_speed(iter/s)": 0.233006 }, { "epoch": 3.3744950072414057, "grad_norm": 1.7751328945159912, "learning_rate": 2.388860005787909e-05, "loss": 0.07657701969146728, "memory(GiB)": 122.96, "step": 44270, "token_acc": 0.9704268292682927, "train_speed(iter/s)": 0.233014 }, { "epoch": 3.3748761338516653, "grad_norm": 1.0858352184295654, "learning_rate": 2.387838978151971e-05, "loss": 0.10320591926574707, "memory(GiB)": 122.96, "step": 44275, "token_acc": 0.9769230769230769, "train_speed(iter/s)": 0.23302 }, { "epoch": 3.3752572604619253, "grad_norm": 1.4126191139221191, "learning_rate": 2.3868181003113437e-05, "loss": 0.07092752456665039, "memory(GiB)": 122.96, "step": 44280, "token_acc": 0.9658561821003622, "train_speed(iter/s)": 0.233026 }, { "epoch": 3.3756383870721853, "grad_norm": 1.0985511541366577, "learning_rate": 2.3857973723245713e-05, "loss": 0.07554703950881958, "memory(GiB)": 122.96, "step": 44285, "token_acc": 0.9714943342776204, "train_speed(iter/s)": 0.233032 }, { "epoch": 3.3760195136824453, "grad_norm": 0.8714030385017395, "learning_rate": 2.384776794250189e-05, "loss": 0.06717325448989868, "memory(GiB)": 122.96, "step": 44290, "token_acc": 0.967930029154519, "train_speed(iter/s)": 0.23304 }, { "epoch": 3.3764006402927054, "grad_norm": 0.5048011541366577, "learning_rate": 2.383756366146721e-05, "loss": 0.061321109533309937, "memory(GiB)": 122.96, "step": 44295, "token_acc": 0.9774297558728696, "train_speed(iter/s)": 0.233045 }, { "epoch": 3.3767817669029654, "grad_norm": 0.4765639901161194, "learning_rate": 2.3827360880726846e-05, "loss": 0.03653908371925354, "memory(GiB)": 122.96, "step": 44300, "token_acc": 0.9810204081632653, "train_speed(iter/s)": 0.233051 }, { "epoch": 3.377162893513225, "grad_norm": 1.22962486743927, "learning_rate": 2.3817159600865895e-05, "loss": 0.04256724417209625, "memory(GiB)": 122.96, "step": 44305, "token_acc": 0.9799511002444988, "train_speed(iter/s)": 0.233058 }, { "epoch": 3.377544020123485, "grad_norm": 0.23830270767211914, "learning_rate": 2.3806959822469326e-05, "loss": 0.06137362718582153, "memory(GiB)": 122.96, "step": 44310, "token_acc": 0.9686366545764882, "train_speed(iter/s)": 0.233064 }, { "epoch": 3.377925146733745, "grad_norm": 1.6633967161178589, "learning_rate": 2.3796761546122076e-05, "loss": 0.07580691576004028, "memory(GiB)": 122.96, "step": 44315, "token_acc": 0.9746967795901297, "train_speed(iter/s)": 0.233068 }, { "epoch": 3.378306273344005, "grad_norm": 1.3423500061035156, "learning_rate": 2.378656477240893e-05, "loss": 0.08400588035583496, "memory(GiB)": 122.96, "step": 44320, "token_acc": 0.9582863585118376, "train_speed(iter/s)": 0.233076 }, { "epoch": 3.3786873999542646, "grad_norm": 1.6682664155960083, "learning_rate": 2.3776369501914652e-05, "loss": 0.060811054706573484, "memory(GiB)": 122.96, "step": 44325, "token_acc": 0.9693593314763231, "train_speed(iter/s)": 0.233083 }, { "epoch": 3.3790685265645246, "grad_norm": 0.8014829158782959, "learning_rate": 2.376617573522392e-05, "loss": 0.05179721713066101, "memory(GiB)": 122.96, "step": 44330, "token_acc": 0.9758675356702484, "train_speed(iter/s)": 0.233088 }, { "epoch": 3.3794496531747846, "grad_norm": 1.2506321668624878, "learning_rate": 2.3755983472921233e-05, "loss": 0.05385288000106812, "memory(GiB)": 122.96, "step": 44335, "token_acc": 0.9801496411665903, "train_speed(iter/s)": 0.233092 }, { "epoch": 3.3798307797850446, "grad_norm": 0.7814849615097046, "learning_rate": 2.374579271559112e-05, "loss": 0.04675308167934418, "memory(GiB)": 122.96, "step": 44340, "token_acc": 0.98080531101169, "train_speed(iter/s)": 0.233096 }, { "epoch": 3.3802119063953047, "grad_norm": 0.940844714641571, "learning_rate": 2.3735603463817974e-05, "loss": 0.05378847122192383, "memory(GiB)": 122.96, "step": 44345, "token_acc": 0.9711668014012396, "train_speed(iter/s)": 0.233103 }, { "epoch": 3.3805930330055647, "grad_norm": 1.644495964050293, "learning_rate": 2.3725415718186066e-05, "loss": 0.07434041500091552, "memory(GiB)": 122.96, "step": 44350, "token_acc": 0.9719383336220336, "train_speed(iter/s)": 0.233107 }, { "epoch": 3.3809741596158243, "grad_norm": 1.661731481552124, "learning_rate": 2.3715229479279643e-05, "loss": 0.060552734136581424, "memory(GiB)": 122.96, "step": 44355, "token_acc": 0.9777846343721074, "train_speed(iter/s)": 0.233115 }, { "epoch": 3.3813552862260843, "grad_norm": 0.6413809657096863, "learning_rate": 2.3705044747682848e-05, "loss": 0.0710341453552246, "memory(GiB)": 122.96, "step": 44360, "token_acc": 0.9756055061857466, "train_speed(iter/s)": 0.23312 }, { "epoch": 3.3817364128363443, "grad_norm": 1.5539230108261108, "learning_rate": 2.36948615239797e-05, "loss": 0.051879340410232545, "memory(GiB)": 122.96, "step": 44365, "token_acc": 0.9697278911564626, "train_speed(iter/s)": 0.233127 }, { "epoch": 3.3821175394466043, "grad_norm": 1.1427555084228516, "learning_rate": 2.368467980875417e-05, "loss": 0.04319285750389099, "memory(GiB)": 122.96, "step": 44370, "token_acc": 0.9829358552631579, "train_speed(iter/s)": 0.233133 }, { "epoch": 3.382498666056864, "grad_norm": 1.2070260047912598, "learning_rate": 2.367449960259015e-05, "loss": 0.06274941563606262, "memory(GiB)": 122.96, "step": 44375, "token_acc": 0.9696880517951736, "train_speed(iter/s)": 0.23314 }, { "epoch": 3.382879792667124, "grad_norm": 0.9306278228759766, "learning_rate": 2.3664320906071396e-05, "loss": 0.04153009951114654, "memory(GiB)": 122.96, "step": 44380, "token_acc": 0.9817272552413926, "train_speed(iter/s)": 0.233145 }, { "epoch": 3.383260919277384, "grad_norm": 1.7035466432571411, "learning_rate": 2.3654143719781624e-05, "loss": 0.07101226449012757, "memory(GiB)": 122.96, "step": 44385, "token_acc": 0.9673260724605468, "train_speed(iter/s)": 0.23315 }, { "epoch": 3.383642045887644, "grad_norm": 0.5381147265434265, "learning_rate": 2.364396804430447e-05, "loss": 0.019391766190528868, "memory(GiB)": 122.96, "step": 44390, "token_acc": 0.9932918064206996, "train_speed(iter/s)": 0.23316 }, { "epoch": 3.384023172497904, "grad_norm": 1.138839840888977, "learning_rate": 2.3633793880223427e-05, "loss": 0.059366679191589354, "memory(GiB)": 122.96, "step": 44395, "token_acc": 0.9758564437194127, "train_speed(iter/s)": 0.233167 }, { "epoch": 3.3844042991081635, "grad_norm": 4.064382076263428, "learning_rate": 2.362362122812195e-05, "loss": 0.0722284734249115, "memory(GiB)": 122.96, "step": 44400, "token_acc": 0.9767841788478074, "train_speed(iter/s)": 0.233167 }, { "epoch": 3.3844042991081635, "eval_loss": 0.06438811123371124, "eval_runtime": 219.0412, "eval_samples_per_second": 2.42, "eval_steps_per_second": 2.42, "eval_token_acc": 0.973367568218782, "step": 44400 }, { "epoch": 3.3847854257184236, "grad_norm": 0.6649875640869141, "learning_rate": 2.361345008858341e-05, "loss": 0.05603752732276916, "memory(GiB)": 122.96, "step": 44405, "token_acc": 0.9735083375331419, "train_speed(iter/s)": 0.232906 }, { "epoch": 3.3851665523286836, "grad_norm": 0.6038417220115662, "learning_rate": 2.360328046219104e-05, "loss": 0.053580445051193235, "memory(GiB)": 122.96, "step": 44410, "token_acc": 0.9830335934848999, "train_speed(iter/s)": 0.232913 }, { "epoch": 3.3855476789389436, "grad_norm": 0.7689390182495117, "learning_rate": 2.359311234952804e-05, "loss": 0.05416521430015564, "memory(GiB)": 122.96, "step": 44415, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.232918 }, { "epoch": 3.3859288055492036, "grad_norm": 1.3844144344329834, "learning_rate": 2.3582945751177522e-05, "loss": 0.05461370348930359, "memory(GiB)": 122.96, "step": 44420, "token_acc": 0.9790059982862039, "train_speed(iter/s)": 0.232923 }, { "epoch": 3.386309932159463, "grad_norm": 1.824751853942871, "learning_rate": 2.357278066772244e-05, "loss": 0.03764788508415222, "memory(GiB)": 122.96, "step": 44425, "token_acc": 0.9779302117506711, "train_speed(iter/s)": 0.23293 }, { "epoch": 3.386691058769723, "grad_norm": 1.689367651939392, "learning_rate": 2.3562617099745787e-05, "loss": 0.07451211810111999, "memory(GiB)": 122.96, "step": 44430, "token_acc": 0.9700570342205324, "train_speed(iter/s)": 0.232935 }, { "epoch": 3.3870721853799832, "grad_norm": 1.4828636646270752, "learning_rate": 2.3552455047830337e-05, "loss": 0.04728330373764038, "memory(GiB)": 122.96, "step": 44435, "token_acc": 0.9817843096667357, "train_speed(iter/s)": 0.23294 }, { "epoch": 3.3874533119902432, "grad_norm": 1.3174493312835693, "learning_rate": 2.354229451255886e-05, "loss": 0.07907824516296387, "memory(GiB)": 122.96, "step": 44440, "token_acc": 0.9786614936954413, "train_speed(iter/s)": 0.232947 }, { "epoch": 3.3878344386005033, "grad_norm": 2.8016281127929688, "learning_rate": 2.3532135494514034e-05, "loss": 0.07729455232620239, "memory(GiB)": 122.96, "step": 44445, "token_acc": 0.9800386349001932, "train_speed(iter/s)": 0.232951 }, { "epoch": 3.388215565210763, "grad_norm": 1.230957269668579, "learning_rate": 2.3521977994278393e-05, "loss": 0.0666947066783905, "memory(GiB)": 122.96, "step": 44450, "token_acc": 0.9727235438884332, "train_speed(iter/s)": 0.232955 }, { "epoch": 3.388596691821023, "grad_norm": 1.1610887050628662, "learning_rate": 2.3511822012434438e-05, "loss": 0.0478197306394577, "memory(GiB)": 122.96, "step": 44455, "token_acc": 0.9793008279668813, "train_speed(iter/s)": 0.23296 }, { "epoch": 3.388977818431283, "grad_norm": 0.6149753332138062, "learning_rate": 2.3501667549564594e-05, "loss": 0.037169459462165835, "memory(GiB)": 122.96, "step": 44460, "token_acc": 0.9853128991060025, "train_speed(iter/s)": 0.232962 }, { "epoch": 3.389358945041543, "grad_norm": 1.625767707824707, "learning_rate": 2.3491514606251125e-05, "loss": 0.046656680107116696, "memory(GiB)": 122.96, "step": 44465, "token_acc": 0.9807764794572182, "train_speed(iter/s)": 0.232967 }, { "epoch": 3.389740071651803, "grad_norm": 0.43805819749832153, "learning_rate": 2.3481363183076275e-05, "loss": 0.05168574452400208, "memory(GiB)": 122.96, "step": 44470, "token_acc": 0.9795471945750848, "train_speed(iter/s)": 0.232968 }, { "epoch": 3.3901211982620625, "grad_norm": 1.2422552108764648, "learning_rate": 2.347121328062221e-05, "loss": 0.07009706497192383, "memory(GiB)": 122.96, "step": 44475, "token_acc": 0.9712021941185434, "train_speed(iter/s)": 0.232973 }, { "epoch": 3.3905023248723225, "grad_norm": 1.927981972694397, "learning_rate": 2.346106489947094e-05, "loss": 0.05807647705078125, "memory(GiB)": 122.96, "step": 44480, "token_acc": 0.9796278158667973, "train_speed(iter/s)": 0.23298 }, { "epoch": 3.3908834514825825, "grad_norm": 1.162038803100586, "learning_rate": 2.3450918040204455e-05, "loss": 0.0405582070350647, "memory(GiB)": 122.96, "step": 44485, "token_acc": 0.9836617011052379, "train_speed(iter/s)": 0.232986 }, { "epoch": 3.3912645780928425, "grad_norm": 1.7091064453125, "learning_rate": 2.34407727034046e-05, "loss": 0.09555755257606506, "memory(GiB)": 122.96, "step": 44490, "token_acc": 0.9676239253704042, "train_speed(iter/s)": 0.232991 }, { "epoch": 3.3916457047031026, "grad_norm": 1.7709009647369385, "learning_rate": 2.3430628889653184e-05, "loss": 0.031143051385879517, "memory(GiB)": 122.96, "step": 44495, "token_acc": 0.9809814510448462, "train_speed(iter/s)": 0.232998 }, { "epoch": 3.392026831313362, "grad_norm": 1.318396806716919, "learning_rate": 2.3420486599531915e-05, "loss": 0.06386517882347106, "memory(GiB)": 122.96, "step": 44500, "token_acc": 0.9631171921475312, "train_speed(iter/s)": 0.233006 }, { "epoch": 3.392407957923622, "grad_norm": 1.3138341903686523, "learning_rate": 2.341034583362238e-05, "loss": 0.06996110081672668, "memory(GiB)": 122.96, "step": 44505, "token_acc": 0.9790957705396208, "train_speed(iter/s)": 0.233011 }, { "epoch": 3.392789084533882, "grad_norm": 0.4875327944755554, "learning_rate": 2.3400206592506123e-05, "loss": 0.040217968821525577, "memory(GiB)": 122.96, "step": 44510, "token_acc": 0.9828839011787502, "train_speed(iter/s)": 0.233016 }, { "epoch": 3.393170211144142, "grad_norm": 0.45836466550827026, "learning_rate": 2.3390068876764604e-05, "loss": 0.07296997904777527, "memory(GiB)": 122.96, "step": 44515, "token_acc": 0.9795141937371964, "train_speed(iter/s)": 0.233022 }, { "epoch": 3.3935513377544018, "grad_norm": 1.0286352634429932, "learning_rate": 2.3379932686979123e-05, "loss": 0.07423176765441894, "memory(GiB)": 122.96, "step": 44520, "token_acc": 0.9729537366548042, "train_speed(iter/s)": 0.233027 }, { "epoch": 3.393932464364662, "grad_norm": 0.9243927001953125, "learning_rate": 2.3369798023730972e-05, "loss": 0.051657605171203616, "memory(GiB)": 122.96, "step": 44525, "token_acc": 0.9825691549829481, "train_speed(iter/s)": 0.233032 }, { "epoch": 3.394313590974922, "grad_norm": 1.279689908027649, "learning_rate": 2.335966488760134e-05, "loss": 0.06637429594993591, "memory(GiB)": 122.96, "step": 44530, "token_acc": 0.9671870022300095, "train_speed(iter/s)": 0.233039 }, { "epoch": 3.394694717585182, "grad_norm": 0.7261753082275391, "learning_rate": 2.3349533279171286e-05, "loss": 0.039176279306411745, "memory(GiB)": 122.96, "step": 44535, "token_acc": 0.9850794680506001, "train_speed(iter/s)": 0.233042 }, { "epoch": 3.395075844195442, "grad_norm": 1.3711705207824707, "learning_rate": 2.3339403199021826e-05, "loss": 0.06909368038177491, "memory(GiB)": 122.96, "step": 44540, "token_acc": 0.9739649249683602, "train_speed(iter/s)": 0.233048 }, { "epoch": 3.395456970805702, "grad_norm": 1.6918468475341797, "learning_rate": 2.332927464773389e-05, "loss": 0.05773085355758667, "memory(GiB)": 122.96, "step": 44545, "token_acc": 0.9776085982982534, "train_speed(iter/s)": 0.233057 }, { "epoch": 3.3958380974159614, "grad_norm": 1.0890835523605347, "learning_rate": 2.3319147625888264e-05, "loss": 0.044738557934761045, "memory(GiB)": 122.96, "step": 44550, "token_acc": 0.9836300976450316, "train_speed(iter/s)": 0.233064 }, { "epoch": 3.3962192240262215, "grad_norm": 1.2164998054504395, "learning_rate": 2.3309022134065712e-05, "loss": 0.056627899408340454, "memory(GiB)": 122.96, "step": 44555, "token_acc": 0.978617536309844, "train_speed(iter/s)": 0.233064 }, { "epoch": 3.3966003506364815, "grad_norm": 1.1534357070922852, "learning_rate": 2.3298898172846877e-05, "loss": 0.03355591595172882, "memory(GiB)": 122.96, "step": 44560, "token_acc": 0.9856304985337243, "train_speed(iter/s)": 0.233071 }, { "epoch": 3.3969814772467415, "grad_norm": 0.9010620713233948, "learning_rate": 2.3288775742812324e-05, "loss": 0.0567701518535614, "memory(GiB)": 122.96, "step": 44565, "token_acc": 0.9792214988225516, "train_speed(iter/s)": 0.233076 }, { "epoch": 3.397362603857001, "grad_norm": 0.809363603591919, "learning_rate": 2.3278654844542547e-05, "loss": 0.0632942795753479, "memory(GiB)": 122.96, "step": 44570, "token_acc": 0.9731432858214554, "train_speed(iter/s)": 0.233078 }, { "epoch": 3.397743730467261, "grad_norm": 0.9402106404304504, "learning_rate": 2.3268535478617893e-05, "loss": 0.05552844405174255, "memory(GiB)": 122.96, "step": 44575, "token_acc": 0.9792147806004619, "train_speed(iter/s)": 0.233085 }, { "epoch": 3.398124857077521, "grad_norm": 0.7543808221817017, "learning_rate": 2.3258417645618674e-05, "loss": 0.06237365603446961, "memory(GiB)": 122.96, "step": 44580, "token_acc": 0.9800218938149973, "train_speed(iter/s)": 0.233086 }, { "epoch": 3.398505983687781, "grad_norm": 0.9666885137557983, "learning_rate": 2.3248301346125135e-05, "loss": 0.06986541748046875, "memory(GiB)": 122.96, "step": 44585, "token_acc": 0.9720018665422305, "train_speed(iter/s)": 0.23309 }, { "epoch": 3.398887110298041, "grad_norm": 1.6397019624710083, "learning_rate": 2.3238186580717348e-05, "loss": 0.052738016843795775, "memory(GiB)": 122.96, "step": 44590, "token_acc": 0.9815950920245399, "train_speed(iter/s)": 0.2331 }, { "epoch": 3.399268236908301, "grad_norm": 1.0383042097091675, "learning_rate": 2.322807334997537e-05, "loss": 0.05521996021270752, "memory(GiB)": 122.96, "step": 44595, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.233104 }, { "epoch": 3.3996493635185607, "grad_norm": 0.8956355452537537, "learning_rate": 2.3217961654479163e-05, "loss": 0.03698550760746002, "memory(GiB)": 122.96, "step": 44600, "token_acc": 0.9846725499303298, "train_speed(iter/s)": 0.233111 }, { "epoch": 3.3996493635185607, "eval_loss": 0.06424329429864883, "eval_runtime": 219.2037, "eval_samples_per_second": 2.418, "eval_steps_per_second": 2.418, "eval_token_acc": 0.973450394554545, "step": 44600 }, { "epoch": 3.4000304901288207, "grad_norm": 2.0473475456237793, "learning_rate": 2.3207851494808565e-05, "loss": 0.024296510219573974, "memory(GiB)": 122.96, "step": 44605, "token_acc": 0.9737302457971359, "train_speed(iter/s)": 0.232853 }, { "epoch": 3.4004116167390808, "grad_norm": 0.5832228064537048, "learning_rate": 2.3197742871543345e-05, "loss": 0.06062343716621399, "memory(GiB)": 122.96, "step": 44610, "token_acc": 0.9787767379679144, "train_speed(iter/s)": 0.232859 }, { "epoch": 3.400792743349341, "grad_norm": 1.1779603958129883, "learning_rate": 2.3187635785263206e-05, "loss": 0.07031776905059814, "memory(GiB)": 122.96, "step": 44615, "token_acc": 0.9700626595497795, "train_speed(iter/s)": 0.232866 }, { "epoch": 3.4011738699596004, "grad_norm": 2.6449520587921143, "learning_rate": 2.317753023654772e-05, "loss": 0.0815818190574646, "memory(GiB)": 122.96, "step": 44620, "token_acc": 0.9799003055153561, "train_speed(iter/s)": 0.232871 }, { "epoch": 3.4015549965698604, "grad_norm": 0.7098424434661865, "learning_rate": 2.3167426225976402e-05, "loss": 0.08742862343788146, "memory(GiB)": 122.96, "step": 44625, "token_acc": 0.9743685687558465, "train_speed(iter/s)": 0.232875 }, { "epoch": 3.4019361231801204, "grad_norm": 2.295198917388916, "learning_rate": 2.315732375412869e-05, "loss": 0.04337025582790375, "memory(GiB)": 122.96, "step": 44630, "token_acc": 0.9842406876790831, "train_speed(iter/s)": 0.232883 }, { "epoch": 3.4023172497903804, "grad_norm": 1.45408296585083, "learning_rate": 2.3147222821583874e-05, "loss": 0.04487159848213196, "memory(GiB)": 122.96, "step": 44635, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.232888 }, { "epoch": 3.4026983764006404, "grad_norm": 2.845858097076416, "learning_rate": 2.313712342892122e-05, "loss": 0.08425554633140564, "memory(GiB)": 122.96, "step": 44640, "token_acc": 0.9567985447930878, "train_speed(iter/s)": 0.232896 }, { "epoch": 3.4030795030109005, "grad_norm": 1.8757227659225464, "learning_rate": 2.31270255767199e-05, "loss": 0.07356293201446533, "memory(GiB)": 122.96, "step": 44645, "token_acc": 0.9684353741496599, "train_speed(iter/s)": 0.232904 }, { "epoch": 3.40346062962116, "grad_norm": 0.8478085994720459, "learning_rate": 2.3116929265558935e-05, "loss": 0.054344451427459715, "memory(GiB)": 122.96, "step": 44650, "token_acc": 0.9767814251401121, "train_speed(iter/s)": 0.232911 }, { "epoch": 3.40384175623142, "grad_norm": 0.5739855170249939, "learning_rate": 2.3106834496017344e-05, "loss": 0.06884437203407287, "memory(GiB)": 122.96, "step": 44655, "token_acc": 0.9750132908027644, "train_speed(iter/s)": 0.23292 }, { "epoch": 3.40422288284168, "grad_norm": 0.537789523601532, "learning_rate": 2.3096741268673976e-05, "loss": 0.04070296883583069, "memory(GiB)": 122.96, "step": 44660, "token_acc": 0.9835298196948682, "train_speed(iter/s)": 0.232925 }, { "epoch": 3.40460400945194, "grad_norm": 1.2702643871307373, "learning_rate": 2.308664958410765e-05, "loss": 0.06397372484207153, "memory(GiB)": 122.96, "step": 44665, "token_acc": 0.9731792411687745, "train_speed(iter/s)": 0.232931 }, { "epoch": 3.4049851360621997, "grad_norm": 0.27819111943244934, "learning_rate": 2.3076559442897095e-05, "loss": 0.08050659894943238, "memory(GiB)": 122.96, "step": 44670, "token_acc": 0.961701069385725, "train_speed(iter/s)": 0.232937 }, { "epoch": 3.4053662626724597, "grad_norm": 0.7483713626861572, "learning_rate": 2.3066470845620897e-05, "loss": 0.08782613873481751, "memory(GiB)": 122.96, "step": 44675, "token_acc": 0.9735306377657357, "train_speed(iter/s)": 0.232943 }, { "epoch": 3.4057473892827197, "grad_norm": 1.4012302160263062, "learning_rate": 2.305638379285761e-05, "loss": 0.06701287031173705, "memory(GiB)": 122.96, "step": 44680, "token_acc": 0.9723346828609987, "train_speed(iter/s)": 0.232946 }, { "epoch": 3.4061285158929797, "grad_norm": 1.098692774772644, "learning_rate": 2.3046298285185698e-05, "loss": 0.06266499757766723, "memory(GiB)": 122.96, "step": 44685, "token_acc": 0.9767929330738134, "train_speed(iter/s)": 0.232949 }, { "epoch": 3.4065096425032397, "grad_norm": 1.6917115449905396, "learning_rate": 2.303621432318346e-05, "loss": 0.041795593500137326, "memory(GiB)": 122.96, "step": 44690, "token_acc": 0.980859375, "train_speed(iter/s)": 0.232956 }, { "epoch": 3.4068907691134998, "grad_norm": 0.7055837512016296, "learning_rate": 2.3026131907429237e-05, "loss": 0.05653186440467835, "memory(GiB)": 122.96, "step": 44695, "token_acc": 0.9776007215874924, "train_speed(iter/s)": 0.23296 }, { "epoch": 3.4072718957237593, "grad_norm": 0.46868836879730225, "learning_rate": 2.301605103850116e-05, "loss": 0.04407854676246643, "memory(GiB)": 122.96, "step": 44700, "token_acc": 0.9869190091845255, "train_speed(iter/s)": 0.232967 }, { "epoch": 3.4076530223340193, "grad_norm": 1.6238206624984741, "learning_rate": 2.3005971716977337e-05, "loss": 0.06622194051742554, "memory(GiB)": 122.96, "step": 44705, "token_acc": 0.9634630659253376, "train_speed(iter/s)": 0.232977 }, { "epoch": 3.4080341489442794, "grad_norm": 1.2246685028076172, "learning_rate": 2.299589394343579e-05, "loss": 0.05243744254112244, "memory(GiB)": 122.96, "step": 44710, "token_acc": 0.9852765618782332, "train_speed(iter/s)": 0.232985 }, { "epoch": 3.4084152755545394, "grad_norm": 1.1716135740280151, "learning_rate": 2.298581771845439e-05, "loss": 0.042613485455513, "memory(GiB)": 122.96, "step": 44715, "token_acc": 0.9824299065420561, "train_speed(iter/s)": 0.232989 }, { "epoch": 3.408796402164799, "grad_norm": 0.6542361378669739, "learning_rate": 2.2975743042610977e-05, "loss": 0.05059970617294311, "memory(GiB)": 122.96, "step": 44720, "token_acc": 0.977378408948096, "train_speed(iter/s)": 0.232993 }, { "epoch": 3.409177528775059, "grad_norm": 2.0905284881591797, "learning_rate": 2.2965669916483318e-05, "loss": 0.07727769613265992, "memory(GiB)": 122.96, "step": 44725, "token_acc": 0.9697048837816662, "train_speed(iter/s)": 0.232997 }, { "epoch": 3.409558655385319, "grad_norm": 1.3643220663070679, "learning_rate": 2.295559834064901e-05, "loss": 0.05204898118972778, "memory(GiB)": 122.96, "step": 44730, "token_acc": 0.9790776152980878, "train_speed(iter/s)": 0.233003 }, { "epoch": 3.409939781995579, "grad_norm": 0.2761351764202118, "learning_rate": 2.2945528315685638e-05, "loss": 0.03355906307697296, "memory(GiB)": 122.96, "step": 44735, "token_acc": 0.9840962819686224, "train_speed(iter/s)": 0.23301 }, { "epoch": 3.410320908605839, "grad_norm": 1.6953786611557007, "learning_rate": 2.2935459842170692e-05, "loss": 0.04756622910499573, "memory(GiB)": 122.96, "step": 44740, "token_acc": 0.9825769284811016, "train_speed(iter/s)": 0.233013 }, { "epoch": 3.4107020352160986, "grad_norm": 2.618507146835327, "learning_rate": 2.2925392920681504e-05, "loss": 0.06822892427444457, "memory(GiB)": 122.96, "step": 44745, "token_acc": 0.97456, "train_speed(iter/s)": 0.233018 }, { "epoch": 3.4110831618263586, "grad_norm": 0.8660376667976379, "learning_rate": 2.2915327551795396e-05, "loss": 0.04379658401012421, "memory(GiB)": 122.96, "step": 44750, "token_acc": 0.9806362378976486, "train_speed(iter/s)": 0.233027 }, { "epoch": 3.4114642884366186, "grad_norm": 0.5510324835777283, "learning_rate": 2.2905263736089583e-05, "loss": 0.05059828758239746, "memory(GiB)": 122.96, "step": 44755, "token_acc": 0.9840686274509803, "train_speed(iter/s)": 0.233029 }, { "epoch": 3.4118454150468787, "grad_norm": 2.3085544109344482, "learning_rate": 2.2895201474141136e-05, "loss": 0.06807131171226502, "memory(GiB)": 122.96, "step": 44760, "token_acc": 0.9775641025641025, "train_speed(iter/s)": 0.233036 }, { "epoch": 3.4122265416571387, "grad_norm": 0.8581545352935791, "learning_rate": 2.288514076652711e-05, "loss": 0.047981977462768555, "memory(GiB)": 122.96, "step": 44765, "token_acc": 0.9772897897897898, "train_speed(iter/s)": 0.233043 }, { "epoch": 3.4126076682673983, "grad_norm": 1.0957971811294556, "learning_rate": 2.2875081613824447e-05, "loss": 0.06051156520843506, "memory(GiB)": 122.96, "step": 44770, "token_acc": 0.979670522257273, "train_speed(iter/s)": 0.233046 }, { "epoch": 3.4129887948776583, "grad_norm": 1.3230106830596924, "learning_rate": 2.2865024016609958e-05, "loss": 0.06653358936309814, "memory(GiB)": 122.96, "step": 44775, "token_acc": 0.9742331288343559, "train_speed(iter/s)": 0.233053 }, { "epoch": 3.4133699214879183, "grad_norm": 0.681696891784668, "learning_rate": 2.2854967975460422e-05, "loss": 0.05562456846237183, "memory(GiB)": 122.96, "step": 44780, "token_acc": 0.9821830841695679, "train_speed(iter/s)": 0.233058 }, { "epoch": 3.4137510480981783, "grad_norm": 1.0680263042449951, "learning_rate": 2.2844913490952525e-05, "loss": 0.051200473308563234, "memory(GiB)": 122.96, "step": 44785, "token_acc": 0.979376340537865, "train_speed(iter/s)": 0.233063 }, { "epoch": 3.4141321747084383, "grad_norm": 1.4870893955230713, "learning_rate": 2.2834860563662802e-05, "loss": 0.08175615072250367, "memory(GiB)": 122.96, "step": 44790, "token_acc": 0.980682213713888, "train_speed(iter/s)": 0.233065 }, { "epoch": 3.414513301318698, "grad_norm": 0.9734242558479309, "learning_rate": 2.2824809194167768e-05, "loss": 0.04437652826309204, "memory(GiB)": 122.96, "step": 44795, "token_acc": 0.9822245688137979, "train_speed(iter/s)": 0.233069 }, { "epoch": 3.414894427928958, "grad_norm": 0.7441583275794983, "learning_rate": 2.281475938304383e-05, "loss": 0.04275440275669098, "memory(GiB)": 122.96, "step": 44800, "token_acc": 0.9801868556701031, "train_speed(iter/s)": 0.233073 }, { "epoch": 3.414894427928958, "eval_loss": 0.06318888813257217, "eval_runtime": 220.8465, "eval_samples_per_second": 2.4, "eval_steps_per_second": 2.4, "eval_token_acc": 0.9737139328956087, "step": 44800 }, { "epoch": 3.415275554539218, "grad_norm": 0.9297592043876648, "learning_rate": 2.280471113086728e-05, "loss": 0.05879397392272949, "memory(GiB)": 122.96, "step": 44805, "token_acc": 0.9735094644489642, "train_speed(iter/s)": 0.232808 }, { "epoch": 3.415656681149478, "grad_norm": 1.7653663158416748, "learning_rate": 2.2794664438214337e-05, "loss": 0.0679441213607788, "memory(GiB)": 122.96, "step": 44810, "token_acc": 0.9684862127180641, "train_speed(iter/s)": 0.232818 }, { "epoch": 3.416037807759738, "grad_norm": 2.3909811973571777, "learning_rate": 2.278461930566116e-05, "loss": 0.06415926218032837, "memory(GiB)": 122.96, "step": 44815, "token_acc": 0.978063900810682, "train_speed(iter/s)": 0.232827 }, { "epoch": 3.4164189343699976, "grad_norm": 0.9707624316215515, "learning_rate": 2.277457573378375e-05, "loss": 0.03789882957935333, "memory(GiB)": 122.96, "step": 44820, "token_acc": 0.9859042127182445, "train_speed(iter/s)": 0.232829 }, { "epoch": 3.4168000609802576, "grad_norm": 0.9600762128829956, "learning_rate": 2.276453372315808e-05, "loss": 0.04155534207820892, "memory(GiB)": 122.96, "step": 44825, "token_acc": 0.9839751873869217, "train_speed(iter/s)": 0.232836 }, { "epoch": 3.4171811875905176, "grad_norm": 0.3185662031173706, "learning_rate": 2.2754493274360017e-05, "loss": 0.03346228897571564, "memory(GiB)": 122.96, "step": 44830, "token_acc": 0.9816020379281064, "train_speed(iter/s)": 0.232843 }, { "epoch": 3.4175623142007776, "grad_norm": 1.7086162567138672, "learning_rate": 2.274445438796533e-05, "loss": 0.08135819435119629, "memory(GiB)": 122.96, "step": 44835, "token_acc": 0.9704668148618609, "train_speed(iter/s)": 0.232851 }, { "epoch": 3.4179434408110376, "grad_norm": 2.2745068073272705, "learning_rate": 2.2734417064549718e-05, "loss": 0.09969144463539123, "memory(GiB)": 122.96, "step": 44840, "token_acc": 0.9610187110187111, "train_speed(iter/s)": 0.232859 }, { "epoch": 3.418324567421297, "grad_norm": 0.8496828079223633, "learning_rate": 2.2724381304688742e-05, "loss": 0.05807327032089234, "memory(GiB)": 122.96, "step": 44845, "token_acc": 0.9757199322416714, "train_speed(iter/s)": 0.232866 }, { "epoch": 3.418705694031557, "grad_norm": 2.07663893699646, "learning_rate": 2.271434710895793e-05, "loss": 0.09426398277282715, "memory(GiB)": 122.96, "step": 44850, "token_acc": 0.9654680817361001, "train_speed(iter/s)": 0.23287 }, { "epoch": 3.4190868206418172, "grad_norm": 1.3196810483932495, "learning_rate": 2.2704314477932696e-05, "loss": 0.11339769363403321, "memory(GiB)": 122.96, "step": 44855, "token_acc": 0.9586669552045012, "train_speed(iter/s)": 0.232876 }, { "epoch": 3.4194679472520773, "grad_norm": 0.8061447739601135, "learning_rate": 2.269428341218835e-05, "loss": 0.04080787897109985, "memory(GiB)": 122.96, "step": 44860, "token_acc": 0.9808673469387755, "train_speed(iter/s)": 0.232881 }, { "epoch": 3.419849073862337, "grad_norm": 0.9080660343170166, "learning_rate": 2.2684253912300136e-05, "loss": 0.059151333570480344, "memory(GiB)": 122.96, "step": 44865, "token_acc": 0.9819587628865979, "train_speed(iter/s)": 0.232885 }, { "epoch": 3.420230200472597, "grad_norm": 1.1304931640625, "learning_rate": 2.2674225978843216e-05, "loss": 0.039825713634490965, "memory(GiB)": 122.96, "step": 44870, "token_acc": 0.9854705186686941, "train_speed(iter/s)": 0.232889 }, { "epoch": 3.420611327082857, "grad_norm": 1.8925048112869263, "learning_rate": 2.2664199612392613e-05, "loss": 0.07359997034072877, "memory(GiB)": 122.96, "step": 44875, "token_acc": 0.9754464285714286, "train_speed(iter/s)": 0.232895 }, { "epoch": 3.420992453693117, "grad_norm": 1.00221586227417, "learning_rate": 2.2654174813523327e-05, "loss": 0.05020250678062439, "memory(GiB)": 122.96, "step": 44880, "token_acc": 0.9820489012689569, "train_speed(iter/s)": 0.232902 }, { "epoch": 3.421373580303377, "grad_norm": 1.0710502862930298, "learning_rate": 2.2644151582810193e-05, "loss": 0.06331337690353393, "memory(GiB)": 122.96, "step": 44885, "token_acc": 0.9719488188976378, "train_speed(iter/s)": 0.232907 }, { "epoch": 3.421754706913637, "grad_norm": 0.7516010403633118, "learning_rate": 2.2634129920828023e-05, "loss": 0.0546191930770874, "memory(GiB)": 122.96, "step": 44890, "token_acc": 0.9761774868157846, "train_speed(iter/s)": 0.232912 }, { "epoch": 3.4221358335238965, "grad_norm": 1.692085862159729, "learning_rate": 2.2624109828151523e-05, "loss": 0.06684125661849975, "memory(GiB)": 122.96, "step": 44895, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.23292 }, { "epoch": 3.4225169601341565, "grad_norm": 1.6461995840072632, "learning_rate": 2.2614091305355272e-05, "loss": 0.049228453636169435, "memory(GiB)": 122.96, "step": 44900, "token_acc": 0.9806172374858864, "train_speed(iter/s)": 0.232925 }, { "epoch": 3.4228980867444165, "grad_norm": 1.7640323638916016, "learning_rate": 2.2604074353013793e-05, "loss": 0.06725164651870727, "memory(GiB)": 122.96, "step": 44905, "token_acc": 0.9743431221020092, "train_speed(iter/s)": 0.232933 }, { "epoch": 3.4232792133546766, "grad_norm": 2.2136282920837402, "learning_rate": 2.2594058971701536e-05, "loss": 0.056569886207580564, "memory(GiB)": 122.96, "step": 44910, "token_acc": 0.9832775919732442, "train_speed(iter/s)": 0.23294 }, { "epoch": 3.423660339964936, "grad_norm": 1.5948423147201538, "learning_rate": 2.2584045161992807e-05, "loss": 0.06861868500709534, "memory(GiB)": 122.96, "step": 44915, "token_acc": 0.9728621016093405, "train_speed(iter/s)": 0.232947 }, { "epoch": 3.424041466575196, "grad_norm": 1.0242313146591187, "learning_rate": 2.257403292446185e-05, "loss": 0.04585750699043274, "memory(GiB)": 122.96, "step": 44920, "token_acc": 0.982085732565579, "train_speed(iter/s)": 0.232954 }, { "epoch": 3.424422593185456, "grad_norm": 1.5753129720687866, "learning_rate": 2.256402225968286e-05, "loss": 0.05965622663497925, "memory(GiB)": 122.96, "step": 44925, "token_acc": 0.9817518248175182, "train_speed(iter/s)": 0.232959 }, { "epoch": 3.424803719795716, "grad_norm": 0.8447466492652893, "learning_rate": 2.255401316822986e-05, "loss": 0.036798608303070066, "memory(GiB)": 122.96, "step": 44930, "token_acc": 0.9867829021372329, "train_speed(iter/s)": 0.232965 }, { "epoch": 3.425184846405976, "grad_norm": 1.3980082273483276, "learning_rate": 2.254400565067683e-05, "loss": 0.07781851291656494, "memory(GiB)": 122.96, "step": 44935, "token_acc": 0.9718866171003717, "train_speed(iter/s)": 0.23297 }, { "epoch": 3.425565973016236, "grad_norm": 1.13558030128479, "learning_rate": 2.2533999707597686e-05, "loss": 0.050400960445404056, "memory(GiB)": 122.96, "step": 44940, "token_acc": 0.976544289044289, "train_speed(iter/s)": 0.232975 }, { "epoch": 3.425947099626496, "grad_norm": 2.0734434127807617, "learning_rate": 2.2523995339566184e-05, "loss": 0.07354960441589356, "memory(GiB)": 122.96, "step": 44945, "token_acc": 0.9775811209439528, "train_speed(iter/s)": 0.232979 }, { "epoch": 3.426328226236756, "grad_norm": 0.9540282487869263, "learning_rate": 2.251399254715605e-05, "loss": 0.0476239413022995, "memory(GiB)": 122.96, "step": 44950, "token_acc": 0.9750118990956688, "train_speed(iter/s)": 0.232985 }, { "epoch": 3.426709352847016, "grad_norm": 0.47220560908317566, "learning_rate": 2.2503991330940887e-05, "loss": 0.04844440817832947, "memory(GiB)": 122.96, "step": 44955, "token_acc": 0.9855232100708103, "train_speed(iter/s)": 0.232989 }, { "epoch": 3.427090479457276, "grad_norm": 1.7117893695831299, "learning_rate": 2.2493991691494222e-05, "loss": 0.04848337471485138, "memory(GiB)": 122.96, "step": 44960, "token_acc": 0.987051206592113, "train_speed(iter/s)": 0.232995 }, { "epoch": 3.4274716060675354, "grad_norm": 1.13459312915802, "learning_rate": 2.248399362938951e-05, "loss": 0.05712783336639404, "memory(GiB)": 122.96, "step": 44965, "token_acc": 0.9769277474195507, "train_speed(iter/s)": 0.232999 }, { "epoch": 3.4278527326777954, "grad_norm": 0.9561129808425903, "learning_rate": 2.247399714520006e-05, "loss": 0.05551947951316834, "memory(GiB)": 122.96, "step": 44970, "token_acc": 0.9778945064565551, "train_speed(iter/s)": 0.233005 }, { "epoch": 3.4282338592880555, "grad_norm": 0.9765750765800476, "learning_rate": 2.246400223949913e-05, "loss": 0.038078561425209045, "memory(GiB)": 122.96, "step": 44975, "token_acc": 0.9841746794871795, "train_speed(iter/s)": 0.233009 }, { "epoch": 3.4286149858983155, "grad_norm": 0.7561149597167969, "learning_rate": 2.2454008912859914e-05, "loss": 0.030833399295806883, "memory(GiB)": 122.96, "step": 44980, "token_acc": 0.9848688634835239, "train_speed(iter/s)": 0.233014 }, { "epoch": 3.4289961125085755, "grad_norm": 1.4561941623687744, "learning_rate": 2.2444017165855435e-05, "loss": 0.04865076541900635, "memory(GiB)": 122.96, "step": 44985, "token_acc": 0.9775570272259014, "train_speed(iter/s)": 0.233021 }, { "epoch": 3.4293772391188355, "grad_norm": 2.5909974575042725, "learning_rate": 2.24340269990587e-05, "loss": 0.05438899397850037, "memory(GiB)": 122.96, "step": 44990, "token_acc": 0.9785443836769037, "train_speed(iter/s)": 0.233029 }, { "epoch": 3.429758365729095, "grad_norm": 0.9542593359947205, "learning_rate": 2.2424038413042608e-05, "loss": 0.06821859478950501, "memory(GiB)": 122.96, "step": 44995, "token_acc": 0.9699556723033984, "train_speed(iter/s)": 0.233034 }, { "epoch": 3.430139492339355, "grad_norm": 2.1722631454467773, "learning_rate": 2.2414051408379933e-05, "loss": 0.05988548398017883, "memory(GiB)": 122.96, "step": 45000, "token_acc": 0.978806907378336, "train_speed(iter/s)": 0.233041 }, { "epoch": 3.430139492339355, "eval_loss": 0.06367901712656021, "eval_runtime": 222.2072, "eval_samples_per_second": 2.385, "eval_steps_per_second": 2.385, "eval_token_acc": 0.973872055900247, "step": 45000 }, { "epoch": 3.430520618949615, "grad_norm": 1.0729546546936035, "learning_rate": 2.240406598564339e-05, "loss": 0.05831748843193054, "memory(GiB)": 122.96, "step": 45005, "token_acc": 0.9740004401731348, "train_speed(iter/s)": 0.232778 }, { "epoch": 3.430901745559875, "grad_norm": 0.830093502998352, "learning_rate": 2.239408214540562e-05, "loss": 0.08225314617156983, "memory(GiB)": 122.96, "step": 45010, "token_acc": 0.973874862788145, "train_speed(iter/s)": 0.232784 }, { "epoch": 3.4312828721701347, "grad_norm": 0.7935507297515869, "learning_rate": 2.238409988823912e-05, "loss": 0.06296311020851135, "memory(GiB)": 122.96, "step": 45015, "token_acc": 0.9763644845280671, "train_speed(iter/s)": 0.232787 }, { "epoch": 3.4316639987803947, "grad_norm": 1.4880785942077637, "learning_rate": 2.2374119214716332e-05, "loss": 0.078467458486557, "memory(GiB)": 122.96, "step": 45020, "token_acc": 0.9752932368355378, "train_speed(iter/s)": 0.232794 }, { "epoch": 3.4320451253906548, "grad_norm": 1.2265498638153076, "learning_rate": 2.2364140125409626e-05, "loss": 0.049119746685028075, "memory(GiB)": 122.96, "step": 45025, "token_acc": 0.9826796735018913, "train_speed(iter/s)": 0.2328 }, { "epoch": 3.432426252000915, "grad_norm": 0.8574033379554749, "learning_rate": 2.2354162620891223e-05, "loss": 0.058259105682373045, "memory(GiB)": 122.96, "step": 45030, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.232806 }, { "epoch": 3.432807378611175, "grad_norm": 1.60665762424469, "learning_rate": 2.23441867017333e-05, "loss": 0.06164218187332153, "memory(GiB)": 122.96, "step": 45035, "token_acc": 0.9798792756539235, "train_speed(iter/s)": 0.23281 }, { "epoch": 3.4331885052214344, "grad_norm": 0.6134791374206543, "learning_rate": 2.2334212368507945e-05, "loss": 0.03807723224163055, "memory(GiB)": 122.96, "step": 45040, "token_acc": 0.980641164627536, "train_speed(iter/s)": 0.232814 }, { "epoch": 3.4335696318316944, "grad_norm": 1.401603102684021, "learning_rate": 2.2324239621787112e-05, "loss": 0.049556410312652587, "memory(GiB)": 122.96, "step": 45045, "token_acc": 0.9817927170868347, "train_speed(iter/s)": 0.232822 }, { "epoch": 3.4339507584419544, "grad_norm": 0.8441397547721863, "learning_rate": 2.2314268462142724e-05, "loss": 0.05769633650779724, "memory(GiB)": 122.96, "step": 45050, "token_acc": 0.9779836015791072, "train_speed(iter/s)": 0.232824 }, { "epoch": 3.4343318850522144, "grad_norm": 0.9169077277183533, "learning_rate": 2.2304298890146542e-05, "loss": 0.05947350859642029, "memory(GiB)": 122.96, "step": 45055, "token_acc": 0.9789661319073084, "train_speed(iter/s)": 0.232832 }, { "epoch": 3.4347130116624744, "grad_norm": 1.506417989730835, "learning_rate": 2.2294330906370292e-05, "loss": 0.07941017746925354, "memory(GiB)": 122.96, "step": 45060, "token_acc": 0.9668008048289738, "train_speed(iter/s)": 0.23284 }, { "epoch": 3.435094138272734, "grad_norm": 1.879273533821106, "learning_rate": 2.228436451138562e-05, "loss": 0.07426203489303589, "memory(GiB)": 122.96, "step": 45065, "token_acc": 0.9754385964912281, "train_speed(iter/s)": 0.232846 }, { "epoch": 3.435475264882994, "grad_norm": 0.7355198860168457, "learning_rate": 2.2274399705764005e-05, "loss": 0.06896369457244873, "memory(GiB)": 122.96, "step": 45070, "token_acc": 0.9730406097144807, "train_speed(iter/s)": 0.232847 }, { "epoch": 3.435856391493254, "grad_norm": 1.0533682107925415, "learning_rate": 2.22644364900769e-05, "loss": 0.07325604557991028, "memory(GiB)": 122.96, "step": 45075, "token_acc": 0.9702244617498855, "train_speed(iter/s)": 0.232853 }, { "epoch": 3.436237518103514, "grad_norm": 0.8070945739746094, "learning_rate": 2.225447486489568e-05, "loss": 0.05115926265716553, "memory(GiB)": 122.96, "step": 45080, "token_acc": 0.9828240824444042, "train_speed(iter/s)": 0.232857 }, { "epoch": 3.436618644713774, "grad_norm": 0.9233098030090332, "learning_rate": 2.2244514830791546e-05, "loss": 0.047272658348083495, "memory(GiB)": 122.96, "step": 45085, "token_acc": 0.9838398813936249, "train_speed(iter/s)": 0.23286 }, { "epoch": 3.4369997713240337, "grad_norm": 1.5134053230285645, "learning_rate": 2.2234556388335694e-05, "loss": 0.06077989935874939, "memory(GiB)": 122.96, "step": 45090, "token_acc": 0.9765347238682153, "train_speed(iter/s)": 0.232866 }, { "epoch": 3.4373808979342937, "grad_norm": 1.0293810367584229, "learning_rate": 2.222459953809918e-05, "loss": 0.061869841814041135, "memory(GiB)": 122.96, "step": 45095, "token_acc": 0.9816687737041719, "train_speed(iter/s)": 0.232873 }, { "epoch": 3.4377620245445537, "grad_norm": 0.07360219210386276, "learning_rate": 2.2214644280652986e-05, "loss": 0.03280588984489441, "memory(GiB)": 122.96, "step": 45100, "token_acc": 0.9848966613672496, "train_speed(iter/s)": 0.232877 }, { "epoch": 3.4381431511548137, "grad_norm": 3.419821262359619, "learning_rate": 2.2204690616568025e-05, "loss": 0.061108851432800294, "memory(GiB)": 122.96, "step": 45105, "token_acc": 0.9719311377245509, "train_speed(iter/s)": 0.232885 }, { "epoch": 3.4385242777650737, "grad_norm": 0.939553439617157, "learning_rate": 2.219473854641505e-05, "loss": 0.07432513833045959, "memory(GiB)": 122.96, "step": 45110, "token_acc": 0.970640344216654, "train_speed(iter/s)": 0.232892 }, { "epoch": 3.4389054043753333, "grad_norm": 0.8780707716941833, "learning_rate": 2.218478807076479e-05, "loss": 0.05283675193786621, "memory(GiB)": 122.96, "step": 45115, "token_acc": 0.9798251513113652, "train_speed(iter/s)": 0.232896 }, { "epoch": 3.4392865309855933, "grad_norm": 1.3269312381744385, "learning_rate": 2.2174839190187875e-05, "loss": 0.04531490206718445, "memory(GiB)": 122.96, "step": 45120, "token_acc": 0.9808641232398604, "train_speed(iter/s)": 0.232899 }, { "epoch": 3.4396676575958534, "grad_norm": 1.7130590677261353, "learning_rate": 2.2164891905254787e-05, "loss": 0.036972776055336, "memory(GiB)": 122.96, "step": 45125, "token_acc": 0.9843096234309623, "train_speed(iter/s)": 0.232906 }, { "epoch": 3.4400487842061134, "grad_norm": 0.14971446990966797, "learning_rate": 2.2154946216535976e-05, "loss": 0.05062277317047119, "memory(GiB)": 122.96, "step": 45130, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.232909 }, { "epoch": 3.4404299108163734, "grad_norm": 1.5663849115371704, "learning_rate": 2.2145002124601804e-05, "loss": 0.06038525104522705, "memory(GiB)": 122.96, "step": 45135, "token_acc": 0.9653164556962025, "train_speed(iter/s)": 0.232916 }, { "epoch": 3.440811037426633, "grad_norm": 1.0063234567642212, "learning_rate": 2.213505963002248e-05, "loss": 0.07549288272857665, "memory(GiB)": 122.96, "step": 45140, "token_acc": 0.9703021882598124, "train_speed(iter/s)": 0.232919 }, { "epoch": 3.441192164036893, "grad_norm": 1.3275728225708008, "learning_rate": 2.212511873336818e-05, "loss": 0.05870128870010376, "memory(GiB)": 122.96, "step": 45145, "token_acc": 0.979000884173298, "train_speed(iter/s)": 0.232925 }, { "epoch": 3.441573290647153, "grad_norm": 3.0689945220947266, "learning_rate": 2.2115179435208978e-05, "loss": 0.07459087371826172, "memory(GiB)": 122.96, "step": 45150, "token_acc": 0.9666075650118203, "train_speed(iter/s)": 0.232933 }, { "epoch": 3.441954417257413, "grad_norm": 1.1504340171813965, "learning_rate": 2.210524173611481e-05, "loss": 0.05880054831504822, "memory(GiB)": 122.96, "step": 45155, "token_acc": 0.9732388823297914, "train_speed(iter/s)": 0.232941 }, { "epoch": 3.4423355438676726, "grad_norm": 1.1274086236953735, "learning_rate": 2.2095305636655593e-05, "loss": 0.07028623819351196, "memory(GiB)": 122.96, "step": 45160, "token_acc": 0.9726832133940373, "train_speed(iter/s)": 0.232943 }, { "epoch": 3.4427166704779326, "grad_norm": 1.664359211921692, "learning_rate": 2.208537113740112e-05, "loss": 0.03891247510910034, "memory(GiB)": 122.96, "step": 45165, "token_acc": 0.9859284890426759, "train_speed(iter/s)": 0.23295 }, { "epoch": 3.4430977970881926, "grad_norm": 1.985236644744873, "learning_rate": 2.2075438238921048e-05, "loss": 0.06343857645988464, "memory(GiB)": 122.96, "step": 45170, "token_acc": 0.974569536423841, "train_speed(iter/s)": 0.232957 }, { "epoch": 3.4434789236984527, "grad_norm": 0.987095832824707, "learning_rate": 2.2065506941785008e-05, "loss": 0.056367266178131106, "memory(GiB)": 122.96, "step": 45175, "token_acc": 0.9805302402651201, "train_speed(iter/s)": 0.232962 }, { "epoch": 3.4438600503087127, "grad_norm": 1.616195797920227, "learning_rate": 2.2055577246562536e-05, "loss": 0.07741478681564332, "memory(GiB)": 122.96, "step": 45180, "token_acc": 0.972812781858008, "train_speed(iter/s)": 0.232965 }, { "epoch": 3.4442411769189727, "grad_norm": 1.8417623043060303, "learning_rate": 2.204564915382301e-05, "loss": 0.06621620059013367, "memory(GiB)": 122.96, "step": 45185, "token_acc": 0.9794893861158921, "train_speed(iter/s)": 0.232968 }, { "epoch": 3.4446223035292323, "grad_norm": 1.9196538925170898, "learning_rate": 2.2035722664135783e-05, "loss": 0.04340148270130158, "memory(GiB)": 122.96, "step": 45190, "token_acc": 0.9822981366459628, "train_speed(iter/s)": 0.232974 }, { "epoch": 3.4450034301394923, "grad_norm": 2.3200559616088867, "learning_rate": 2.2025797778070107e-05, "loss": 0.05473848581314087, "memory(GiB)": 122.96, "step": 45195, "token_acc": 0.9822411649795774, "train_speed(iter/s)": 0.232978 }, { "epoch": 3.4453845567497523, "grad_norm": 0.5618265867233276, "learning_rate": 2.2015874496195095e-05, "loss": 0.0591754674911499, "memory(GiB)": 122.96, "step": 45200, "token_acc": 0.9814015277316506, "train_speed(iter/s)": 0.232981 }, { "epoch": 3.4453845567497523, "eval_loss": 0.06219211965799332, "eval_runtime": 221.2737, "eval_samples_per_second": 2.395, "eval_steps_per_second": 2.395, "eval_token_acc": 0.974391602915487, "step": 45200 }, { "epoch": 3.4457656833600123, "grad_norm": 0.8313685655593872, "learning_rate": 2.2005952819079818e-05, "loss": 0.043881377577781676, "memory(GiB)": 122.96, "step": 45205, "token_acc": 0.9745452156393261, "train_speed(iter/s)": 0.232721 }, { "epoch": 3.446146809970272, "grad_norm": 0.4827812612056732, "learning_rate": 2.199603274729326e-05, "loss": 0.07583792805671692, "memory(GiB)": 122.96, "step": 45210, "token_acc": 0.9749754661432777, "train_speed(iter/s)": 0.232724 }, { "epoch": 3.446527936580532, "grad_norm": 1.0161316394805908, "learning_rate": 2.1986114281404248e-05, "loss": 0.050187486410140994, "memory(GiB)": 122.96, "step": 45215, "token_acc": 0.9791955617198336, "train_speed(iter/s)": 0.23273 }, { "epoch": 3.446909063190792, "grad_norm": 1.4389375448226929, "learning_rate": 2.19761974219816e-05, "loss": 0.05433968901634216, "memory(GiB)": 122.96, "step": 45220, "token_acc": 0.9793200689331035, "train_speed(iter/s)": 0.232736 }, { "epoch": 3.447290189801052, "grad_norm": 1.0110160112380981, "learning_rate": 2.196628216959395e-05, "loss": 0.06922097206115722, "memory(GiB)": 122.96, "step": 45225, "token_acc": 0.9723994894703255, "train_speed(iter/s)": 0.232741 }, { "epoch": 3.447671316411312, "grad_norm": 1.3081225156784058, "learning_rate": 2.195636852480994e-05, "loss": 0.05955634117126465, "memory(GiB)": 122.96, "step": 45230, "token_acc": 0.981827111984283, "train_speed(iter/s)": 0.232749 }, { "epoch": 3.448052443021572, "grad_norm": 1.3090782165527344, "learning_rate": 2.1946456488198075e-05, "loss": 0.06339813470840454, "memory(GiB)": 122.96, "step": 45235, "token_acc": 0.9752304147465438, "train_speed(iter/s)": 0.232758 }, { "epoch": 3.4484335696318316, "grad_norm": 1.515677571296692, "learning_rate": 2.1936546060326728e-05, "loss": 0.06343318223953247, "memory(GiB)": 122.96, "step": 45240, "token_acc": 0.9739450580572075, "train_speed(iter/s)": 0.232766 }, { "epoch": 3.4488146962420916, "grad_norm": 1.0709668397903442, "learning_rate": 2.1926637241764236e-05, "loss": 0.03897995352745056, "memory(GiB)": 122.96, "step": 45245, "token_acc": 0.979509119567665, "train_speed(iter/s)": 0.232771 }, { "epoch": 3.4491958228523516, "grad_norm": 1.2891544103622437, "learning_rate": 2.191673003307884e-05, "loss": 0.06364188194274903, "memory(GiB)": 122.96, "step": 45250, "token_acc": 0.9677320221266134, "train_speed(iter/s)": 0.232779 }, { "epoch": 3.4495769494626116, "grad_norm": 1.6605703830718994, "learning_rate": 2.1906824434838635e-05, "loss": 0.06384528875350952, "memory(GiB)": 122.96, "step": 45255, "token_acc": 0.9762057877813505, "train_speed(iter/s)": 0.232782 }, { "epoch": 3.449958076072871, "grad_norm": 1.2980294227600098, "learning_rate": 2.1896920447611696e-05, "loss": 0.06739249229431152, "memory(GiB)": 122.96, "step": 45260, "token_acc": 0.9699907663896583, "train_speed(iter/s)": 0.232791 }, { "epoch": 3.450339202683131, "grad_norm": 2.117784023284912, "learning_rate": 2.1887018071965944e-05, "loss": 0.06596599817276001, "memory(GiB)": 122.96, "step": 45265, "token_acc": 0.9740345327533939, "train_speed(iter/s)": 0.232794 }, { "epoch": 3.4507203292933912, "grad_norm": 0.9883667230606079, "learning_rate": 2.187711730846924e-05, "loss": 0.03947869241237641, "memory(GiB)": 122.96, "step": 45270, "token_acc": 0.9853589196872778, "train_speed(iter/s)": 0.232796 }, { "epoch": 3.4511014559036512, "grad_norm": 1.3558802604675293, "learning_rate": 2.186721815768937e-05, "loss": 0.0524524450302124, "memory(GiB)": 122.96, "step": 45275, "token_acc": 0.9790655339805825, "train_speed(iter/s)": 0.232803 }, { "epoch": 3.4514825825139113, "grad_norm": 0.8608250021934509, "learning_rate": 2.1857320620193973e-05, "loss": 0.08758134245872498, "memory(GiB)": 122.96, "step": 45280, "token_acc": 0.9680032401782098, "train_speed(iter/s)": 0.232811 }, { "epoch": 3.4518637091241713, "grad_norm": 0.9818025231361389, "learning_rate": 2.1847424696550635e-05, "loss": 0.06350049376487732, "memory(GiB)": 122.96, "step": 45285, "token_acc": 0.9799670044779637, "train_speed(iter/s)": 0.232816 }, { "epoch": 3.452244835734431, "grad_norm": 1.0767451524734497, "learning_rate": 2.1837530387326867e-05, "loss": 0.06039189696311951, "memory(GiB)": 122.96, "step": 45290, "token_acc": 0.9720394736842105, "train_speed(iter/s)": 0.232824 }, { "epoch": 3.452625962344691, "grad_norm": 1.5162702798843384, "learning_rate": 2.1827637693090024e-05, "loss": 0.055663669109344484, "memory(GiB)": 122.96, "step": 45295, "token_acc": 0.9634936881610372, "train_speed(iter/s)": 0.232831 }, { "epoch": 3.453007088954951, "grad_norm": 1.8333370685577393, "learning_rate": 2.1817746614407426e-05, "loss": 0.06623818278312683, "memory(GiB)": 122.96, "step": 45300, "token_acc": 0.9801728520589731, "train_speed(iter/s)": 0.232837 }, { "epoch": 3.453388215565211, "grad_norm": 1.1191003322601318, "learning_rate": 2.180785715184629e-05, "loss": 0.08530397415161133, "memory(GiB)": 122.96, "step": 45305, "token_acc": 0.9635719706551986, "train_speed(iter/s)": 0.232843 }, { "epoch": 3.4537693421754705, "grad_norm": 1.371564269065857, "learning_rate": 2.1797969305973704e-05, "loss": 0.04689441025257111, "memory(GiB)": 122.96, "step": 45310, "token_acc": 0.9837513246202755, "train_speed(iter/s)": 0.232851 }, { "epoch": 3.4541504687857305, "grad_norm": 0.9433755278587341, "learning_rate": 2.178808307735671e-05, "loss": 0.05835524797439575, "memory(GiB)": 122.96, "step": 45315, "token_acc": 0.9711067580803134, "train_speed(iter/s)": 0.232854 }, { "epoch": 3.4545315953959905, "grad_norm": 0.9724857807159424, "learning_rate": 2.1778198466562243e-05, "loss": 0.040848612785339355, "memory(GiB)": 122.96, "step": 45320, "token_acc": 0.985, "train_speed(iter/s)": 0.232859 }, { "epoch": 3.4549127220062505, "grad_norm": 2.2537782192230225, "learning_rate": 2.1768315474157115e-05, "loss": 0.05940612554550171, "memory(GiB)": 122.96, "step": 45325, "token_acc": 0.9756711409395973, "train_speed(iter/s)": 0.232867 }, { "epoch": 3.4552938486165106, "grad_norm": 1.354131817817688, "learning_rate": 2.1758434100708082e-05, "loss": 0.06856381893157959, "memory(GiB)": 122.96, "step": 45330, "token_acc": 0.9735491512041058, "train_speed(iter/s)": 0.232876 }, { "epoch": 3.45567497522677, "grad_norm": 1.0650523900985718, "learning_rate": 2.174855434678181e-05, "loss": 0.06634034514427185, "memory(GiB)": 122.96, "step": 45335, "token_acc": 0.9688888888888889, "train_speed(iter/s)": 0.232883 }, { "epoch": 3.45605610183703, "grad_norm": 0.8153066039085388, "learning_rate": 2.1738676212944832e-05, "loss": 0.035853144526481626, "memory(GiB)": 122.96, "step": 45340, "token_acc": 0.9855813953488373, "train_speed(iter/s)": 0.232889 }, { "epoch": 3.45643722844729, "grad_norm": 0.632796585559845, "learning_rate": 2.172879969976362e-05, "loss": 0.04423660635948181, "memory(GiB)": 122.96, "step": 45345, "token_acc": 0.9831600831600832, "train_speed(iter/s)": 0.232894 }, { "epoch": 3.45681835505755, "grad_norm": 0.5279809832572937, "learning_rate": 2.171892480780457e-05, "loss": 0.06785483360290527, "memory(GiB)": 122.96, "step": 45350, "token_acc": 0.9761464807660003, "train_speed(iter/s)": 0.232898 }, { "epoch": 3.45719948166781, "grad_norm": 1.1810177564620972, "learning_rate": 2.1709051537633927e-05, "loss": 0.062383002042770384, "memory(GiB)": 122.96, "step": 45355, "token_acc": 0.9754651737197432, "train_speed(iter/s)": 0.232902 }, { "epoch": 3.45758060827807, "grad_norm": 0.7604886293411255, "learning_rate": 2.169917988981789e-05, "loss": 0.049132540822029114, "memory(GiB)": 122.96, "step": 45360, "token_acc": 0.9794786466999446, "train_speed(iter/s)": 0.232908 }, { "epoch": 3.45796173488833, "grad_norm": 0.7514955401420593, "learning_rate": 2.168930986492255e-05, "loss": 0.03613582849502563, "memory(GiB)": 122.96, "step": 45365, "token_acc": 0.9844585561497327, "train_speed(iter/s)": 0.23291 }, { "epoch": 3.45834286149859, "grad_norm": 0.9083470106124878, "learning_rate": 2.167944146351392e-05, "loss": 0.07789779901504516, "memory(GiB)": 122.96, "step": 45370, "token_acc": 0.9728201782996303, "train_speed(iter/s)": 0.232914 }, { "epoch": 3.45872398810885, "grad_norm": 0.6812936067581177, "learning_rate": 2.1669574686157913e-05, "loss": 0.03419223427772522, "memory(GiB)": 122.96, "step": 45375, "token_acc": 0.9835325365205844, "train_speed(iter/s)": 0.232915 }, { "epoch": 3.45910511471911, "grad_norm": 0.8549937009811401, "learning_rate": 2.165970953342031e-05, "loss": 0.0678126037120819, "memory(GiB)": 122.96, "step": 45380, "token_acc": 0.9752057717711615, "train_speed(iter/s)": 0.232915 }, { "epoch": 3.4594862413293694, "grad_norm": 1.6441999673843384, "learning_rate": 2.164984600586685e-05, "loss": 0.0690489649772644, "memory(GiB)": 122.96, "step": 45385, "token_acc": 0.9755694948827996, "train_speed(iter/s)": 0.23292 }, { "epoch": 3.4598673679396295, "grad_norm": 0.9395938515663147, "learning_rate": 2.1639984104063184e-05, "loss": 0.07576008439064026, "memory(GiB)": 122.96, "step": 45390, "token_acc": 0.9674074074074074, "train_speed(iter/s)": 0.232928 }, { "epoch": 3.4602484945498895, "grad_norm": 0.5714975595474243, "learning_rate": 2.16301238285748e-05, "loss": 0.040801575779914855, "memory(GiB)": 122.96, "step": 45395, "token_acc": 0.9776018861569552, "train_speed(iter/s)": 0.232934 }, { "epoch": 3.4606296211601495, "grad_norm": 1.669095754623413, "learning_rate": 2.1620265179967157e-05, "loss": 0.061686772108078006, "memory(GiB)": 122.96, "step": 45400, "token_acc": 0.9794776119402985, "train_speed(iter/s)": 0.232939 }, { "epoch": 3.4606296211601495, "eval_loss": 0.06145572289824486, "eval_runtime": 220.7587, "eval_samples_per_second": 2.401, "eval_steps_per_second": 2.401, "eval_token_acc": 0.9746250225890006, "step": 45400 }, { "epoch": 3.4610107477704095, "grad_norm": 0.871698260307312, "learning_rate": 2.1610408158805634e-05, "loss": 0.05769317746162415, "memory(GiB)": 122.96, "step": 45405, "token_acc": 0.974868738880773, "train_speed(iter/s)": 0.232681 }, { "epoch": 3.461391874380669, "grad_norm": 0.8524215817451477, "learning_rate": 2.160055276565544e-05, "loss": 0.05424030423164368, "memory(GiB)": 122.96, "step": 45410, "token_acc": 0.9814251401120897, "train_speed(iter/s)": 0.232684 }, { "epoch": 3.461773000990929, "grad_norm": 0.7810723185539246, "learning_rate": 2.1590699001081753e-05, "loss": 0.06508281230926513, "memory(GiB)": 122.96, "step": 45415, "token_acc": 0.9786971830985915, "train_speed(iter/s)": 0.232689 }, { "epoch": 3.462154127601189, "grad_norm": 1.4505157470703125, "learning_rate": 2.1580846865649662e-05, "loss": 0.0689714014530182, "memory(GiB)": 122.96, "step": 45420, "token_acc": 0.9701103309929789, "train_speed(iter/s)": 0.232694 }, { "epoch": 3.462535254211449, "grad_norm": 1.13093900680542, "learning_rate": 2.1570996359924106e-05, "loss": 0.06275312900543213, "memory(GiB)": 122.96, "step": 45425, "token_acc": 0.9775943396226415, "train_speed(iter/s)": 0.2327 }, { "epoch": 3.462916380821709, "grad_norm": 0.9029223322868347, "learning_rate": 2.156114748446998e-05, "loss": 0.05117926597595215, "memory(GiB)": 122.96, "step": 45430, "token_acc": 0.9761653659565492, "train_speed(iter/s)": 0.232706 }, { "epoch": 3.4632975074319687, "grad_norm": 1.1034107208251953, "learning_rate": 2.1551300239852095e-05, "loss": 0.06647626161575318, "memory(GiB)": 122.96, "step": 45435, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.232715 }, { "epoch": 3.4636786340422288, "grad_norm": 2.064685583114624, "learning_rate": 2.1541454626635098e-05, "loss": 0.058794498443603516, "memory(GiB)": 122.96, "step": 45440, "token_acc": 0.971644264408028, "train_speed(iter/s)": 0.23272 }, { "epoch": 3.4640597606524888, "grad_norm": 0.7677899599075317, "learning_rate": 2.153161064538364e-05, "loss": 0.031951296329498294, "memory(GiB)": 122.96, "step": 45445, "token_acc": 0.9867054424594931, "train_speed(iter/s)": 0.232726 }, { "epoch": 3.464440887262749, "grad_norm": 0.7323102355003357, "learning_rate": 2.152176829666218e-05, "loss": 0.052821391820907594, "memory(GiB)": 122.96, "step": 45450, "token_acc": 0.9757952097326068, "train_speed(iter/s)": 0.232731 }, { "epoch": 3.4648220138730084, "grad_norm": 1.2035472393035889, "learning_rate": 2.1511927581035153e-05, "loss": 0.07341600060462952, "memory(GiB)": 122.96, "step": 45455, "token_acc": 0.973542041834049, "train_speed(iter/s)": 0.232733 }, { "epoch": 3.4652031404832684, "grad_norm": 1.3000563383102417, "learning_rate": 2.1502088499066896e-05, "loss": 0.05518950223922729, "memory(GiB)": 122.96, "step": 45460, "token_acc": 0.9774960380348653, "train_speed(iter/s)": 0.232738 }, { "epoch": 3.4655842670935284, "grad_norm": 1.2549231052398682, "learning_rate": 2.1492251051321598e-05, "loss": 0.11701295375823975, "memory(GiB)": 122.96, "step": 45465, "token_acc": 0.9611846533542742, "train_speed(iter/s)": 0.232745 }, { "epoch": 3.4659653937037884, "grad_norm": 1.00435471534729, "learning_rate": 2.1482415238363417e-05, "loss": 0.06862907409667969, "memory(GiB)": 122.96, "step": 45470, "token_acc": 0.9718161618524149, "train_speed(iter/s)": 0.232745 }, { "epoch": 3.4663465203140484, "grad_norm": 0.9796452522277832, "learning_rate": 2.1472581060756396e-05, "loss": 0.04723505973815918, "memory(GiB)": 122.96, "step": 45475, "token_acc": 0.9796468123316372, "train_speed(iter/s)": 0.232748 }, { "epoch": 3.4667276469243085, "grad_norm": 0.9445042610168457, "learning_rate": 2.146274851906445e-05, "loss": 0.06765291094779968, "memory(GiB)": 122.96, "step": 45480, "token_acc": 0.9749113716934824, "train_speed(iter/s)": 0.232756 }, { "epoch": 3.467108773534568, "grad_norm": 0.6449015140533447, "learning_rate": 2.1452917613851454e-05, "loss": 0.06539470553398133, "memory(GiB)": 122.96, "step": 45485, "token_acc": 0.9767590618336887, "train_speed(iter/s)": 0.23276 }, { "epoch": 3.467489900144828, "grad_norm": 0.6671376824378967, "learning_rate": 2.144308834568115e-05, "loss": 0.05540565252304077, "memory(GiB)": 122.96, "step": 45490, "token_acc": 0.9736456808199122, "train_speed(iter/s)": 0.232765 }, { "epoch": 3.467871026755088, "grad_norm": 1.652592658996582, "learning_rate": 2.143326071511721e-05, "loss": 0.055988460779190063, "memory(GiB)": 122.96, "step": 45495, "token_acc": 0.9821383647798743, "train_speed(iter/s)": 0.232773 }, { "epoch": 3.468252153365348, "grad_norm": 0.8878817558288574, "learning_rate": 2.1423434722723224e-05, "loss": 0.04629412889480591, "memory(GiB)": 122.96, "step": 45500, "token_acc": 0.9782041242495432, "train_speed(iter/s)": 0.232774 }, { "epoch": 3.4686332799756077, "grad_norm": 0.8809770345687866, "learning_rate": 2.1413610369062625e-05, "loss": 0.042269963026046756, "memory(GiB)": 122.96, "step": 45505, "token_acc": 0.9791579824927052, "train_speed(iter/s)": 0.23278 }, { "epoch": 3.4690144065858677, "grad_norm": 0.8297970294952393, "learning_rate": 2.1403787654698813e-05, "loss": 0.06449523568153381, "memory(GiB)": 122.96, "step": 45510, "token_acc": 0.9739361702127659, "train_speed(iter/s)": 0.232787 }, { "epoch": 3.4693955331961277, "grad_norm": 2.358611822128296, "learning_rate": 2.1393966580195095e-05, "loss": 0.10312068462371826, "memory(GiB)": 122.96, "step": 45515, "token_acc": 0.9580301235403622, "train_speed(iter/s)": 0.232791 }, { "epoch": 3.4697766598063877, "grad_norm": 0.45927196741104126, "learning_rate": 2.138414714611463e-05, "loss": 0.047816476225852965, "memory(GiB)": 122.96, "step": 45520, "token_acc": 0.9799700702198688, "train_speed(iter/s)": 0.232792 }, { "epoch": 3.4701577864166477, "grad_norm": 0.9601935148239136, "learning_rate": 2.1374329353020533e-05, "loss": 0.04691123068332672, "memory(GiB)": 122.96, "step": 45525, "token_acc": 0.9864018994172242, "train_speed(iter/s)": 0.232797 }, { "epoch": 3.4705389130269078, "grad_norm": 1.2984322309494019, "learning_rate": 2.1364513201475823e-05, "loss": 0.07405023574829102, "memory(GiB)": 122.96, "step": 45530, "token_acc": 0.9706441029183215, "train_speed(iter/s)": 0.232802 }, { "epoch": 3.4709200396371673, "grad_norm": 1.2576429843902588, "learning_rate": 2.135469869204338e-05, "loss": 0.046726527810096743, "memory(GiB)": 122.96, "step": 45535, "token_acc": 0.9778573754477369, "train_speed(iter/s)": 0.232808 }, { "epoch": 3.4713011662474273, "grad_norm": 0.718582272529602, "learning_rate": 2.134488582528604e-05, "loss": 0.05680583715438843, "memory(GiB)": 122.96, "step": 45540, "token_acc": 0.9753937007874016, "train_speed(iter/s)": 0.232814 }, { "epoch": 3.4716822928576874, "grad_norm": 0.7944990396499634, "learning_rate": 2.133507460176653e-05, "loss": 0.06901566982269287, "memory(GiB)": 122.96, "step": 45545, "token_acc": 0.9793771482137278, "train_speed(iter/s)": 0.232814 }, { "epoch": 3.4720634194679474, "grad_norm": 0.8480201959609985, "learning_rate": 2.132526502204746e-05, "loss": 0.04543834924697876, "memory(GiB)": 122.96, "step": 45550, "token_acc": 0.982368832646097, "train_speed(iter/s)": 0.232818 }, { "epoch": 3.472444546078207, "grad_norm": 0.5613117814064026, "learning_rate": 2.131545708669137e-05, "loss": 0.054096716642379764, "memory(GiB)": 122.96, "step": 45555, "token_acc": 0.9741206030150754, "train_speed(iter/s)": 0.232823 }, { "epoch": 3.472825672688467, "grad_norm": 0.5866463780403137, "learning_rate": 2.1305650796260723e-05, "loss": 0.040168476104736325, "memory(GiB)": 122.96, "step": 45560, "token_acc": 0.9802405498281787, "train_speed(iter/s)": 0.232829 }, { "epoch": 3.473206799298727, "grad_norm": 0.7897183299064636, "learning_rate": 2.1295846151317828e-05, "loss": 0.0351487398147583, "memory(GiB)": 122.96, "step": 45565, "token_acc": 0.9848377997179125, "train_speed(iter/s)": 0.232834 }, { "epoch": 3.473587925908987, "grad_norm": 0.7294285893440247, "learning_rate": 2.128604315242495e-05, "loss": 0.04257303774356842, "memory(GiB)": 122.96, "step": 45570, "token_acc": 0.9800705467372134, "train_speed(iter/s)": 0.232839 }, { "epoch": 3.473969052519247, "grad_norm": 1.8372489213943481, "learning_rate": 2.127624180014427e-05, "loss": 0.05803642868995666, "memory(GiB)": 122.96, "step": 45575, "token_acc": 0.9778129952456418, "train_speed(iter/s)": 0.232842 }, { "epoch": 3.474350179129507, "grad_norm": 1.463348388671875, "learning_rate": 2.126644209503781e-05, "loss": 0.05881203413009643, "memory(GiB)": 122.96, "step": 45580, "token_acc": 0.9764832793959007, "train_speed(iter/s)": 0.232848 }, { "epoch": 3.4747313057397666, "grad_norm": 2.5429534912109375, "learning_rate": 2.125664403766755e-05, "loss": 0.04949098229408264, "memory(GiB)": 122.96, "step": 45585, "token_acc": 0.9859985261606485, "train_speed(iter/s)": 0.232853 }, { "epoch": 3.4751124323500266, "grad_norm": 1.1219232082366943, "learning_rate": 2.124684762859539e-05, "loss": 0.06389384865760803, "memory(GiB)": 122.96, "step": 45590, "token_acc": 0.9766536964980544, "train_speed(iter/s)": 0.232859 }, { "epoch": 3.4754935589602867, "grad_norm": 1.3053443431854248, "learning_rate": 2.1237052868383072e-05, "loss": 0.07849932312965394, "memory(GiB)": 122.96, "step": 45595, "token_acc": 0.9697041420118343, "train_speed(iter/s)": 0.232866 }, { "epoch": 3.4758746855705467, "grad_norm": 0.9387605786323547, "learning_rate": 2.1227259757592293e-05, "loss": 0.06763590574264526, "memory(GiB)": 122.96, "step": 45600, "token_acc": 0.9749869723814487, "train_speed(iter/s)": 0.232873 }, { "epoch": 3.4758746855705467, "eval_loss": 0.06129448488354683, "eval_runtime": 222.3326, "eval_samples_per_second": 2.384, "eval_steps_per_second": 2.384, "eval_token_acc": 0.974896090596952, "step": 45600 }, { "epoch": 3.4762558121808063, "grad_norm": 0.5404269099235535, "learning_rate": 2.1217468296784665e-05, "loss": 0.0323899507522583, "memory(GiB)": 122.96, "step": 45605, "token_acc": 0.9753155842002437, "train_speed(iter/s)": 0.232612 }, { "epoch": 3.4766369387910663, "grad_norm": 1.279731273651123, "learning_rate": 2.1207678486521644e-05, "loss": 0.05108104348182678, "memory(GiB)": 122.96, "step": 45610, "token_acc": 0.9772526891200847, "train_speed(iter/s)": 0.232618 }, { "epoch": 3.4770180654013263, "grad_norm": 2.9462409019470215, "learning_rate": 2.1197890327364666e-05, "loss": 0.07645491361618043, "memory(GiB)": 122.96, "step": 45615, "token_acc": 0.9725274725274725, "train_speed(iter/s)": 0.232625 }, { "epoch": 3.4773991920115863, "grad_norm": 1.1436007022857666, "learning_rate": 2.1188103819875004e-05, "loss": 0.05076541900634766, "memory(GiB)": 122.96, "step": 45620, "token_acc": 0.9736748488082533, "train_speed(iter/s)": 0.232633 }, { "epoch": 3.4777803186218463, "grad_norm": 0.48035794496536255, "learning_rate": 2.1178318964613862e-05, "loss": 0.06473852396011352, "memory(GiB)": 122.96, "step": 45625, "token_acc": 0.9764434643143545, "train_speed(iter/s)": 0.232633 }, { "epoch": 3.4781614452321064, "grad_norm": 0.05489281192421913, "learning_rate": 2.1168535762142422e-05, "loss": 0.048590391874313354, "memory(GiB)": 122.96, "step": 45630, "token_acc": 0.9784665579119086, "train_speed(iter/s)": 0.23264 }, { "epoch": 3.478542571842366, "grad_norm": 1.382334589958191, "learning_rate": 2.1158754213021643e-05, "loss": 0.060788053274154666, "memory(GiB)": 122.96, "step": 45635, "token_acc": 0.9698204036289576, "train_speed(iter/s)": 0.232644 }, { "epoch": 3.478923698452626, "grad_norm": 1.0205919742584229, "learning_rate": 2.1148974317812463e-05, "loss": 0.06498830914497375, "memory(GiB)": 122.96, "step": 45640, "token_acc": 0.9651810584958217, "train_speed(iter/s)": 0.232651 }, { "epoch": 3.479304825062886, "grad_norm": 0.428172767162323, "learning_rate": 2.113919607707574e-05, "loss": 0.04394850730895996, "memory(GiB)": 122.96, "step": 45645, "token_acc": 0.9830981547526748, "train_speed(iter/s)": 0.232655 }, { "epoch": 3.479685951673146, "grad_norm": 1.0690404176712036, "learning_rate": 2.1129419491372178e-05, "loss": 0.05586216449737549, "memory(GiB)": 122.96, "step": 45650, "token_acc": 0.9781503107929931, "train_speed(iter/s)": 0.232659 }, { "epoch": 3.4800670782834056, "grad_norm": 0.6871334910392761, "learning_rate": 2.1119644561262446e-05, "loss": 0.04871627688407898, "memory(GiB)": 122.96, "step": 45655, "token_acc": 0.9806608801583676, "train_speed(iter/s)": 0.232663 }, { "epoch": 3.4804482048936656, "grad_norm": 0.6085583567619324, "learning_rate": 2.1109871287307064e-05, "loss": 0.06148748397827149, "memory(GiB)": 122.96, "step": 45660, "token_acc": 0.9763690070438537, "train_speed(iter/s)": 0.232668 }, { "epoch": 3.4808293315039256, "grad_norm": 0.9081423282623291, "learning_rate": 2.1100099670066493e-05, "loss": 0.07246443033218383, "memory(GiB)": 122.96, "step": 45665, "token_acc": 0.9734776725304466, "train_speed(iter/s)": 0.232672 }, { "epoch": 3.4812104581141856, "grad_norm": 1.0667616128921509, "learning_rate": 2.1090329710101113e-05, "loss": 0.044239723682403566, "memory(GiB)": 122.96, "step": 45670, "token_acc": 0.9837032874402922, "train_speed(iter/s)": 0.232678 }, { "epoch": 3.4815915847244456, "grad_norm": 0.9836214184761047, "learning_rate": 2.108056140797115e-05, "loss": 0.06769940853118897, "memory(GiB)": 122.96, "step": 45675, "token_acc": 0.977577834904313, "train_speed(iter/s)": 0.23268 }, { "epoch": 3.481972711334705, "grad_norm": 0.6712965369224548, "learning_rate": 2.107079476423679e-05, "loss": 0.0731998085975647, "memory(GiB)": 122.96, "step": 45680, "token_acc": 0.9744688142563399, "train_speed(iter/s)": 0.232686 }, { "epoch": 3.4823538379449652, "grad_norm": 0.7678058743476868, "learning_rate": 2.1061029779458112e-05, "loss": 0.05543010234832764, "memory(GiB)": 122.96, "step": 45685, "token_acc": 0.9739724414085502, "train_speed(iter/s)": 0.232689 }, { "epoch": 3.4827349645552252, "grad_norm": 3.574603796005249, "learning_rate": 2.1051266454195072e-05, "loss": 0.10301198959350585, "memory(GiB)": 122.96, "step": 45690, "token_acc": 0.965526247061896, "train_speed(iter/s)": 0.232694 }, { "epoch": 3.4831160911654853, "grad_norm": 1.8791759014129639, "learning_rate": 2.104150478900756e-05, "loss": 0.05302752256393432, "memory(GiB)": 122.96, "step": 45695, "token_acc": 0.9825923312161844, "train_speed(iter/s)": 0.2327 }, { "epoch": 3.4834972177757453, "grad_norm": 0.6351460218429565, "learning_rate": 2.1031744784455387e-05, "loss": 0.07178840041160583, "memory(GiB)": 122.96, "step": 45700, "token_acc": 0.9773844641101278, "train_speed(iter/s)": 0.232702 }, { "epoch": 3.483878344386005, "grad_norm": 0.8995490670204163, "learning_rate": 2.1021986441098196e-05, "loss": 0.0454012542963028, "memory(GiB)": 122.96, "step": 45705, "token_acc": 0.9759450171821306, "train_speed(iter/s)": 0.232709 }, { "epoch": 3.484259470996265, "grad_norm": 1.6305568218231201, "learning_rate": 2.101222975949561e-05, "loss": 0.06787364482879639, "memory(GiB)": 122.96, "step": 45710, "token_acc": 0.9738601823708206, "train_speed(iter/s)": 0.232713 }, { "epoch": 3.484640597606525, "grad_norm": 1.3048747777938843, "learning_rate": 2.100247474020715e-05, "loss": 0.07315539717674255, "memory(GiB)": 122.96, "step": 45715, "token_acc": 0.9667411268304791, "train_speed(iter/s)": 0.23272 }, { "epoch": 3.485021724216785, "grad_norm": 1.1352070569992065, "learning_rate": 2.099272138379218e-05, "loss": 0.05162966251373291, "memory(GiB)": 122.96, "step": 45720, "token_acc": 0.9789932112206994, "train_speed(iter/s)": 0.232721 }, { "epoch": 3.485402850827045, "grad_norm": 1.1906036138534546, "learning_rate": 2.0982969690810023e-05, "loss": 0.05077582597732544, "memory(GiB)": 122.96, "step": 45725, "token_acc": 0.9775183744055339, "train_speed(iter/s)": 0.23273 }, { "epoch": 3.4857839774373045, "grad_norm": 1.115087866783142, "learning_rate": 2.097321966181992e-05, "loss": 0.06823775172233582, "memory(GiB)": 122.96, "step": 45730, "token_acc": 0.9712320200125079, "train_speed(iter/s)": 0.232734 }, { "epoch": 3.4861651040475645, "grad_norm": 1.0002802610397339, "learning_rate": 2.0963471297380953e-05, "loss": 0.05600858926773071, "memory(GiB)": 122.96, "step": 45735, "token_acc": 0.979443115075049, "train_speed(iter/s)": 0.232733 }, { "epoch": 3.4865462306578245, "grad_norm": 3.488795518875122, "learning_rate": 2.095372459805216e-05, "loss": 0.06474840641021729, "memory(GiB)": 122.96, "step": 45740, "token_acc": 0.9806228373702423, "train_speed(iter/s)": 0.232739 }, { "epoch": 3.4869273572680846, "grad_norm": 1.491994857788086, "learning_rate": 2.0943979564392487e-05, "loss": 0.07057619094848633, "memory(GiB)": 122.96, "step": 45745, "token_acc": 0.9705704495061479, "train_speed(iter/s)": 0.232745 }, { "epoch": 3.4873084838783446, "grad_norm": 0.9350032210350037, "learning_rate": 2.0934236196960733e-05, "loss": 0.037183725833892824, "memory(GiB)": 122.96, "step": 45750, "token_acc": 0.9853254734729713, "train_speed(iter/s)": 0.232747 }, { "epoch": 3.487689610488604, "grad_norm": 1.668515920639038, "learning_rate": 2.0924494496315648e-05, "loss": 0.05650555491447449, "memory(GiB)": 122.96, "step": 45755, "token_acc": 0.9750659314313115, "train_speed(iter/s)": 0.232753 }, { "epoch": 3.488070737098864, "grad_norm": 0.781704843044281, "learning_rate": 2.091475446301588e-05, "loss": 0.041441628336906434, "memory(GiB)": 122.96, "step": 45760, "token_acc": 0.9842133913990201, "train_speed(iter/s)": 0.23276 }, { "epoch": 3.488451863709124, "grad_norm": 1.0339933633804321, "learning_rate": 2.090501609761997e-05, "loss": 0.054609501361846925, "memory(GiB)": 122.96, "step": 45765, "token_acc": 0.9759812536613942, "train_speed(iter/s)": 0.232765 }, { "epoch": 3.488832990319384, "grad_norm": 1.1245914697647095, "learning_rate": 2.089527940068639e-05, "loss": 0.07173210382461548, "memory(GiB)": 122.96, "step": 45770, "token_acc": 0.9688270912760709, "train_speed(iter/s)": 0.232769 }, { "epoch": 3.4892141169296442, "grad_norm": 1.0721688270568848, "learning_rate": 2.0885544372773453e-05, "loss": 0.0313246488571167, "memory(GiB)": 122.96, "step": 45775, "token_acc": 0.9873307121013543, "train_speed(iter/s)": 0.232778 }, { "epoch": 3.489595243539904, "grad_norm": 0.7294628620147705, "learning_rate": 2.087581101443944e-05, "loss": 0.04572803974151611, "memory(GiB)": 122.96, "step": 45780, "token_acc": 0.9787685774946921, "train_speed(iter/s)": 0.232783 }, { "epoch": 3.489976370150164, "grad_norm": 1.3219822645187378, "learning_rate": 2.0866079326242528e-05, "loss": 0.0652586579322815, "memory(GiB)": 122.96, "step": 45785, "token_acc": 0.9694624555532315, "train_speed(iter/s)": 0.23279 }, { "epoch": 3.490357496760424, "grad_norm": 0.719188928604126, "learning_rate": 2.085634930874075e-05, "loss": 0.06442931294441223, "memory(GiB)": 122.96, "step": 45790, "token_acc": 0.9791487532244196, "train_speed(iter/s)": 0.232796 }, { "epoch": 3.490738623370684, "grad_norm": 0.75143963098526, "learning_rate": 2.0846620962492102e-05, "loss": 0.08720421195030212, "memory(GiB)": 122.96, "step": 45795, "token_acc": 0.9729931085863289, "train_speed(iter/s)": 0.232802 }, { "epoch": 3.4911197499809434, "grad_norm": 0.9573978185653687, "learning_rate": 2.083689428805447e-05, "loss": 0.10561884641647339, "memory(GiB)": 122.96, "step": 45800, "token_acc": 0.9567338282078472, "train_speed(iter/s)": 0.232808 }, { "epoch": 3.4911197499809434, "eval_loss": 0.06182347238063812, "eval_runtime": 226.3186, "eval_samples_per_second": 2.342, "eval_steps_per_second": 2.342, "eval_token_acc": 0.9751596289380158, "step": 45800 }, { "epoch": 3.4915008765912035, "grad_norm": 0.5485325455665588, "learning_rate": 2.08271692859856e-05, "loss": 0.04437752068042755, "memory(GiB)": 122.96, "step": 45805, "token_acc": 0.975398123186164, "train_speed(iter/s)": 0.232548 }, { "epoch": 3.4918820032014635, "grad_norm": 0.8883798122406006, "learning_rate": 2.081744595684319e-05, "loss": 0.03662385046482086, "memory(GiB)": 122.96, "step": 45810, "token_acc": 0.985014619883041, "train_speed(iter/s)": 0.232554 }, { "epoch": 3.4922631298117235, "grad_norm": 0.5619946122169495, "learning_rate": 2.080772430118485e-05, "loss": 0.043571746349334715, "memory(GiB)": 122.96, "step": 45815, "token_acc": 0.9822148881554822, "train_speed(iter/s)": 0.232557 }, { "epoch": 3.4926442564219835, "grad_norm": 1.2625367641448975, "learning_rate": 2.0798004319568032e-05, "loss": 0.0764374554157257, "memory(GiB)": 122.96, "step": 45820, "token_acc": 0.9696127799965272, "train_speed(iter/s)": 0.232561 }, { "epoch": 3.4930253830322435, "grad_norm": 1.3199982643127441, "learning_rate": 2.0788286012550173e-05, "loss": 0.05671080946922302, "memory(GiB)": 122.96, "step": 45825, "token_acc": 0.9698919197117859, "train_speed(iter/s)": 0.232567 }, { "epoch": 3.493406509642503, "grad_norm": 0.8210570812225342, "learning_rate": 2.0778569380688533e-05, "loss": 0.07261161208152771, "memory(GiB)": 122.96, "step": 45830, "token_acc": 0.9713932908005762, "train_speed(iter/s)": 0.23257 }, { "epoch": 3.493787636252763, "grad_norm": 0.9715078473091125, "learning_rate": 2.076885442454034e-05, "loss": 0.061373007297515866, "memory(GiB)": 122.96, "step": 45835, "token_acc": 0.9731467473524962, "train_speed(iter/s)": 0.232576 }, { "epoch": 3.494168762863023, "grad_norm": 1.3919907808303833, "learning_rate": 2.075914114466271e-05, "loss": 0.04735950231552124, "memory(GiB)": 122.96, "step": 45840, "token_acc": 0.9789439585357953, "train_speed(iter/s)": 0.232583 }, { "epoch": 3.494549889473283, "grad_norm": 0.8763614296913147, "learning_rate": 2.0749429541612624e-05, "loss": 0.04717410802841186, "memory(GiB)": 122.96, "step": 45845, "token_acc": 0.9783762786481937, "train_speed(iter/s)": 0.232585 }, { "epoch": 3.4949310160835427, "grad_norm": 1.2443774938583374, "learning_rate": 2.073971961594701e-05, "loss": 0.05147503018379211, "memory(GiB)": 122.96, "step": 45850, "token_acc": 0.9775530340404539, "train_speed(iter/s)": 0.232593 }, { "epoch": 3.4953121426938027, "grad_norm": 0.8994700312614441, "learning_rate": 2.0730011368222718e-05, "loss": 0.05053727030754089, "memory(GiB)": 122.96, "step": 45855, "token_acc": 0.9802748585286984, "train_speed(iter/s)": 0.232597 }, { "epoch": 3.4956932693040628, "grad_norm": 1.186975121498108, "learning_rate": 2.072030479899642e-05, "loss": 0.04194928705692291, "memory(GiB)": 122.96, "step": 45860, "token_acc": 0.976937984496124, "train_speed(iter/s)": 0.232602 }, { "epoch": 3.496074395914323, "grad_norm": 1.6463955640792847, "learning_rate": 2.0710599908824775e-05, "loss": 0.05754992961883545, "memory(GiB)": 122.96, "step": 45865, "token_acc": 0.9786419207541611, "train_speed(iter/s)": 0.232606 }, { "epoch": 3.496455522524583, "grad_norm": 0.9804288148880005, "learning_rate": 2.0700896698264315e-05, "loss": 0.047250676155090335, "memory(GiB)": 122.96, "step": 45870, "token_acc": 0.9777863627583543, "train_speed(iter/s)": 0.232611 }, { "epoch": 3.496836649134843, "grad_norm": 0.5005662441253662, "learning_rate": 2.0691195167871453e-05, "loss": 0.054424136877059937, "memory(GiB)": 122.96, "step": 45875, "token_acc": 0.9794903666873834, "train_speed(iter/s)": 0.232617 }, { "epoch": 3.4972177757451024, "grad_norm": 1.174930453300476, "learning_rate": 2.0681495318202538e-05, "loss": 0.06093894839286804, "memory(GiB)": 122.96, "step": 45880, "token_acc": 0.9743279244369096, "train_speed(iter/s)": 0.232624 }, { "epoch": 3.4975989023553624, "grad_norm": 1.3117767572402954, "learning_rate": 2.0671797149813826e-05, "loss": 0.06457515954971313, "memory(GiB)": 122.96, "step": 45885, "token_acc": 0.9804255319148936, "train_speed(iter/s)": 0.232632 }, { "epoch": 3.4979800289656224, "grad_norm": 3.543818235397339, "learning_rate": 2.0662100663261418e-05, "loss": 0.05853534340858459, "memory(GiB)": 122.96, "step": 45890, "token_acc": 0.9781094527363184, "train_speed(iter/s)": 0.232641 }, { "epoch": 3.4983611555758825, "grad_norm": 2.7436232566833496, "learning_rate": 2.0652405859101425e-05, "loss": 0.05648674368858338, "memory(GiB)": 122.96, "step": 45895, "token_acc": 0.977856860419138, "train_speed(iter/s)": 0.232649 }, { "epoch": 3.498742282186142, "grad_norm": 1.3570802211761475, "learning_rate": 2.0642712737889748e-05, "loss": 0.05645543336868286, "memory(GiB)": 122.96, "step": 45900, "token_acc": 0.9795201872440024, "train_speed(iter/s)": 0.232657 }, { "epoch": 3.499123408796402, "grad_norm": 1.8019976615905762, "learning_rate": 2.063302130018226e-05, "loss": 0.051719683408737185, "memory(GiB)": 122.96, "step": 45905, "token_acc": 0.9800995024875622, "train_speed(iter/s)": 0.232663 }, { "epoch": 3.499504535406662, "grad_norm": 1.906590223312378, "learning_rate": 2.0623331546534742e-05, "loss": 0.06969367265701294, "memory(GiB)": 122.96, "step": 45910, "token_acc": 0.979606595313856, "train_speed(iter/s)": 0.232667 }, { "epoch": 3.499885662016922, "grad_norm": 0.7500190138816833, "learning_rate": 2.0613643477502813e-05, "loss": 0.06738216876983642, "memory(GiB)": 122.96, "step": 45915, "token_acc": 0.9777530589543938, "train_speed(iter/s)": 0.232674 }, { "epoch": 3.500266788627182, "grad_norm": 2.2650322914123535, "learning_rate": 2.0603957093642068e-05, "loss": 0.06168020963668823, "memory(GiB)": 122.96, "step": 45920, "token_acc": 0.9781445138269402, "train_speed(iter/s)": 0.23268 }, { "epoch": 3.500647915237442, "grad_norm": 0.7972579002380371, "learning_rate": 2.0594272395507985e-05, "loss": 0.03812500238418579, "memory(GiB)": 122.96, "step": 45925, "token_acc": 0.9849837662337663, "train_speed(iter/s)": 0.232689 }, { "epoch": 3.5010290418477017, "grad_norm": 1.9578025341033936, "learning_rate": 2.05845893836559e-05, "loss": 0.06808693408966064, "memory(GiB)": 122.96, "step": 45930, "token_acc": 0.9768292682926829, "train_speed(iter/s)": 0.232693 }, { "epoch": 3.5014101684579617, "grad_norm": 1.0077353715896606, "learning_rate": 2.057490805864111e-05, "loss": 0.048974961042404175, "memory(GiB)": 122.96, "step": 45935, "token_acc": 0.9843908629441624, "train_speed(iter/s)": 0.232694 }, { "epoch": 3.5017912950682217, "grad_norm": 2.153729200363159, "learning_rate": 2.0565228421018818e-05, "loss": 0.058753442764282224, "memory(GiB)": 122.96, "step": 45940, "token_acc": 0.9793466807165437, "train_speed(iter/s)": 0.2327 }, { "epoch": 3.5021724216784818, "grad_norm": 0.014175181277096272, "learning_rate": 2.0555550471344054e-05, "loss": 0.06040411591529846, "memory(GiB)": 122.96, "step": 45945, "token_acc": 0.9706242350061199, "train_speed(iter/s)": 0.232706 }, { "epoch": 3.5025535482887413, "grad_norm": 1.1168289184570312, "learning_rate": 2.054587421017184e-05, "loss": 0.07539054751396179, "memory(GiB)": 122.96, "step": 45950, "token_acc": 0.9686411149825784, "train_speed(iter/s)": 0.232712 }, { "epoch": 3.5029346748990013, "grad_norm": 1.7553696632385254, "learning_rate": 2.053619963805707e-05, "loss": 0.07214447855949402, "memory(GiB)": 122.96, "step": 45955, "token_acc": 0.9767566123430403, "train_speed(iter/s)": 0.232713 }, { "epoch": 3.5033158015092614, "grad_norm": 1.647191047668457, "learning_rate": 2.0526526755554502e-05, "loss": 0.06568965911865235, "memory(GiB)": 122.96, "step": 45960, "token_acc": 0.9710848459265513, "train_speed(iter/s)": 0.232718 }, { "epoch": 3.5036969281195214, "grad_norm": 1.2985783815383911, "learning_rate": 2.0516855563218858e-05, "loss": 0.051267361640930174, "memory(GiB)": 122.96, "step": 45965, "token_acc": 0.9748013620885357, "train_speed(iter/s)": 0.232724 }, { "epoch": 3.5040780547297814, "grad_norm": 0.7569097876548767, "learning_rate": 2.0507186061604738e-05, "loss": 0.04899776577949524, "memory(GiB)": 122.96, "step": 45970, "token_acc": 0.9792133686570206, "train_speed(iter/s)": 0.232725 }, { "epoch": 3.5044591813400414, "grad_norm": 1.3438225984573364, "learning_rate": 2.0497518251266622e-05, "loss": 0.0999957025051117, "memory(GiB)": 122.96, "step": 45975, "token_acc": 0.9600298841987299, "train_speed(iter/s)": 0.232732 }, { "epoch": 3.504840307950301, "grad_norm": 0.6926179528236389, "learning_rate": 2.048785213275893e-05, "loss": 0.05862660408020019, "memory(GiB)": 122.96, "step": 45980, "token_acc": 0.9765296431963099, "train_speed(iter/s)": 0.232735 }, { "epoch": 3.505221434560561, "grad_norm": 0.16106900572776794, "learning_rate": 2.0478187706635977e-05, "loss": 0.050265824794769286, "memory(GiB)": 122.96, "step": 45985, "token_acc": 0.9771807140228193, "train_speed(iter/s)": 0.232743 }, { "epoch": 3.505602561170821, "grad_norm": 0.5569798350334167, "learning_rate": 2.046852497345194e-05, "loss": 0.04796984195709229, "memory(GiB)": 122.96, "step": 45990, "token_acc": 0.9813501699854298, "train_speed(iter/s)": 0.232742 }, { "epoch": 3.5059836877810806, "grad_norm": 1.264333724975586, "learning_rate": 2.0458863933760973e-05, "loss": 0.03199462592601776, "memory(GiB)": 122.96, "step": 45995, "token_acc": 0.9819102749638206, "train_speed(iter/s)": 0.23275 }, { "epoch": 3.5063648143913406, "grad_norm": 0.7602733373641968, "learning_rate": 2.044920458811706e-05, "loss": 0.05693466067314148, "memory(GiB)": 122.96, "step": 46000, "token_acc": 0.9753280839895013, "train_speed(iter/s)": 0.232757 }, { "epoch": 3.5063648143913406, "eval_loss": 0.06152508407831192, "eval_runtime": 224.0345, "eval_samples_per_second": 2.366, "eval_steps_per_second": 2.366, "eval_token_acc": 0.9749262092645021, "step": 46000 }, { "epoch": 3.5067459410016006, "grad_norm": 0.6322149038314819, "learning_rate": 2.0439546937074127e-05, "loss": 0.074873948097229, "memory(GiB)": 122.96, "step": 46005, "token_acc": 0.975016762382299, "train_speed(iter/s)": 0.232496 }, { "epoch": 3.5071270676118607, "grad_norm": 1.7046021223068237, "learning_rate": 2.0429890981186017e-05, "loss": 0.07756444215774536, "memory(GiB)": 122.96, "step": 46010, "token_acc": 0.9678099252730408, "train_speed(iter/s)": 0.232501 }, { "epoch": 3.5075081942221207, "grad_norm": 1.5571244955062866, "learning_rate": 2.0420236721006418e-05, "loss": 0.07456346750259399, "memory(GiB)": 122.96, "step": 46015, "token_acc": 0.9743505425846761, "train_speed(iter/s)": 0.232508 }, { "epoch": 3.5078893208323807, "grad_norm": 1.1427301168441772, "learning_rate": 2.041058415708898e-05, "loss": 0.054862552881240846, "memory(GiB)": 122.96, "step": 46020, "token_acc": 0.9746628524724152, "train_speed(iter/s)": 0.232514 }, { "epoch": 3.5082704474426407, "grad_norm": 0.7011398077011108, "learning_rate": 2.040093328998723e-05, "loss": 0.0648173987865448, "memory(GiB)": 122.96, "step": 46025, "token_acc": 0.979019330504479, "train_speed(iter/s)": 0.232515 }, { "epoch": 3.5086515740529003, "grad_norm": 0.8604607582092285, "learning_rate": 2.0391284120254594e-05, "loss": 0.059105384349823, "memory(GiB)": 122.96, "step": 46030, "token_acc": 0.9775767853778029, "train_speed(iter/s)": 0.23252 }, { "epoch": 3.5090327006631603, "grad_norm": 0.6817874312400818, "learning_rate": 2.0381636648444413e-05, "loss": 0.04441819489002228, "memory(GiB)": 122.96, "step": 46035, "token_acc": 0.9822762814943528, "train_speed(iter/s)": 0.232525 }, { "epoch": 3.5094138272734203, "grad_norm": 2.0082995891571045, "learning_rate": 2.0371990875109948e-05, "loss": 0.05625681877136231, "memory(GiB)": 122.96, "step": 46040, "token_acc": 0.9767296904217744, "train_speed(iter/s)": 0.232532 }, { "epoch": 3.50979495388368, "grad_norm": 0.5969873666763306, "learning_rate": 2.0362346800804294e-05, "loss": 0.059069865942001344, "memory(GiB)": 122.96, "step": 46045, "token_acc": 0.9749936208216382, "train_speed(iter/s)": 0.232538 }, { "epoch": 3.51017608049394, "grad_norm": 0.8075672388076782, "learning_rate": 2.035270442608053e-05, "loss": 0.05012603998184204, "memory(GiB)": 122.96, "step": 46050, "token_acc": 0.9789612097304405, "train_speed(iter/s)": 0.23254 }, { "epoch": 3.5105572071042, "grad_norm": 0.8069801926612854, "learning_rate": 2.0343063751491575e-05, "loss": 0.05600680112838745, "memory(GiB)": 122.96, "step": 46055, "token_acc": 0.9744479495268139, "train_speed(iter/s)": 0.232548 }, { "epoch": 3.51093833371446, "grad_norm": 0.88272625207901, "learning_rate": 2.033342477759029e-05, "loss": 0.10414590835571289, "memory(GiB)": 122.96, "step": 46060, "token_acc": 0.9685990338164251, "train_speed(iter/s)": 0.232551 }, { "epoch": 3.51131946032472, "grad_norm": 0.7826039791107178, "learning_rate": 2.0323787504929433e-05, "loss": 0.04015759825706482, "memory(GiB)": 122.96, "step": 46065, "token_acc": 0.9871092491137609, "train_speed(iter/s)": 0.232559 }, { "epoch": 3.51170058693498, "grad_norm": 2.4727163314819336, "learning_rate": 2.0314151934061637e-05, "loss": 0.06349260210990906, "memory(GiB)": 122.96, "step": 46070, "token_acc": 0.975020587427944, "train_speed(iter/s)": 0.232562 }, { "epoch": 3.5120817135452396, "grad_norm": 1.177433729171753, "learning_rate": 2.0304518065539467e-05, "loss": 0.05157172679901123, "memory(GiB)": 122.96, "step": 46075, "token_acc": 0.9753761969904241, "train_speed(iter/s)": 0.232568 }, { "epoch": 3.5124628401554996, "grad_norm": 0.8229337930679321, "learning_rate": 2.02948858999154e-05, "loss": 0.08985299468040467, "memory(GiB)": 122.96, "step": 46080, "token_acc": 0.9770612768687841, "train_speed(iter/s)": 0.232568 }, { "epoch": 3.5128439667657596, "grad_norm": 0.8853925466537476, "learning_rate": 2.0285255437741756e-05, "loss": 0.07765599489212036, "memory(GiB)": 122.96, "step": 46085, "token_acc": 0.9769404672192916, "train_speed(iter/s)": 0.232573 }, { "epoch": 3.5132250933760196, "grad_norm": 1.4233826398849487, "learning_rate": 2.0275626679570824e-05, "loss": 0.0519239604473114, "memory(GiB)": 122.96, "step": 46090, "token_acc": 0.9794431433775697, "train_speed(iter/s)": 0.23258 }, { "epoch": 3.513606219986279, "grad_norm": 0.6709772348403931, "learning_rate": 2.0265999625954786e-05, "loss": 0.04199601411819458, "memory(GiB)": 122.96, "step": 46095, "token_acc": 0.9777236338322312, "train_speed(iter/s)": 0.232583 }, { "epoch": 3.513987346596539, "grad_norm": 0.608588695526123, "learning_rate": 2.0256374277445662e-05, "loss": 0.0628137469291687, "memory(GiB)": 122.96, "step": 46100, "token_acc": 0.9793618034608668, "train_speed(iter/s)": 0.232586 }, { "epoch": 3.5143684732067992, "grad_norm": 1.5200026035308838, "learning_rate": 2.024675063459545e-05, "loss": 0.06833767890930176, "memory(GiB)": 122.96, "step": 46105, "token_acc": 0.9739417387346382, "train_speed(iter/s)": 0.232587 }, { "epoch": 3.5147495998170593, "grad_norm": 1.4021960496902466, "learning_rate": 2.0237128697956033e-05, "loss": 0.05672979950904846, "memory(GiB)": 122.96, "step": 46110, "token_acc": 0.9761258817145958, "train_speed(iter/s)": 0.23259 }, { "epoch": 3.5151307264273193, "grad_norm": 0.07328073680400848, "learning_rate": 2.022750846807915e-05, "loss": 0.04986168742179871, "memory(GiB)": 122.96, "step": 46115, "token_acc": 0.9807524059492564, "train_speed(iter/s)": 0.232596 }, { "epoch": 3.5155118530375793, "grad_norm": 0.8316476345062256, "learning_rate": 2.02178899455165e-05, "loss": 0.06388010382652283, "memory(GiB)": 122.96, "step": 46120, "token_acc": 0.9756790903348074, "train_speed(iter/s)": 0.232599 }, { "epoch": 3.515892979647839, "grad_norm": 0.879058837890625, "learning_rate": 2.0208273130819665e-05, "loss": 0.054142999649047854, "memory(GiB)": 122.96, "step": 46125, "token_acc": 0.9763903462749213, "train_speed(iter/s)": 0.232604 }, { "epoch": 3.516274106258099, "grad_norm": 0.9695757031440735, "learning_rate": 2.0198658024540102e-05, "loss": 0.03327067792415619, "memory(GiB)": 122.96, "step": 46130, "token_acc": 0.9822703200552614, "train_speed(iter/s)": 0.232611 }, { "epoch": 3.516655232868359, "grad_norm": 1.2468252182006836, "learning_rate": 2.018904462722921e-05, "loss": 0.05499972701072693, "memory(GiB)": 122.96, "step": 46135, "token_acc": 0.9828411811652035, "train_speed(iter/s)": 0.232616 }, { "epoch": 3.517036359478619, "grad_norm": 0.7676615118980408, "learning_rate": 2.017943293943828e-05, "loss": 0.055069136619567874, "memory(GiB)": 122.96, "step": 46140, "token_acc": 0.9771428571428571, "train_speed(iter/s)": 0.232619 }, { "epoch": 3.5174174860888785, "grad_norm": 0.6986245512962341, "learning_rate": 2.0169822961718476e-05, "loss": 0.05534220933914184, "memory(GiB)": 122.96, "step": 46145, "token_acc": 0.9800214822771214, "train_speed(iter/s)": 0.232627 }, { "epoch": 3.5177986126991385, "grad_norm": 1.2194929122924805, "learning_rate": 2.0160214694620887e-05, "loss": 0.07410275936126709, "memory(GiB)": 122.96, "step": 46150, "token_acc": 0.9769295302013423, "train_speed(iter/s)": 0.232635 }, { "epoch": 3.5181797393093985, "grad_norm": 0.7096107006072998, "learning_rate": 2.0150608138696538e-05, "loss": 0.051240730285644534, "memory(GiB)": 122.96, "step": 46155, "token_acc": 0.9822473270123059, "train_speed(iter/s)": 0.232641 }, { "epoch": 3.5185608659196586, "grad_norm": 1.469334602355957, "learning_rate": 2.0141003294496253e-05, "loss": 0.04515378773212433, "memory(GiB)": 122.96, "step": 46160, "token_acc": 0.9822380106571936, "train_speed(iter/s)": 0.232648 }, { "epoch": 3.5189419925299186, "grad_norm": 1.4720138311386108, "learning_rate": 2.0131400162570907e-05, "loss": 0.050509297847747804, "memory(GiB)": 122.96, "step": 46165, "token_acc": 0.9814029809927526, "train_speed(iter/s)": 0.23265 }, { "epoch": 3.5193231191401786, "grad_norm": 2.2241880893707275, "learning_rate": 2.012179874347113e-05, "loss": 0.03953019380569458, "memory(GiB)": 122.96, "step": 46170, "token_acc": 0.9827364081422314, "train_speed(iter/s)": 0.232657 }, { "epoch": 3.519704245750438, "grad_norm": 3.7867226600646973, "learning_rate": 2.0112199037747553e-05, "loss": 0.09225237369537354, "memory(GiB)": 122.96, "step": 46175, "token_acc": 0.966791199667912, "train_speed(iter/s)": 0.232664 }, { "epoch": 3.520085372360698, "grad_norm": 1.2564043998718262, "learning_rate": 2.0102601045950676e-05, "loss": 0.06710940599441528, "memory(GiB)": 122.96, "step": 46180, "token_acc": 0.9732595666205625, "train_speed(iter/s)": 0.232667 }, { "epoch": 3.520466498970958, "grad_norm": 2.2581868171691895, "learning_rate": 2.009300476863087e-05, "loss": 0.053464758396148684, "memory(GiB)": 122.96, "step": 46185, "token_acc": 0.9812332439678284, "train_speed(iter/s)": 0.232676 }, { "epoch": 3.520847625581218, "grad_norm": 1.5457837581634521, "learning_rate": 2.0083410206338455e-05, "loss": 0.08877884149551392, "memory(GiB)": 122.96, "step": 46190, "token_acc": 0.9693308550185874, "train_speed(iter/s)": 0.23268 }, { "epoch": 3.521228752191478, "grad_norm": 1.5769003629684448, "learning_rate": 2.0073817359623654e-05, "loss": 0.0888767421245575, "memory(GiB)": 122.96, "step": 46195, "token_acc": 0.967429177397364, "train_speed(iter/s)": 0.232685 }, { "epoch": 3.521609878801738, "grad_norm": 0.8570423126220703, "learning_rate": 2.0064226229036538e-05, "loss": 0.058543580770492556, "memory(GiB)": 122.96, "step": 46200, "token_acc": 0.9751920614596671, "train_speed(iter/s)": 0.232689 }, { "epoch": 3.521609878801738, "eval_loss": 0.06178486719727516, "eval_runtime": 220.9613, "eval_samples_per_second": 2.399, "eval_steps_per_second": 2.399, "eval_token_acc": 0.9748584422625143, "step": 46200 }, { "epoch": 3.521991005411998, "grad_norm": 0.9546840786933899, "learning_rate": 2.0054636815127125e-05, "loss": 0.03944112658500672, "memory(GiB)": 122.96, "step": 46205, "token_acc": 0.9749647519486876, "train_speed(iter/s)": 0.232437 }, { "epoch": 3.522372132022258, "grad_norm": 1.3733474016189575, "learning_rate": 2.0045049118445346e-05, "loss": 0.04989332556724548, "memory(GiB)": 122.96, "step": 46210, "token_acc": 0.980319803198032, "train_speed(iter/s)": 0.232444 }, { "epoch": 3.522753258632518, "grad_norm": 4.39862060546875, "learning_rate": 2.003546313954097e-05, "loss": 0.06647968292236328, "memory(GiB)": 122.96, "step": 46215, "token_acc": 0.9831610044313146, "train_speed(iter/s)": 0.232451 }, { "epoch": 3.523134385242778, "grad_norm": 1.4540331363677979, "learning_rate": 2.0025878878963756e-05, "loss": 0.05379894971847534, "memory(GiB)": 122.96, "step": 46220, "token_acc": 0.9757350842135313, "train_speed(iter/s)": 0.232458 }, { "epoch": 3.5235155118530375, "grad_norm": 0.8511389493942261, "learning_rate": 2.0016296337263267e-05, "loss": 0.04057295620441437, "memory(GiB)": 122.96, "step": 46225, "token_acc": 0.9811029032812232, "train_speed(iter/s)": 0.232465 }, { "epoch": 3.5238966384632975, "grad_norm": 2.454878568649292, "learning_rate": 2.000671551498905e-05, "loss": 0.04477899372577667, "memory(GiB)": 122.96, "step": 46230, "token_acc": 0.985838779956427, "train_speed(iter/s)": 0.23247 }, { "epoch": 3.5242777650735575, "grad_norm": 0.788938581943512, "learning_rate": 1.999713641269052e-05, "loss": 0.07031739354133607, "memory(GiB)": 122.96, "step": 46235, "token_acc": 0.9813147845062739, "train_speed(iter/s)": 0.232473 }, { "epoch": 3.5246588916838175, "grad_norm": 2.132746934890747, "learning_rate": 1.9987559030916976e-05, "loss": 0.1013608455657959, "memory(GiB)": 122.96, "step": 46240, "token_acc": 0.9573542210617929, "train_speed(iter/s)": 0.232481 }, { "epoch": 3.525040018294077, "grad_norm": 0.6466344594955444, "learning_rate": 1.9977983370217645e-05, "loss": 0.05814366936683655, "memory(GiB)": 122.96, "step": 46245, "token_acc": 0.981555333998006, "train_speed(iter/s)": 0.232484 }, { "epoch": 3.525421144904337, "grad_norm": 0.9116997718811035, "learning_rate": 1.9968409431141666e-05, "loss": 0.05039224624633789, "memory(GiB)": 122.96, "step": 46250, "token_acc": 0.9778787280268616, "train_speed(iter/s)": 0.232489 }, { "epoch": 3.525802271514597, "grad_norm": 0.8482143878936768, "learning_rate": 1.9958837214238025e-05, "loss": 0.05010125637054443, "memory(GiB)": 122.96, "step": 46255, "token_acc": 0.979548845782498, "train_speed(iter/s)": 0.232487 }, { "epoch": 3.526183398124857, "grad_norm": 0.45206859707832336, "learning_rate": 1.994926672005566e-05, "loss": 0.04483374655246734, "memory(GiB)": 122.96, "step": 46260, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.23249 }, { "epoch": 3.526564524735117, "grad_norm": 1.170863151550293, "learning_rate": 1.9939697949143416e-05, "loss": 0.0769361436367035, "memory(GiB)": 122.96, "step": 46265, "token_acc": 0.9678462061925825, "train_speed(iter/s)": 0.232494 }, { "epoch": 3.526945651345377, "grad_norm": 1.2434415817260742, "learning_rate": 1.9930130902049976e-05, "loss": 0.05903710126876831, "memory(GiB)": 122.96, "step": 46270, "token_acc": 0.9813333333333333, "train_speed(iter/s)": 0.2325 }, { "epoch": 3.5273267779556368, "grad_norm": 1.469412088394165, "learning_rate": 1.992056557932399e-05, "loss": 0.05915390253067017, "memory(GiB)": 122.96, "step": 46275, "token_acc": 0.9734111543450065, "train_speed(iter/s)": 0.232505 }, { "epoch": 3.527707904565897, "grad_norm": 1.3485966920852661, "learning_rate": 1.9911001981514006e-05, "loss": 0.03972023129463196, "memory(GiB)": 122.96, "step": 46280, "token_acc": 0.9821131447587355, "train_speed(iter/s)": 0.232511 }, { "epoch": 3.528089031176157, "grad_norm": 0.7872074842453003, "learning_rate": 1.99014401091684e-05, "loss": 0.04189539849758148, "memory(GiB)": 122.96, "step": 46285, "token_acc": 0.9807907080634354, "train_speed(iter/s)": 0.232517 }, { "epoch": 3.5284701577864164, "grad_norm": 2.1449637413024902, "learning_rate": 1.9891879962835535e-05, "loss": 0.05509365200996399, "memory(GiB)": 122.96, "step": 46290, "token_acc": 0.9760493241640977, "train_speed(iter/s)": 0.232523 }, { "epoch": 3.5288512843966764, "grad_norm": 0.8847402334213257, "learning_rate": 1.9882321543063636e-05, "loss": 0.053091973066329956, "memory(GiB)": 122.96, "step": 46295, "token_acc": 0.9742424242424242, "train_speed(iter/s)": 0.23253 }, { "epoch": 3.5292324110069364, "grad_norm": 0.6113286018371582, "learning_rate": 1.9872764850400834e-05, "loss": 0.060461944341659545, "memory(GiB)": 122.96, "step": 46300, "token_acc": 0.9769487412799515, "train_speed(iter/s)": 0.232533 }, { "epoch": 3.5296135376171964, "grad_norm": 1.4088778495788574, "learning_rate": 1.986320988539518e-05, "loss": 0.030284661054611205, "memory(GiB)": 122.96, "step": 46305, "token_acc": 0.9895498392282959, "train_speed(iter/s)": 0.232542 }, { "epoch": 3.5299946642274564, "grad_norm": 1.830336332321167, "learning_rate": 1.9853656648594567e-05, "loss": 0.07856906652450561, "memory(GiB)": 122.96, "step": 46310, "token_acc": 0.9668759471747131, "train_speed(iter/s)": 0.232547 }, { "epoch": 3.5303757908377165, "grad_norm": 0.8462036848068237, "learning_rate": 1.984410514054684e-05, "loss": 0.04973042011260986, "memory(GiB)": 122.96, "step": 46315, "token_acc": 0.9810184579133394, "train_speed(iter/s)": 0.232549 }, { "epoch": 3.5307569174479765, "grad_norm": 0.9528764486312866, "learning_rate": 1.983455536179977e-05, "loss": 0.06579681634902954, "memory(GiB)": 122.96, "step": 46320, "token_acc": 0.9766835626357712, "train_speed(iter/s)": 0.232552 }, { "epoch": 3.531138044058236, "grad_norm": 2.5461556911468506, "learning_rate": 1.982500731290095e-05, "loss": 0.0813897967338562, "memory(GiB)": 122.96, "step": 46325, "token_acc": 0.9710720363017583, "train_speed(iter/s)": 0.23256 }, { "epoch": 3.531519170668496, "grad_norm": 0.8801552057266235, "learning_rate": 1.9815460994397928e-05, "loss": 0.06899614334106445, "memory(GiB)": 122.96, "step": 46330, "token_acc": 0.97420063126115, "train_speed(iter/s)": 0.232562 }, { "epoch": 3.531900297278756, "grad_norm": 1.2618391513824463, "learning_rate": 1.980591640683816e-05, "loss": 0.056531739234924314, "memory(GiB)": 122.96, "step": 46335, "token_acc": 0.9674716609167078, "train_speed(iter/s)": 0.232569 }, { "epoch": 3.5322814238890157, "grad_norm": 1.9378024339675903, "learning_rate": 1.9796373550768952e-05, "loss": 0.056615781784057614, "memory(GiB)": 122.96, "step": 46340, "token_acc": 0.9803964757709251, "train_speed(iter/s)": 0.232576 }, { "epoch": 3.5326625504992757, "grad_norm": 1.0068256855010986, "learning_rate": 1.9786832426737563e-05, "loss": 0.05144315361976624, "memory(GiB)": 122.96, "step": 46345, "token_acc": 0.9781138790035587, "train_speed(iter/s)": 0.232581 }, { "epoch": 3.5330436771095357, "grad_norm": 0.8637687563896179, "learning_rate": 1.977729303529114e-05, "loss": 0.058718568086624144, "memory(GiB)": 122.96, "step": 46350, "token_acc": 0.9738496354035705, "train_speed(iter/s)": 0.232581 }, { "epoch": 3.5334248037197957, "grad_norm": 1.7702547311782837, "learning_rate": 1.97677553769767e-05, "loss": 0.04716317653656006, "memory(GiB)": 122.96, "step": 46355, "token_acc": 0.9793125397835774, "train_speed(iter/s)": 0.232588 }, { "epoch": 3.5338059303300557, "grad_norm": 1.024194359779358, "learning_rate": 1.9758219452341186e-05, "loss": 0.05220912098884582, "memory(GiB)": 122.96, "step": 46360, "token_acc": 0.9802806219188471, "train_speed(iter/s)": 0.232596 }, { "epoch": 3.5341870569403158, "grad_norm": 0.9323307871818542, "learning_rate": 1.974868526193147e-05, "loss": 0.04159487783908844, "memory(GiB)": 122.96, "step": 46365, "token_acc": 0.9841459766676638, "train_speed(iter/s)": 0.232603 }, { "epoch": 3.5345681835505753, "grad_norm": 0.8474838137626648, "learning_rate": 1.973915280629425e-05, "loss": 0.04836551249027252, "memory(GiB)": 122.96, "step": 46370, "token_acc": 0.9749334168886103, "train_speed(iter/s)": 0.232609 }, { "epoch": 3.5349493101608354, "grad_norm": 1.1813925504684448, "learning_rate": 1.9729622085976197e-05, "loss": 0.03494167029857635, "memory(GiB)": 122.96, "step": 46375, "token_acc": 0.9838411819021238, "train_speed(iter/s)": 0.232617 }, { "epoch": 3.5353304367710954, "grad_norm": 0.7246534824371338, "learning_rate": 1.9720093101523857e-05, "loss": 0.04111408293247223, "memory(GiB)": 122.96, "step": 46380, "token_acc": 0.9858718792335979, "train_speed(iter/s)": 0.232621 }, { "epoch": 3.5357115633813554, "grad_norm": 0.320162296295166, "learning_rate": 1.9710565853483643e-05, "loss": 0.04697161316871643, "memory(GiB)": 122.96, "step": 46385, "token_acc": 0.9745620350371971, "train_speed(iter/s)": 0.232627 }, { "epoch": 3.536092689991615, "grad_norm": 1.055559515953064, "learning_rate": 1.9701040342401938e-05, "loss": 0.05758148431777954, "memory(GiB)": 122.96, "step": 46390, "token_acc": 0.9713163064833006, "train_speed(iter/s)": 0.232634 }, { "epoch": 3.536473816601875, "grad_norm": 1.1093605756759644, "learning_rate": 1.969151656882495e-05, "loss": 0.04179710447788239, "memory(GiB)": 122.96, "step": 46395, "token_acc": 0.9821720398157777, "train_speed(iter/s)": 0.232639 }, { "epoch": 3.536854943212135, "grad_norm": 1.3724945783615112, "learning_rate": 1.968199453329883e-05, "loss": 0.03223057985305786, "memory(GiB)": 122.96, "step": 46400, "token_acc": 0.9883570504527813, "train_speed(iter/s)": 0.232646 }, { "epoch": 3.536854943212135, "eval_loss": 0.06018233671784401, "eval_runtime": 220.4067, "eval_samples_per_second": 2.405, "eval_steps_per_second": 2.405, "eval_token_acc": 0.9755361122823927, "step": 46400 }, { "epoch": 3.537236069822395, "grad_norm": 1.3051786422729492, "learning_rate": 1.9672474236369654e-05, "loss": 0.0579162061214447, "memory(GiB)": 122.96, "step": 46405, "token_acc": 0.9755937061300889, "train_speed(iter/s)": 0.232396 }, { "epoch": 3.537617196432655, "grad_norm": 1.0037888288497925, "learning_rate": 1.9662955678583324e-05, "loss": 0.04800903797149658, "memory(GiB)": 122.96, "step": 46410, "token_acc": 0.9753782260456838, "train_speed(iter/s)": 0.232403 }, { "epoch": 3.537998323042915, "grad_norm": 0.9799992442131042, "learning_rate": 1.9653438860485705e-05, "loss": 0.03647419810295105, "memory(GiB)": 122.96, "step": 46415, "token_acc": 0.9793055874913773, "train_speed(iter/s)": 0.232409 }, { "epoch": 3.5383794496531746, "grad_norm": 1.5945441722869873, "learning_rate": 1.964392378262256e-05, "loss": 0.04175435304641724, "memory(GiB)": 122.96, "step": 46420, "token_acc": 0.9847942754919499, "train_speed(iter/s)": 0.232415 }, { "epoch": 3.5387605762634347, "grad_norm": 2.349905014038086, "learning_rate": 1.963441044553948e-05, "loss": 0.060189032554626466, "memory(GiB)": 122.96, "step": 46425, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.232422 }, { "epoch": 3.5391417028736947, "grad_norm": 1.5676814317703247, "learning_rate": 1.962489884978207e-05, "loss": 0.06092397570610046, "memory(GiB)": 122.96, "step": 46430, "token_acc": 0.9802140117100747, "train_speed(iter/s)": 0.232428 }, { "epoch": 3.5395228294839547, "grad_norm": 1.7719299793243408, "learning_rate": 1.9615388995895768e-05, "loss": 0.03606921434402466, "memory(GiB)": 122.96, "step": 46435, "token_acc": 0.9817596566523605, "train_speed(iter/s)": 0.232438 }, { "epoch": 3.5399039560942143, "grad_norm": 0.8293426632881165, "learning_rate": 1.9605880884425888e-05, "loss": 0.044089436531066895, "memory(GiB)": 122.96, "step": 46440, "token_acc": 0.9846938775510204, "train_speed(iter/s)": 0.232442 }, { "epoch": 3.5402850827044743, "grad_norm": 0.9187294244766235, "learning_rate": 1.959637451591771e-05, "loss": 0.05848800539970398, "memory(GiB)": 122.96, "step": 46445, "token_acc": 0.9779210629152012, "train_speed(iter/s)": 0.232441 }, { "epoch": 3.5406662093147343, "grad_norm": 1.5700602531433105, "learning_rate": 1.9586869890916344e-05, "loss": 0.07417986392974854, "memory(GiB)": 122.96, "step": 46450, "token_acc": 0.9689922480620154, "train_speed(iter/s)": 0.232448 }, { "epoch": 3.5410473359249943, "grad_norm": 0.452915757894516, "learning_rate": 1.9577367009966857e-05, "loss": 0.04246101379394531, "memory(GiB)": 122.96, "step": 46455, "token_acc": 0.9816969696969697, "train_speed(iter/s)": 0.232447 }, { "epoch": 3.5414284625352543, "grad_norm": 1.6963430643081665, "learning_rate": 1.9567865873614217e-05, "loss": 0.07409543991088867, "memory(GiB)": 122.96, "step": 46460, "token_acc": 0.9790446841294299, "train_speed(iter/s)": 0.232454 }, { "epoch": 3.5418095891455144, "grad_norm": 0.8971818089485168, "learning_rate": 1.9558366482403223e-05, "loss": 0.05037371516227722, "memory(GiB)": 122.96, "step": 46465, "token_acc": 0.9775280898876404, "train_speed(iter/s)": 0.232461 }, { "epoch": 3.542190715755774, "grad_norm": 0.7752423286437988, "learning_rate": 1.9548868836878643e-05, "loss": 0.0497003048658371, "memory(GiB)": 122.96, "step": 46470, "token_acc": 0.982296867907399, "train_speed(iter/s)": 0.232462 }, { "epoch": 3.542571842366034, "grad_norm": 0.4826826751232147, "learning_rate": 1.9539372937585148e-05, "loss": 0.03642245829105377, "memory(GiB)": 122.96, "step": 46475, "token_acc": 0.9853562801932367, "train_speed(iter/s)": 0.232464 }, { "epoch": 3.542952968976294, "grad_norm": 1.7694352865219116, "learning_rate": 1.952987878506724e-05, "loss": 0.06577748656272889, "memory(GiB)": 122.96, "step": 46480, "token_acc": 0.9779804270462633, "train_speed(iter/s)": 0.23247 }, { "epoch": 3.543334095586554, "grad_norm": 0.9567990303039551, "learning_rate": 1.9520386379869383e-05, "loss": 0.051110750436782836, "memory(GiB)": 122.96, "step": 46485, "token_acc": 0.9731400047539814, "train_speed(iter/s)": 0.232477 }, { "epoch": 3.5437152221968136, "grad_norm": 0.8953652381896973, "learning_rate": 1.9510895722535943e-05, "loss": 0.03891555070877075, "memory(GiB)": 122.96, "step": 46490, "token_acc": 0.9800386349001932, "train_speed(iter/s)": 0.232483 }, { "epoch": 3.5440963488070736, "grad_norm": 0.751865804195404, "learning_rate": 1.9501406813611134e-05, "loss": 0.05440452098846436, "memory(GiB)": 122.96, "step": 46495, "token_acc": 0.980661658941985, "train_speed(iter/s)": 0.232487 }, { "epoch": 3.5444774754173336, "grad_norm": 2.1480767726898193, "learning_rate": 1.9491919653639107e-05, "loss": 0.05926657915115356, "memory(GiB)": 122.96, "step": 46500, "token_acc": 0.9764376996805112, "train_speed(iter/s)": 0.232492 }, { "epoch": 3.5448586020275936, "grad_norm": 0.8232603669166565, "learning_rate": 1.9482434243163933e-05, "loss": 0.053774744272232056, "memory(GiB)": 122.96, "step": 46505, "token_acc": 0.9793584793584793, "train_speed(iter/s)": 0.232495 }, { "epoch": 3.5452397286378536, "grad_norm": 3.2298083305358887, "learning_rate": 1.947295058272952e-05, "loss": 0.08111108541488647, "memory(GiB)": 122.96, "step": 46510, "token_acc": 0.9816910785619174, "train_speed(iter/s)": 0.232501 }, { "epoch": 3.5456208552481137, "grad_norm": 0.484810471534729, "learning_rate": 1.9463468672879724e-05, "loss": 0.05054931044578552, "memory(GiB)": 122.96, "step": 46515, "token_acc": 0.9842051630434783, "train_speed(iter/s)": 0.232503 }, { "epoch": 3.5460019818583732, "grad_norm": 0.7116111516952515, "learning_rate": 1.9453988514158317e-05, "loss": 0.0615386962890625, "memory(GiB)": 122.96, "step": 46520, "token_acc": 0.9810665825181445, "train_speed(iter/s)": 0.232509 }, { "epoch": 3.5463831084686332, "grad_norm": 1.34740149974823, "learning_rate": 1.94445101071089e-05, "loss": 0.0754745602607727, "memory(GiB)": 122.96, "step": 46525, "token_acc": 0.9763194551926946, "train_speed(iter/s)": 0.232514 }, { "epoch": 3.5467642350788933, "grad_norm": 1.9555062055587769, "learning_rate": 1.943503345227503e-05, "loss": 0.06733548641204834, "memory(GiB)": 122.96, "step": 46530, "token_acc": 0.9711445198836082, "train_speed(iter/s)": 0.232519 }, { "epoch": 3.5471453616891533, "grad_norm": 3.080296516418457, "learning_rate": 1.942555855020017e-05, "loss": 0.07735669016838073, "memory(GiB)": 122.96, "step": 46535, "token_acc": 0.9782788580885395, "train_speed(iter/s)": 0.232522 }, { "epoch": 3.547526488299413, "grad_norm": 1.4671250581741333, "learning_rate": 1.9416085401427624e-05, "loss": 0.06380118727684021, "memory(GiB)": 122.96, "step": 46540, "token_acc": 0.9750851788756388, "train_speed(iter/s)": 0.232528 }, { "epoch": 3.547907614909673, "grad_norm": 2.0047996044158936, "learning_rate": 1.9406614006500663e-05, "loss": 0.07913438081741334, "memory(GiB)": 122.96, "step": 46545, "token_acc": 0.96875, "train_speed(iter/s)": 0.232535 }, { "epoch": 3.548288741519933, "grad_norm": 1.2483727931976318, "learning_rate": 1.9397144365962426e-05, "loss": 0.0686512291431427, "memory(GiB)": 122.96, "step": 46550, "token_acc": 0.9667226890756303, "train_speed(iter/s)": 0.232542 }, { "epoch": 3.548669868130193, "grad_norm": 0.5558659434318542, "learning_rate": 1.9387676480355934e-05, "loss": 0.03189200162887573, "memory(GiB)": 122.96, "step": 46555, "token_acc": 0.9844717538344289, "train_speed(iter/s)": 0.232541 }, { "epoch": 3.549050994740453, "grad_norm": 3.1626100540161133, "learning_rate": 1.9378210350224137e-05, "loss": 0.08162121772766114, "memory(GiB)": 122.96, "step": 46560, "token_acc": 0.9652288732394366, "train_speed(iter/s)": 0.23255 }, { "epoch": 3.549432121350713, "grad_norm": 0.6716939806938171, "learning_rate": 1.9368745976109868e-05, "loss": 0.03679351806640625, "memory(GiB)": 122.96, "step": 46565, "token_acc": 0.9865347622973344, "train_speed(iter/s)": 0.232557 }, { "epoch": 3.5498132479609725, "grad_norm": 1.5060303211212158, "learning_rate": 1.9359283358555873e-05, "loss": 0.09341654777526856, "memory(GiB)": 122.96, "step": 46570, "token_acc": 0.9655762973111834, "train_speed(iter/s)": 0.232562 }, { "epoch": 3.5501943745712325, "grad_norm": 1.1053876876831055, "learning_rate": 1.934982249810481e-05, "loss": 0.05068185329437256, "memory(GiB)": 122.96, "step": 46575, "token_acc": 0.9803632793323515, "train_speed(iter/s)": 0.232571 }, { "epoch": 3.5505755011814926, "grad_norm": 1.6908422708511353, "learning_rate": 1.934036339529917e-05, "loss": 0.0559003472328186, "memory(GiB)": 122.96, "step": 46580, "token_acc": 0.9755043227665706, "train_speed(iter/s)": 0.232575 }, { "epoch": 3.5509566277917526, "grad_norm": 1.6736327409744263, "learning_rate": 1.9330906050681415e-05, "loss": 0.05929225087165833, "memory(GiB)": 122.96, "step": 46585, "token_acc": 0.9795753829615694, "train_speed(iter/s)": 0.232582 }, { "epoch": 3.551337754402012, "grad_norm": 1.9404592514038086, "learning_rate": 1.9321450464793888e-05, "loss": 0.06202995777130127, "memory(GiB)": 122.96, "step": 46590, "token_acc": 0.9728813559322034, "train_speed(iter/s)": 0.232588 }, { "epoch": 3.551718881012272, "grad_norm": 1.0544624328613281, "learning_rate": 1.9311996638178797e-05, "loss": 0.05335569381713867, "memory(GiB)": 122.96, "step": 46595, "token_acc": 0.9766587024444036, "train_speed(iter/s)": 0.232594 }, { "epoch": 3.552100007622532, "grad_norm": 1.229246735572815, "learning_rate": 1.930254457137829e-05, "loss": 0.038326188921928406, "memory(GiB)": 122.96, "step": 46600, "token_acc": 0.9825384904243335, "train_speed(iter/s)": 0.232599 }, { "epoch": 3.552100007622532, "eval_loss": 0.06004065275192261, "eval_runtime": 220.0813, "eval_samples_per_second": 2.408, "eval_steps_per_second": 2.408, "eval_token_acc": 0.9757318836214686, "step": 46600 }, { "epoch": 3.552481134232792, "grad_norm": 0.4382794499397278, "learning_rate": 1.9293094264934413e-05, "loss": 0.0706146240234375, "memory(GiB)": 122.96, "step": 46605, "token_acc": 0.9756352013258949, "train_speed(iter/s)": 0.232351 }, { "epoch": 3.5528622608430522, "grad_norm": 1.7722821235656738, "learning_rate": 1.9283645719389066e-05, "loss": 0.047273358702659606, "memory(GiB)": 122.96, "step": 46610, "token_acc": 0.9785211267605634, "train_speed(iter/s)": 0.232356 }, { "epoch": 3.5532433874533123, "grad_norm": 0.5163552165031433, "learning_rate": 1.9274198935284114e-05, "loss": 0.04250532388687134, "memory(GiB)": 122.96, "step": 46615, "token_acc": 0.9838756536897153, "train_speed(iter/s)": 0.232359 }, { "epoch": 3.553624514063572, "grad_norm": 2.0309224128723145, "learning_rate": 1.926475391316125e-05, "loss": 0.055315279960632326, "memory(GiB)": 122.96, "step": 46620, "token_acc": 0.9764049764049764, "train_speed(iter/s)": 0.232367 }, { "epoch": 3.554005640673832, "grad_norm": 1.0391697883605957, "learning_rate": 1.925531065356212e-05, "loss": 0.055416280031204225, "memory(GiB)": 122.96, "step": 46625, "token_acc": 0.9780375487070493, "train_speed(iter/s)": 0.232372 }, { "epoch": 3.554386767284092, "grad_norm": 1.0408554077148438, "learning_rate": 1.9245869157028267e-05, "loss": 0.06594306230545044, "memory(GiB)": 122.96, "step": 46630, "token_acc": 0.9752918287937743, "train_speed(iter/s)": 0.232378 }, { "epoch": 3.5547678938943514, "grad_norm": 0.5788599848747253, "learning_rate": 1.923642942410109e-05, "loss": 0.057189762592315674, "memory(GiB)": 122.96, "step": 46635, "token_acc": 0.9725412778176598, "train_speed(iter/s)": 0.232384 }, { "epoch": 3.5551490205046115, "grad_norm": 2.3125197887420654, "learning_rate": 1.922699145532192e-05, "loss": 0.04145443737506867, "memory(GiB)": 122.96, "step": 46640, "token_acc": 0.9794640621086902, "train_speed(iter/s)": 0.232389 }, { "epoch": 3.5555301471148715, "grad_norm": 1.3623069524765015, "learning_rate": 1.9217555251232e-05, "loss": 0.02923901379108429, "memory(GiB)": 122.96, "step": 46645, "token_acc": 0.986416148279802, "train_speed(iter/s)": 0.232392 }, { "epoch": 3.5559112737251315, "grad_norm": 1.2559106349945068, "learning_rate": 1.9208120812372428e-05, "loss": 0.04972193837165832, "memory(GiB)": 122.96, "step": 46650, "token_acc": 0.978772378516624, "train_speed(iter/s)": 0.232399 }, { "epoch": 3.5562924003353915, "grad_norm": 0.7354571223258972, "learning_rate": 1.9198688139284228e-05, "loss": 0.060518664121627805, "memory(GiB)": 122.96, "step": 46655, "token_acc": 0.9792416860834569, "train_speed(iter/s)": 0.232403 }, { "epoch": 3.5566735269456515, "grad_norm": 0.9055386781692505, "learning_rate": 1.918925723250835e-05, "loss": 0.048020491003990175, "memory(GiB)": 122.96, "step": 46660, "token_acc": 0.9787701317715959, "train_speed(iter/s)": 0.232408 }, { "epoch": 3.5570546535559115, "grad_norm": 3.669761896133423, "learning_rate": 1.9179828092585572e-05, "loss": 0.06707985401153564, "memory(GiB)": 122.96, "step": 46665, "token_acc": 0.9754886751473782, "train_speed(iter/s)": 0.232415 }, { "epoch": 3.557435780166171, "grad_norm": 0.694337785243988, "learning_rate": 1.9170400720056632e-05, "loss": 0.04676980376243591, "memory(GiB)": 122.96, "step": 46670, "token_acc": 0.9815289438798056, "train_speed(iter/s)": 0.232415 }, { "epoch": 3.557816906776431, "grad_norm": 1.0258665084838867, "learning_rate": 1.916097511546216e-05, "loss": 0.04263062179088593, "memory(GiB)": 122.96, "step": 46675, "token_acc": 0.9835572024085225, "train_speed(iter/s)": 0.23242 }, { "epoch": 3.558198033386691, "grad_norm": 1.4888147115707397, "learning_rate": 1.9151551279342634e-05, "loss": 0.05471775531768799, "memory(GiB)": 122.96, "step": 46680, "token_acc": 0.9797388563710041, "train_speed(iter/s)": 0.232426 }, { "epoch": 3.5585791599969507, "grad_norm": 1.089923620223999, "learning_rate": 1.914212921223849e-05, "loss": 0.08222234845161439, "memory(GiB)": 122.96, "step": 46685, "token_acc": 0.9745783427034955, "train_speed(iter/s)": 0.232432 }, { "epoch": 3.5589602866072108, "grad_norm": 0.7181756496429443, "learning_rate": 1.9132708914690055e-05, "loss": 0.0498701274394989, "memory(GiB)": 122.96, "step": 46690, "token_acc": 0.9777080581241744, "train_speed(iter/s)": 0.232437 }, { "epoch": 3.5593414132174708, "grad_norm": 1.48860502243042, "learning_rate": 1.912329038723749e-05, "loss": 0.04854249060153961, "memory(GiB)": 122.96, "step": 46695, "token_acc": 0.9809913378248316, "train_speed(iter/s)": 0.232444 }, { "epoch": 3.559722539827731, "grad_norm": 1.5654714107513428, "learning_rate": 1.911387363042097e-05, "loss": 0.06345379948616028, "memory(GiB)": 122.96, "step": 46700, "token_acc": 0.9810597766586381, "train_speed(iter/s)": 0.232445 }, { "epoch": 3.560103666437991, "grad_norm": 0.860392689704895, "learning_rate": 1.9104458644780448e-05, "loss": 0.053161400556564334, "memory(GiB)": 122.96, "step": 46705, "token_acc": 0.9793103448275862, "train_speed(iter/s)": 0.232451 }, { "epoch": 3.560484793048251, "grad_norm": 0.7851295471191406, "learning_rate": 1.909504543085585e-05, "loss": 0.07290263772010804, "memory(GiB)": 122.96, "step": 46710, "token_acc": 0.977366529090667, "train_speed(iter/s)": 0.232452 }, { "epoch": 3.5608659196585104, "grad_norm": 1.3623138666152954, "learning_rate": 1.9085633989187003e-05, "loss": 0.06825804114341735, "memory(GiB)": 122.96, "step": 46715, "token_acc": 0.9701935301596271, "train_speed(iter/s)": 0.232457 }, { "epoch": 3.5612470462687704, "grad_norm": 1.5112305879592896, "learning_rate": 1.9076224320313564e-05, "loss": 0.06235749125480652, "memory(GiB)": 122.96, "step": 46720, "token_acc": 0.9779980657640233, "train_speed(iter/s)": 0.232459 }, { "epoch": 3.5616281728790304, "grad_norm": 1.3388617038726807, "learning_rate": 1.9066816424775157e-05, "loss": 0.06584340333938599, "memory(GiB)": 122.96, "step": 46725, "token_acc": 0.976878612716763, "train_speed(iter/s)": 0.232465 }, { "epoch": 3.5620092994892905, "grad_norm": 0.8380825519561768, "learning_rate": 1.9057410303111295e-05, "loss": 0.057466650009155275, "memory(GiB)": 122.96, "step": 46730, "token_acc": 0.9781653197221041, "train_speed(iter/s)": 0.232467 }, { "epoch": 3.56239042609955, "grad_norm": 1.2624478340148926, "learning_rate": 1.9048005955861348e-05, "loss": 0.03178880214691162, "memory(GiB)": 122.96, "step": 46735, "token_acc": 0.9853973602920528, "train_speed(iter/s)": 0.232474 }, { "epoch": 3.56277155270981, "grad_norm": 0.6632423400878906, "learning_rate": 1.9038603383564622e-05, "loss": 0.04137397110462189, "memory(GiB)": 122.96, "step": 46740, "token_acc": 0.9808749175642999, "train_speed(iter/s)": 0.232475 }, { "epoch": 3.56315267932007, "grad_norm": 1.4044800996780396, "learning_rate": 1.9029202586760335e-05, "loss": 0.07065856456756592, "memory(GiB)": 122.96, "step": 46745, "token_acc": 0.9693486590038314, "train_speed(iter/s)": 0.232483 }, { "epoch": 3.56353380593033, "grad_norm": 0.7929338812828064, "learning_rate": 1.9019803565987536e-05, "loss": 0.048653244972229004, "memory(GiB)": 122.96, "step": 46750, "token_acc": 0.983561998446803, "train_speed(iter/s)": 0.232486 }, { "epoch": 3.56391493254059, "grad_norm": 0.7877162098884583, "learning_rate": 1.901040632178524e-05, "loss": 0.08200629353523255, "memory(GiB)": 122.96, "step": 46755, "token_acc": 0.9781810326204364, "train_speed(iter/s)": 0.23249 }, { "epoch": 3.56429605915085, "grad_norm": 0.8441221117973328, "learning_rate": 1.9001010854692348e-05, "loss": 0.056585633754730226, "memory(GiB)": 122.96, "step": 46760, "token_acc": 0.9777680906713164, "train_speed(iter/s)": 0.232494 }, { "epoch": 3.5646771857611097, "grad_norm": 1.3324960470199585, "learning_rate": 1.899161716524761e-05, "loss": 0.09119502305984498, "memory(GiB)": 122.96, "step": 46765, "token_acc": 0.9760647263737499, "train_speed(iter/s)": 0.232497 }, { "epoch": 3.5650583123713697, "grad_norm": 0.6199893355369568, "learning_rate": 1.8982225253989732e-05, "loss": 0.0670171856880188, "memory(GiB)": 122.96, "step": 46770, "token_acc": 0.9759829968119023, "train_speed(iter/s)": 0.232501 }, { "epoch": 3.5654394389816297, "grad_norm": 1.3446202278137207, "learning_rate": 1.8972835121457316e-05, "loss": 0.07434442043304443, "memory(GiB)": 122.96, "step": 46775, "token_acc": 0.9776442307692308, "train_speed(iter/s)": 0.232508 }, { "epoch": 3.5658205655918898, "grad_norm": 0.8536604642868042, "learning_rate": 1.8963446768188808e-05, "loss": 0.04837429821491242, "memory(GiB)": 122.96, "step": 46780, "token_acc": 0.9753265602322206, "train_speed(iter/s)": 0.232514 }, { "epoch": 3.5662016922021493, "grad_norm": 0.6545916199684143, "learning_rate": 1.895406019472261e-05, "loss": 0.03368062376976013, "memory(GiB)": 122.96, "step": 46785, "token_acc": 0.9848665870171247, "train_speed(iter/s)": 0.232517 }, { "epoch": 3.5665828188124094, "grad_norm": 1.2157056331634521, "learning_rate": 1.894467540159698e-05, "loss": 0.05305518507957459, "memory(GiB)": 122.96, "step": 46790, "token_acc": 0.9764243614931237, "train_speed(iter/s)": 0.232523 }, { "epoch": 3.5669639454226694, "grad_norm": 0.5284175872802734, "learning_rate": 1.8935292389350095e-05, "loss": 0.045314455032348634, "memory(GiB)": 122.96, "step": 46795, "token_acc": 0.9830148619957537, "train_speed(iter/s)": 0.232532 }, { "epoch": 3.5673450720329294, "grad_norm": 2.2114906311035156, "learning_rate": 1.8925911158520056e-05, "loss": 0.05562052130699158, "memory(GiB)": 122.96, "step": 46800, "token_acc": 0.9822499400335812, "train_speed(iter/s)": 0.232536 }, { "epoch": 3.5673450720329294, "eval_loss": 0.05934963747859001, "eval_runtime": 225.4803, "eval_samples_per_second": 2.351, "eval_steps_per_second": 2.351, "eval_token_acc": 0.9757620022890188, "step": 46800 }, { "epoch": 3.5677261986431894, "grad_norm": 1.445575475692749, "learning_rate": 1.8916531709644785e-05, "loss": 0.06269552111625672, "memory(GiB)": 122.96, "step": 46805, "token_acc": 0.9759651151656219, "train_speed(iter/s)": 0.232283 }, { "epoch": 3.5681073252534494, "grad_norm": 0.9567502737045288, "learning_rate": 1.8907154043262182e-05, "loss": 0.06412315964698792, "memory(GiB)": 122.96, "step": 46810, "token_acc": 0.9733428367783321, "train_speed(iter/s)": 0.232285 }, { "epoch": 3.568488451863709, "grad_norm": 2.387922763824463, "learning_rate": 1.8897778159910022e-05, "loss": 0.049033185839653014, "memory(GiB)": 122.96, "step": 46815, "token_acc": 0.9812274368231046, "train_speed(iter/s)": 0.232292 }, { "epoch": 3.568869578473969, "grad_norm": 0.9654027223587036, "learning_rate": 1.8888404060125937e-05, "loss": 0.061077868938446044, "memory(GiB)": 122.96, "step": 46820, "token_acc": 0.9791802212101497, "train_speed(iter/s)": 0.232295 }, { "epoch": 3.569250705084229, "grad_norm": 2.002194881439209, "learning_rate": 1.88790317444475e-05, "loss": 0.04631061851978302, "memory(GiB)": 122.96, "step": 46825, "token_acc": 0.9819653179190752, "train_speed(iter/s)": 0.232302 }, { "epoch": 3.569631831694489, "grad_norm": 1.6122676134109497, "learning_rate": 1.8869661213412177e-05, "loss": 0.04250850081443787, "memory(GiB)": 122.96, "step": 46830, "token_acc": 0.9836481981266868, "train_speed(iter/s)": 0.232305 }, { "epoch": 3.5700129583047486, "grad_norm": 1.1179932355880737, "learning_rate": 1.8860292467557323e-05, "loss": 0.05037444233894348, "memory(GiB)": 122.96, "step": 46835, "token_acc": 0.9829931972789115, "train_speed(iter/s)": 0.232313 }, { "epoch": 3.5703940849150086, "grad_norm": 0.6762893795967102, "learning_rate": 1.8850925507420204e-05, "loss": 0.06316714882850646, "memory(GiB)": 122.96, "step": 46840, "token_acc": 0.9788812785388128, "train_speed(iter/s)": 0.232316 }, { "epoch": 3.5707752115252687, "grad_norm": 1.1523724794387817, "learning_rate": 1.884156033353794e-05, "loss": 0.03417057991027832, "memory(GiB)": 122.96, "step": 46845, "token_acc": 0.9841864336246359, "train_speed(iter/s)": 0.232324 }, { "epoch": 3.5711563381355287, "grad_norm": 1.4642258882522583, "learning_rate": 1.88321969464476e-05, "loss": 0.04842900633811951, "memory(GiB)": 122.96, "step": 46850, "token_acc": 0.9810233005044439, "train_speed(iter/s)": 0.232328 }, { "epoch": 3.5715374647457887, "grad_norm": 1.0618579387664795, "learning_rate": 1.882283534668614e-05, "loss": 0.05420316457748413, "memory(GiB)": 122.96, "step": 46855, "token_acc": 0.9803828206837444, "train_speed(iter/s)": 0.232327 }, { "epoch": 3.5719185913560487, "grad_norm": 0.6396664977073669, "learning_rate": 1.8813475534790382e-05, "loss": 0.062465840578079225, "memory(GiB)": 122.96, "step": 46860, "token_acc": 0.9780353874313605, "train_speed(iter/s)": 0.232332 }, { "epoch": 3.5722997179663083, "grad_norm": 1.0759085416793823, "learning_rate": 1.880411751129708e-05, "loss": 0.06518712043762206, "memory(GiB)": 122.96, "step": 46865, "token_acc": 0.9736356446084893, "train_speed(iter/s)": 0.232339 }, { "epoch": 3.5726808445765683, "grad_norm": 2.05277156829834, "learning_rate": 1.8794761276742884e-05, "loss": 0.08465058207511902, "memory(GiB)": 122.96, "step": 46870, "token_acc": 0.9679874048806087, "train_speed(iter/s)": 0.232345 }, { "epoch": 3.5730619711868283, "grad_norm": 0.9153910279273987, "learning_rate": 1.8785406831664305e-05, "loss": 0.0397034227848053, "memory(GiB)": 122.96, "step": 46875, "token_acc": 0.9817732468334878, "train_speed(iter/s)": 0.232353 }, { "epoch": 3.5734430977970884, "grad_norm": 1.124255895614624, "learning_rate": 1.877605417659779e-05, "loss": 0.05024414658546448, "memory(GiB)": 122.96, "step": 46880, "token_acc": 0.9723623601667032, "train_speed(iter/s)": 0.232359 }, { "epoch": 3.573824224407348, "grad_norm": 0.704498827457428, "learning_rate": 1.8766703312079693e-05, "loss": 0.05106344819068909, "memory(GiB)": 122.96, "step": 46885, "token_acc": 0.9827665441176471, "train_speed(iter/s)": 0.232363 }, { "epoch": 3.574205351017608, "grad_norm": 0.635569155216217, "learning_rate": 1.8757354238646195e-05, "loss": 0.044501110911369324, "memory(GiB)": 122.96, "step": 46890, "token_acc": 0.9830637007077856, "train_speed(iter/s)": 0.232371 }, { "epoch": 3.574586477627868, "grad_norm": 1.60356605052948, "learning_rate": 1.8748006956833453e-05, "loss": 0.07547378540039062, "memory(GiB)": 122.96, "step": 46895, "token_acc": 0.9784370698883621, "train_speed(iter/s)": 0.232374 }, { "epoch": 3.574967604238128, "grad_norm": 0.8736376762390137, "learning_rate": 1.8738661467177502e-05, "loss": 0.05892327427864075, "memory(GiB)": 122.96, "step": 46900, "token_acc": 0.9782768237193601, "train_speed(iter/s)": 0.232376 }, { "epoch": 3.575348730848388, "grad_norm": 2.3551392555236816, "learning_rate": 1.872931777021423e-05, "loss": 0.07203641533851624, "memory(GiB)": 122.96, "step": 46905, "token_acc": 0.978563015312132, "train_speed(iter/s)": 0.232384 }, { "epoch": 3.575729857458648, "grad_norm": 1.2258864641189575, "learning_rate": 1.8719975866479467e-05, "loss": 0.061588054895401, "memory(GiB)": 122.96, "step": 46910, "token_acc": 0.9720354523227384, "train_speed(iter/s)": 0.232388 }, { "epoch": 3.5761109840689076, "grad_norm": 0.7664706110954285, "learning_rate": 1.871063575650895e-05, "loss": 0.053876572847366334, "memory(GiB)": 122.96, "step": 46915, "token_acc": 0.9796407185628743, "train_speed(iter/s)": 0.232393 }, { "epoch": 3.5764921106791676, "grad_norm": 1.612284779548645, "learning_rate": 1.8701297440838255e-05, "loss": 0.08210026025772095, "memory(GiB)": 122.96, "step": 46920, "token_acc": 0.9716334164588528, "train_speed(iter/s)": 0.232397 }, { "epoch": 3.5768732372894276, "grad_norm": 1.2753187417984009, "learning_rate": 1.8691960920002905e-05, "loss": 0.05902968049049377, "memory(GiB)": 122.96, "step": 46925, "token_acc": 0.9787444389520514, "train_speed(iter/s)": 0.2324 }, { "epoch": 3.577254363899687, "grad_norm": 1.8052647113800049, "learning_rate": 1.868262619453833e-05, "loss": 0.0838977575302124, "memory(GiB)": 122.96, "step": 46930, "token_acc": 0.9741750358680057, "train_speed(iter/s)": 0.232405 }, { "epoch": 3.5776354905099472, "grad_norm": 0.6171461939811707, "learning_rate": 1.867329326497979e-05, "loss": 0.043978333473205566, "memory(GiB)": 122.96, "step": 46935, "token_acc": 0.9810269537720223, "train_speed(iter/s)": 0.23241 }, { "epoch": 3.5780166171202072, "grad_norm": 0.6597920656204224, "learning_rate": 1.866396213186251e-05, "loss": 0.044922256469726564, "memory(GiB)": 122.96, "step": 46940, "token_acc": 0.9790832220738763, "train_speed(iter/s)": 0.232414 }, { "epoch": 3.5783977437304673, "grad_norm": 1.5426667928695679, "learning_rate": 1.8654632795721606e-05, "loss": 0.043079647421836856, "memory(GiB)": 122.96, "step": 46945, "token_acc": 0.9848305752561072, "train_speed(iter/s)": 0.232418 }, { "epoch": 3.5787788703407273, "grad_norm": 1.4843192100524902, "learning_rate": 1.8645305257092033e-05, "loss": 0.08024131059646607, "memory(GiB)": 122.96, "step": 46950, "token_acc": 0.9701865423051299, "train_speed(iter/s)": 0.232423 }, { "epoch": 3.5791599969509873, "grad_norm": 1.2520023584365845, "learning_rate": 1.8635979516508716e-05, "loss": 0.06362650394439698, "memory(GiB)": 122.96, "step": 46955, "token_acc": 0.9773175542406312, "train_speed(iter/s)": 0.232428 }, { "epoch": 3.5795411235612473, "grad_norm": 1.5821611881256104, "learning_rate": 1.862665557450639e-05, "loss": 0.0647797167301178, "memory(GiB)": 122.96, "step": 46960, "token_acc": 0.9693165969316597, "train_speed(iter/s)": 0.232437 }, { "epoch": 3.579922250171507, "grad_norm": 0.8349012732505798, "learning_rate": 1.8617333431619793e-05, "loss": 0.06424868106842041, "memory(GiB)": 122.96, "step": 46965, "token_acc": 0.9721743412952475, "train_speed(iter/s)": 0.232443 }, { "epoch": 3.580303376781767, "grad_norm": 2.1380057334899902, "learning_rate": 1.8608013088383515e-05, "loss": 0.04805622398853302, "memory(GiB)": 122.96, "step": 46970, "token_acc": 0.9792161520190024, "train_speed(iter/s)": 0.232451 }, { "epoch": 3.580684503392027, "grad_norm": 1.549740195274353, "learning_rate": 1.8598694545331984e-05, "loss": 0.06654325127601624, "memory(GiB)": 122.96, "step": 46975, "token_acc": 0.9720683287165282, "train_speed(iter/s)": 0.232457 }, { "epoch": 3.5810656300022865, "grad_norm": 1.0449352264404297, "learning_rate": 1.8589377802999606e-05, "loss": 0.06904310584068299, "memory(GiB)": 122.96, "step": 46980, "token_acc": 0.981242436466317, "train_speed(iter/s)": 0.232462 }, { "epoch": 3.5814467566125465, "grad_norm": 0.9228067994117737, "learning_rate": 1.858006286192066e-05, "loss": 0.033990538120269774, "memory(GiB)": 122.96, "step": 46985, "token_acc": 0.986126224156692, "train_speed(iter/s)": 0.23247 }, { "epoch": 3.5818278832228065, "grad_norm": 1.631448745727539, "learning_rate": 1.8570749722629295e-05, "loss": 0.032961130142211914, "memory(GiB)": 122.96, "step": 46990, "token_acc": 0.9821741206430049, "train_speed(iter/s)": 0.232474 }, { "epoch": 3.5822090098330666, "grad_norm": 1.1341471672058105, "learning_rate": 1.8561438385659592e-05, "loss": 0.061360675096511844, "memory(GiB)": 122.96, "step": 46995, "token_acc": 0.9734345351043643, "train_speed(iter/s)": 0.232479 }, { "epoch": 3.5825901364433266, "grad_norm": 0.9918317198753357, "learning_rate": 1.8552128851545493e-05, "loss": 0.05981239080429077, "memory(GiB)": 122.96, "step": 47000, "token_acc": 0.9752860411899313, "train_speed(iter/s)": 0.232481 }, { "epoch": 3.5825901364433266, "eval_loss": 0.05900820717215538, "eval_runtime": 221.918, "eval_samples_per_second": 2.388, "eval_steps_per_second": 2.388, "eval_token_acc": 0.9757921209565689, "step": 47000 }, { "epoch": 3.5829712630535866, "grad_norm": 0.7144765853881836, "learning_rate": 1.8542821120820863e-05, "loss": 0.03040274977684021, "memory(GiB)": 122.96, "step": 47005, "token_acc": 0.9762360537706605, "train_speed(iter/s)": 0.232229 }, { "epoch": 3.583352389663846, "grad_norm": 3.652787446975708, "learning_rate": 1.8533515194019486e-05, "loss": 0.06989901065826416, "memory(GiB)": 122.96, "step": 47010, "token_acc": 0.9806931844615027, "train_speed(iter/s)": 0.232233 }, { "epoch": 3.583733516274106, "grad_norm": 1.489815354347229, "learning_rate": 1.8524211071674967e-05, "loss": 0.05056637525558472, "memory(GiB)": 122.96, "step": 47015, "token_acc": 0.981000802782981, "train_speed(iter/s)": 0.232236 }, { "epoch": 3.584114642884366, "grad_norm": 0.7473633289337158, "learning_rate": 1.851490875432088e-05, "loss": 0.04124007225036621, "memory(GiB)": 122.96, "step": 47020, "token_acc": 0.9846030793841232, "train_speed(iter/s)": 0.232241 }, { "epoch": 3.5844957694946262, "grad_norm": 1.7853816747665405, "learning_rate": 1.8505608242490686e-05, "loss": 0.0627809464931488, "memory(GiB)": 122.96, "step": 47025, "token_acc": 0.9699606380774808, "train_speed(iter/s)": 0.232247 }, { "epoch": 3.584876896104886, "grad_norm": 0.7042953968048096, "learning_rate": 1.8496309536717686e-05, "loss": 0.05734219551086426, "memory(GiB)": 122.96, "step": 47030, "token_acc": 0.9812179016874542, "train_speed(iter/s)": 0.232249 }, { "epoch": 3.585258022715146, "grad_norm": 1.3742696046829224, "learning_rate": 1.8487012637535144e-05, "loss": 0.05341324806213379, "memory(GiB)": 122.96, "step": 47035, "token_acc": 0.9751764973810066, "train_speed(iter/s)": 0.232255 }, { "epoch": 3.585639149325406, "grad_norm": 1.2992419004440308, "learning_rate": 1.847771754547621e-05, "loss": 0.04224950075149536, "memory(GiB)": 122.96, "step": 47040, "token_acc": 0.9801421917136554, "train_speed(iter/s)": 0.232262 }, { "epoch": 3.586020275935666, "grad_norm": 1.1067533493041992, "learning_rate": 1.846842426107387e-05, "loss": 0.042961719632148745, "memory(GiB)": 122.96, "step": 47045, "token_acc": 0.9802660753880266, "train_speed(iter/s)": 0.232267 }, { "epoch": 3.586401402545926, "grad_norm": 1.7166286706924438, "learning_rate": 1.8459132784861073e-05, "loss": 0.03731703758239746, "memory(GiB)": 122.96, "step": 47050, "token_acc": 0.9837837837837838, "train_speed(iter/s)": 0.232276 }, { "epoch": 3.586782529156186, "grad_norm": 0.6317066550254822, "learning_rate": 1.8449843117370664e-05, "loss": 0.0366670548915863, "memory(GiB)": 122.96, "step": 47055, "token_acc": 0.9834983498349835, "train_speed(iter/s)": 0.232281 }, { "epoch": 3.5871636557664455, "grad_norm": 1.8171716928482056, "learning_rate": 1.844055525913532e-05, "loss": 0.07504984140396118, "memory(GiB)": 122.96, "step": 47060, "token_acc": 0.979417268110131, "train_speed(iter/s)": 0.232288 }, { "epoch": 3.5875447823767055, "grad_norm": 1.2058758735656738, "learning_rate": 1.8431269210687685e-05, "loss": 0.06833293437957763, "memory(GiB)": 122.96, "step": 47065, "token_acc": 0.9765616111047901, "train_speed(iter/s)": 0.232289 }, { "epoch": 3.5879259089869655, "grad_norm": 1.2428706884384155, "learning_rate": 1.8421984972560276e-05, "loss": 0.05005948543548584, "memory(GiB)": 122.96, "step": 47070, "token_acc": 0.9724592707525214, "train_speed(iter/s)": 0.232297 }, { "epoch": 3.5883070355972255, "grad_norm": 1.1434329748153687, "learning_rate": 1.8412702545285472e-05, "loss": 0.06922175884246826, "memory(GiB)": 122.96, "step": 47075, "token_acc": 0.9771948129378447, "train_speed(iter/s)": 0.232298 }, { "epoch": 3.588688162207485, "grad_norm": 1.4985953569412231, "learning_rate": 1.8403421929395597e-05, "loss": 0.029758870601654053, "memory(GiB)": 122.96, "step": 47080, "token_acc": 0.985827664399093, "train_speed(iter/s)": 0.232308 }, { "epoch": 3.589069288817745, "grad_norm": 2.395111083984375, "learning_rate": 1.8394143125422864e-05, "loss": 0.04621181488037109, "memory(GiB)": 122.96, "step": 47085, "token_acc": 0.9810606060606061, "train_speed(iter/s)": 0.232313 }, { "epoch": 3.589450415428005, "grad_norm": 1.20294988155365, "learning_rate": 1.8384866133899335e-05, "loss": 0.03152124285697937, "memory(GiB)": 122.96, "step": 47090, "token_acc": 0.9848134697920106, "train_speed(iter/s)": 0.23232 }, { "epoch": 3.589831542038265, "grad_norm": 1.6464755535125732, "learning_rate": 1.837559095535702e-05, "loss": 0.07433818578720093, "memory(GiB)": 122.96, "step": 47095, "token_acc": 0.9772357723577236, "train_speed(iter/s)": 0.232324 }, { "epoch": 3.590212668648525, "grad_norm": 0.6206871867179871, "learning_rate": 1.8366317590327816e-05, "loss": 0.060637271404266356, "memory(GiB)": 122.96, "step": 47100, "token_acc": 0.9817715019255455, "train_speed(iter/s)": 0.23233 }, { "epoch": 3.590593795258785, "grad_norm": 0.4666770100593567, "learning_rate": 1.835704603934349e-05, "loss": 0.06033395528793335, "memory(GiB)": 122.96, "step": 47105, "token_acc": 0.9732722413134784, "train_speed(iter/s)": 0.232339 }, { "epoch": 3.5909749218690448, "grad_norm": 0.9988734126091003, "learning_rate": 1.8347776302935755e-05, "loss": 0.040096724033355714, "memory(GiB)": 122.96, "step": 47110, "token_acc": 0.9815795045521915, "train_speed(iter/s)": 0.232345 }, { "epoch": 3.591356048479305, "grad_norm": 1.224653959274292, "learning_rate": 1.833850838163615e-05, "loss": 0.05620102882385254, "memory(GiB)": 122.96, "step": 47115, "token_acc": 0.9775464857911356, "train_speed(iter/s)": 0.232346 }, { "epoch": 3.591737175089565, "grad_norm": 1.6122561693191528, "learning_rate": 1.8329242275976154e-05, "loss": 0.05222952365875244, "memory(GiB)": 122.96, "step": 47120, "token_acc": 0.9843302697001657, "train_speed(iter/s)": 0.232349 }, { "epoch": 3.592118301699825, "grad_norm": 1.1553281545639038, "learning_rate": 1.831997798648717e-05, "loss": 0.06638657450675964, "memory(GiB)": 122.96, "step": 47125, "token_acc": 0.9768054823405377, "train_speed(iter/s)": 0.232356 }, { "epoch": 3.5924994283100844, "grad_norm": 0.8611319661140442, "learning_rate": 1.8310715513700412e-05, "loss": 0.06013938784599304, "memory(GiB)": 122.96, "step": 47130, "token_acc": 0.9765549312148808, "train_speed(iter/s)": 0.232359 }, { "epoch": 3.5928805549203444, "grad_norm": 0.928214430809021, "learning_rate": 1.8301454858147065e-05, "loss": 0.044680291414260866, "memory(GiB)": 122.96, "step": 47135, "token_acc": 0.9810979847116053, "train_speed(iter/s)": 0.232361 }, { "epoch": 3.5932616815306044, "grad_norm": 0.9823131561279297, "learning_rate": 1.82921960203582e-05, "loss": 0.0658108115196228, "memory(GiB)": 122.96, "step": 47140, "token_acc": 0.9657802964254577, "train_speed(iter/s)": 0.232367 }, { "epoch": 3.5936428081408645, "grad_norm": 0.4703455865383148, "learning_rate": 1.828293900086473e-05, "loss": 0.04294630289077759, "memory(GiB)": 122.96, "step": 47145, "token_acc": 0.9869894099848714, "train_speed(iter/s)": 0.232369 }, { "epoch": 3.5940239347511245, "grad_norm": 1.4363523721694946, "learning_rate": 1.827368380019753e-05, "loss": 0.05068590044975281, "memory(GiB)": 122.96, "step": 47150, "token_acc": 0.9843808107103013, "train_speed(iter/s)": 0.232375 }, { "epoch": 3.5944050613613845, "grad_norm": 1.2436727285385132, "learning_rate": 1.8264430418887347e-05, "loss": 0.06382756829261779, "memory(GiB)": 122.96, "step": 47155, "token_acc": 0.9737015121630507, "train_speed(iter/s)": 0.23238 }, { "epoch": 3.594786187971644, "grad_norm": 0.9764981269836426, "learning_rate": 1.8255178857464796e-05, "loss": 0.04905833005905151, "memory(GiB)": 122.96, "step": 47160, "token_acc": 0.9853747714808044, "train_speed(iter/s)": 0.232388 }, { "epoch": 3.595167314581904, "grad_norm": 0.7267060279846191, "learning_rate": 1.824592911646042e-05, "loss": 0.04492635130882263, "memory(GiB)": 122.96, "step": 47165, "token_acc": 0.9805346127484579, "train_speed(iter/s)": 0.232392 }, { "epoch": 3.595548441192164, "grad_norm": 2.544975757598877, "learning_rate": 1.823668119640467e-05, "loss": 0.07157142162322998, "memory(GiB)": 122.96, "step": 47170, "token_acc": 0.9739683763979946, "train_speed(iter/s)": 0.232396 }, { "epoch": 3.595929567802424, "grad_norm": 1.6592223644256592, "learning_rate": 1.822743509782784e-05, "loss": 0.041348579525947574, "memory(GiB)": 122.96, "step": 47175, "token_acc": 0.9815270935960592, "train_speed(iter/s)": 0.232401 }, { "epoch": 3.5963106944126837, "grad_norm": 1.409091830253601, "learning_rate": 1.8218190821260178e-05, "loss": 0.05678023099899292, "memory(GiB)": 122.96, "step": 47180, "token_acc": 0.9798689138576779, "train_speed(iter/s)": 0.232409 }, { "epoch": 3.5966918210229437, "grad_norm": 1.4399526119232178, "learning_rate": 1.820894836723177e-05, "loss": 0.07048658132553101, "memory(GiB)": 122.96, "step": 47185, "token_acc": 0.9730085582620145, "train_speed(iter/s)": 0.232415 }, { "epoch": 3.5970729476332037, "grad_norm": 1.274295449256897, "learning_rate": 1.8199707736272643e-05, "loss": 0.05799223184585571, "memory(GiB)": 122.96, "step": 47190, "token_acc": 0.9809111324991979, "train_speed(iter/s)": 0.232417 }, { "epoch": 3.5974540742434638, "grad_norm": 0.9154659509658813, "learning_rate": 1.8190468928912723e-05, "loss": 0.0732722520828247, "memory(GiB)": 122.96, "step": 47195, "token_acc": 0.9721213881225498, "train_speed(iter/s)": 0.232419 }, { "epoch": 3.5978352008537238, "grad_norm": 1.6220835447311401, "learning_rate": 1.8181231945681782e-05, "loss": 0.06106681227684021, "memory(GiB)": 122.96, "step": 47200, "token_acc": 0.9771508510142224, "train_speed(iter/s)": 0.232424 }, { "epoch": 3.5978352008537238, "eval_loss": 0.0594787560403347, "eval_runtime": 219.1843, "eval_samples_per_second": 2.418, "eval_steps_per_second": 2.418, "eval_token_acc": 0.9756716462863683, "step": 47200 }, { "epoch": 3.598216327463984, "grad_norm": 1.8037583827972412, "learning_rate": 1.817199678710953e-05, "loss": 0.04851398468017578, "memory(GiB)": 122.96, "step": 47205, "token_acc": 0.9758176269969485, "train_speed(iter/s)": 0.232177 }, { "epoch": 3.5985974540742434, "grad_norm": 1.1298267841339111, "learning_rate": 1.8162763453725586e-05, "loss": 0.03791375160217285, "memory(GiB)": 122.96, "step": 47210, "token_acc": 0.9847009735744089, "train_speed(iter/s)": 0.232186 }, { "epoch": 3.5989785806845034, "grad_norm": 0.8652254343032837, "learning_rate": 1.815353194605939e-05, "loss": 0.0533711850643158, "memory(GiB)": 122.96, "step": 47215, "token_acc": 0.9774600504625736, "train_speed(iter/s)": 0.232184 }, { "epoch": 3.5993597072947634, "grad_norm": 1.8170115947723389, "learning_rate": 1.814430226464035e-05, "loss": 0.0678840696811676, "memory(GiB)": 122.96, "step": 47220, "token_acc": 0.9744655581947743, "train_speed(iter/s)": 0.232191 }, { "epoch": 3.599740833905023, "grad_norm": 0.9284037947654724, "learning_rate": 1.8135074409997764e-05, "loss": 0.05981945991516113, "memory(GiB)": 122.96, "step": 47225, "token_acc": 0.9757004711133151, "train_speed(iter/s)": 0.232198 }, { "epoch": 3.600121960515283, "grad_norm": 1.2877708673477173, "learning_rate": 1.812584838266076e-05, "loss": 0.06271784901618957, "memory(GiB)": 122.96, "step": 47230, "token_acc": 0.975729034595182, "train_speed(iter/s)": 0.232203 }, { "epoch": 3.600503087125543, "grad_norm": 0.989707887172699, "learning_rate": 1.8116624183158472e-05, "loss": 0.051410382986068724, "memory(GiB)": 122.96, "step": 47235, "token_acc": 0.9796802131912059, "train_speed(iter/s)": 0.232207 }, { "epoch": 3.600884213735803, "grad_norm": 0.9919970631599426, "learning_rate": 1.810740181201981e-05, "loss": 0.053725212812423706, "memory(GiB)": 122.96, "step": 47240, "token_acc": 0.9758981355161437, "train_speed(iter/s)": 0.232213 }, { "epoch": 3.601265340346063, "grad_norm": 1.095314621925354, "learning_rate": 1.8098181269773655e-05, "loss": 0.020884917676448823, "memory(GiB)": 122.96, "step": 47245, "token_acc": 0.9896994496966276, "train_speed(iter/s)": 0.232216 }, { "epoch": 3.601646466956323, "grad_norm": 0.9280776977539062, "learning_rate": 1.808896255694878e-05, "loss": 0.0473389208316803, "memory(GiB)": 122.96, "step": 47250, "token_acc": 0.9808585503166783, "train_speed(iter/s)": 0.232219 }, { "epoch": 3.602027593566583, "grad_norm": 0.8161941170692444, "learning_rate": 1.8079745674073795e-05, "loss": 0.051980823278427124, "memory(GiB)": 122.96, "step": 47255, "token_acc": 0.9792582176129373, "train_speed(iter/s)": 0.232224 }, { "epoch": 3.6024087201768427, "grad_norm": 2.988579511642456, "learning_rate": 1.807053062167727e-05, "loss": 0.05582446455955505, "memory(GiB)": 122.96, "step": 47260, "token_acc": 0.9851345922057051, "train_speed(iter/s)": 0.232231 }, { "epoch": 3.6027898467871027, "grad_norm": 1.264708161354065, "learning_rate": 1.8061317400287665e-05, "loss": 0.04419417381286621, "memory(GiB)": 122.96, "step": 47265, "token_acc": 0.9815414964069326, "train_speed(iter/s)": 0.232236 }, { "epoch": 3.6031709733973627, "grad_norm": 0.5623130202293396, "learning_rate": 1.8052106010433267e-05, "loss": 0.04226417541503906, "memory(GiB)": 122.96, "step": 47270, "token_acc": 0.9858323494687131, "train_speed(iter/s)": 0.232242 }, { "epoch": 3.6035521000076223, "grad_norm": 1.45103120803833, "learning_rate": 1.8042896452642344e-05, "loss": 0.08083772659301758, "memory(GiB)": 122.96, "step": 47275, "token_acc": 0.9679554162312783, "train_speed(iter/s)": 0.232246 }, { "epoch": 3.6039332266178823, "grad_norm": 0.28289496898651123, "learning_rate": 1.8033688727443022e-05, "loss": 0.04851097464561462, "memory(GiB)": 122.96, "step": 47280, "token_acc": 0.9784285714285714, "train_speed(iter/s)": 0.232247 }, { "epoch": 3.6043143532281423, "grad_norm": 1.2884248495101929, "learning_rate": 1.8024482835363287e-05, "loss": 0.03840899765491486, "memory(GiB)": 122.96, "step": 47285, "token_acc": 0.9843689695098418, "train_speed(iter/s)": 0.232252 }, { "epoch": 3.6046954798384023, "grad_norm": 0.7698943018913269, "learning_rate": 1.8015278776931084e-05, "loss": 0.03828598260879516, "memory(GiB)": 122.96, "step": 47290, "token_acc": 0.9742742916739092, "train_speed(iter/s)": 0.232258 }, { "epoch": 3.6050766064486623, "grad_norm": 0.9557726979255676, "learning_rate": 1.8006076552674234e-05, "loss": 0.05516451597213745, "memory(GiB)": 122.96, "step": 47295, "token_acc": 0.9760016270083384, "train_speed(iter/s)": 0.232264 }, { "epoch": 3.6054577330589224, "grad_norm": 1.8380497694015503, "learning_rate": 1.79968761631204e-05, "loss": 0.05986540317535401, "memory(GiB)": 122.96, "step": 47300, "token_acc": 0.9769004619907602, "train_speed(iter/s)": 0.23227 }, { "epoch": 3.605838859669182, "grad_norm": 0.7367437481880188, "learning_rate": 1.798767760879721e-05, "loss": 0.053400707244873044, "memory(GiB)": 122.96, "step": 47305, "token_acc": 0.9818294445591575, "train_speed(iter/s)": 0.232274 }, { "epoch": 3.606219986279442, "grad_norm": 0.8276212215423584, "learning_rate": 1.797848089023217e-05, "loss": 0.06602771282196045, "memory(GiB)": 122.96, "step": 47310, "token_acc": 0.9771808579997392, "train_speed(iter/s)": 0.232278 }, { "epoch": 3.606601112889702, "grad_norm": 1.6977864503860474, "learning_rate": 1.7969286007952636e-05, "loss": 0.07041627168655396, "memory(GiB)": 122.96, "step": 47315, "token_acc": 0.9725330620549338, "train_speed(iter/s)": 0.23228 }, { "epoch": 3.606982239499962, "grad_norm": 0.8128845691680908, "learning_rate": 1.796009296248591e-05, "loss": 0.04533239901065826, "memory(GiB)": 122.96, "step": 47320, "token_acc": 0.9820182183840057, "train_speed(iter/s)": 0.232283 }, { "epoch": 3.6073633661102216, "grad_norm": 1.160190463066101, "learning_rate": 1.7950901754359185e-05, "loss": 0.0728375792503357, "memory(GiB)": 122.96, "step": 47325, "token_acc": 0.9739069111424542, "train_speed(iter/s)": 0.232289 }, { "epoch": 3.6077444927204816, "grad_norm": 0.9889615774154663, "learning_rate": 1.79417123840995e-05, "loss": 0.04795294106006622, "memory(GiB)": 122.96, "step": 47330, "token_acc": 0.9832434514637904, "train_speed(iter/s)": 0.232294 }, { "epoch": 3.6081256193307416, "grad_norm": 2.9192662239074707, "learning_rate": 1.7932524852233845e-05, "loss": 0.0460934042930603, "memory(GiB)": 122.96, "step": 47335, "token_acc": 0.9794225024892134, "train_speed(iter/s)": 0.232302 }, { "epoch": 3.6085067459410016, "grad_norm": 1.1166999340057373, "learning_rate": 1.7923339159289094e-05, "loss": 0.045385292172431944, "memory(GiB)": 122.96, "step": 47340, "token_acc": 0.9801295896328294, "train_speed(iter/s)": 0.232307 }, { "epoch": 3.6088878725512616, "grad_norm": 0.3596144914627075, "learning_rate": 1.7914155305791968e-05, "loss": 0.05092054009437561, "memory(GiB)": 122.96, "step": 47345, "token_acc": 0.9796782387806944, "train_speed(iter/s)": 0.232313 }, { "epoch": 3.6092689991615217, "grad_norm": 0.9796237349510193, "learning_rate": 1.790497329226916e-05, "loss": 0.04205400943756103, "memory(GiB)": 122.96, "step": 47350, "token_acc": 0.9751369574378423, "train_speed(iter/s)": 0.232319 }, { "epoch": 3.6096501257717812, "grad_norm": 0.7389909029006958, "learning_rate": 1.7895793119247172e-05, "loss": 0.04233803749084473, "memory(GiB)": 122.96, "step": 47355, "token_acc": 0.9851422815411736, "train_speed(iter/s)": 0.232325 }, { "epoch": 3.6100312523820413, "grad_norm": 1.139735460281372, "learning_rate": 1.788661478725246e-05, "loss": 0.038258200883865355, "memory(GiB)": 122.96, "step": 47360, "token_acc": 0.9810450819672131, "train_speed(iter/s)": 0.232334 }, { "epoch": 3.6104123789923013, "grad_norm": 0.41071707010269165, "learning_rate": 1.787743829681139e-05, "loss": 0.031751468777656555, "memory(GiB)": 122.96, "step": 47365, "token_acc": 0.9817559863169898, "train_speed(iter/s)": 0.232341 }, { "epoch": 3.6107935056025613, "grad_norm": 1.9983606338500977, "learning_rate": 1.7868263648450157e-05, "loss": 0.0705912947654724, "memory(GiB)": 122.96, "step": 47370, "token_acc": 0.9737092176116566, "train_speed(iter/s)": 0.232348 }, { "epoch": 3.611174632212821, "grad_norm": 1.2292011976242065, "learning_rate": 1.7859090842694887e-05, "loss": 0.03917264938354492, "memory(GiB)": 122.96, "step": 47375, "token_acc": 0.9853192215773301, "train_speed(iter/s)": 0.232355 }, { "epoch": 3.611555758823081, "grad_norm": 1.748838186264038, "learning_rate": 1.7849919880071625e-05, "loss": 0.06175339221954346, "memory(GiB)": 122.96, "step": 47380, "token_acc": 0.9779843444227005, "train_speed(iter/s)": 0.23236 }, { "epoch": 3.611936885433341, "grad_norm": 0.7711045742034912, "learning_rate": 1.7840750761106246e-05, "loss": 0.04176511764526367, "memory(GiB)": 122.96, "step": 47385, "token_acc": 0.9818682579756713, "train_speed(iter/s)": 0.232365 }, { "epoch": 3.612318012043601, "grad_norm": 0.7081757187843323, "learning_rate": 1.7831583486324594e-05, "loss": 0.06046620011329651, "memory(GiB)": 122.96, "step": 47390, "token_acc": 0.9765491535952787, "train_speed(iter/s)": 0.23237 }, { "epoch": 3.612699138653861, "grad_norm": 1.0125676393508911, "learning_rate": 1.7822418056252325e-05, "loss": 0.061809098720550536, "memory(GiB)": 122.96, "step": 47395, "token_acc": 0.9704286489419425, "train_speed(iter/s)": 0.232376 }, { "epoch": 3.613080265264121, "grad_norm": 0.8500528931617737, "learning_rate": 1.781325447141507e-05, "loss": 0.05391446352005005, "memory(GiB)": 122.96, "step": 47400, "token_acc": 0.9803411860276198, "train_speed(iter/s)": 0.23238 }, { "epoch": 3.613080265264121, "eval_loss": 0.05899503454566002, "eval_runtime": 223.6561, "eval_samples_per_second": 2.37, "eval_steps_per_second": 2.37, "eval_token_acc": 0.9760481296307452, "step": 47400 }, { "epoch": 3.6134613918743805, "grad_norm": 0.7402609586715698, "learning_rate": 1.7804092732338312e-05, "loss": 0.03917487859725952, "memory(GiB)": 122.96, "step": 47405, "token_acc": 0.9762316742081448, "train_speed(iter/s)": 0.23213 }, { "epoch": 3.6138425184846406, "grad_norm": 1.134986162185669, "learning_rate": 1.7794932839547418e-05, "loss": 0.07701539993286133, "memory(GiB)": 122.96, "step": 47410, "token_acc": 0.970783015192832, "train_speed(iter/s)": 0.232137 }, { "epoch": 3.6142236450949006, "grad_norm": 0.90165114402771, "learning_rate": 1.7785774793567673e-05, "loss": 0.045980268716812135, "memory(GiB)": 122.96, "step": 47415, "token_acc": 0.9791039156626506, "train_speed(iter/s)": 0.232142 }, { "epoch": 3.6146047717051606, "grad_norm": 1.128702163696289, "learning_rate": 1.777661859492427e-05, "loss": 0.04720664620399475, "memory(GiB)": 122.96, "step": 47420, "token_acc": 0.9821344616831218, "train_speed(iter/s)": 0.232147 }, { "epoch": 3.61498589831542, "grad_norm": 0.4910225570201874, "learning_rate": 1.776746424414224e-05, "loss": 0.03528856337070465, "memory(GiB)": 122.96, "step": 47425, "token_acc": 0.9819347319347319, "train_speed(iter/s)": 0.232156 }, { "epoch": 3.61536702492568, "grad_norm": 0.8407129049301147, "learning_rate": 1.775831174174655e-05, "loss": 0.025632518529891967, "memory(GiB)": 122.96, "step": 47430, "token_acc": 0.9868977176669484, "train_speed(iter/s)": 0.232161 }, { "epoch": 3.61574815153594, "grad_norm": 1.3456968069076538, "learning_rate": 1.774916108826209e-05, "loss": 0.052316421270370485, "memory(GiB)": 122.96, "step": 47435, "token_acc": 0.9684252597921663, "train_speed(iter/s)": 0.232169 }, { "epoch": 3.6161292781462, "grad_norm": 0.7614882588386536, "learning_rate": 1.774001228421356e-05, "loss": 0.044124957919120786, "memory(GiB)": 122.96, "step": 47440, "token_acc": 0.9784770669437435, "train_speed(iter/s)": 0.232173 }, { "epoch": 3.6165104047564602, "grad_norm": 1.2382704019546509, "learning_rate": 1.7730865330125617e-05, "loss": 0.04821193218231201, "memory(GiB)": 122.96, "step": 47445, "token_acc": 0.9781255892890817, "train_speed(iter/s)": 0.232179 }, { "epoch": 3.6168915313667203, "grad_norm": 1.0875635147094727, "learning_rate": 1.772172022652282e-05, "loss": 0.062191063165664674, "memory(GiB)": 122.96, "step": 47450, "token_acc": 0.9796722009365687, "train_speed(iter/s)": 0.232181 }, { "epoch": 3.61727265797698, "grad_norm": 1.3738534450531006, "learning_rate": 1.7712576973929557e-05, "loss": 0.06152011156082153, "memory(GiB)": 122.96, "step": 47455, "token_acc": 0.9779661016949153, "train_speed(iter/s)": 0.232186 }, { "epoch": 3.61765378458724, "grad_norm": 0.548699676990509, "learning_rate": 1.7703435572870174e-05, "loss": 0.02621033489704132, "memory(GiB)": 122.96, "step": 47460, "token_acc": 0.9877650897226754, "train_speed(iter/s)": 0.232194 }, { "epoch": 3.6180349111975, "grad_norm": 1.2715814113616943, "learning_rate": 1.7694296023868907e-05, "loss": 0.05066499710083008, "memory(GiB)": 122.96, "step": 47465, "token_acc": 0.9796938456732271, "train_speed(iter/s)": 0.232197 }, { "epoch": 3.61841603780776, "grad_norm": 1.4759024381637573, "learning_rate": 1.768515832744983e-05, "loss": 0.06140434741973877, "memory(GiB)": 122.96, "step": 47470, "token_acc": 0.9739079448841982, "train_speed(iter/s)": 0.232203 }, { "epoch": 3.6187971644180195, "grad_norm": 0.786737859249115, "learning_rate": 1.767602248413696e-05, "loss": 0.09284164905548095, "memory(GiB)": 122.96, "step": 47475, "token_acc": 0.9698365185355745, "train_speed(iter/s)": 0.232208 }, { "epoch": 3.6191782910282795, "grad_norm": 1.1604071855545044, "learning_rate": 1.766688849445422e-05, "loss": 0.05211270451545715, "memory(GiB)": 122.96, "step": 47480, "token_acc": 0.9845417740154582, "train_speed(iter/s)": 0.232216 }, { "epoch": 3.6195594176385395, "grad_norm": 2.387571096420288, "learning_rate": 1.7657756358925358e-05, "loss": 0.07560851573944091, "memory(GiB)": 122.96, "step": 47485, "token_acc": 0.9802011313639221, "train_speed(iter/s)": 0.232221 }, { "epoch": 3.6199405442487995, "grad_norm": 1.6118093729019165, "learning_rate": 1.7648626078074086e-05, "loss": 0.07603476643562317, "memory(GiB)": 122.96, "step": 47490, "token_acc": 0.9752024291497976, "train_speed(iter/s)": 0.232229 }, { "epoch": 3.6203216708590595, "grad_norm": 1.0152109861373901, "learning_rate": 1.763949765242398e-05, "loss": 0.054453814029693605, "memory(GiB)": 122.96, "step": 47495, "token_acc": 0.9759233926128591, "train_speed(iter/s)": 0.232232 }, { "epoch": 3.6207027974693196, "grad_norm": 0.5905246138572693, "learning_rate": 1.7630371082498516e-05, "loss": 0.0884295403957367, "memory(GiB)": 122.96, "step": 47500, "token_acc": 0.975249500998004, "train_speed(iter/s)": 0.232236 }, { "epoch": 3.621083924079579, "grad_norm": 0.6929469108581543, "learning_rate": 1.7621246368821065e-05, "loss": 0.047768494486808775, "memory(GiB)": 122.96, "step": 47505, "token_acc": 0.9804221995233231, "train_speed(iter/s)": 0.232239 }, { "epoch": 3.621465050689839, "grad_norm": 1.5383646488189697, "learning_rate": 1.7612123511914868e-05, "loss": 0.07661871910095215, "memory(GiB)": 122.96, "step": 47510, "token_acc": 0.9713306325958242, "train_speed(iter/s)": 0.232246 }, { "epoch": 3.621846177300099, "grad_norm": 2.2604787349700928, "learning_rate": 1.760300251230309e-05, "loss": 0.06505971550941467, "memory(GiB)": 122.96, "step": 47515, "token_acc": 0.9698033707865169, "train_speed(iter/s)": 0.232253 }, { "epoch": 3.622227303910359, "grad_norm": 1.6242302656173706, "learning_rate": 1.759388337050879e-05, "loss": 0.051475238800048825, "memory(GiB)": 122.96, "step": 47520, "token_acc": 0.976831091180867, "train_speed(iter/s)": 0.232259 }, { "epoch": 3.6226084305206188, "grad_norm": 1.0099433660507202, "learning_rate": 1.7584766087054884e-05, "loss": 0.04905773401260376, "memory(GiB)": 122.96, "step": 47525, "token_acc": 0.9858537557282328, "train_speed(iter/s)": 0.232263 }, { "epoch": 3.622989557130879, "grad_norm": 0.9047166705131531, "learning_rate": 1.7575650662464216e-05, "loss": 0.04197709858417511, "memory(GiB)": 122.96, "step": 47530, "token_acc": 0.9851354591225125, "train_speed(iter/s)": 0.23227 }, { "epoch": 3.623370683741139, "grad_norm": 1.2369072437286377, "learning_rate": 1.7566537097259535e-05, "loss": 0.04675836265087128, "memory(GiB)": 122.96, "step": 47535, "token_acc": 0.9841552782729254, "train_speed(iter/s)": 0.232274 }, { "epoch": 3.623751810351399, "grad_norm": 0.10166900604963303, "learning_rate": 1.7557425391963416e-05, "loss": 0.03423279523849487, "memory(GiB)": 122.96, "step": 47540, "token_acc": 0.9799927246271372, "train_speed(iter/s)": 0.232282 }, { "epoch": 3.624132936961659, "grad_norm": 1.497053623199463, "learning_rate": 1.7548315547098405e-05, "loss": 0.055977606773376466, "memory(GiB)": 122.96, "step": 47545, "token_acc": 0.9767853194782224, "train_speed(iter/s)": 0.232288 }, { "epoch": 3.624514063571919, "grad_norm": 0.9703657031059265, "learning_rate": 1.753920756318692e-05, "loss": 0.044962641596794126, "memory(GiB)": 122.96, "step": 47550, "token_acc": 0.9828003875968992, "train_speed(iter/s)": 0.232293 }, { "epoch": 3.6248951901821784, "grad_norm": 1.365577220916748, "learning_rate": 1.753010144075123e-05, "loss": 0.05316944122314453, "memory(GiB)": 122.96, "step": 47555, "token_acc": 0.9823717948717948, "train_speed(iter/s)": 0.2323 }, { "epoch": 3.6252763167924384, "grad_norm": 1.003524899482727, "learning_rate": 1.7520997180313557e-05, "loss": 0.04015655219554901, "memory(GiB)": 122.96, "step": 47560, "token_acc": 0.9799509913120963, "train_speed(iter/s)": 0.232306 }, { "epoch": 3.6256574434026985, "grad_norm": 1.932751178741455, "learning_rate": 1.751189478239596e-05, "loss": 0.06435790657997131, "memory(GiB)": 122.96, "step": 47565, "token_acc": 0.9783793946230495, "train_speed(iter/s)": 0.232311 }, { "epoch": 3.626038570012958, "grad_norm": 0.6418894529342651, "learning_rate": 1.7502794247520433e-05, "loss": 0.06235827207565307, "memory(GiB)": 122.96, "step": 47570, "token_acc": 0.9788262370540851, "train_speed(iter/s)": 0.232318 }, { "epoch": 3.626419696623218, "grad_norm": 2.1610803604125977, "learning_rate": 1.7493695576208868e-05, "loss": 0.07291669249534607, "memory(GiB)": 122.96, "step": 47575, "token_acc": 0.9704060564349621, "train_speed(iter/s)": 0.232327 }, { "epoch": 3.626800823233478, "grad_norm": 1.0146440267562866, "learning_rate": 1.7484598768982994e-05, "loss": 0.07093042731285096, "memory(GiB)": 122.96, "step": 47580, "token_acc": 0.975103734439834, "train_speed(iter/s)": 0.232333 }, { "epoch": 3.627181949843738, "grad_norm": 1.301300048828125, "learning_rate": 1.7475503826364493e-05, "loss": 0.05369055867195129, "memory(GiB)": 122.96, "step": 47585, "token_acc": 0.9805115712545676, "train_speed(iter/s)": 0.23234 }, { "epoch": 3.627563076453998, "grad_norm": 1.1104676723480225, "learning_rate": 1.7466410748874934e-05, "loss": 0.0951920747756958, "memory(GiB)": 122.96, "step": 47590, "token_acc": 0.9684986595174263, "train_speed(iter/s)": 0.232347 }, { "epoch": 3.627944203064258, "grad_norm": 1.8204164505004883, "learning_rate": 1.7457319537035726e-05, "loss": 0.06986138820648194, "memory(GiB)": 122.96, "step": 47595, "token_acc": 0.9810606060606061, "train_speed(iter/s)": 0.232355 }, { "epoch": 3.628325329674518, "grad_norm": 1.9393055438995361, "learning_rate": 1.7448230191368225e-05, "loss": 0.05152398347854614, "memory(GiB)": 122.96, "step": 47600, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.23236 }, { "epoch": 3.628325329674518, "eval_loss": 0.05922512710094452, "eval_runtime": 220.0247, "eval_samples_per_second": 2.409, "eval_steps_per_second": 2.409, "eval_token_acc": 0.9759050659598819, "step": 47600 }, { "epoch": 3.6287064562847777, "grad_norm": 0.9465760588645935, "learning_rate": 1.743914271239368e-05, "loss": 0.056246399879455566, "memory(GiB)": 122.96, "step": 47605, "token_acc": 0.9760454908512681, "train_speed(iter/s)": 0.232119 }, { "epoch": 3.6290875828950377, "grad_norm": 0.9288109540939331, "learning_rate": 1.7430057100633186e-05, "loss": 0.05463117957115173, "memory(GiB)": 122.96, "step": 47610, "token_acc": 0.9755294117647059, "train_speed(iter/s)": 0.232124 }, { "epoch": 3.6294687095052978, "grad_norm": 0.9820647835731506, "learning_rate": 1.7420973356607773e-05, "loss": 0.0666085124015808, "memory(GiB)": 122.96, "step": 47615, "token_acc": 0.9722500835840856, "train_speed(iter/s)": 0.232132 }, { "epoch": 3.6298498361155573, "grad_norm": 1.0574134588241577, "learning_rate": 1.7411891480838365e-05, "loss": 0.07623971104621888, "memory(GiB)": 122.96, "step": 47620, "token_acc": 0.973404255319149, "train_speed(iter/s)": 0.23214 }, { "epoch": 3.6302309627258174, "grad_norm": 2.0081863403320312, "learning_rate": 1.7402811473845744e-05, "loss": 0.09845021367073059, "memory(GiB)": 122.96, "step": 47625, "token_acc": 0.965098299094323, "train_speed(iter/s)": 0.232146 }, { "epoch": 3.6306120893360774, "grad_norm": 0.7158924341201782, "learning_rate": 1.7393733336150615e-05, "loss": 0.040968358516693115, "memory(GiB)": 122.96, "step": 47630, "token_acc": 0.9839338994721138, "train_speed(iter/s)": 0.232145 }, { "epoch": 3.6309932159463374, "grad_norm": 1.9703623056411743, "learning_rate": 1.7384657068273565e-05, "loss": 0.07067152261734008, "memory(GiB)": 122.96, "step": 47635, "token_acc": 0.9708924705316334, "train_speed(iter/s)": 0.232151 }, { "epoch": 3.6313743425565974, "grad_norm": 1.917624831199646, "learning_rate": 1.7375582670735075e-05, "loss": 0.0656814455986023, "memory(GiB)": 122.96, "step": 47640, "token_acc": 0.9744302390216787, "train_speed(iter/s)": 0.232155 }, { "epoch": 3.6317554691668574, "grad_norm": 1.3861640691757202, "learning_rate": 1.736651014405554e-05, "loss": 0.056943339109420774, "memory(GiB)": 122.96, "step": 47645, "token_acc": 0.9783741120757695, "train_speed(iter/s)": 0.23216 }, { "epoch": 3.632136595777117, "grad_norm": 1.6887177228927612, "learning_rate": 1.73574394887552e-05, "loss": 0.05863676071166992, "memory(GiB)": 122.96, "step": 47650, "token_acc": 0.9752716373173473, "train_speed(iter/s)": 0.232167 }, { "epoch": 3.632517722387377, "grad_norm": 0.30417925119400024, "learning_rate": 1.734837070535422e-05, "loss": 0.049505564570426944, "memory(GiB)": 122.96, "step": 47655, "token_acc": 0.9796274738067521, "train_speed(iter/s)": 0.232171 }, { "epoch": 3.632898848997637, "grad_norm": 1.6731693744659424, "learning_rate": 1.733930379437268e-05, "loss": 0.053939664363861085, "memory(GiB)": 122.96, "step": 47660, "token_acc": 0.9815624406006462, "train_speed(iter/s)": 0.232176 }, { "epoch": 3.633279975607897, "grad_norm": 1.265807867050171, "learning_rate": 1.733023875633048e-05, "loss": 0.03941224217414856, "memory(GiB)": 122.96, "step": 47665, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.232184 }, { "epoch": 3.6336611022181566, "grad_norm": 1.2417453527450562, "learning_rate": 1.7321175591747484e-05, "loss": 0.05610345602035523, "memory(GiB)": 122.96, "step": 47670, "token_acc": 0.9758922240605058, "train_speed(iter/s)": 0.232189 }, { "epoch": 3.6340422288284167, "grad_norm": 1.828171730041504, "learning_rate": 1.731211430114344e-05, "loss": 0.04406516551971436, "memory(GiB)": 122.96, "step": 47675, "token_acc": 0.9831876260928043, "train_speed(iter/s)": 0.232197 }, { "epoch": 3.6344233554386767, "grad_norm": 0.6655412912368774, "learning_rate": 1.7303054885037928e-05, "loss": 0.0656819462776184, "memory(GiB)": 122.96, "step": 47680, "token_acc": 0.9718258324185877, "train_speed(iter/s)": 0.232204 }, { "epoch": 3.6348044820489367, "grad_norm": 0.9383029937744141, "learning_rate": 1.729399734395049e-05, "loss": 0.06071552038192749, "memory(GiB)": 122.96, "step": 47685, "token_acc": 0.9823857302118172, "train_speed(iter/s)": 0.232209 }, { "epoch": 3.6351856086591967, "grad_norm": 0.6466847062110901, "learning_rate": 1.7284941678400546e-05, "loss": 0.057598966360092166, "memory(GiB)": 122.96, "step": 47690, "token_acc": 0.9810459810459811, "train_speed(iter/s)": 0.232214 }, { "epoch": 3.6355667352694567, "grad_norm": 0.9205085635185242, "learning_rate": 1.7275887888907362e-05, "loss": 0.06998350620269775, "memory(GiB)": 122.96, "step": 47695, "token_acc": 0.9810864306135914, "train_speed(iter/s)": 0.232218 }, { "epoch": 3.6359478618797163, "grad_norm": 0.7856445908546448, "learning_rate": 1.7266835975990154e-05, "loss": 0.06631748676300049, "memory(GiB)": 122.96, "step": 47700, "token_acc": 0.9800913838120104, "train_speed(iter/s)": 0.232226 }, { "epoch": 3.6363289884899763, "grad_norm": 0.8880451917648315, "learning_rate": 1.7257785940168013e-05, "loss": 0.058634668588638306, "memory(GiB)": 122.96, "step": 47705, "token_acc": 0.9800847457627119, "train_speed(iter/s)": 0.232231 }, { "epoch": 3.6367101151002363, "grad_norm": 0.8842060565948486, "learning_rate": 1.724873778195989e-05, "loss": 0.054058611392974854, "memory(GiB)": 122.96, "step": 47710, "token_acc": 0.9840848806366048, "train_speed(iter/s)": 0.232234 }, { "epoch": 3.6370912417104964, "grad_norm": 0.753688633441925, "learning_rate": 1.723969150188467e-05, "loss": 0.037414976954460145, "memory(GiB)": 122.96, "step": 47715, "token_acc": 0.983343319352906, "train_speed(iter/s)": 0.232237 }, { "epoch": 3.637472368320756, "grad_norm": 1.6569169759750366, "learning_rate": 1.723064710046114e-05, "loss": 0.047268688678741455, "memory(GiB)": 122.96, "step": 47720, "token_acc": 0.9836233367451381, "train_speed(iter/s)": 0.232245 }, { "epoch": 3.637853494931016, "grad_norm": 2.77325701713562, "learning_rate": 1.7221604578207908e-05, "loss": 0.05992996096611023, "memory(GiB)": 122.96, "step": 47725, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.23225 }, { "epoch": 3.638234621541276, "grad_norm": 1.5017508268356323, "learning_rate": 1.7212563935643538e-05, "loss": 0.06100462675094605, "memory(GiB)": 122.96, "step": 47730, "token_acc": 0.9776048284625158, "train_speed(iter/s)": 0.232254 }, { "epoch": 3.638615748151536, "grad_norm": 1.0390727519989014, "learning_rate": 1.72035251732865e-05, "loss": 0.04228924810886383, "memory(GiB)": 122.96, "step": 47735, "token_acc": 0.9826532838751909, "train_speed(iter/s)": 0.232254 }, { "epoch": 3.638996874761796, "grad_norm": 1.3094216585159302, "learning_rate": 1.7194488291655077e-05, "loss": 0.06521302461624146, "memory(GiB)": 122.96, "step": 47740, "token_acc": 0.9771769662921348, "train_speed(iter/s)": 0.23226 }, { "epoch": 3.639378001372056, "grad_norm": 1.7683950662612915, "learning_rate": 1.7185453291267534e-05, "loss": 0.03874517679214477, "memory(GiB)": 122.96, "step": 47745, "token_acc": 0.9821200510855683, "train_speed(iter/s)": 0.232268 }, { "epoch": 3.6397591279823156, "grad_norm": 1.4274749755859375, "learning_rate": 1.717642017264195e-05, "loss": 0.06403992772102356, "memory(GiB)": 122.96, "step": 47750, "token_acc": 0.9739616957176673, "train_speed(iter/s)": 0.232274 }, { "epoch": 3.6401402545925756, "grad_norm": 1.5095325708389282, "learning_rate": 1.7167388936296347e-05, "loss": 0.05742988586425781, "memory(GiB)": 122.96, "step": 47755, "token_acc": 0.979750039550704, "train_speed(iter/s)": 0.232279 }, { "epoch": 3.6405213812028356, "grad_norm": 0.8805214762687683, "learning_rate": 1.7158359582748638e-05, "loss": 0.037418439984321594, "memory(GiB)": 122.96, "step": 47760, "token_acc": 0.9882396177875781, "train_speed(iter/s)": 0.232286 }, { "epoch": 3.6409025078130957, "grad_norm": 0.7769801616668701, "learning_rate": 1.714933211251658e-05, "loss": 0.0330279529094696, "memory(GiB)": 122.96, "step": 47765, "token_acc": 0.9812372283113505, "train_speed(iter/s)": 0.232291 }, { "epoch": 3.6412836344233552, "grad_norm": 0.5937682390213013, "learning_rate": 1.714030652611789e-05, "loss": 0.031880933046340945, "memory(GiB)": 122.96, "step": 47770, "token_acc": 0.9864537977745524, "train_speed(iter/s)": 0.232293 }, { "epoch": 3.6416647610336152, "grad_norm": 0.9511445164680481, "learning_rate": 1.713128282407015e-05, "loss": 0.05185995101928711, "memory(GiB)": 122.96, "step": 47775, "token_acc": 0.9717013296965564, "train_speed(iter/s)": 0.2323 }, { "epoch": 3.6420458876438753, "grad_norm": 2.344233989715576, "learning_rate": 1.71222610068908e-05, "loss": 0.04885266721248627, "memory(GiB)": 122.96, "step": 47780, "token_acc": 0.9826493159826493, "train_speed(iter/s)": 0.232307 }, { "epoch": 3.6424270142541353, "grad_norm": 1.4126336574554443, "learning_rate": 1.711324107509722e-05, "loss": 0.07625150084495544, "memory(GiB)": 122.96, "step": 47785, "token_acc": 0.9671175373134329, "train_speed(iter/s)": 0.232313 }, { "epoch": 3.6428081408643953, "grad_norm": 0.5987164974212646, "learning_rate": 1.7104223029206635e-05, "loss": 0.0352380633354187, "memory(GiB)": 122.96, "step": 47790, "token_acc": 0.9769530588035926, "train_speed(iter/s)": 0.232317 }, { "epoch": 3.6431892674746553, "grad_norm": 0.7462478280067444, "learning_rate": 1.7095206869736207e-05, "loss": 0.060283929109573364, "memory(GiB)": 122.96, "step": 47795, "token_acc": 0.9799627213420317, "train_speed(iter/s)": 0.232321 }, { "epoch": 3.643570394084915, "grad_norm": 0.7728898525238037, "learning_rate": 1.7086192597202982e-05, "loss": 0.07239018678665161, "memory(GiB)": 122.96, "step": 47800, "token_acc": 0.9699140401146131, "train_speed(iter/s)": 0.232327 }, { "epoch": 3.643570394084915, "eval_loss": 0.05931966379284859, "eval_runtime": 218.1075, "eval_samples_per_second": 2.43, "eval_steps_per_second": 2.43, "eval_token_acc": 0.9758297692910066, "step": 47800 }, { "epoch": 3.643951520695175, "grad_norm": 1.1376514434814453, "learning_rate": 1.7077180212123862e-05, "loss": 0.05890887975692749, "memory(GiB)": 122.96, "step": 47805, "token_acc": 0.975807727148547, "train_speed(iter/s)": 0.232085 }, { "epoch": 3.644332647305435, "grad_norm": 1.5291908979415894, "learning_rate": 1.7068169715015668e-05, "loss": 0.05574790835380554, "memory(GiB)": 122.96, "step": 47810, "token_acc": 0.9793878825733916, "train_speed(iter/s)": 0.232087 }, { "epoch": 3.644713773915695, "grad_norm": 1.2032731771469116, "learning_rate": 1.705916110639514e-05, "loss": 0.04272624552249908, "memory(GiB)": 122.96, "step": 47815, "token_acc": 0.9838395096127055, "train_speed(iter/s)": 0.232094 }, { "epoch": 3.6450949005259545, "grad_norm": 1.470335602760315, "learning_rate": 1.7050154386778844e-05, "loss": 0.05284435153007507, "memory(GiB)": 122.96, "step": 47820, "token_acc": 0.9724875811998471, "train_speed(iter/s)": 0.2321 }, { "epoch": 3.6454760271362145, "grad_norm": 2.282376527786255, "learning_rate": 1.7041149556683283e-05, "loss": 0.05055549740791321, "memory(GiB)": 122.96, "step": 47825, "token_acc": 0.9793769197016235, "train_speed(iter/s)": 0.232105 }, { "epoch": 3.6458571537464746, "grad_norm": 1.719781756401062, "learning_rate": 1.7032146616624866e-05, "loss": 0.04864290058612823, "memory(GiB)": 122.96, "step": 47830, "token_acc": 0.980962840929892, "train_speed(iter/s)": 0.23211 }, { "epoch": 3.6462382803567346, "grad_norm": 0.9051420092582703, "learning_rate": 1.7023145567119837e-05, "loss": 0.03226599097251892, "memory(GiB)": 122.96, "step": 47835, "token_acc": 0.9794392523364486, "train_speed(iter/s)": 0.232117 }, { "epoch": 3.6466194069669946, "grad_norm": 1.8147262334823608, "learning_rate": 1.7014146408684374e-05, "loss": 0.05465726256370544, "memory(GiB)": 122.96, "step": 47840, "token_acc": 0.9796493425172198, "train_speed(iter/s)": 0.232124 }, { "epoch": 3.6470005335772546, "grad_norm": 2.54840350151062, "learning_rate": 1.7005149141834557e-05, "loss": 0.06274861097335815, "memory(GiB)": 122.96, "step": 47845, "token_acc": 0.9802749551703527, "train_speed(iter/s)": 0.232133 }, { "epoch": 3.647381660187514, "grad_norm": 1.830100655555725, "learning_rate": 1.6996153767086308e-05, "loss": 0.056211167573928834, "memory(GiB)": 122.96, "step": 47850, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.23214 }, { "epoch": 3.647762786797774, "grad_norm": 0.33382993936538696, "learning_rate": 1.6987160284955483e-05, "loss": 0.06042122840881348, "memory(GiB)": 122.96, "step": 47855, "token_acc": 0.9843652282676673, "train_speed(iter/s)": 0.232143 }, { "epoch": 3.6481439134080342, "grad_norm": 1.338073968887329, "learning_rate": 1.6978168695957837e-05, "loss": 0.03632608950138092, "memory(GiB)": 122.96, "step": 47860, "token_acc": 0.9804166666666667, "train_speed(iter/s)": 0.232151 }, { "epoch": 3.648525040018294, "grad_norm": 0.549271285533905, "learning_rate": 1.696917900060896e-05, "loss": 0.04478556215763092, "memory(GiB)": 122.96, "step": 47865, "token_acc": 0.9845639032815199, "train_speed(iter/s)": 0.23215 }, { "epoch": 3.648906166628554, "grad_norm": 1.0777959823608398, "learning_rate": 1.696019119942438e-05, "loss": 0.03317604064941406, "memory(GiB)": 122.96, "step": 47870, "token_acc": 0.9839525553811268, "train_speed(iter/s)": 0.232155 }, { "epoch": 3.649287293238814, "grad_norm": 1.1444809436798096, "learning_rate": 1.695120529291953e-05, "loss": 0.032970663905143735, "memory(GiB)": 122.96, "step": 47875, "token_acc": 0.9856850715746421, "train_speed(iter/s)": 0.23216 }, { "epoch": 3.649668419849074, "grad_norm": 1.6117008924484253, "learning_rate": 1.6942221281609675e-05, "loss": 0.04201326966285705, "memory(GiB)": 122.96, "step": 47880, "token_acc": 0.9796206618240516, "train_speed(iter/s)": 0.232164 }, { "epoch": 3.650049546459334, "grad_norm": 2.267258882522583, "learning_rate": 1.6933239166010024e-05, "loss": 0.07109785079956055, "memory(GiB)": 122.96, "step": 47885, "token_acc": 0.9692242833052277, "train_speed(iter/s)": 0.232171 }, { "epoch": 3.650430673069594, "grad_norm": 1.2512543201446533, "learning_rate": 1.6924258946635675e-05, "loss": 0.03970286250114441, "memory(GiB)": 122.96, "step": 47890, "token_acc": 0.9798294863797047, "train_speed(iter/s)": 0.232177 }, { "epoch": 3.650811799679854, "grad_norm": 1.6166025400161743, "learning_rate": 1.691528062400155e-05, "loss": 0.05653139352798462, "memory(GiB)": 122.96, "step": 47895, "token_acc": 0.9793792959800042, "train_speed(iter/s)": 0.232183 }, { "epoch": 3.6511929262901135, "grad_norm": 1.0473705530166626, "learning_rate": 1.6906304198622584e-05, "loss": 0.08742517232894897, "memory(GiB)": 122.96, "step": 47900, "token_acc": 0.9707859958265709, "train_speed(iter/s)": 0.232189 }, { "epoch": 3.6515740529003735, "grad_norm": 0.5780665278434753, "learning_rate": 1.6897329671013484e-05, "loss": 0.03909661173820496, "memory(GiB)": 122.96, "step": 47905, "token_acc": 0.9825829773250082, "train_speed(iter/s)": 0.232191 }, { "epoch": 3.6519551795106335, "grad_norm": 1.2663543224334717, "learning_rate": 1.6888357041688907e-05, "loss": 0.0636956512928009, "memory(GiB)": 122.96, "step": 47910, "token_acc": 0.9761176641910587, "train_speed(iter/s)": 0.232193 }, { "epoch": 3.652336306120893, "grad_norm": 1.2166259288787842, "learning_rate": 1.687938631116342e-05, "loss": 0.02992349863052368, "memory(GiB)": 122.96, "step": 47915, "token_acc": 0.982831825733801, "train_speed(iter/s)": 0.232195 }, { "epoch": 3.652717432731153, "grad_norm": 1.6655292510986328, "learning_rate": 1.6870417479951406e-05, "loss": 0.05300288200378418, "memory(GiB)": 122.96, "step": 47920, "token_acc": 0.9795087923266967, "train_speed(iter/s)": 0.2322 }, { "epoch": 3.653098559341413, "grad_norm": 1.3579692840576172, "learning_rate": 1.6861450548567215e-05, "loss": 0.06861122250556946, "memory(GiB)": 122.96, "step": 47925, "token_acc": 0.9706452506230961, "train_speed(iter/s)": 0.232206 }, { "epoch": 3.653479685951673, "grad_norm": 0.7963428497314453, "learning_rate": 1.685248551752507e-05, "loss": 0.05292418599128723, "memory(GiB)": 122.96, "step": 47930, "token_acc": 0.9740458015267176, "train_speed(iter/s)": 0.232211 }, { "epoch": 3.653860812561933, "grad_norm": 1.220557689666748, "learning_rate": 1.6843522387339035e-05, "loss": 0.05075922608375549, "memory(GiB)": 122.96, "step": 47935, "token_acc": 0.9789306300934207, "train_speed(iter/s)": 0.232216 }, { "epoch": 3.654241939172193, "grad_norm": 2.1784205436706543, "learning_rate": 1.683456115852313e-05, "loss": 0.09971869587898255, "memory(GiB)": 122.96, "step": 47940, "token_acc": 0.9608987452582434, "train_speed(iter/s)": 0.232222 }, { "epoch": 3.6546230657824528, "grad_norm": 2.614711046218872, "learning_rate": 1.6825601831591252e-05, "loss": 0.0449992448091507, "memory(GiB)": 122.96, "step": 47945, "token_acc": 0.9798183652875883, "train_speed(iter/s)": 0.232227 }, { "epoch": 3.655004192392713, "grad_norm": 2.7913143634796143, "learning_rate": 1.6816644407057148e-05, "loss": 0.05405691862106323, "memory(GiB)": 122.96, "step": 47950, "token_acc": 0.9850489774875408, "train_speed(iter/s)": 0.232233 }, { "epoch": 3.655385319002973, "grad_norm": 1.4592957496643066, "learning_rate": 1.680768888543451e-05, "loss": 0.045831909775733946, "memory(GiB)": 122.96, "step": 47955, "token_acc": 0.9820804195804196, "train_speed(iter/s)": 0.232238 }, { "epoch": 3.655766445613233, "grad_norm": 1.7272624969482422, "learning_rate": 1.6798735267236863e-05, "loss": 0.06353203058242798, "memory(GiB)": 122.96, "step": 47960, "token_acc": 0.969782034346103, "train_speed(iter/s)": 0.232242 }, { "epoch": 3.6561475722234924, "grad_norm": 1.8976670503616333, "learning_rate": 1.6789783552977684e-05, "loss": 0.0625275731086731, "memory(GiB)": 122.96, "step": 47965, "token_acc": 0.9825895875591616, "train_speed(iter/s)": 0.232244 }, { "epoch": 3.6565286988337524, "grad_norm": 0.9626296758651733, "learning_rate": 1.6780833743170317e-05, "loss": 0.03540968894958496, "memory(GiB)": 122.96, "step": 47970, "token_acc": 0.9835684943739954, "train_speed(iter/s)": 0.23225 }, { "epoch": 3.6569098254440124, "grad_norm": 0.08254506438970566, "learning_rate": 1.6771885838327966e-05, "loss": 0.05199010968208313, "memory(GiB)": 122.96, "step": 47975, "token_acc": 0.978915001098177, "train_speed(iter/s)": 0.232254 }, { "epoch": 3.6572909520542725, "grad_norm": 1.2929847240447998, "learning_rate": 1.676293983896376e-05, "loss": 0.03660974204540253, "memory(GiB)": 122.96, "step": 47980, "token_acc": 0.984390837218731, "train_speed(iter/s)": 0.232258 }, { "epoch": 3.6576720786645325, "grad_norm": 1.5124449729919434, "learning_rate": 1.6753995745590735e-05, "loss": 0.07228717803955079, "memory(GiB)": 122.96, "step": 47985, "token_acc": 0.9651682048228639, "train_speed(iter/s)": 0.232264 }, { "epoch": 3.6580532052747925, "grad_norm": 1.233964204788208, "learning_rate": 1.674505355872175e-05, "loss": 0.060631567239761354, "memory(GiB)": 122.96, "step": 47990, "token_acc": 0.979784996494508, "train_speed(iter/s)": 0.232266 }, { "epoch": 3.658434331885052, "grad_norm": 1.5195633172988892, "learning_rate": 1.673611327886963e-05, "loss": 0.07084540128707886, "memory(GiB)": 122.96, "step": 47995, "token_acc": 0.9644619940769991, "train_speed(iter/s)": 0.232274 }, { "epoch": 3.658815458495312, "grad_norm": 0.7902447581291199, "learning_rate": 1.6727174906547065e-05, "loss": 0.051998329162597653, "memory(GiB)": 122.96, "step": 48000, "token_acc": 0.9802062344435226, "train_speed(iter/s)": 0.232277 }, { "epoch": 3.658815458495312, "eval_loss": 0.05878680199384689, "eval_runtime": 220.4197, "eval_samples_per_second": 2.405, "eval_steps_per_second": 2.405, "eval_token_acc": 0.9764170833082344, "step": 48000 }, { "epoch": 3.659196585105572, "grad_norm": 1.1177029609680176, "learning_rate": 1.6718238442266597e-05, "loss": 0.03565287292003631, "memory(GiB)": 122.96, "step": 48005, "token_acc": 0.9766207635613422, "train_speed(iter/s)": 0.232035 }, { "epoch": 3.659577711715832, "grad_norm": 1.1681771278381348, "learning_rate": 1.6709303886540706e-05, "loss": 0.06048480272293091, "memory(GiB)": 122.96, "step": 48010, "token_acc": 0.9723979298447384, "train_speed(iter/s)": 0.232044 }, { "epoch": 3.6599588383260917, "grad_norm": 2.0282559394836426, "learning_rate": 1.6700371239881768e-05, "loss": 0.032705605030059814, "memory(GiB)": 122.96, "step": 48015, "token_acc": 0.9816360601001669, "train_speed(iter/s)": 0.23205 }, { "epoch": 3.6603399649363517, "grad_norm": 1.0503597259521484, "learning_rate": 1.6691440502801997e-05, "loss": 0.04945161640644073, "memory(GiB)": 122.96, "step": 48020, "token_acc": 0.9846205282514209, "train_speed(iter/s)": 0.232058 }, { "epoch": 3.6607210915466117, "grad_norm": 0.771257221698761, "learning_rate": 1.668251167581354e-05, "loss": 0.04196532964706421, "memory(GiB)": 122.96, "step": 48025, "token_acc": 0.9836563156665888, "train_speed(iter/s)": 0.232064 }, { "epoch": 3.6611022181568718, "grad_norm": 1.7336751222610474, "learning_rate": 1.6673584759428424e-05, "loss": 0.06518191695213318, "memory(GiB)": 122.96, "step": 48030, "token_acc": 0.977404098791382, "train_speed(iter/s)": 0.232072 }, { "epoch": 3.6614833447671318, "grad_norm": 1.8255970478057861, "learning_rate": 1.6664659754158567e-05, "loss": 0.03973201513290405, "memory(GiB)": 122.96, "step": 48035, "token_acc": 0.9828715365239294, "train_speed(iter/s)": 0.232079 }, { "epoch": 3.661864471377392, "grad_norm": 1.28916597366333, "learning_rate": 1.6655736660515803e-05, "loss": 0.03886716961860657, "memory(GiB)": 122.96, "step": 48040, "token_acc": 0.9844246662428481, "train_speed(iter/s)": 0.232086 }, { "epoch": 3.6622455979876514, "grad_norm": 0.8008208870887756, "learning_rate": 1.6646815479011778e-05, "loss": 0.06824996471405029, "memory(GiB)": 122.96, "step": 48045, "token_acc": 0.9810309278350515, "train_speed(iter/s)": 0.232094 }, { "epoch": 3.6626267245979114, "grad_norm": 1.26068913936615, "learning_rate": 1.6637896210158115e-05, "loss": 0.03955523669719696, "memory(GiB)": 122.96, "step": 48050, "token_acc": 0.9793684210526316, "train_speed(iter/s)": 0.232102 }, { "epoch": 3.6630078512081714, "grad_norm": 1.0715631246566772, "learning_rate": 1.6628978854466304e-05, "loss": 0.038085955381393435, "memory(GiB)": 122.96, "step": 48055, "token_acc": 0.9872039205009528, "train_speed(iter/s)": 0.232108 }, { "epoch": 3.6633889778184314, "grad_norm": 0.7420515418052673, "learning_rate": 1.6620063412447673e-05, "loss": 0.05656273365020752, "memory(GiB)": 122.96, "step": 48060, "token_acc": 0.9772033746185604, "train_speed(iter/s)": 0.232113 }, { "epoch": 3.663770104428691, "grad_norm": 2.2701187133789062, "learning_rate": 1.6611149884613504e-05, "loss": 0.05292450785636902, "memory(GiB)": 122.96, "step": 48065, "token_acc": 0.9797492767598843, "train_speed(iter/s)": 0.232122 }, { "epoch": 3.664151231038951, "grad_norm": 1.5229096412658691, "learning_rate": 1.6602238271474963e-05, "loss": 0.04687563478946686, "memory(GiB)": 122.96, "step": 48070, "token_acc": 0.9811419591409115, "train_speed(iter/s)": 0.232128 }, { "epoch": 3.664532357649211, "grad_norm": 1.0770761966705322, "learning_rate": 1.659332857354306e-05, "loss": 0.06040109395980835, "memory(GiB)": 122.96, "step": 48075, "token_acc": 0.9721526908635795, "train_speed(iter/s)": 0.232136 }, { "epoch": 3.664913484259471, "grad_norm": 0.6295843124389648, "learning_rate": 1.658442079132873e-05, "loss": 0.03707034587860107, "memory(GiB)": 122.96, "step": 48080, "token_acc": 0.9853421912896763, "train_speed(iter/s)": 0.232133 }, { "epoch": 3.665294610869731, "grad_norm": 1.371564269065857, "learning_rate": 1.6575514925342823e-05, "loss": 0.050472313165664674, "memory(GiB)": 122.96, "step": 48085, "token_acc": 0.9704813805631244, "train_speed(iter/s)": 0.232141 }, { "epoch": 3.665675737479991, "grad_norm": 2.668579578399658, "learning_rate": 1.656661097609601e-05, "loss": 0.06933038234710694, "memory(GiB)": 122.96, "step": 48090, "token_acc": 0.9730392156862745, "train_speed(iter/s)": 0.232148 }, { "epoch": 3.6660568640902507, "grad_norm": 0.8513806462287903, "learning_rate": 1.6557708944098906e-05, "loss": 0.05955277681350708, "memory(GiB)": 122.96, "step": 48095, "token_acc": 0.9751080024686278, "train_speed(iter/s)": 0.232154 }, { "epoch": 3.6664379907005107, "grad_norm": 0.7242007851600647, "learning_rate": 1.6548808829862016e-05, "loss": 0.060566335916519165, "memory(GiB)": 122.96, "step": 48100, "token_acc": 0.9750075278530563, "train_speed(iter/s)": 0.232161 }, { "epoch": 3.6668191173107707, "grad_norm": 0.9351616501808167, "learning_rate": 1.6539910633895695e-05, "loss": 0.03421060442924499, "memory(GiB)": 122.96, "step": 48105, "token_acc": 0.984986812740921, "train_speed(iter/s)": 0.232165 }, { "epoch": 3.6672002439210307, "grad_norm": 1.6127039194107056, "learning_rate": 1.6531014356710222e-05, "loss": 0.052091628313064575, "memory(GiB)": 122.96, "step": 48110, "token_acc": 0.9860349127182045, "train_speed(iter/s)": 0.232172 }, { "epoch": 3.6675813705312903, "grad_norm": 2.268441677093506, "learning_rate": 1.6522119998815784e-05, "loss": 0.0549412727355957, "memory(GiB)": 122.96, "step": 48115, "token_acc": 0.9784539044136851, "train_speed(iter/s)": 0.232175 }, { "epoch": 3.6679624971415503, "grad_norm": 1.273640513420105, "learning_rate": 1.6513227560722384e-05, "loss": 0.04528163075447082, "memory(GiB)": 122.96, "step": 48120, "token_acc": 0.9831018905805589, "train_speed(iter/s)": 0.232178 }, { "epoch": 3.6683436237518103, "grad_norm": 0.7694916129112244, "learning_rate": 1.6504337042940005e-05, "loss": 0.05279173851013184, "memory(GiB)": 122.96, "step": 48125, "token_acc": 0.9795591182364729, "train_speed(iter/s)": 0.232182 }, { "epoch": 3.6687247503620704, "grad_norm": 0.8424879908561707, "learning_rate": 1.6495448445978438e-05, "loss": 0.050432682037353516, "memory(GiB)": 122.96, "step": 48130, "token_acc": 0.9807750991760756, "train_speed(iter/s)": 0.232186 }, { "epoch": 3.6691058769723304, "grad_norm": 1.6479442119598389, "learning_rate": 1.6486561770347425e-05, "loss": 0.08307241797447204, "memory(GiB)": 122.96, "step": 48135, "token_acc": 0.9675509756632318, "train_speed(iter/s)": 0.232192 }, { "epoch": 3.6694870035825904, "grad_norm": 0.8473480343818665, "learning_rate": 1.647767701655659e-05, "loss": 0.04499098062515259, "memory(GiB)": 122.96, "step": 48140, "token_acc": 0.9807295251204404, "train_speed(iter/s)": 0.232194 }, { "epoch": 3.66986813019285, "grad_norm": 0.6770869493484497, "learning_rate": 1.64687941851154e-05, "loss": 0.05598995089530945, "memory(GiB)": 122.96, "step": 48145, "token_acc": 0.9819018404907975, "train_speed(iter/s)": 0.232201 }, { "epoch": 3.67024925680311, "grad_norm": 0.5390996336936951, "learning_rate": 1.6459913276533262e-05, "loss": 0.03256575465202331, "memory(GiB)": 122.96, "step": 48150, "token_acc": 0.9871420222092344, "train_speed(iter/s)": 0.232208 }, { "epoch": 3.67063038341337, "grad_norm": 2.0292603969573975, "learning_rate": 1.645103429131946e-05, "loss": 0.05319755077362061, "memory(GiB)": 122.96, "step": 48155, "token_acc": 0.9796817625458997, "train_speed(iter/s)": 0.232209 }, { "epoch": 3.6710115100236296, "grad_norm": 1.1795772314071655, "learning_rate": 1.6442157229983146e-05, "loss": 0.05296671390533447, "memory(GiB)": 122.96, "step": 48160, "token_acc": 0.9805203650728783, "train_speed(iter/s)": 0.232212 }, { "epoch": 3.6713926366338896, "grad_norm": 1.208998203277588, "learning_rate": 1.643328209303337e-05, "loss": 0.04531992673873901, "memory(GiB)": 122.96, "step": 48165, "token_acc": 0.9848927038626609, "train_speed(iter/s)": 0.232215 }, { "epoch": 3.6717737632441496, "grad_norm": 1.4459234476089478, "learning_rate": 1.642440888097913e-05, "loss": 0.065830397605896, "memory(GiB)": 122.96, "step": 48170, "token_acc": 0.9768591327791507, "train_speed(iter/s)": 0.23222 }, { "epoch": 3.6721548898544096, "grad_norm": 0.921940267086029, "learning_rate": 1.6415537594329216e-05, "loss": 0.0465308278799057, "memory(GiB)": 122.96, "step": 48175, "token_acc": 0.9807603152526657, "train_speed(iter/s)": 0.232226 }, { "epoch": 3.6725360164646697, "grad_norm": 2.3774189949035645, "learning_rate": 1.640666823359238e-05, "loss": 0.04490221738815307, "memory(GiB)": 122.96, "step": 48180, "token_acc": 0.9856938483547926, "train_speed(iter/s)": 0.232232 }, { "epoch": 3.6729171430749297, "grad_norm": 1.6197043657302856, "learning_rate": 1.639780079927722e-05, "loss": 0.05760207176208496, "memory(GiB)": 122.96, "step": 48185, "token_acc": 0.9720554272517321, "train_speed(iter/s)": 0.232239 }, { "epoch": 3.6732982696851897, "grad_norm": 0.8382560610771179, "learning_rate": 1.638893529189225e-05, "loss": 0.035894864797592164, "memory(GiB)": 122.96, "step": 48190, "token_acc": 0.9816023738872404, "train_speed(iter/s)": 0.232243 }, { "epoch": 3.6736793962954493, "grad_norm": 1.9648873805999756, "learning_rate": 1.6380071711945876e-05, "loss": 0.054339814186096194, "memory(GiB)": 122.96, "step": 48195, "token_acc": 0.9830144055041926, "train_speed(iter/s)": 0.23225 }, { "epoch": 3.6740605229057093, "grad_norm": 1.6480191946029663, "learning_rate": 1.637121005994637e-05, "loss": 0.05863473415374756, "memory(GiB)": 122.96, "step": 48200, "token_acc": 0.9764988897113249, "train_speed(iter/s)": 0.232253 }, { "epoch": 3.6740605229057093, "eval_loss": 0.05810040235519409, "eval_runtime": 222.2027, "eval_samples_per_second": 2.385, "eval_steps_per_second": 2.385, "eval_token_acc": 0.9762740196373713, "step": 48200 }, { "epoch": 3.6744416495159693, "grad_norm": 0.7535433173179626, "learning_rate": 1.63623503364019e-05, "loss": 0.029051649570465087, "memory(GiB)": 122.96, "step": 48205, "token_acc": 0.9767053934622926, "train_speed(iter/s)": 0.232012 }, { "epoch": 3.674822776126229, "grad_norm": 1.3645621538162231, "learning_rate": 1.635349254182056e-05, "loss": 0.049265730381011966, "memory(GiB)": 122.96, "step": 48210, "token_acc": 0.9812775330396476, "train_speed(iter/s)": 0.232018 }, { "epoch": 3.675203902736489, "grad_norm": 1.8363505601882935, "learning_rate": 1.6344636676710262e-05, "loss": 0.09498717784881591, "memory(GiB)": 122.96, "step": 48215, "token_acc": 0.9653353428786737, "train_speed(iter/s)": 0.232025 }, { "epoch": 3.675585029346749, "grad_norm": 2.8972411155700684, "learning_rate": 1.633578274157888e-05, "loss": 0.05635427236557007, "memory(GiB)": 122.96, "step": 48220, "token_acc": 0.983358547655068, "train_speed(iter/s)": 0.232032 }, { "epoch": 3.675966155957009, "grad_norm": 0.8019871115684509, "learning_rate": 1.6326930736934148e-05, "loss": 0.06921446323394775, "memory(GiB)": 122.96, "step": 48225, "token_acc": 0.9770966655439542, "train_speed(iter/s)": 0.232039 }, { "epoch": 3.676347282567269, "grad_norm": 0.8099725842475891, "learning_rate": 1.6318080663283658e-05, "loss": 0.052753770351409913, "memory(GiB)": 122.96, "step": 48230, "token_acc": 0.9803103620891039, "train_speed(iter/s)": 0.232041 }, { "epoch": 3.676728409177529, "grad_norm": 2.132075548171997, "learning_rate": 1.6309232521134944e-05, "loss": 0.07401596307754517, "memory(GiB)": 122.96, "step": 48235, "token_acc": 0.9819537073362102, "train_speed(iter/s)": 0.232045 }, { "epoch": 3.6771095357877885, "grad_norm": 0.9877547025680542, "learning_rate": 1.6300386310995413e-05, "loss": 0.04916301369667053, "memory(GiB)": 122.96, "step": 48240, "token_acc": 0.9757418747056052, "train_speed(iter/s)": 0.232051 }, { "epoch": 3.6774906623980486, "grad_norm": 0.06091378256678581, "learning_rate": 1.6291542033372325e-05, "loss": 0.033946821093559267, "memory(GiB)": 122.96, "step": 48245, "token_acc": 0.9782637863947403, "train_speed(iter/s)": 0.232056 }, { "epoch": 3.6778717890083086, "grad_norm": 0.8548510670661926, "learning_rate": 1.6282699688772877e-05, "loss": 0.025692084431648256, "memory(GiB)": 122.96, "step": 48250, "token_acc": 0.9873577749683944, "train_speed(iter/s)": 0.232062 }, { "epoch": 3.6782529156185686, "grad_norm": 0.9286959171295166, "learning_rate": 1.627385927770415e-05, "loss": 0.03849512934684753, "memory(GiB)": 122.96, "step": 48255, "token_acc": 0.9800159872102319, "train_speed(iter/s)": 0.232069 }, { "epoch": 3.678634042228828, "grad_norm": 2.155122756958008, "learning_rate": 1.626502080067307e-05, "loss": 0.09634953141212463, "memory(GiB)": 122.96, "step": 48260, "token_acc": 0.96037804434751, "train_speed(iter/s)": 0.232077 }, { "epoch": 3.679015168839088, "grad_norm": 0.6940560936927795, "learning_rate": 1.6256184258186496e-05, "loss": 0.04620637893676758, "memory(GiB)": 122.96, "step": 48265, "token_acc": 0.9821134868421053, "train_speed(iter/s)": 0.232081 }, { "epoch": 3.679396295449348, "grad_norm": 2.0207934379577637, "learning_rate": 1.624734965075118e-05, "loss": 0.05696791410446167, "memory(GiB)": 122.96, "step": 48270, "token_acc": 0.9786928908010654, "train_speed(iter/s)": 0.232087 }, { "epoch": 3.6797774220596082, "grad_norm": 1.522004246711731, "learning_rate": 1.6238516978873718e-05, "loss": 0.07894476056098938, "memory(GiB)": 122.96, "step": 48275, "token_acc": 0.9774350185661239, "train_speed(iter/s)": 0.232093 }, { "epoch": 3.6801585486698682, "grad_norm": 0.6156745553016663, "learning_rate": 1.622968624306063e-05, "loss": 0.0616912841796875, "memory(GiB)": 122.96, "step": 48280, "token_acc": 0.9776303317535545, "train_speed(iter/s)": 0.232097 }, { "epoch": 3.6805396752801283, "grad_norm": 1.5485402345657349, "learning_rate": 1.6220857443818344e-05, "loss": 0.06718974113464356, "memory(GiB)": 122.96, "step": 48285, "token_acc": 0.9766233766233766, "train_speed(iter/s)": 0.232102 }, { "epoch": 3.680920801890388, "grad_norm": 0.7601261734962463, "learning_rate": 1.621203058165311e-05, "loss": 0.044635072350502014, "memory(GiB)": 122.96, "step": 48290, "token_acc": 0.9846385039238604, "train_speed(iter/s)": 0.232107 }, { "epoch": 3.681301928500648, "grad_norm": 1.6727445125579834, "learning_rate": 1.6203205657071126e-05, "loss": 0.050914764404296875, "memory(GiB)": 122.96, "step": 48295, "token_acc": 0.9859355527861848, "train_speed(iter/s)": 0.23211 }, { "epoch": 3.681683055110908, "grad_norm": 1.0336617231369019, "learning_rate": 1.619438267057846e-05, "loss": 0.045072078704833984, "memory(GiB)": 122.96, "step": 48300, "token_acc": 0.9838152105593966, "train_speed(iter/s)": 0.232113 }, { "epoch": 3.682064181721168, "grad_norm": 2.1194939613342285, "learning_rate": 1.618556162268107e-05, "loss": 0.07241759300231934, "memory(GiB)": 122.96, "step": 48305, "token_acc": 0.9777660137638963, "train_speed(iter/s)": 0.23212 }, { "epoch": 3.6824453083314275, "grad_norm": 0.9463962912559509, "learning_rate": 1.6176742513884825e-05, "loss": 0.030112722516059877, "memory(GiB)": 122.96, "step": 48310, "token_acc": 0.98532874479261, "train_speed(iter/s)": 0.232123 }, { "epoch": 3.6828264349416875, "grad_norm": 1.7319163084030151, "learning_rate": 1.6167925344695416e-05, "loss": 0.08415945172309876, "memory(GiB)": 122.96, "step": 48315, "token_acc": 0.9642782097079219, "train_speed(iter/s)": 0.232128 }, { "epoch": 3.6832075615519475, "grad_norm": 1.8599900007247925, "learning_rate": 1.6159110115618493e-05, "loss": 0.07805742621421814, "memory(GiB)": 122.96, "step": 48320, "token_acc": 0.9715811965811966, "train_speed(iter/s)": 0.232135 }, { "epoch": 3.6835886881622075, "grad_norm": 0.5375562906265259, "learning_rate": 1.6150296827159578e-05, "loss": 0.039122378826141356, "memory(GiB)": 122.96, "step": 48325, "token_acc": 0.9856099180872261, "train_speed(iter/s)": 0.232139 }, { "epoch": 3.6839698147724675, "grad_norm": 1.6492680311203003, "learning_rate": 1.6141485479824043e-05, "loss": 0.048486185073852536, "memory(GiB)": 122.96, "step": 48330, "token_acc": 0.9821673525377229, "train_speed(iter/s)": 0.232142 }, { "epoch": 3.6843509413827276, "grad_norm": 0.8854400515556335, "learning_rate": 1.6132676074117192e-05, "loss": 0.05629817247390747, "memory(GiB)": 122.96, "step": 48335, "token_acc": 0.9771442614893094, "train_speed(iter/s)": 0.232149 }, { "epoch": 3.684732067992987, "grad_norm": 0.6061491370201111, "learning_rate": 1.6123868610544217e-05, "loss": 0.047524815797805785, "memory(GiB)": 122.96, "step": 48340, "token_acc": 0.9818103074924209, "train_speed(iter/s)": 0.23215 }, { "epoch": 3.685113194603247, "grad_norm": 2.0609936714172363, "learning_rate": 1.611506308961016e-05, "loss": 0.0627691090106964, "memory(GiB)": 122.96, "step": 48345, "token_acc": 0.9785313001605136, "train_speed(iter/s)": 0.232155 }, { "epoch": 3.685494321213507, "grad_norm": 2.329880952835083, "learning_rate": 1.610625951182001e-05, "loss": 0.05459084510803223, "memory(GiB)": 122.96, "step": 48350, "token_acc": 0.980650277557494, "train_speed(iter/s)": 0.232159 }, { "epoch": 3.685875447823767, "grad_norm": 0.4827536940574646, "learning_rate": 1.6097457877678567e-05, "loss": 0.03923773467540741, "memory(GiB)": 122.96, "step": 48355, "token_acc": 0.9884514435695538, "train_speed(iter/s)": 0.23216 }, { "epoch": 3.6862565744340268, "grad_norm": 0.7513619661331177, "learning_rate": 1.608865818769059e-05, "loss": 0.03520242273807526, "memory(GiB)": 122.96, "step": 48360, "token_acc": 0.9847583643122677, "train_speed(iter/s)": 0.232167 }, { "epoch": 3.686637701044287, "grad_norm": 1.3567614555358887, "learning_rate": 1.6079860442360716e-05, "loss": 0.040508699417114255, "memory(GiB)": 122.96, "step": 48365, "token_acc": 0.9853039412157648, "train_speed(iter/s)": 0.232174 }, { "epoch": 3.687018827654547, "grad_norm": 1.4228169918060303, "learning_rate": 1.6071064642193422e-05, "loss": 0.04824328422546387, "memory(GiB)": 122.96, "step": 48370, "token_acc": 0.9768073032321737, "train_speed(iter/s)": 0.232181 }, { "epoch": 3.687399954264807, "grad_norm": 0.6925344467163086, "learning_rate": 1.6062270787693117e-05, "loss": 0.05991653800010681, "memory(GiB)": 122.96, "step": 48375, "token_acc": 0.9743669896842764, "train_speed(iter/s)": 0.232189 }, { "epoch": 3.687781080875067, "grad_norm": 1.335773229598999, "learning_rate": 1.6053478879364115e-05, "loss": 0.029550814628601076, "memory(GiB)": 122.96, "step": 48380, "token_acc": 0.988592014410087, "train_speed(iter/s)": 0.232196 }, { "epoch": 3.688162207485327, "grad_norm": 2.7997865676879883, "learning_rate": 1.604468891771054e-05, "loss": 0.06762516498565674, "memory(GiB)": 122.96, "step": 48385, "token_acc": 0.972351357964277, "train_speed(iter/s)": 0.232201 }, { "epoch": 3.6885433340955864, "grad_norm": 0.809954047203064, "learning_rate": 1.6035900903236494e-05, "loss": 0.040023711323738095, "memory(GiB)": 122.96, "step": 48390, "token_acc": 0.9837476099426387, "train_speed(iter/s)": 0.232204 }, { "epoch": 3.6889244607058465, "grad_norm": 0.8898844718933105, "learning_rate": 1.6027114836445933e-05, "loss": 0.04361325800418854, "memory(GiB)": 122.96, "step": 48395, "token_acc": 0.9815570136549033, "train_speed(iter/s)": 0.232208 }, { "epoch": 3.6893055873161065, "grad_norm": 0.6146228909492493, "learning_rate": 1.6018330717842666e-05, "loss": 0.05757214426994324, "memory(GiB)": 122.96, "step": 48400, "token_acc": 0.9790705924086556, "train_speed(iter/s)": 0.232212 }, { "epoch": 3.6893055873161065, "eval_loss": 0.059177279472351074, "eval_runtime": 222.1724, "eval_samples_per_second": 2.386, "eval_steps_per_second": 2.386, "eval_token_acc": 0.9762589603035962, "step": 48400 }, { "epoch": 3.6896867139263665, "grad_norm": 0.8625184297561646, "learning_rate": 1.6009548547930436e-05, "loss": 0.046514520049095155, "memory(GiB)": 122.96, "step": 48405, "token_acc": 0.9765289757704477, "train_speed(iter/s)": 0.231969 }, { "epoch": 3.690067840536626, "grad_norm": 0.6489352583885193, "learning_rate": 1.6000768327212883e-05, "loss": 0.046875306963920595, "memory(GiB)": 122.96, "step": 48410, "token_acc": 0.9797682863565623, "train_speed(iter/s)": 0.231973 }, { "epoch": 3.690448967146886, "grad_norm": 1.4069807529449463, "learning_rate": 1.5991990056193468e-05, "loss": 0.0655810534954071, "memory(GiB)": 122.96, "step": 48415, "token_acc": 0.9719864176570459, "train_speed(iter/s)": 0.231978 }, { "epoch": 3.690830093757146, "grad_norm": 1.0049433708190918, "learning_rate": 1.5983213735375613e-05, "loss": 0.04664726853370667, "memory(GiB)": 122.96, "step": 48420, "token_acc": 0.9809797236676835, "train_speed(iter/s)": 0.231982 }, { "epoch": 3.691211220367406, "grad_norm": 1.188881516456604, "learning_rate": 1.5974439365262607e-05, "loss": 0.04738571941852569, "memory(GiB)": 122.96, "step": 48425, "token_acc": 0.9804131054131054, "train_speed(iter/s)": 0.231987 }, { "epoch": 3.691592346977666, "grad_norm": 0.9814119338989258, "learning_rate": 1.596566694635757e-05, "loss": 0.060292786359786986, "memory(GiB)": 122.96, "step": 48430, "token_acc": 0.9755333432791288, "train_speed(iter/s)": 0.231989 }, { "epoch": 3.691973473587926, "grad_norm": 1.616642951965332, "learning_rate": 1.595689647916363e-05, "loss": 0.07547932863235474, "memory(GiB)": 122.96, "step": 48435, "token_acc": 0.9730596536241181, "train_speed(iter/s)": 0.231996 }, { "epoch": 3.6923546001981857, "grad_norm": 0.7356760501861572, "learning_rate": 1.5948127964183683e-05, "loss": 0.03916893303394318, "memory(GiB)": 122.96, "step": 48440, "token_acc": 0.9858841010401189, "train_speed(iter/s)": 0.232 }, { "epoch": 3.6927357268084458, "grad_norm": 1.3649760484695435, "learning_rate": 1.593936140192057e-05, "loss": 0.05226744413375854, "memory(GiB)": 122.96, "step": 48445, "token_acc": 0.9823490956635432, "train_speed(iter/s)": 0.232006 }, { "epoch": 3.6931168534187058, "grad_norm": 1.1437256336212158, "learning_rate": 1.5930596792877044e-05, "loss": 0.05172572135925293, "memory(GiB)": 122.96, "step": 48450, "token_acc": 0.9792243767313019, "train_speed(iter/s)": 0.232012 }, { "epoch": 3.693497980028966, "grad_norm": 0.8684987425804138, "learning_rate": 1.5921834137555674e-05, "loss": 0.04258478581905365, "memory(GiB)": 122.96, "step": 48455, "token_acc": 0.9819254185692542, "train_speed(iter/s)": 0.232016 }, { "epoch": 3.6938791066392254, "grad_norm": 1.192156195640564, "learning_rate": 1.5913073436458976e-05, "loss": 0.05748317241668701, "memory(GiB)": 122.96, "step": 48460, "token_acc": 0.9784681636419563, "train_speed(iter/s)": 0.232024 }, { "epoch": 3.6942602332494854, "grad_norm": 1.1472495794296265, "learning_rate": 1.5904314690089344e-05, "loss": 0.06327325701713563, "memory(GiB)": 122.96, "step": 48465, "token_acc": 0.9771197846567967, "train_speed(iter/s)": 0.232031 }, { "epoch": 3.6946413598597454, "grad_norm": 0.9456011056900024, "learning_rate": 1.5895557898949026e-05, "loss": 0.05184919238090515, "memory(GiB)": 122.96, "step": 48470, "token_acc": 0.9808030715085586, "train_speed(iter/s)": 0.232034 }, { "epoch": 3.6950224864700054, "grad_norm": 1.0404905080795288, "learning_rate": 1.58868030635402e-05, "loss": 0.028860560059547423, "memory(GiB)": 122.96, "step": 48475, "token_acc": 0.9860440150295223, "train_speed(iter/s)": 0.232041 }, { "epoch": 3.6954036130802654, "grad_norm": 0.8925660252571106, "learning_rate": 1.587805018436493e-05, "loss": 0.05596458911895752, "memory(GiB)": 122.96, "step": 48480, "token_acc": 0.9756549965221424, "train_speed(iter/s)": 0.232047 }, { "epoch": 3.6957847396905255, "grad_norm": 0.6341900825500488, "learning_rate": 1.586929926192512e-05, "loss": 0.06573118567466736, "memory(GiB)": 122.96, "step": 48485, "token_acc": 0.9774040313262293, "train_speed(iter/s)": 0.232049 }, { "epoch": 3.696165866300785, "grad_norm": 1.0370584726333618, "learning_rate": 1.586055029672261e-05, "loss": 0.050352704524993894, "memory(GiB)": 122.96, "step": 48490, "token_acc": 0.9796411318150449, "train_speed(iter/s)": 0.232054 }, { "epoch": 3.696546992911045, "grad_norm": 1.1615376472473145, "learning_rate": 1.585180328925913e-05, "loss": 0.06879176497459412, "memory(GiB)": 122.96, "step": 48495, "token_acc": 0.9704565801253358, "train_speed(iter/s)": 0.232059 }, { "epoch": 3.696928119521305, "grad_norm": 1.0408551692962646, "learning_rate": 1.584305824003625e-05, "loss": 0.060592269897460936, "memory(GiB)": 122.96, "step": 48500, "token_acc": 0.9784984138174128, "train_speed(iter/s)": 0.232063 }, { "epoch": 3.6973092461315646, "grad_norm": 1.666353702545166, "learning_rate": 1.5834315149555477e-05, "loss": 0.05862908959388733, "memory(GiB)": 122.96, "step": 48505, "token_acc": 0.9704375246353961, "train_speed(iter/s)": 0.232068 }, { "epoch": 3.6976903727418247, "grad_norm": 1.731019139289856, "learning_rate": 1.5825574018318194e-05, "loss": 0.07606738805770874, "memory(GiB)": 122.96, "step": 48510, "token_acc": 0.9726277372262774, "train_speed(iter/s)": 0.232075 }, { "epoch": 3.6980714993520847, "grad_norm": 1.8112565279006958, "learning_rate": 1.5816834846825635e-05, "loss": 0.05146102905273438, "memory(GiB)": 122.96, "step": 48515, "token_acc": 0.9812583668005355, "train_speed(iter/s)": 0.232082 }, { "epoch": 3.6984526259623447, "grad_norm": 0.990960955619812, "learning_rate": 1.5808097635578982e-05, "loss": 0.0400552898645401, "memory(GiB)": 122.96, "step": 48520, "token_acc": 0.9830985915492958, "train_speed(iter/s)": 0.232088 }, { "epoch": 3.6988337525726047, "grad_norm": 0.8230341076850891, "learning_rate": 1.5799362385079253e-05, "loss": 0.05660185217857361, "memory(GiB)": 122.96, "step": 48525, "token_acc": 0.981578593518892, "train_speed(iter/s)": 0.232092 }, { "epoch": 3.6992148791828647, "grad_norm": 1.6877230405807495, "learning_rate": 1.579062909582737e-05, "loss": 0.0644965946674347, "memory(GiB)": 122.96, "step": 48530, "token_acc": 0.9787928221859706, "train_speed(iter/s)": 0.232098 }, { "epoch": 3.6995960057931248, "grad_norm": 2.41813325881958, "learning_rate": 1.5781897768324183e-05, "loss": 0.07666466236114503, "memory(GiB)": 122.96, "step": 48535, "token_acc": 0.9740932642487047, "train_speed(iter/s)": 0.232107 }, { "epoch": 3.6999771324033843, "grad_norm": 2.337859869003296, "learning_rate": 1.5773168403070344e-05, "loss": 0.051493358612060544, "memory(GiB)": 122.96, "step": 48540, "token_acc": 0.9833178869323448, "train_speed(iter/s)": 0.232115 }, { "epoch": 3.7003582590136443, "grad_norm": 1.8415247201919556, "learning_rate": 1.5764441000566472e-05, "loss": 0.051285314559936526, "memory(GiB)": 122.96, "step": 48545, "token_acc": 0.9841149773071104, "train_speed(iter/s)": 0.232123 }, { "epoch": 3.7007393856239044, "grad_norm": 1.3859657049179077, "learning_rate": 1.5755715561313044e-05, "loss": 0.09091821312904358, "memory(GiB)": 122.96, "step": 48550, "token_acc": 0.9717271051014137, "train_speed(iter/s)": 0.23213 }, { "epoch": 3.701120512234164, "grad_norm": 0.5118578672409058, "learning_rate": 1.574699208581041e-05, "loss": 0.05651033520698547, "memory(GiB)": 122.96, "step": 48555, "token_acc": 0.9835989190196627, "train_speed(iter/s)": 0.232129 }, { "epoch": 3.701501638844424, "grad_norm": 0.5589068531990051, "learning_rate": 1.5738270574558816e-05, "loss": 0.03484681248664856, "memory(GiB)": 122.96, "step": 48560, "token_acc": 0.982740021574973, "train_speed(iter/s)": 0.232128 }, { "epoch": 3.701882765454684, "grad_norm": 0.9829323291778564, "learning_rate": 1.5729551028058416e-05, "loss": 0.04082508683204651, "memory(GiB)": 122.96, "step": 48565, "token_acc": 0.9813641446142378, "train_speed(iter/s)": 0.232128 }, { "epoch": 3.702263892064944, "grad_norm": 1.311590313911438, "learning_rate": 1.572083344680923e-05, "loss": 0.03534108996391296, "memory(GiB)": 122.96, "step": 48570, "token_acc": 0.9846153846153847, "train_speed(iter/s)": 0.232136 }, { "epoch": 3.702645018675204, "grad_norm": 1.2769992351531982, "learning_rate": 1.5712117831311184e-05, "loss": 0.06841142177581787, "memory(GiB)": 122.96, "step": 48575, "token_acc": 0.9767116091752758, "train_speed(iter/s)": 0.23214 }, { "epoch": 3.703026145285464, "grad_norm": 1.2119688987731934, "learning_rate": 1.570340418206405e-05, "loss": 0.05462930202484131, "memory(GiB)": 122.96, "step": 48580, "token_acc": 0.9797375851187511, "train_speed(iter/s)": 0.232144 }, { "epoch": 3.7034072718957236, "grad_norm": 0.69859778881073, "learning_rate": 1.5694692499567536e-05, "loss": 0.08060967922210693, "memory(GiB)": 122.96, "step": 48585, "token_acc": 0.9753040914117214, "train_speed(iter/s)": 0.232148 }, { "epoch": 3.7037883985059836, "grad_norm": 1.3780245780944824, "learning_rate": 1.5685982784321222e-05, "loss": 0.09155261516571045, "memory(GiB)": 122.96, "step": 48590, "token_acc": 0.9645913743916764, "train_speed(iter/s)": 0.232152 }, { "epoch": 3.7041695251162436, "grad_norm": 2.224315643310547, "learning_rate": 1.5677275036824545e-05, "loss": 0.03502265214920044, "memory(GiB)": 122.96, "step": 48595, "token_acc": 0.9842200180342651, "train_speed(iter/s)": 0.232158 }, { "epoch": 3.7045506517265037, "grad_norm": 1.0141119956970215, "learning_rate": 1.5668569257576864e-05, "loss": 0.043917950987815854, "memory(GiB)": 122.96, "step": 48600, "token_acc": 0.9837054918527459, "train_speed(iter/s)": 0.232162 }, { "epoch": 3.7045506517265037, "eval_loss": 0.05847746133804321, "eval_runtime": 219.9746, "eval_samples_per_second": 2.409, "eval_steps_per_second": 2.409, "eval_token_acc": 0.9763041383049214, "step": 48600 }, { "epoch": 3.7049317783367632, "grad_norm": 1.41921067237854, "learning_rate": 1.5659865447077444e-05, "loss": 0.07246212363243103, "memory(GiB)": 122.96, "step": 48605, "token_acc": 0.9763993438973668, "train_speed(iter/s)": 0.231924 }, { "epoch": 3.7053129049470233, "grad_norm": 0.5053153038024902, "learning_rate": 1.565116360582536e-05, "loss": 0.0285037100315094, "memory(GiB)": 122.96, "step": 48610, "token_acc": 0.9877467665078284, "train_speed(iter/s)": 0.231931 }, { "epoch": 3.7056940315572833, "grad_norm": 0.16861605644226074, "learning_rate": 1.564246373431964e-05, "loss": 0.021909546852111817, "memory(GiB)": 122.96, "step": 48615, "token_acc": 0.9866030881017257, "train_speed(iter/s)": 0.231937 }, { "epoch": 3.7060751581675433, "grad_norm": 0.5869128704071045, "learning_rate": 1.563376583305921e-05, "loss": 0.06947723627090455, "memory(GiB)": 122.96, "step": 48620, "token_acc": 0.9796276782578153, "train_speed(iter/s)": 0.231941 }, { "epoch": 3.7064562847778033, "grad_norm": 1.1670022010803223, "learning_rate": 1.5625069902542817e-05, "loss": 0.06591216921806335, "memory(GiB)": 122.96, "step": 48625, "token_acc": 0.9660478749389351, "train_speed(iter/s)": 0.231947 }, { "epoch": 3.7068374113880633, "grad_norm": 1.151626467704773, "learning_rate": 1.561637594326914e-05, "loss": 0.0645624816417694, "memory(GiB)": 122.96, "step": 48630, "token_acc": 0.9797464665598135, "train_speed(iter/s)": 0.23195 }, { "epoch": 3.707218537998323, "grad_norm": 0.7558711767196655, "learning_rate": 1.5607683955736758e-05, "loss": 0.061853927373886106, "memory(GiB)": 122.96, "step": 48635, "token_acc": 0.9847454512038228, "train_speed(iter/s)": 0.231956 }, { "epoch": 3.707599664608583, "grad_norm": 2.431062936782837, "learning_rate": 1.559899394044409e-05, "loss": 0.07868604063987732, "memory(GiB)": 122.96, "step": 48640, "token_acc": 0.9739084132055378, "train_speed(iter/s)": 0.23196 }, { "epoch": 3.707980791218843, "grad_norm": 4.60565710067749, "learning_rate": 1.559030589788948e-05, "loss": 0.04711296260356903, "memory(GiB)": 122.96, "step": 48645, "token_acc": 0.975213927412216, "train_speed(iter/s)": 0.231966 }, { "epoch": 3.708361917829103, "grad_norm": 1.2648577690124512, "learning_rate": 1.5581619828571158e-05, "loss": 0.05967831611633301, "memory(GiB)": 122.96, "step": 48650, "token_acc": 0.9775086505190311, "train_speed(iter/s)": 0.231975 }, { "epoch": 3.7087430444393625, "grad_norm": 1.837645411491394, "learning_rate": 1.5572935732987205e-05, "loss": 0.06603884696960449, "memory(GiB)": 122.96, "step": 48655, "token_acc": 0.9714285714285714, "train_speed(iter/s)": 0.231984 }, { "epoch": 3.7091241710496226, "grad_norm": 0.6567006707191467, "learning_rate": 1.556425361163562e-05, "loss": 0.05600627660751343, "memory(GiB)": 122.96, "step": 48660, "token_acc": 0.9777350192413414, "train_speed(iter/s)": 0.231984 }, { "epoch": 3.7095052976598826, "grad_norm": 0.47729143500328064, "learning_rate": 1.5555573465014312e-05, "loss": 0.04251473248004913, "memory(GiB)": 122.96, "step": 48665, "token_acc": 0.9876112371313907, "train_speed(iter/s)": 0.231988 }, { "epoch": 3.7098864242701426, "grad_norm": 1.1041089296340942, "learning_rate": 1.5546895293621005e-05, "loss": 0.05678543448448181, "memory(GiB)": 122.96, "step": 48670, "token_acc": 0.9777002133023075, "train_speed(iter/s)": 0.231992 }, { "epoch": 3.7102675508804026, "grad_norm": 1.3040434122085571, "learning_rate": 1.553821909795338e-05, "loss": 0.04241708517074585, "memory(GiB)": 122.96, "step": 48675, "token_acc": 0.9830429732868757, "train_speed(iter/s)": 0.231998 }, { "epoch": 3.7106486774906626, "grad_norm": 1.4824509620666504, "learning_rate": 1.5529544878508974e-05, "loss": 0.046774637699127194, "memory(GiB)": 122.96, "step": 48680, "token_acc": 0.9839791356184798, "train_speed(iter/s)": 0.232006 }, { "epoch": 3.711029804100922, "grad_norm": 2.039973497390747, "learning_rate": 1.552087263578519e-05, "loss": 0.05612778067588806, "memory(GiB)": 122.96, "step": 48685, "token_acc": 0.9832359274069518, "train_speed(iter/s)": 0.232009 }, { "epoch": 3.711410930711182, "grad_norm": 0.8245639801025391, "learning_rate": 1.5512202370279378e-05, "loss": 0.053490346670150755, "memory(GiB)": 122.96, "step": 48690, "token_acc": 0.9702053079619429, "train_speed(iter/s)": 0.232015 }, { "epoch": 3.7117920573214422, "grad_norm": 0.18117906153202057, "learning_rate": 1.5503534082488698e-05, "loss": 0.027326157689094542, "memory(GiB)": 122.96, "step": 48695, "token_acc": 0.987606963706108, "train_speed(iter/s)": 0.232022 }, { "epoch": 3.7121731839317023, "grad_norm": 0.6940703988075256, "learning_rate": 1.5494867772910242e-05, "loss": 0.05360459685325623, "memory(GiB)": 122.96, "step": 48700, "token_acc": 0.9771323652144884, "train_speed(iter/s)": 0.232026 }, { "epoch": 3.712554310541962, "grad_norm": 1.626269817352295, "learning_rate": 1.5486203442041026e-05, "loss": 0.04373975992202759, "memory(GiB)": 122.96, "step": 48705, "token_acc": 0.9832134292565947, "train_speed(iter/s)": 0.23203 }, { "epoch": 3.712935437152222, "grad_norm": 0.900880753993988, "learning_rate": 1.547754109037786e-05, "loss": 0.04266671538352966, "memory(GiB)": 122.96, "step": 48710, "token_acc": 0.983764705882353, "train_speed(iter/s)": 0.232036 }, { "epoch": 3.713316563762482, "grad_norm": 1.058880090713501, "learning_rate": 1.5468880718417515e-05, "loss": 0.04690352976322174, "memory(GiB)": 122.96, "step": 48715, "token_acc": 0.9843155893536122, "train_speed(iter/s)": 0.232038 }, { "epoch": 3.713697690372742, "grad_norm": 2.736949920654297, "learning_rate": 1.546022232665663e-05, "loss": 0.06237162351608276, "memory(GiB)": 122.96, "step": 48720, "token_acc": 0.974937343358396, "train_speed(iter/s)": 0.232043 }, { "epoch": 3.714078816983002, "grad_norm": 0.6298825740814209, "learning_rate": 1.54515659155917e-05, "loss": 0.05568807125091553, "memory(GiB)": 122.96, "step": 48725, "token_acc": 0.9780242779405609, "train_speed(iter/s)": 0.232049 }, { "epoch": 3.714459943593262, "grad_norm": 1.2206292152404785, "learning_rate": 1.5442911485719132e-05, "loss": 0.04889516830444336, "memory(GiB)": 122.96, "step": 48730, "token_acc": 0.9799414960300877, "train_speed(iter/s)": 0.232054 }, { "epoch": 3.7148410702035215, "grad_norm": 1.0809626579284668, "learning_rate": 1.5434259037535242e-05, "loss": 0.03863396942615509, "memory(GiB)": 122.96, "step": 48735, "token_acc": 0.9841300940438872, "train_speed(iter/s)": 0.23206 }, { "epoch": 3.7152221968137815, "grad_norm": 0.8796589374542236, "learning_rate": 1.542560857153618e-05, "loss": 0.035721606016159056, "memory(GiB)": 122.96, "step": 48740, "token_acc": 0.9865426479319216, "train_speed(iter/s)": 0.232064 }, { "epoch": 3.7156033234240415, "grad_norm": 1.1689856052398682, "learning_rate": 1.5416960088218036e-05, "loss": 0.06162925958633423, "memory(GiB)": 122.96, "step": 48745, "token_acc": 0.9856495468277946, "train_speed(iter/s)": 0.232073 }, { "epoch": 3.7159844500343016, "grad_norm": 1.5225716829299927, "learning_rate": 1.540831358807673e-05, "loss": 0.0512096107006073, "memory(GiB)": 122.96, "step": 48750, "token_acc": 0.9799735499716606, "train_speed(iter/s)": 0.232076 }, { "epoch": 3.716365576644561, "grad_norm": 0.8682401776313782, "learning_rate": 1.5399669071608114e-05, "loss": 0.06712403893470764, "memory(GiB)": 122.96, "step": 48755, "token_acc": 0.9795721409039729, "train_speed(iter/s)": 0.232081 }, { "epoch": 3.716746703254821, "grad_norm": 1.4565927982330322, "learning_rate": 1.5391026539307927e-05, "loss": 0.05284461975097656, "memory(GiB)": 122.96, "step": 48760, "token_acc": 0.979905005480453, "train_speed(iter/s)": 0.232087 }, { "epoch": 3.717127829865081, "grad_norm": 0.9094082117080688, "learning_rate": 1.538238599167175e-05, "loss": 0.0832652509212494, "memory(GiB)": 122.96, "step": 48765, "token_acc": 0.9787716739588397, "train_speed(iter/s)": 0.23209 }, { "epoch": 3.717508956475341, "grad_norm": 0.9550985097885132, "learning_rate": 1.537374742919509e-05, "loss": 0.04529339075088501, "memory(GiB)": 122.96, "step": 48770, "token_acc": 0.980956411341515, "train_speed(iter/s)": 0.232095 }, { "epoch": 3.717890083085601, "grad_norm": 2.0775845050811768, "learning_rate": 1.5365110852373345e-05, "loss": 0.05505893230438232, "memory(GiB)": 122.96, "step": 48775, "token_acc": 0.979767014101778, "train_speed(iter/s)": 0.232101 }, { "epoch": 3.7182712096958612, "grad_norm": 1.3295663595199585, "learning_rate": 1.535647626170175e-05, "loss": 0.06241623759269714, "memory(GiB)": 122.96, "step": 48780, "token_acc": 0.9793291731669267, "train_speed(iter/s)": 0.232107 }, { "epoch": 3.718652336306121, "grad_norm": 0.767246425151825, "learning_rate": 1.5347843657675476e-05, "loss": 0.056697767972946164, "memory(GiB)": 122.96, "step": 48785, "token_acc": 0.9806635456950946, "train_speed(iter/s)": 0.232111 }, { "epoch": 3.719033462916381, "grad_norm": 1.235554814338684, "learning_rate": 1.533921304078958e-05, "loss": 0.080640310049057, "memory(GiB)": 122.96, "step": 48790, "token_acc": 0.9802391232148788, "train_speed(iter/s)": 0.232114 }, { "epoch": 3.719414589526641, "grad_norm": 0.4389704465866089, "learning_rate": 1.5330584411538955e-05, "loss": 0.05586666464805603, "memory(GiB)": 122.96, "step": 48795, "token_acc": 0.9806609547123623, "train_speed(iter/s)": 0.232121 }, { "epoch": 3.7197957161369004, "grad_norm": 0.7085950374603271, "learning_rate": 1.5321957770418427e-05, "loss": 0.07089146971702576, "memory(GiB)": 122.96, "step": 48800, "token_acc": 0.975984796129924, "train_speed(iter/s)": 0.232125 }, { "epoch": 3.7197957161369004, "eval_loss": 0.05823696032166481, "eval_runtime": 218.6123, "eval_samples_per_second": 2.424, "eval_steps_per_second": 2.424, "eval_token_acc": 0.9762363713029335, "step": 48800 }, { "epoch": 3.7201768427471604, "grad_norm": 0.9154613614082336, "learning_rate": 1.5313333117922712e-05, "loss": 0.05939228534698486, "memory(GiB)": 122.96, "step": 48805, "token_acc": 0.9761647745796235, "train_speed(iter/s)": 0.231888 }, { "epoch": 3.7205579693574204, "grad_norm": 1.0025330781936646, "learning_rate": 1.5304710454546357e-05, "loss": 0.03069949746131897, "memory(GiB)": 122.96, "step": 48810, "token_acc": 0.9858447488584475, "train_speed(iter/s)": 0.231894 }, { "epoch": 3.7209390959676805, "grad_norm": 0.9321764707565308, "learning_rate": 1.5296089780783855e-05, "loss": 0.05340126156806946, "memory(GiB)": 122.96, "step": 48815, "token_acc": 0.9762789368390968, "train_speed(iter/s)": 0.2319 }, { "epoch": 3.7213202225779405, "grad_norm": 1.1412880420684814, "learning_rate": 1.5287471097129573e-05, "loss": 0.041927629709243776, "memory(GiB)": 122.96, "step": 48820, "token_acc": 0.985516645049719, "train_speed(iter/s)": 0.231905 }, { "epoch": 3.7217013491882005, "grad_norm": 0.6852235198020935, "learning_rate": 1.5278854404077726e-05, "loss": 0.07262083888053894, "memory(GiB)": 122.96, "step": 48825, "token_acc": 0.9736566382278102, "train_speed(iter/s)": 0.231909 }, { "epoch": 3.7220824757984605, "grad_norm": 1.4083611965179443, "learning_rate": 1.5270239702122447e-05, "loss": 0.04360325932502747, "memory(GiB)": 122.96, "step": 48830, "token_acc": 0.9819881754434209, "train_speed(iter/s)": 0.231912 }, { "epoch": 3.72246360240872, "grad_norm": 0.8190171718597412, "learning_rate": 1.5261626991757756e-05, "loss": 0.03207942843437195, "memory(GiB)": 122.96, "step": 48835, "token_acc": 0.9864130434782609, "train_speed(iter/s)": 0.231917 }, { "epoch": 3.72284472901898, "grad_norm": 0.8472197651863098, "learning_rate": 1.5253016273477555e-05, "loss": 0.032633519172668456, "memory(GiB)": 122.96, "step": 48840, "token_acc": 0.988931460195828, "train_speed(iter/s)": 0.231923 }, { "epoch": 3.72322585562924, "grad_norm": 1.227094054222107, "learning_rate": 1.5244407547775641e-05, "loss": 0.0774518072605133, "memory(GiB)": 122.96, "step": 48845, "token_acc": 0.9765984890363, "train_speed(iter/s)": 0.231927 }, { "epoch": 3.7236069822394997, "grad_norm": 1.451130747795105, "learning_rate": 1.523580081514565e-05, "loss": 0.04026959836483002, "memory(GiB)": 122.96, "step": 48850, "token_acc": 0.9831819060506476, "train_speed(iter/s)": 0.231932 }, { "epoch": 3.7239881088497597, "grad_norm": 1.4702320098876953, "learning_rate": 1.5227196076081158e-05, "loss": 0.05356399416923523, "memory(GiB)": 122.96, "step": 48855, "token_acc": 0.97508038585209, "train_speed(iter/s)": 0.23194 }, { "epoch": 3.7243692354600197, "grad_norm": 1.2157304286956787, "learning_rate": 1.521859333107562e-05, "loss": 0.05711514949798584, "memory(GiB)": 122.96, "step": 48860, "token_acc": 0.9701216287678477, "train_speed(iter/s)": 0.231946 }, { "epoch": 3.7247503620702798, "grad_norm": 1.1650246381759644, "learning_rate": 1.5209992580622334e-05, "loss": 0.04022566676139831, "memory(GiB)": 122.96, "step": 48865, "token_acc": 0.983475951608144, "train_speed(iter/s)": 0.231949 }, { "epoch": 3.72513148868054, "grad_norm": 0.16441799700260162, "learning_rate": 1.5201393825214528e-05, "loss": 0.05265583992004395, "memory(GiB)": 122.96, "step": 48870, "token_acc": 0.9764705882352941, "train_speed(iter/s)": 0.231958 }, { "epoch": 3.7255126152908, "grad_norm": 0.9477109909057617, "learning_rate": 1.5192797065345315e-05, "loss": 0.028566893935203553, "memory(GiB)": 122.96, "step": 48875, "token_acc": 0.9871391076115485, "train_speed(iter/s)": 0.231964 }, { "epoch": 3.7258937419010594, "grad_norm": 1.0979293584823608, "learning_rate": 1.5184202301507649e-05, "loss": 0.046809914708137515, "memory(GiB)": 122.96, "step": 48880, "token_acc": 0.9770047169811321, "train_speed(iter/s)": 0.231972 }, { "epoch": 3.7262748685113194, "grad_norm": 0.9250083565711975, "learning_rate": 1.5175609534194419e-05, "loss": 0.03163527846336365, "memory(GiB)": 122.96, "step": 48885, "token_acc": 0.9876556347770565, "train_speed(iter/s)": 0.231973 }, { "epoch": 3.7266559951215794, "grad_norm": 1.5738561153411865, "learning_rate": 1.5167018763898395e-05, "loss": 0.07032080292701721, "memory(GiB)": 122.96, "step": 48890, "token_acc": 0.9794190285781489, "train_speed(iter/s)": 0.231977 }, { "epoch": 3.7270371217318394, "grad_norm": 1.9003453254699707, "learning_rate": 1.5158429991112177e-05, "loss": 0.061708831787109376, "memory(GiB)": 122.96, "step": 48895, "token_acc": 0.9799082769163573, "train_speed(iter/s)": 0.231983 }, { "epoch": 3.727418248342099, "grad_norm": 1.8850990533828735, "learning_rate": 1.5149843216328325e-05, "loss": 0.053540974855422974, "memory(GiB)": 122.96, "step": 48900, "token_acc": 0.9785932721712538, "train_speed(iter/s)": 0.23199 }, { "epoch": 3.727799374952359, "grad_norm": 0.7754955291748047, "learning_rate": 1.5141258440039246e-05, "loss": 0.045809459686279294, "memory(GiB)": 122.96, "step": 48905, "token_acc": 0.9786795048143053, "train_speed(iter/s)": 0.231995 }, { "epoch": 3.728180501562619, "grad_norm": 0.9432021379470825, "learning_rate": 1.5132675662737223e-05, "loss": 0.04675309062004089, "memory(GiB)": 122.96, "step": 48910, "token_acc": 0.9845545977011494, "train_speed(iter/s)": 0.232002 }, { "epoch": 3.728561628172879, "grad_norm": 1.0406442880630493, "learning_rate": 1.5124094884914453e-05, "loss": 0.05537645816802979, "memory(GiB)": 122.96, "step": 48915, "token_acc": 0.9665358397025408, "train_speed(iter/s)": 0.232007 }, { "epoch": 3.728942754783139, "grad_norm": 1.1862291097640991, "learning_rate": 1.5115516107062988e-05, "loss": 0.04712205529212952, "memory(GiB)": 122.96, "step": 48920, "token_acc": 0.9845852703694642, "train_speed(iter/s)": 0.232014 }, { "epoch": 3.729323881393399, "grad_norm": 1.9611831903457642, "learning_rate": 1.510693932967479e-05, "loss": 0.08562690019607544, "memory(GiB)": 122.96, "step": 48925, "token_acc": 0.9679783950617284, "train_speed(iter/s)": 0.232021 }, { "epoch": 3.7297050080036587, "grad_norm": 0.6685274839401245, "learning_rate": 1.5098364553241712e-05, "loss": 0.058075320720672605, "memory(GiB)": 122.96, "step": 48930, "token_acc": 0.9765441751368257, "train_speed(iter/s)": 0.232026 }, { "epoch": 3.7300861346139187, "grad_norm": 0.7071112990379333, "learning_rate": 1.5089791778255452e-05, "loss": 0.04090987741947174, "memory(GiB)": 122.96, "step": 48935, "token_acc": 0.9805515239477504, "train_speed(iter/s)": 0.232032 }, { "epoch": 3.7304672612241787, "grad_norm": 0.3011295795440674, "learning_rate": 1.5081221005207624e-05, "loss": 0.03296991586685181, "memory(GiB)": 122.96, "step": 48940, "token_acc": 0.986940780967158, "train_speed(iter/s)": 0.232035 }, { "epoch": 3.7308483878344387, "grad_norm": 0.9027917981147766, "learning_rate": 1.5072652234589752e-05, "loss": 0.04842948317527771, "memory(GiB)": 122.96, "step": 48945, "token_acc": 0.9835882727852135, "train_speed(iter/s)": 0.232037 }, { "epoch": 3.7312295144446983, "grad_norm": 1.7066742181777954, "learning_rate": 1.5064085466893169e-05, "loss": 0.07208556532859803, "memory(GiB)": 122.96, "step": 48950, "token_acc": 0.9711538461538461, "train_speed(iter/s)": 0.232044 }, { "epoch": 3.7316106410549583, "grad_norm": 0.6871095895767212, "learning_rate": 1.5055520702609166e-05, "loss": 0.070688796043396, "memory(GiB)": 122.96, "step": 48955, "token_acc": 0.9800268868830421, "train_speed(iter/s)": 0.232049 }, { "epoch": 3.7319917676652183, "grad_norm": 1.0584282875061035, "learning_rate": 1.50469579422289e-05, "loss": 0.06227047443389892, "memory(GiB)": 122.96, "step": 48960, "token_acc": 0.9738035264483628, "train_speed(iter/s)": 0.232055 }, { "epoch": 3.7323728942754784, "grad_norm": 0.7508059740066528, "learning_rate": 1.5038397186243363e-05, "loss": 0.04112919569015503, "memory(GiB)": 122.96, "step": 48965, "token_acc": 0.9830461750516885, "train_speed(iter/s)": 0.232058 }, { "epoch": 3.7327540208857384, "grad_norm": 0.7023333311080933, "learning_rate": 1.5029838435143544e-05, "loss": 0.06964746117591858, "memory(GiB)": 122.96, "step": 48970, "token_acc": 0.9748427672955975, "train_speed(iter/s)": 0.232062 }, { "epoch": 3.7331351474959984, "grad_norm": 0.11623256653547287, "learning_rate": 1.5021281689420186e-05, "loss": 0.05047510266304016, "memory(GiB)": 122.96, "step": 48975, "token_acc": 0.9724803431022159, "train_speed(iter/s)": 0.232066 }, { "epoch": 3.733516274106258, "grad_norm": 1.4545999765396118, "learning_rate": 1.5012726949564004e-05, "loss": 0.032633939385414125, "memory(GiB)": 122.96, "step": 48980, "token_acc": 0.9890438247011952, "train_speed(iter/s)": 0.232075 }, { "epoch": 3.733897400716518, "grad_norm": 1.1644946336746216, "learning_rate": 1.5004174216065592e-05, "loss": 0.04062047600746155, "memory(GiB)": 122.96, "step": 48985, "token_acc": 0.9838345864661654, "train_speed(iter/s)": 0.232079 }, { "epoch": 3.734278527326778, "grad_norm": 0.8848966956138611, "learning_rate": 1.4995623489415366e-05, "loss": 0.02991829514503479, "memory(GiB)": 122.96, "step": 48990, "token_acc": 0.9871981379109689, "train_speed(iter/s)": 0.232086 }, { "epoch": 3.734659653937038, "grad_norm": 1.7558376789093018, "learning_rate": 1.4987074770103694e-05, "loss": 0.06936917304992676, "memory(GiB)": 122.96, "step": 48995, "token_acc": 0.9732914375490966, "train_speed(iter/s)": 0.232091 }, { "epoch": 3.7350407805472976, "grad_norm": 3.1976816654205322, "learning_rate": 1.4978528058620822e-05, "loss": 0.07226569056510926, "memory(GiB)": 122.96, "step": 49000, "token_acc": 0.9713608532490619, "train_speed(iter/s)": 0.232097 }, { "epoch": 3.7350407805472976, "eval_loss": 0.057013314217329025, "eval_runtime": 220.1882, "eval_samples_per_second": 2.407, "eval_steps_per_second": 2.407, "eval_token_acc": 0.9768236853201614, "step": 49000 }, { "epoch": 3.7354219071575576, "grad_norm": 0.5227184295654297, "learning_rate": 1.496998335545683e-05, "loss": 0.0424824595451355, "memory(GiB)": 122.96, "step": 49005, "token_acc": 0.9769970907438755, "train_speed(iter/s)": 0.231861 }, { "epoch": 3.7358030337678176, "grad_norm": 1.1877198219299316, "learning_rate": 1.4961440661101732e-05, "loss": 0.054729503393173215, "memory(GiB)": 122.96, "step": 49010, "token_acc": 0.9753593429158111, "train_speed(iter/s)": 0.231865 }, { "epoch": 3.7361841603780777, "grad_norm": 1.735164761543274, "learning_rate": 1.4952899976045426e-05, "loss": 0.06321409940719605, "memory(GiB)": 122.96, "step": 49015, "token_acc": 0.9730751062824752, "train_speed(iter/s)": 0.23187 }, { "epoch": 3.7365652869883377, "grad_norm": 0.9821336269378662, "learning_rate": 1.494436130077766e-05, "loss": 0.07832803726196289, "memory(GiB)": 122.96, "step": 49020, "token_acc": 0.9698080023450095, "train_speed(iter/s)": 0.231874 }, { "epoch": 3.7369464135985977, "grad_norm": 0.6912567615509033, "learning_rate": 1.4935824635788088e-05, "loss": 0.04839940667152405, "memory(GiB)": 122.96, "step": 49025, "token_acc": 0.9828662930344275, "train_speed(iter/s)": 0.231878 }, { "epoch": 3.7373275402088573, "grad_norm": 1.6851619482040405, "learning_rate": 1.4927289981566277e-05, "loss": 0.05730386972427368, "memory(GiB)": 122.96, "step": 49030, "token_acc": 0.9844626672421234, "train_speed(iter/s)": 0.231884 }, { "epoch": 3.7377086668191173, "grad_norm": 1.0678712129592896, "learning_rate": 1.4918757338601608e-05, "loss": 0.03656271696090698, "memory(GiB)": 122.96, "step": 49035, "token_acc": 0.9838403041825095, "train_speed(iter/s)": 0.231889 }, { "epoch": 3.7380897934293773, "grad_norm": 4.605990409851074, "learning_rate": 1.4910226707383412e-05, "loss": 0.06931053400039673, "memory(GiB)": 122.96, "step": 49040, "token_acc": 0.9786892758936755, "train_speed(iter/s)": 0.231895 }, { "epoch": 3.7384709200396373, "grad_norm": 3.1493732929229736, "learning_rate": 1.4901698088400895e-05, "loss": 0.07509937286376953, "memory(GiB)": 122.96, "step": 49045, "token_acc": 0.9782683093771389, "train_speed(iter/s)": 0.231898 }, { "epoch": 3.738852046649897, "grad_norm": 0.0008121732389554381, "learning_rate": 1.4893171482143097e-05, "loss": 0.04620593786239624, "memory(GiB)": 122.96, "step": 49050, "token_acc": 0.9803229998143679, "train_speed(iter/s)": 0.231903 }, { "epoch": 3.739233173260157, "grad_norm": 1.2293556928634644, "learning_rate": 1.4884646889098996e-05, "loss": 0.04439484477043152, "memory(GiB)": 122.96, "step": 49055, "token_acc": 0.9883527454242929, "train_speed(iter/s)": 0.231906 }, { "epoch": 3.739614299870417, "grad_norm": 1.7414129972457886, "learning_rate": 1.4876124309757466e-05, "loss": 0.07393454909324645, "memory(GiB)": 122.96, "step": 49060, "token_acc": 0.97143840330351, "train_speed(iter/s)": 0.231913 }, { "epoch": 3.739995426480677, "grad_norm": 1.1627411842346191, "learning_rate": 1.4867603744607189e-05, "loss": 0.05280392169952393, "memory(GiB)": 122.96, "step": 49065, "token_acc": 0.9816377906115801, "train_speed(iter/s)": 0.231916 }, { "epoch": 3.740376553090937, "grad_norm": 0.54014652967453, "learning_rate": 1.4859085194136808e-05, "loss": 0.046751323342323306, "memory(GiB)": 122.96, "step": 49070, "token_acc": 0.9836372720459655, "train_speed(iter/s)": 0.231918 }, { "epoch": 3.740757679701197, "grad_norm": 1.5349016189575195, "learning_rate": 1.4850568658834829e-05, "loss": 0.047150492668151855, "memory(GiB)": 122.96, "step": 49075, "token_acc": 0.9819477434679335, "train_speed(iter/s)": 0.231925 }, { "epoch": 3.7411388063114566, "grad_norm": 2.01247501373291, "learning_rate": 1.484205413918961e-05, "loss": 0.049671322107315063, "memory(GiB)": 122.96, "step": 49080, "token_acc": 0.9804847340258105, "train_speed(iter/s)": 0.231932 }, { "epoch": 3.7415199329217166, "grad_norm": 0.9363377094268799, "learning_rate": 1.4833541635689447e-05, "loss": 0.0807680368423462, "memory(GiB)": 122.96, "step": 49085, "token_acc": 0.9705454138201366, "train_speed(iter/s)": 0.231931 }, { "epoch": 3.7419010595319766, "grad_norm": 2.3982794284820557, "learning_rate": 1.4825031148822465e-05, "loss": 0.06936993598937988, "memory(GiB)": 122.96, "step": 49090, "token_acc": 0.981835264641403, "train_speed(iter/s)": 0.231938 }, { "epoch": 3.742282186142236, "grad_norm": 2.4877123832702637, "learning_rate": 1.4816522679076717e-05, "loss": 0.06697304248809814, "memory(GiB)": 122.96, "step": 49095, "token_acc": 0.9739592884650651, "train_speed(iter/s)": 0.231944 }, { "epoch": 3.742663312752496, "grad_norm": 1.0323090553283691, "learning_rate": 1.4808016226940118e-05, "loss": 0.05782181620597839, "memory(GiB)": 122.96, "step": 49100, "token_acc": 0.9733091474475253, "train_speed(iter/s)": 0.23195 }, { "epoch": 3.743044439362756, "grad_norm": 1.2188265323638916, "learning_rate": 1.4799511792900477e-05, "loss": 0.08327438235282898, "memory(GiB)": 122.96, "step": 49105, "token_acc": 0.9762724837351703, "train_speed(iter/s)": 0.231958 }, { "epoch": 3.7434255659730162, "grad_norm": 1.1094975471496582, "learning_rate": 1.4791009377445487e-05, "loss": 0.07091230154037476, "memory(GiB)": 122.96, "step": 49110, "token_acc": 0.9773341338937256, "train_speed(iter/s)": 0.23196 }, { "epoch": 3.7438066925832763, "grad_norm": 0.5982166528701782, "learning_rate": 1.4782508981062738e-05, "loss": 0.04426900744438171, "memory(GiB)": 122.96, "step": 49115, "token_acc": 0.9842637071393076, "train_speed(iter/s)": 0.231964 }, { "epoch": 3.7441878191935363, "grad_norm": 1.735591173171997, "learning_rate": 1.4774010604239652e-05, "loss": 0.03176187574863434, "memory(GiB)": 122.96, "step": 49120, "token_acc": 0.9852132049518569, "train_speed(iter/s)": 0.231971 }, { "epoch": 3.7445689458037963, "grad_norm": 2.5108978748321533, "learning_rate": 1.4765514247463602e-05, "loss": 0.07998481988906861, "memory(GiB)": 122.96, "step": 49125, "token_acc": 0.9730789802103762, "train_speed(iter/s)": 0.231975 }, { "epoch": 3.744950072414056, "grad_norm": 1.5338020324707031, "learning_rate": 1.4757019911221787e-05, "loss": 0.11357120275497437, "memory(GiB)": 122.96, "step": 49130, "token_acc": 0.9707252162341983, "train_speed(iter/s)": 0.231979 }, { "epoch": 3.745331199024316, "grad_norm": 1.333380937576294, "learning_rate": 1.4748527596001333e-05, "loss": 0.06728856563568116, "memory(GiB)": 122.96, "step": 49135, "token_acc": 0.9715528175394094, "train_speed(iter/s)": 0.231984 }, { "epoch": 3.745712325634576, "grad_norm": 1.9847711324691772, "learning_rate": 1.4740037302289256e-05, "loss": 0.03741539716720581, "memory(GiB)": 122.96, "step": 49140, "token_acc": 0.9848182181382341, "train_speed(iter/s)": 0.231992 }, { "epoch": 3.7460934522448355, "grad_norm": 0.6527532339096069, "learning_rate": 1.4731549030572389e-05, "loss": 0.038103365898132326, "memory(GiB)": 122.96, "step": 49145, "token_acc": 0.9882926829268293, "train_speed(iter/s)": 0.231998 }, { "epoch": 3.7464745788550955, "grad_norm": 0.9204108119010925, "learning_rate": 1.4723062781337527e-05, "loss": 0.053105777502059935, "memory(GiB)": 122.96, "step": 49150, "token_acc": 0.9795032776290278, "train_speed(iter/s)": 0.231998 }, { "epoch": 3.7468557054653555, "grad_norm": 1.6908094882965088, "learning_rate": 1.4714578555071318e-05, "loss": 0.08305364847183228, "memory(GiB)": 122.96, "step": 49155, "token_acc": 0.9648586707410237, "train_speed(iter/s)": 0.232006 }, { "epoch": 3.7472368320756155, "grad_norm": 0.6499575972557068, "learning_rate": 1.4706096352260273e-05, "loss": 0.06048931479454041, "memory(GiB)": 122.96, "step": 49160, "token_acc": 0.9812371134020619, "train_speed(iter/s)": 0.232012 }, { "epoch": 3.7476179586858755, "grad_norm": 1.4950976371765137, "learning_rate": 1.469761617339082e-05, "loss": 0.05918506383895874, "memory(GiB)": 122.96, "step": 49165, "token_acc": 0.97906209665854, "train_speed(iter/s)": 0.232015 }, { "epoch": 3.7479990852961356, "grad_norm": 1.3800337314605713, "learning_rate": 1.4689138018949273e-05, "loss": 0.03078022599220276, "memory(GiB)": 122.96, "step": 49170, "token_acc": 0.9867267124911117, "train_speed(iter/s)": 0.232021 }, { "epoch": 3.748380211906395, "grad_norm": 1.0513643026351929, "learning_rate": 1.4680661889421776e-05, "loss": 0.06265894174575806, "memory(GiB)": 122.96, "step": 49175, "token_acc": 0.9810151878497202, "train_speed(iter/s)": 0.232025 }, { "epoch": 3.748761338516655, "grad_norm": 1.272114872932434, "learning_rate": 1.4672187785294423e-05, "loss": 0.07429357767105102, "memory(GiB)": 122.96, "step": 49180, "token_acc": 0.9727784026996625, "train_speed(iter/s)": 0.232031 }, { "epoch": 3.749142465126915, "grad_norm": 2.0578062534332275, "learning_rate": 1.4663715707053171e-05, "loss": 0.05489050149917603, "memory(GiB)": 122.96, "step": 49185, "token_acc": 0.9815754841135552, "train_speed(iter/s)": 0.232034 }, { "epoch": 3.749523591737175, "grad_norm": 1.948913335800171, "learning_rate": 1.465524565518383e-05, "loss": 0.07350223064422608, "memory(GiB)": 122.96, "step": 49190, "token_acc": 0.9711379879054426, "train_speed(iter/s)": 0.23204 }, { "epoch": 3.7499047183474348, "grad_norm": 0.42155614495277405, "learning_rate": 1.4646777630172132e-05, "loss": 0.042332953214645384, "memory(GiB)": 122.96, "step": 49195, "token_acc": 0.9851257075161248, "train_speed(iter/s)": 0.232042 }, { "epoch": 3.750285844957695, "grad_norm": 2.1088130474090576, "learning_rate": 1.4638311632503693e-05, "loss": 0.05472139716148376, "memory(GiB)": 122.96, "step": 49200, "token_acc": 0.9774398395721925, "train_speed(iter/s)": 0.232047 }, { "epoch": 3.750285844957695, "eval_loss": 0.05730169266462326, "eval_runtime": 220.6745, "eval_samples_per_second": 2.402, "eval_steps_per_second": 2.402, "eval_token_acc": 0.976831214987049, "step": 49200 }, { "epoch": 3.750666971567955, "grad_norm": 3.634643077850342, "learning_rate": 1.462984766266397e-05, "loss": 0.04340276420116425, "memory(GiB)": 122.96, "step": 49205, "token_acc": 0.9771390015453484, "train_speed(iter/s)": 0.231812 }, { "epoch": 3.751048098178215, "grad_norm": 1.1849124431610107, "learning_rate": 1.4621385721138341e-05, "loss": 0.07287706136703491, "memory(GiB)": 122.96, "step": 49210, "token_acc": 0.9738219895287958, "train_speed(iter/s)": 0.231818 }, { "epoch": 3.751429224788475, "grad_norm": 1.2348345518112183, "learning_rate": 1.461292580841208e-05, "loss": 0.0424186110496521, "memory(GiB)": 122.96, "step": 49215, "token_acc": 0.9785320568610386, "train_speed(iter/s)": 0.231825 }, { "epoch": 3.751810351398735, "grad_norm": 1.1550379991531372, "learning_rate": 1.4604467924970294e-05, "loss": 0.040815478563308714, "memory(GiB)": 122.96, "step": 49220, "token_acc": 0.9877003354453969, "train_speed(iter/s)": 0.231828 }, { "epoch": 3.7521914780089944, "grad_norm": 2.527832269668579, "learning_rate": 1.4596012071298021e-05, "loss": 0.0467542827129364, "memory(GiB)": 122.96, "step": 49225, "token_acc": 0.9850236966824645, "train_speed(iter/s)": 0.231832 }, { "epoch": 3.7525726046192545, "grad_norm": 1.4194464683532715, "learning_rate": 1.4587558247880156e-05, "loss": 0.04871063530445099, "memory(GiB)": 122.96, "step": 49230, "token_acc": 0.9798439531859557, "train_speed(iter/s)": 0.231839 }, { "epoch": 3.7529537312295145, "grad_norm": 0.900928258895874, "learning_rate": 1.4579106455201491e-05, "loss": 0.04832919239997864, "memory(GiB)": 122.96, "step": 49235, "token_acc": 0.9788543897216274, "train_speed(iter/s)": 0.231841 }, { "epoch": 3.7533348578397745, "grad_norm": 1.3773133754730225, "learning_rate": 1.457065669374672e-05, "loss": 0.06057637929916382, "memory(GiB)": 122.96, "step": 49240, "token_acc": 0.9755902360944377, "train_speed(iter/s)": 0.231846 }, { "epoch": 3.753715984450034, "grad_norm": 0.9474126100540161, "learning_rate": 1.4562208964000352e-05, "loss": 0.04279967844486236, "memory(GiB)": 122.96, "step": 49245, "token_acc": 0.9776796664213883, "train_speed(iter/s)": 0.231853 }, { "epoch": 3.754097111060294, "grad_norm": 0.9218318462371826, "learning_rate": 1.4553763266446851e-05, "loss": 0.04951457977294922, "memory(GiB)": 122.96, "step": 49250, "token_acc": 0.9795557302929034, "train_speed(iter/s)": 0.231858 }, { "epoch": 3.754478237670554, "grad_norm": 0.5153103470802307, "learning_rate": 1.4545319601570556e-05, "loss": 0.08016550540924072, "memory(GiB)": 122.96, "step": 49255, "token_acc": 0.9577613516367476, "train_speed(iter/s)": 0.231866 }, { "epoch": 3.754859364280814, "grad_norm": 1.8108398914337158, "learning_rate": 1.4536877969855633e-05, "loss": 0.054098653793334964, "memory(GiB)": 122.96, "step": 49260, "token_acc": 0.9751895244658856, "train_speed(iter/s)": 0.231872 }, { "epoch": 3.755240490891074, "grad_norm": 0.6787075996398926, "learning_rate": 1.452843837178619e-05, "loss": 0.05782237648963928, "memory(GiB)": 122.96, "step": 49265, "token_acc": 0.9786532726416575, "train_speed(iter/s)": 0.231877 }, { "epoch": 3.755621617501334, "grad_norm": 3.2243194580078125, "learning_rate": 1.4520000807846213e-05, "loss": 0.0693245768547058, "memory(GiB)": 122.96, "step": 49270, "token_acc": 0.979259686014691, "train_speed(iter/s)": 0.231878 }, { "epoch": 3.7560027441115937, "grad_norm": 1.021813988685608, "learning_rate": 1.4511565278519523e-05, "loss": 0.08610450625419616, "memory(GiB)": 122.96, "step": 49275, "token_acc": 0.9642643242164105, "train_speed(iter/s)": 0.231884 }, { "epoch": 3.7563838707218538, "grad_norm": 0.956100583076477, "learning_rate": 1.4503131784289886e-05, "loss": 0.04096458554267883, "memory(GiB)": 122.96, "step": 49280, "token_acc": 0.9848999622499056, "train_speed(iter/s)": 0.231892 }, { "epoch": 3.7567649973321138, "grad_norm": 2.196608543395996, "learning_rate": 1.4494700325640926e-05, "loss": 0.06224585175514221, "memory(GiB)": 122.96, "step": 49285, "token_acc": 0.9734623015873016, "train_speed(iter/s)": 0.231897 }, { "epoch": 3.757146123942374, "grad_norm": 1.330375075340271, "learning_rate": 1.448627090305612e-05, "loss": 0.04344974756240845, "memory(GiB)": 122.96, "step": 49290, "token_acc": 0.9819456617002629, "train_speed(iter/s)": 0.231901 }, { "epoch": 3.7575272505526334, "grad_norm": 1.4644325971603394, "learning_rate": 1.4477843517018897e-05, "loss": 0.036493897438049316, "memory(GiB)": 122.96, "step": 49295, "token_acc": 0.9846077457795432, "train_speed(iter/s)": 0.231909 }, { "epoch": 3.7579083771628934, "grad_norm": 1.576104760169983, "learning_rate": 1.4469418168012483e-05, "loss": 0.0472178190946579, "memory(GiB)": 122.96, "step": 49300, "token_acc": 0.98001223740567, "train_speed(iter/s)": 0.231915 }, { "epoch": 3.7582895037731534, "grad_norm": 1.7189468145370483, "learning_rate": 1.4460994856520055e-05, "loss": 0.05116929411888123, "memory(GiB)": 122.96, "step": 49305, "token_acc": 0.9780120481927711, "train_speed(iter/s)": 0.231921 }, { "epoch": 3.7586706303834134, "grad_norm": 0.8280119895935059, "learning_rate": 1.445257358302467e-05, "loss": 0.03652292191982269, "memory(GiB)": 122.96, "step": 49310, "token_acc": 0.9861259338313767, "train_speed(iter/s)": 0.231927 }, { "epoch": 3.7590517569936734, "grad_norm": 1.176550030708313, "learning_rate": 1.4444154348009215e-05, "loss": 0.07219944000244141, "memory(GiB)": 122.96, "step": 49315, "token_acc": 0.970890785729919, "train_speed(iter/s)": 0.231933 }, { "epoch": 3.7594328836039335, "grad_norm": 1.1383305788040161, "learning_rate": 1.4435737151956507e-05, "loss": 0.063514244556427, "memory(GiB)": 122.96, "step": 49320, "token_acc": 0.9756181912502162, "train_speed(iter/s)": 0.231937 }, { "epoch": 3.759814010214193, "grad_norm": 2.173473834991455, "learning_rate": 1.4427321995349247e-05, "loss": 0.09212472438812255, "memory(GiB)": 122.96, "step": 49325, "token_acc": 0.9628571428571429, "train_speed(iter/s)": 0.231945 }, { "epoch": 3.760195136824453, "grad_norm": 1.7989131212234497, "learning_rate": 1.4418908878669984e-05, "loss": 0.0803191065788269, "memory(GiB)": 122.96, "step": 49330, "token_acc": 0.9717429357339334, "train_speed(iter/s)": 0.231951 }, { "epoch": 3.760576263434713, "grad_norm": 2.313945770263672, "learning_rate": 1.4410497802401174e-05, "loss": 0.0729818344116211, "memory(GiB)": 122.96, "step": 49335, "token_acc": 0.9670596393897365, "train_speed(iter/s)": 0.231959 }, { "epoch": 3.760957390044973, "grad_norm": 1.0201239585876465, "learning_rate": 1.4402088767025179e-05, "loss": 0.0304873526096344, "memory(GiB)": 122.96, "step": 49340, "token_acc": 0.9856566562079785, "train_speed(iter/s)": 0.231965 }, { "epoch": 3.7613385166552327, "grad_norm": 0.7694002389907837, "learning_rate": 1.4393681773024188e-05, "loss": 0.06002195477485657, "memory(GiB)": 122.96, "step": 49345, "token_acc": 0.9776220005392289, "train_speed(iter/s)": 0.231967 }, { "epoch": 3.7617196432654927, "grad_norm": 1.7079490423202515, "learning_rate": 1.4385276820880306e-05, "loss": 0.051539909839630124, "memory(GiB)": 122.96, "step": 49350, "token_acc": 0.9804618117229129, "train_speed(iter/s)": 0.231972 }, { "epoch": 3.7621007698757527, "grad_norm": 0.257594496011734, "learning_rate": 1.4376873911075544e-05, "loss": 0.04232952892780304, "memory(GiB)": 122.96, "step": 49355, "token_acc": 0.9780746831106544, "train_speed(iter/s)": 0.231978 }, { "epoch": 3.7624818964860127, "grad_norm": 1.9846243858337402, "learning_rate": 1.4368473044091735e-05, "loss": 0.05914462804794311, "memory(GiB)": 122.96, "step": 49360, "token_acc": 0.9829987709954936, "train_speed(iter/s)": 0.231984 }, { "epoch": 3.7628630230962727, "grad_norm": 1.3557640314102173, "learning_rate": 1.4360074220410647e-05, "loss": 0.044481754302978516, "memory(GiB)": 122.96, "step": 49365, "token_acc": 0.9779298168174796, "train_speed(iter/s)": 0.231991 }, { "epoch": 3.7632441497065328, "grad_norm": 1.5696619749069214, "learning_rate": 1.4351677440513911e-05, "loss": 0.064264315366745, "memory(GiB)": 122.96, "step": 49370, "token_acc": 0.9796945505032975, "train_speed(iter/s)": 0.231994 }, { "epoch": 3.7636252763167923, "grad_norm": 0.6067826747894287, "learning_rate": 1.4343282704883049e-05, "loss": 0.03702512681484223, "memory(GiB)": 122.96, "step": 49375, "token_acc": 0.9881849062176931, "train_speed(iter/s)": 0.231997 }, { "epoch": 3.7640064029270524, "grad_norm": 1.1257916688919067, "learning_rate": 1.4334890013999469e-05, "loss": 0.038163980841636656, "memory(GiB)": 122.96, "step": 49380, "token_acc": 0.9825904432646005, "train_speed(iter/s)": 0.231999 }, { "epoch": 3.7643875295373124, "grad_norm": 1.9523323774337769, "learning_rate": 1.4326499368344432e-05, "loss": 0.08483254909515381, "memory(GiB)": 122.96, "step": 49385, "token_acc": 0.9729570840681951, "train_speed(iter/s)": 0.232006 }, { "epoch": 3.7647686561475724, "grad_norm": 1.9997837543487549, "learning_rate": 1.4318110768399101e-05, "loss": 0.07311153411865234, "memory(GiB)": 122.96, "step": 49390, "token_acc": 0.9790611279972982, "train_speed(iter/s)": 0.232013 }, { "epoch": 3.765149782757832, "grad_norm": 1.5989100933074951, "learning_rate": 1.4309724214644554e-05, "loss": 0.04350111782550812, "memory(GiB)": 122.96, "step": 49395, "token_acc": 0.9850212089077413, "train_speed(iter/s)": 0.232017 }, { "epoch": 3.765530909368092, "grad_norm": 1.6443607807159424, "learning_rate": 1.4301339707561684e-05, "loss": 0.042566624283790586, "memory(GiB)": 122.96, "step": 49400, "token_acc": 0.9821005917159763, "train_speed(iter/s)": 0.232019 }, { "epoch": 3.765530909368092, "eval_loss": 0.05675838515162468, "eval_runtime": 222.0971, "eval_samples_per_second": 2.386, "eval_steps_per_second": 2.386, "eval_token_acc": 0.9771926389976507, "step": 49400 }, { "epoch": 3.765912035978352, "grad_norm": 0.6239147782325745, "learning_rate": 1.4292957247631323e-05, "loss": 0.04719987511634827, "memory(GiB)": 122.96, "step": 49405, "token_acc": 0.9775235787590424, "train_speed(iter/s)": 0.231778 }, { "epoch": 3.766293162588612, "grad_norm": 1.1191984415054321, "learning_rate": 1.4284576835334173e-05, "loss": 0.027698755264282227, "memory(GiB)": 122.96, "step": 49410, "token_acc": 0.9891572879494922, "train_speed(iter/s)": 0.231782 }, { "epoch": 3.766674289198872, "grad_norm": 1.0212314128875732, "learning_rate": 1.4276198471150786e-05, "loss": 0.04538663327693939, "memory(GiB)": 122.96, "step": 49415, "token_acc": 0.987409200968523, "train_speed(iter/s)": 0.23179 }, { "epoch": 3.767055415809132, "grad_norm": 0.5518254041671753, "learning_rate": 1.4267822155561644e-05, "loss": 0.040771621465682986, "memory(GiB)": 122.96, "step": 49420, "token_acc": 0.9877839691384483, "train_speed(iter/s)": 0.231795 }, { "epoch": 3.7674365424193916, "grad_norm": 1.2289819717407227, "learning_rate": 1.4259447889047096e-05, "loss": 0.05148004293441773, "memory(GiB)": 122.96, "step": 49425, "token_acc": 0.9774798005844937, "train_speed(iter/s)": 0.231799 }, { "epoch": 3.7678176690296517, "grad_norm": 1.1270025968551636, "learning_rate": 1.4251075672087338e-05, "loss": 0.03582529127597809, "memory(GiB)": 122.96, "step": 49430, "token_acc": 0.9820305480682839, "train_speed(iter/s)": 0.231803 }, { "epoch": 3.7681987956399117, "grad_norm": 1.6252589225769043, "learning_rate": 1.4242705505162496e-05, "loss": 0.05199323892593384, "memory(GiB)": 122.96, "step": 49435, "token_acc": 0.9783018867924528, "train_speed(iter/s)": 0.231809 }, { "epoch": 3.7685799222501712, "grad_norm": 2.368750810623169, "learning_rate": 1.4234337388752578e-05, "loss": 0.06835298538208008, "memory(GiB)": 122.96, "step": 49440, "token_acc": 0.9721964782205746, "train_speed(iter/s)": 0.231817 }, { "epoch": 3.7689610488604313, "grad_norm": 2.875704050064087, "learning_rate": 1.4225971323337417e-05, "loss": 0.08151073455810547, "memory(GiB)": 122.96, "step": 49445, "token_acc": 0.9708215297450425, "train_speed(iter/s)": 0.231824 }, { "epoch": 3.7693421754706913, "grad_norm": 0.9627692103385925, "learning_rate": 1.421760730939679e-05, "loss": 0.04797639846801758, "memory(GiB)": 122.96, "step": 49450, "token_acc": 0.980875691997987, "train_speed(iter/s)": 0.23183 }, { "epoch": 3.7697233020809513, "grad_norm": 1.330607533454895, "learning_rate": 1.4209245347410349e-05, "loss": 0.03799420297145843, "memory(GiB)": 122.96, "step": 49455, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.231837 }, { "epoch": 3.7701044286912113, "grad_norm": 1.0230084657669067, "learning_rate": 1.4200885437857586e-05, "loss": 0.03309817314147949, "memory(GiB)": 122.96, "step": 49460, "token_acc": 0.9821251241310824, "train_speed(iter/s)": 0.231844 }, { "epoch": 3.7704855553014713, "grad_norm": 1.5670970678329468, "learning_rate": 1.4192527581217912e-05, "loss": 0.059177052974700925, "memory(GiB)": 122.96, "step": 49465, "token_acc": 0.9773512476007677, "train_speed(iter/s)": 0.231849 }, { "epoch": 3.770866681911731, "grad_norm": 0.8811646699905396, "learning_rate": 1.4184171777970629e-05, "loss": 0.05547956824302673, "memory(GiB)": 122.96, "step": 49470, "token_acc": 0.9806275579809004, "train_speed(iter/s)": 0.231851 }, { "epoch": 3.771247808521991, "grad_norm": 1.314267635345459, "learning_rate": 1.4175818028594873e-05, "loss": 0.045135363936424255, "memory(GiB)": 122.96, "step": 49475, "token_acc": 0.9824069749338316, "train_speed(iter/s)": 0.231854 }, { "epoch": 3.771628935132251, "grad_norm": 0.7771315574645996, "learning_rate": 1.416746633356973e-05, "loss": 0.04235619902610779, "memory(GiB)": 122.96, "step": 49480, "token_acc": 0.9834331337325349, "train_speed(iter/s)": 0.231858 }, { "epoch": 3.772010061742511, "grad_norm": 1.3125030994415283, "learning_rate": 1.4159116693374086e-05, "loss": 0.044371843338012695, "memory(GiB)": 122.96, "step": 49485, "token_acc": 0.9798278644432491, "train_speed(iter/s)": 0.23186 }, { "epoch": 3.7723911883527705, "grad_norm": 1.270327091217041, "learning_rate": 1.4150769108486782e-05, "loss": 0.04144091308116913, "memory(GiB)": 122.96, "step": 49490, "token_acc": 0.9876638396299152, "train_speed(iter/s)": 0.231862 }, { "epoch": 3.7727723149630306, "grad_norm": 1.0295872688293457, "learning_rate": 1.4142423579386521e-05, "loss": 0.05821223258972168, "memory(GiB)": 122.96, "step": 49495, "token_acc": 0.9805124011992369, "train_speed(iter/s)": 0.231863 }, { "epoch": 3.7731534415732906, "grad_norm": 2.1801817417144775, "learning_rate": 1.413408010655184e-05, "loss": 0.07778084874153138, "memory(GiB)": 122.96, "step": 49500, "token_acc": 0.9728240910760191, "train_speed(iter/s)": 0.231871 }, { "epoch": 3.7735345681835506, "grad_norm": 2.154778003692627, "learning_rate": 1.4125738690461243e-05, "loss": 0.03599921464920044, "memory(GiB)": 122.96, "step": 49505, "token_acc": 0.982546608488695, "train_speed(iter/s)": 0.231877 }, { "epoch": 3.7739156947938106, "grad_norm": 1.1936566829681396, "learning_rate": 1.4117399331593067e-05, "loss": 0.055577391386032106, "memory(GiB)": 122.96, "step": 49510, "token_acc": 0.9765957446808511, "train_speed(iter/s)": 0.231883 }, { "epoch": 3.7742968214040706, "grad_norm": 1.3995211124420166, "learning_rate": 1.4109062030425513e-05, "loss": 0.06513239741325379, "memory(GiB)": 122.96, "step": 49515, "token_acc": 0.9710564399421129, "train_speed(iter/s)": 0.231889 }, { "epoch": 3.77467794801433, "grad_norm": 0.7792954444885254, "learning_rate": 1.4100726787436708e-05, "loss": 0.06409629583358764, "memory(GiB)": 122.96, "step": 49520, "token_acc": 0.9754355898314767, "train_speed(iter/s)": 0.231893 }, { "epoch": 3.7750590746245902, "grad_norm": 1.000656008720398, "learning_rate": 1.4092393603104614e-05, "loss": 0.03858259320259094, "memory(GiB)": 122.96, "step": 49525, "token_acc": 0.9846574690770694, "train_speed(iter/s)": 0.231896 }, { "epoch": 3.7754402012348502, "grad_norm": 1.1591041088104248, "learning_rate": 1.4084062477907118e-05, "loss": 0.07007834315299988, "memory(GiB)": 122.96, "step": 49530, "token_acc": 0.9701723376049491, "train_speed(iter/s)": 0.231901 }, { "epoch": 3.7758213278451103, "grad_norm": 2.4687910079956055, "learning_rate": 1.4075733412321985e-05, "loss": 0.08525997400283813, "memory(GiB)": 122.96, "step": 49535, "token_acc": 0.972382956338769, "train_speed(iter/s)": 0.231908 }, { "epoch": 3.77620245445537, "grad_norm": 0.9665534496307373, "learning_rate": 1.4067406406826816e-05, "loss": 0.1047576904296875, "memory(GiB)": 122.96, "step": 49540, "token_acc": 0.9691004236232246, "train_speed(iter/s)": 0.231913 }, { "epoch": 3.77658358106563, "grad_norm": 0.43288248777389526, "learning_rate": 1.4059081461899137e-05, "loss": 0.03984101414680481, "memory(GiB)": 122.96, "step": 49545, "token_acc": 0.9858035207268597, "train_speed(iter/s)": 0.231915 }, { "epoch": 3.77696470767589, "grad_norm": 1.1039576530456543, "learning_rate": 1.405075857801637e-05, "loss": 0.060120928287506106, "memory(GiB)": 122.96, "step": 49550, "token_acc": 0.9831362667183563, "train_speed(iter/s)": 0.23192 }, { "epoch": 3.77734583428615, "grad_norm": 1.1502867937088013, "learning_rate": 1.4042437755655757e-05, "loss": 0.052438211441040036, "memory(GiB)": 122.96, "step": 49555, "token_acc": 0.9731691919191919, "train_speed(iter/s)": 0.231928 }, { "epoch": 3.77772696089641, "grad_norm": 1.764073133468628, "learning_rate": 1.4034118995294477e-05, "loss": 0.05944451093673706, "memory(GiB)": 122.96, "step": 49560, "token_acc": 0.9805391894304588, "train_speed(iter/s)": 0.231931 }, { "epoch": 3.77810808750667, "grad_norm": 1.3907653093338013, "learning_rate": 1.4025802297409584e-05, "loss": 0.057460206747055056, "memory(GiB)": 122.96, "step": 49565, "token_acc": 0.9803743961352657, "train_speed(iter/s)": 0.231938 }, { "epoch": 3.7784892141169295, "grad_norm": 0.9711343050003052, "learning_rate": 1.4017487662477973e-05, "loss": 0.045544058084487915, "memory(GiB)": 122.96, "step": 49570, "token_acc": 0.9789092458457606, "train_speed(iter/s)": 0.231942 }, { "epoch": 3.7788703407271895, "grad_norm": 0.3694034516811371, "learning_rate": 1.4009175090976463e-05, "loss": 0.05266235470771789, "memory(GiB)": 122.96, "step": 49575, "token_acc": 0.9761362112883765, "train_speed(iter/s)": 0.231946 }, { "epoch": 3.7792514673374495, "grad_norm": 0.7384228706359863, "learning_rate": 1.4000864583381762e-05, "loss": 0.05924941301345825, "memory(GiB)": 122.96, "step": 49580, "token_acc": 0.9772528433945756, "train_speed(iter/s)": 0.231951 }, { "epoch": 3.7796325939477096, "grad_norm": 1.696331262588501, "learning_rate": 1.3992556140170404e-05, "loss": 0.06198549270629883, "memory(GiB)": 122.96, "step": 49585, "token_acc": 0.9722042663219134, "train_speed(iter/s)": 0.231957 }, { "epoch": 3.780013720557969, "grad_norm": 2.579899787902832, "learning_rate": 1.3984249761818858e-05, "loss": 0.06167426109313965, "memory(GiB)": 122.96, "step": 49590, "token_acc": 0.974195418962018, "train_speed(iter/s)": 0.231963 }, { "epoch": 3.780394847168229, "grad_norm": 1.5736339092254639, "learning_rate": 1.3975945448803474e-05, "loss": 0.03826078176498413, "memory(GiB)": 122.96, "step": 49595, "token_acc": 0.9810897435897435, "train_speed(iter/s)": 0.231971 }, { "epoch": 3.780775973778489, "grad_norm": 0.8955866098403931, "learning_rate": 1.3967643201600422e-05, "loss": 0.05459545254707336, "memory(GiB)": 122.96, "step": 49600, "token_acc": 0.979517271922055, "train_speed(iter/s)": 0.23197 }, { "epoch": 3.780775973778489, "eval_loss": 0.05507699400186539, "eval_runtime": 219.5756, "eval_samples_per_second": 2.414, "eval_steps_per_second": 2.414, "eval_token_acc": 0.9773658213360641, "step": 49600 }, { "epoch": 3.781157100388749, "grad_norm": 1.163751482963562, "learning_rate": 1.3959343020685828e-05, "loss": 0.06899356842041016, "memory(GiB)": 122.96, "step": 49605, "token_acc": 0.9772331028069972, "train_speed(iter/s)": 0.231737 }, { "epoch": 3.781538226999009, "grad_norm": 0.48156222701072693, "learning_rate": 1.3951044906535676e-05, "loss": 0.06851338744163513, "memory(GiB)": 122.96, "step": 49610, "token_acc": 0.9791817711030554, "train_speed(iter/s)": 0.231738 }, { "epoch": 3.7819193536092692, "grad_norm": 1.197718620300293, "learning_rate": 1.3942748859625799e-05, "loss": 0.0483797550201416, "memory(GiB)": 122.96, "step": 49615, "token_acc": 0.9779567613395507, "train_speed(iter/s)": 0.231746 }, { "epoch": 3.782300480219529, "grad_norm": 1.1512904167175293, "learning_rate": 1.393445488043194e-05, "loss": 0.030302512645721435, "memory(GiB)": 122.96, "step": 49620, "token_acc": 0.9869811320754717, "train_speed(iter/s)": 0.23175 }, { "epoch": 3.782681606829789, "grad_norm": 1.7169764041900635, "learning_rate": 1.3926162969429752e-05, "loss": 0.03919914960861206, "memory(GiB)": 122.96, "step": 49625, "token_acc": 0.9826109525045419, "train_speed(iter/s)": 0.231755 }, { "epoch": 3.783062733440049, "grad_norm": 1.241289496421814, "learning_rate": 1.3917873127094699e-05, "loss": 0.04687936007976532, "memory(GiB)": 122.96, "step": 49630, "token_acc": 0.9869791666666666, "train_speed(iter/s)": 0.231761 }, { "epoch": 3.783443860050309, "grad_norm": 1.361514925956726, "learning_rate": 1.3909585353902177e-05, "loss": 0.056233108043670654, "memory(GiB)": 122.96, "step": 49635, "token_acc": 0.9773282176491106, "train_speed(iter/s)": 0.231765 }, { "epoch": 3.7838249866605684, "grad_norm": 0.46829554438591003, "learning_rate": 1.3901299650327459e-05, "loss": 0.03161362707614899, "memory(GiB)": 122.96, "step": 49640, "token_acc": 0.9863072314933675, "train_speed(iter/s)": 0.23177 }, { "epoch": 3.7842061132708285, "grad_norm": 1.360326886177063, "learning_rate": 1.3893016016845689e-05, "loss": 0.0428810179233551, "memory(GiB)": 122.96, "step": 49645, "token_acc": 0.9865157717312786, "train_speed(iter/s)": 0.231776 }, { "epoch": 3.7845872398810885, "grad_norm": 1.6945706605911255, "learning_rate": 1.3884734453931903e-05, "loss": 0.06506238579750061, "memory(GiB)": 122.96, "step": 49650, "token_acc": 0.974511672224869, "train_speed(iter/s)": 0.231783 }, { "epoch": 3.7849683664913485, "grad_norm": 0.7082275152206421, "learning_rate": 1.3876454962060986e-05, "loss": 0.04251969456672668, "memory(GiB)": 122.96, "step": 49655, "token_acc": 0.9836516004945735, "train_speed(iter/s)": 0.231786 }, { "epoch": 3.7853494931016085, "grad_norm": 1.0649880170822144, "learning_rate": 1.386817754170775e-05, "loss": 0.06598026752471924, "memory(GiB)": 122.96, "step": 49660, "token_acc": 0.9796049806784027, "train_speed(iter/s)": 0.23179 }, { "epoch": 3.7857306197118685, "grad_norm": 1.4562768936157227, "learning_rate": 1.385990219334687e-05, "loss": 0.05464975237846374, "memory(GiB)": 122.96, "step": 49665, "token_acc": 0.9777310924369748, "train_speed(iter/s)": 0.231798 }, { "epoch": 3.786111746322128, "grad_norm": 1.3175218105316162, "learning_rate": 1.3851628917452874e-05, "loss": 0.05058342218399048, "memory(GiB)": 122.96, "step": 49670, "token_acc": 0.9796539961013645, "train_speed(iter/s)": 0.2318 }, { "epoch": 3.786492872932388, "grad_norm": 0.7989910840988159, "learning_rate": 1.384335771450021e-05, "loss": 0.054574775695800784, "memory(GiB)": 122.96, "step": 49675, "token_acc": 0.9794283239497618, "train_speed(iter/s)": 0.231805 }, { "epoch": 3.786873999542648, "grad_norm": 1.9526698589324951, "learning_rate": 1.3835088584963208e-05, "loss": 0.0649226188659668, "memory(GiB)": 122.96, "step": 49680, "token_acc": 0.970972097209721, "train_speed(iter/s)": 0.231811 }, { "epoch": 3.787255126152908, "grad_norm": 0.7702149748802185, "learning_rate": 1.3826821529316036e-05, "loss": 0.036125478148460385, "memory(GiB)": 122.96, "step": 49685, "token_acc": 0.9861546499477534, "train_speed(iter/s)": 0.231816 }, { "epoch": 3.7876362527631677, "grad_norm": 1.7919747829437256, "learning_rate": 1.3818556548032802e-05, "loss": 0.06690528392791747, "memory(GiB)": 122.96, "step": 49690, "token_acc": 0.9752083778585168, "train_speed(iter/s)": 0.231821 }, { "epoch": 3.7880173793734278, "grad_norm": 1.5046727657318115, "learning_rate": 1.381029364158743e-05, "loss": 0.028903329372406007, "memory(GiB)": 122.96, "step": 49695, "token_acc": 0.9873301785986872, "train_speed(iter/s)": 0.231825 }, { "epoch": 3.7883985059836878, "grad_norm": 0.5665990114212036, "learning_rate": 1.380203281045378e-05, "loss": 0.047303909063339235, "memory(GiB)": 122.96, "step": 49700, "token_acc": 0.9850460789427925, "train_speed(iter/s)": 0.231829 }, { "epoch": 3.788779632593948, "grad_norm": 0.8633244037628174, "learning_rate": 1.3793774055105579e-05, "loss": 0.0611092746257782, "memory(GiB)": 122.96, "step": 49705, "token_acc": 0.9763549823430063, "train_speed(iter/s)": 0.231832 }, { "epoch": 3.789160759204208, "grad_norm": 1.460031509399414, "learning_rate": 1.378551737601641e-05, "loss": 0.05788120031356812, "memory(GiB)": 122.96, "step": 49710, "token_acc": 0.9751618443651737, "train_speed(iter/s)": 0.231835 }, { "epoch": 3.789541885814468, "grad_norm": 0.9067478179931641, "learning_rate": 1.377726277365976e-05, "loss": 0.048944252729415896, "memory(GiB)": 122.96, "step": 49715, "token_acc": 0.9807493984187006, "train_speed(iter/s)": 0.231843 }, { "epoch": 3.7899230124247274, "grad_norm": 0.41892266273498535, "learning_rate": 1.3769010248509011e-05, "loss": 0.028213325142860412, "memory(GiB)": 122.96, "step": 49720, "token_acc": 0.9888849682427664, "train_speed(iter/s)": 0.231846 }, { "epoch": 3.7903041390349874, "grad_norm": 2.878065347671509, "learning_rate": 1.3760759801037376e-05, "loss": 0.06556029319763183, "memory(GiB)": 122.96, "step": 49725, "token_acc": 0.9861205145565335, "train_speed(iter/s)": 0.231853 }, { "epoch": 3.7906852656452474, "grad_norm": 4.586342811584473, "learning_rate": 1.3752511431718002e-05, "loss": 0.08724913597106934, "memory(GiB)": 122.96, "step": 49730, "token_acc": 0.9692168401991852, "train_speed(iter/s)": 0.23186 }, { "epoch": 3.791066392255507, "grad_norm": 0.8645709156990051, "learning_rate": 1.3744265141023899e-05, "loss": 0.051538360118865964, "memory(GiB)": 122.96, "step": 49735, "token_acc": 0.9793759915388683, "train_speed(iter/s)": 0.231861 }, { "epoch": 3.791447518865767, "grad_norm": 0.8297973871231079, "learning_rate": 1.3736020929427923e-05, "loss": 0.04985419809818268, "memory(GiB)": 122.96, "step": 49740, "token_acc": 0.9812402915521568, "train_speed(iter/s)": 0.231862 }, { "epoch": 3.791828645476027, "grad_norm": 0.7368699312210083, "learning_rate": 1.3727778797402869e-05, "loss": 0.03648805916309357, "memory(GiB)": 122.96, "step": 49745, "token_acc": 0.9883551673944687, "train_speed(iter/s)": 0.231867 }, { "epoch": 3.792209772086287, "grad_norm": 0.6735879182815552, "learning_rate": 1.371953874542139e-05, "loss": 0.09566297531127929, "memory(GiB)": 122.96, "step": 49750, "token_acc": 0.9699895615866388, "train_speed(iter/s)": 0.23187 }, { "epoch": 3.792590898696547, "grad_norm": 1.4900367259979248, "learning_rate": 1.3711300773955981e-05, "loss": 0.04835646748542786, "memory(GiB)": 122.96, "step": 49755, "token_acc": 0.9876574307304786, "train_speed(iter/s)": 0.231875 }, { "epoch": 3.792972025306807, "grad_norm": 1.1711376905441284, "learning_rate": 1.3703064883479083e-05, "loss": 0.038391613960266115, "memory(GiB)": 122.96, "step": 49760, "token_acc": 0.9884126762529666, "train_speed(iter/s)": 0.231877 }, { "epoch": 3.793353151917067, "grad_norm": 2.4208757877349854, "learning_rate": 1.3694831074462966e-05, "loss": 0.03946090936660766, "memory(GiB)": 122.96, "step": 49765, "token_acc": 0.986125385405961, "train_speed(iter/s)": 0.231883 }, { "epoch": 3.7937342785273267, "grad_norm": 0.8947399854660034, "learning_rate": 1.3686599347379819e-05, "loss": 0.04355248808860779, "memory(GiB)": 122.96, "step": 49770, "token_acc": 0.9843196762771876, "train_speed(iter/s)": 0.23189 }, { "epoch": 3.7941154051375867, "grad_norm": 0.46912530064582825, "learning_rate": 1.3678369702701694e-05, "loss": 0.03071017563343048, "memory(GiB)": 122.96, "step": 49775, "token_acc": 0.9875801282051282, "train_speed(iter/s)": 0.231897 }, { "epoch": 3.7944965317478467, "grad_norm": 1.2033329010009766, "learning_rate": 1.36701421409005e-05, "loss": 0.05704330205917359, "memory(GiB)": 122.96, "step": 49780, "token_acc": 0.9768563162970106, "train_speed(iter/s)": 0.231904 }, { "epoch": 3.7948776583581063, "grad_norm": 0.4610370695590973, "learning_rate": 1.366191666244806e-05, "loss": 0.026459187269210815, "memory(GiB)": 122.96, "step": 49785, "token_acc": 0.9902676399026764, "train_speed(iter/s)": 0.231913 }, { "epoch": 3.7952587849683663, "grad_norm": 0.7197744846343994, "learning_rate": 1.3653693267816092e-05, "loss": 0.03316831290721893, "memory(GiB)": 122.96, "step": 49790, "token_acc": 0.9831419851765731, "train_speed(iter/s)": 0.231916 }, { "epoch": 3.7956399115786263, "grad_norm": 0.6361710429191589, "learning_rate": 1.364547195747613e-05, "loss": 0.059509378671646115, "memory(GiB)": 122.96, "step": 49795, "token_acc": 0.9748412310698583, "train_speed(iter/s)": 0.231917 }, { "epoch": 3.7960210381888864, "grad_norm": 1.292320728302002, "learning_rate": 1.3637252731899641e-05, "loss": 0.05207591056823731, "memory(GiB)": 122.96, "step": 49800, "token_acc": 0.9833723044946739, "train_speed(iter/s)": 0.231923 }, { "epoch": 3.7960210381888864, "eval_loss": 0.05494461953639984, "eval_runtime": 220.123, "eval_samples_per_second": 2.408, "eval_steps_per_second": 2.408, "eval_token_acc": 0.977252876332751, "step": 49800 }, { "epoch": 3.7964021647991464, "grad_norm": 1.0038409233093262, "learning_rate": 1.3629035591557982e-05, "loss": 0.050142991542816165, "memory(GiB)": 122.96, "step": 49805, "token_acc": 0.9771282526535281, "train_speed(iter/s)": 0.23169 }, { "epoch": 3.7967832914094064, "grad_norm": 1.495643138885498, "learning_rate": 1.3620820536922335e-05, "loss": 0.057119280099868774, "memory(GiB)": 122.96, "step": 49810, "token_acc": 0.9730193769928869, "train_speed(iter/s)": 0.231696 }, { "epoch": 3.797164418019666, "grad_norm": 1.7586190700531006, "learning_rate": 1.3612607568463814e-05, "loss": 0.03344021439552307, "memory(GiB)": 122.96, "step": 49815, "token_acc": 0.9831622176591376, "train_speed(iter/s)": 0.231703 }, { "epoch": 3.797545544629926, "grad_norm": 0.8871007561683655, "learning_rate": 1.3604396686653404e-05, "loss": 0.0762170672416687, "memory(GiB)": 122.96, "step": 49820, "token_acc": 0.9709302325581395, "train_speed(iter/s)": 0.231707 }, { "epoch": 3.797926671240186, "grad_norm": 1.6857911348342896, "learning_rate": 1.3596187891961926e-05, "loss": 0.05691769123077393, "memory(GiB)": 122.96, "step": 49825, "token_acc": 0.9787365813377374, "train_speed(iter/s)": 0.231714 }, { "epoch": 3.798307797850446, "grad_norm": 1.3018244504928589, "learning_rate": 1.3587981184860144e-05, "loss": 0.04956548810005188, "memory(GiB)": 122.96, "step": 49830, "token_acc": 0.9785942006983452, "train_speed(iter/s)": 0.231715 }, { "epoch": 3.7986889244607056, "grad_norm": 1.0196419954299927, "learning_rate": 1.3579776565818686e-05, "loss": 0.03993641436100006, "memory(GiB)": 122.96, "step": 49835, "token_acc": 0.9842215424247304, "train_speed(iter/s)": 0.231718 }, { "epoch": 3.7990700510709656, "grad_norm": 0.999539315700531, "learning_rate": 1.357157403530801e-05, "loss": 0.041711249947547914, "memory(GiB)": 122.96, "step": 49840, "token_acc": 0.9850723299476762, "train_speed(iter/s)": 0.23172 }, { "epoch": 3.7994511776812256, "grad_norm": 0.5577094554901123, "learning_rate": 1.3563373593798518e-05, "loss": 0.03190929293632507, "memory(GiB)": 122.96, "step": 49845, "token_acc": 0.9860248447204969, "train_speed(iter/s)": 0.231726 }, { "epoch": 3.7998323042914857, "grad_norm": 1.3178467750549316, "learning_rate": 1.3555175241760481e-05, "loss": 0.04737975597381592, "memory(GiB)": 122.96, "step": 49850, "token_acc": 0.9837288135593221, "train_speed(iter/s)": 0.231732 }, { "epoch": 3.8002134309017457, "grad_norm": 0.04409927502274513, "learning_rate": 1.3546978979664e-05, "loss": 0.03942444622516632, "memory(GiB)": 122.96, "step": 49855, "token_acc": 0.9792648444863337, "train_speed(iter/s)": 0.23174 }, { "epoch": 3.8005945575120057, "grad_norm": 3.179527997970581, "learning_rate": 1.3538784807979132e-05, "loss": 0.04871741533279419, "memory(GiB)": 122.96, "step": 49860, "token_acc": 0.9821717990275527, "train_speed(iter/s)": 0.231747 }, { "epoch": 3.8009756841222653, "grad_norm": 1.6907007694244385, "learning_rate": 1.3530592727175734e-05, "loss": 0.03760620057582855, "memory(GiB)": 122.96, "step": 49865, "token_acc": 0.9874421678783873, "train_speed(iter/s)": 0.231753 }, { "epoch": 3.8013568107325253, "grad_norm": 1.661289930343628, "learning_rate": 1.3522402737723605e-05, "loss": 0.079800283908844, "memory(GiB)": 122.96, "step": 49870, "token_acc": 0.9686132488305417, "train_speed(iter/s)": 0.231757 }, { "epoch": 3.8017379373427853, "grad_norm": 1.5284230709075928, "learning_rate": 1.351421484009242e-05, "loss": 0.03937745690345764, "memory(GiB)": 122.96, "step": 49875, "token_acc": 0.9893088015912481, "train_speed(iter/s)": 0.231762 }, { "epoch": 3.8021190639530453, "grad_norm": 0.5144834518432617, "learning_rate": 1.3506029034751683e-05, "loss": 0.04361860752105713, "memory(GiB)": 122.96, "step": 49880, "token_acc": 0.9833784306146115, "train_speed(iter/s)": 0.231767 }, { "epoch": 3.802500190563305, "grad_norm": 1.584444284439087, "learning_rate": 1.3497845322170833e-05, "loss": 0.06879715919494629, "memory(GiB)": 122.96, "step": 49885, "token_acc": 0.975603217158177, "train_speed(iter/s)": 0.231773 }, { "epoch": 3.802881317173565, "grad_norm": 0.9943462014198303, "learning_rate": 1.3489663702819172e-05, "loss": 0.06735858917236329, "memory(GiB)": 122.96, "step": 49890, "token_acc": 0.9841591453306318, "train_speed(iter/s)": 0.231776 }, { "epoch": 3.803262443783825, "grad_norm": 1.1590137481689453, "learning_rate": 1.3481484177165854e-05, "loss": 0.04314205348491669, "memory(GiB)": 122.96, "step": 49895, "token_acc": 0.9831428017826003, "train_speed(iter/s)": 0.231781 }, { "epoch": 3.803643570394085, "grad_norm": 0.7390767931938171, "learning_rate": 1.3473306745679936e-05, "loss": 0.024588119983673096, "memory(GiB)": 122.96, "step": 49900, "token_acc": 0.991775950211158, "train_speed(iter/s)": 0.231787 }, { "epoch": 3.804024697004345, "grad_norm": 1.6639763116836548, "learning_rate": 1.3465131408830405e-05, "loss": 0.054437047243118285, "memory(GiB)": 122.96, "step": 49905, "token_acc": 0.9824964131994262, "train_speed(iter/s)": 0.231793 }, { "epoch": 3.804405823614605, "grad_norm": 1.025344967842102, "learning_rate": 1.3456958167086031e-05, "loss": 0.039322075247764585, "memory(GiB)": 122.96, "step": 49910, "token_acc": 0.9860769860769861, "train_speed(iter/s)": 0.2318 }, { "epoch": 3.8047869502248646, "grad_norm": 0.869002103805542, "learning_rate": 1.3448787020915537e-05, "loss": 0.05250083208084107, "memory(GiB)": 122.96, "step": 49915, "token_acc": 0.9779375309866137, "train_speed(iter/s)": 0.231806 }, { "epoch": 3.8051680768351246, "grad_norm": 0.6119213104248047, "learning_rate": 1.3440617970787478e-05, "loss": 0.037960395216941833, "memory(GiB)": 122.96, "step": 49920, "token_acc": 0.987240356083086, "train_speed(iter/s)": 0.231811 }, { "epoch": 3.8055492034453846, "grad_norm": 0.7159050107002258, "learning_rate": 1.3432451017170317e-05, "loss": 0.05246716141700745, "memory(GiB)": 122.96, "step": 49925, "token_acc": 0.9811278364412491, "train_speed(iter/s)": 0.231816 }, { "epoch": 3.8059303300556446, "grad_norm": 2.2820262908935547, "learning_rate": 1.3424286160532418e-05, "loss": 0.06867601871490478, "memory(GiB)": 122.96, "step": 49930, "token_acc": 0.9779339972661589, "train_speed(iter/s)": 0.231819 }, { "epoch": 3.806311456665904, "grad_norm": 1.4799386262893677, "learning_rate": 1.341612340134195e-05, "loss": 0.06933952569961548, "memory(GiB)": 122.96, "step": 49935, "token_acc": 0.9763313609467456, "train_speed(iter/s)": 0.231823 }, { "epoch": 3.806692583276164, "grad_norm": 0.6729950904846191, "learning_rate": 1.3407962740067042e-05, "loss": 0.042849200963974, "memory(GiB)": 122.96, "step": 49940, "token_acc": 0.9832098765432099, "train_speed(iter/s)": 0.231823 }, { "epoch": 3.8070737098864242, "grad_norm": 1.329359769821167, "learning_rate": 1.3399804177175678e-05, "loss": 0.050516313314437865, "memory(GiB)": 122.96, "step": 49945, "token_acc": 0.9781243670245088, "train_speed(iter/s)": 0.231828 }, { "epoch": 3.8074548364966843, "grad_norm": 1.0247658491134644, "learning_rate": 1.3391647713135686e-05, "loss": 0.042109829187393186, "memory(GiB)": 122.96, "step": 49950, "token_acc": 0.9841199432221434, "train_speed(iter/s)": 0.231828 }, { "epoch": 3.8078359631069443, "grad_norm": 1.1890708208084106, "learning_rate": 1.3383493348414811e-05, "loss": 0.029956707358360292, "memory(GiB)": 122.96, "step": 49955, "token_acc": 0.9873116574147502, "train_speed(iter/s)": 0.231834 }, { "epoch": 3.8082170897172043, "grad_norm": 0.976090669631958, "learning_rate": 1.3375341083480685e-05, "loss": 0.03179272711277008, "memory(GiB)": 122.96, "step": 49960, "token_acc": 0.9836503169836504, "train_speed(iter/s)": 0.231839 }, { "epoch": 3.808598216327464, "grad_norm": 0.41821831464767456, "learning_rate": 1.3367190918800776e-05, "loss": 0.0809582769870758, "memory(GiB)": 122.96, "step": 49965, "token_acc": 0.9679293516151523, "train_speed(iter/s)": 0.231845 }, { "epoch": 3.808979342937724, "grad_norm": 2.1224887371063232, "learning_rate": 1.3359042854842474e-05, "loss": 0.06757027506828309, "memory(GiB)": 122.96, "step": 49970, "token_acc": 0.969311377245509, "train_speed(iter/s)": 0.231852 }, { "epoch": 3.809360469547984, "grad_norm": 0.3155832886695862, "learning_rate": 1.3350896892073038e-05, "loss": 0.019587448239326476, "memory(GiB)": 122.96, "step": 49975, "token_acc": 0.9920391916717698, "train_speed(iter/s)": 0.231859 }, { "epoch": 3.809741596158244, "grad_norm": 1.3085591793060303, "learning_rate": 1.3342753030959581e-05, "loss": 0.04049122929573059, "memory(GiB)": 122.96, "step": 49980, "token_acc": 0.9804265264387964, "train_speed(iter/s)": 0.231866 }, { "epoch": 3.8101227227685035, "grad_norm": 0.7166280150413513, "learning_rate": 1.3334611271969128e-05, "loss": 0.0388145923614502, "memory(GiB)": 122.96, "step": 49985, "token_acc": 0.9830970556161396, "train_speed(iter/s)": 0.231872 }, { "epoch": 3.8105038493787635, "grad_norm": 0.09663061797618866, "learning_rate": 1.3326471615568581e-05, "loss": 0.029179111123085022, "memory(GiB)": 122.96, "step": 49990, "token_acc": 0.9851936218678815, "train_speed(iter/s)": 0.231878 }, { "epoch": 3.8108849759890235, "grad_norm": 2.3255820274353027, "learning_rate": 1.3318334062224691e-05, "loss": 0.038890546560287474, "memory(GiB)": 122.96, "step": 49995, "token_acc": 0.9839883551673945, "train_speed(iter/s)": 0.231884 }, { "epoch": 3.8112661025992836, "grad_norm": 1.0345311164855957, "learning_rate": 1.3310198612404112e-05, "loss": 0.0442691445350647, "memory(GiB)": 122.96, "step": 50000, "token_acc": 0.9847473784556721, "train_speed(iter/s)": 0.231888 }, { "epoch": 3.8112661025992836, "eval_loss": 0.05524379014968872, "eval_runtime": 222.7516, "eval_samples_per_second": 2.379, "eval_steps_per_second": 2.379, "eval_token_acc": 0.9774561773387145, "step": 50000 }, { "epoch": 3.8116472292095436, "grad_norm": 2.6850016117095947, "learning_rate": 1.3302065266573405e-05, "loss": 0.07331587076187134, "memory(GiB)": 122.96, "step": 50005, "token_acc": 0.9774667268700682, "train_speed(iter/s)": 0.231651 }, { "epoch": 3.8120283558198036, "grad_norm": 1.2834588289260864, "learning_rate": 1.3293934025198935e-05, "loss": 0.05926549434661865, "memory(GiB)": 122.96, "step": 50010, "token_acc": 0.9771709937332139, "train_speed(iter/s)": 0.231655 }, { "epoch": 3.812409482430063, "grad_norm": 2.1027579307556152, "learning_rate": 1.3285804888747011e-05, "loss": 0.03558054864406586, "memory(GiB)": 122.96, "step": 50015, "token_acc": 0.988903115663679, "train_speed(iter/s)": 0.23166 }, { "epoch": 3.812790609040323, "grad_norm": 1.9584769010543823, "learning_rate": 1.3277677857683823e-05, "loss": 0.07080695629119874, "memory(GiB)": 122.96, "step": 50020, "token_acc": 0.9731100963977676, "train_speed(iter/s)": 0.231666 }, { "epoch": 3.813171735650583, "grad_norm": 1.242777943611145, "learning_rate": 1.3269552932475376e-05, "loss": 0.0422286331653595, "memory(GiB)": 122.96, "step": 50025, "token_acc": 0.9808205470313542, "train_speed(iter/s)": 0.23167 }, { "epoch": 3.813552862260843, "grad_norm": 0.5136341452598572, "learning_rate": 1.326143011358762e-05, "loss": 0.02251770794391632, "memory(GiB)": 122.96, "step": 50030, "token_acc": 0.9887061620764811, "train_speed(iter/s)": 0.231676 }, { "epoch": 3.813933988871103, "grad_norm": 1.0055211782455444, "learning_rate": 1.3253309401486363e-05, "loss": 0.05027662515640259, "memory(GiB)": 122.96, "step": 50035, "token_acc": 0.983366124128063, "train_speed(iter/s)": 0.231682 }, { "epoch": 3.814315115481363, "grad_norm": 1.600035548210144, "learning_rate": 1.324519079663728e-05, "loss": 0.03682913184165955, "memory(GiB)": 122.96, "step": 50040, "token_acc": 0.9871164604170198, "train_speed(iter/s)": 0.231686 }, { "epoch": 3.814696242091623, "grad_norm": 1.106581211090088, "learning_rate": 1.3237074299505964e-05, "loss": 0.08231832385063172, "memory(GiB)": 122.96, "step": 50045, "token_acc": 0.9784837362359196, "train_speed(iter/s)": 0.231689 }, { "epoch": 3.815077368701883, "grad_norm": 1.1743078231811523, "learning_rate": 1.3228959910557814e-05, "loss": 0.03619283437728882, "memory(GiB)": 122.96, "step": 50050, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.231697 }, { "epoch": 3.815458495312143, "grad_norm": 1.2885369062423706, "learning_rate": 1.3220847630258176e-05, "loss": 0.05162625908851624, "memory(GiB)": 122.96, "step": 50055, "token_acc": 0.9793706293706294, "train_speed(iter/s)": 0.231704 }, { "epoch": 3.815839621922403, "grad_norm": 1.2468609809875488, "learning_rate": 1.3212737459072272e-05, "loss": 0.02626628875732422, "memory(GiB)": 122.96, "step": 50060, "token_acc": 0.9852176293457432, "train_speed(iter/s)": 0.23171 }, { "epoch": 3.8162207485326625, "grad_norm": 1.3957470655441284, "learning_rate": 1.320462939746514e-05, "loss": 0.06692988872528076, "memory(GiB)": 122.96, "step": 50065, "token_acc": 0.9757402782732786, "train_speed(iter/s)": 0.231716 }, { "epoch": 3.8166018751429225, "grad_norm": 1.1560328006744385, "learning_rate": 1.319652344590176e-05, "loss": 0.06845216751098633, "memory(GiB)": 122.96, "step": 50070, "token_acc": 0.9749216300940439, "train_speed(iter/s)": 0.231721 }, { "epoch": 3.8169830017531825, "grad_norm": 0.979686975479126, "learning_rate": 1.3188419604846986e-05, "loss": 0.0760522186756134, "memory(GiB)": 122.96, "step": 50075, "token_acc": 0.9719154307352477, "train_speed(iter/s)": 0.231726 }, { "epoch": 3.817364128363442, "grad_norm": 1.4192699193954468, "learning_rate": 1.3180317874765507e-05, "loss": 0.0569574773311615, "memory(GiB)": 122.96, "step": 50080, "token_acc": 0.9736123748862603, "train_speed(iter/s)": 0.231733 }, { "epoch": 3.817745254973702, "grad_norm": 0.502214789390564, "learning_rate": 1.3172218256121955e-05, "loss": 0.03558612465858459, "memory(GiB)": 122.96, "step": 50085, "token_acc": 0.9832966226138032, "train_speed(iter/s)": 0.231737 }, { "epoch": 3.818126381583962, "grad_norm": 1.0302854776382446, "learning_rate": 1.316412074938076e-05, "loss": 0.04386135637760162, "memory(GiB)": 122.96, "step": 50090, "token_acc": 0.9827110538900129, "train_speed(iter/s)": 0.23174 }, { "epoch": 3.818507508194222, "grad_norm": 1.518978238105774, "learning_rate": 1.3156025355006307e-05, "loss": 0.05602890849113464, "memory(GiB)": 122.96, "step": 50095, "token_acc": 0.9782108332054626, "train_speed(iter/s)": 0.231744 }, { "epoch": 3.818888634804482, "grad_norm": 1.2702674865722656, "learning_rate": 1.3147932073462838e-05, "loss": 0.05863426923751831, "memory(GiB)": 122.96, "step": 50100, "token_acc": 0.982227696639099, "train_speed(iter/s)": 0.231746 }, { "epoch": 3.819269761414742, "grad_norm": 1.2628086805343628, "learning_rate": 1.313984090521443e-05, "loss": 0.04744421243667603, "memory(GiB)": 122.96, "step": 50105, "token_acc": 0.9742083758937692, "train_speed(iter/s)": 0.231752 }, { "epoch": 3.8196508880250017, "grad_norm": 1.8796530961990356, "learning_rate": 1.3131751850725099e-05, "loss": 0.041048911213874814, "memory(GiB)": 122.96, "step": 50110, "token_acc": 0.9840612049729041, "train_speed(iter/s)": 0.231758 }, { "epoch": 3.8200320146352618, "grad_norm": 1.7758311033248901, "learning_rate": 1.3123664910458721e-05, "loss": 0.049628537893295285, "memory(GiB)": 122.96, "step": 50115, "token_acc": 0.9815490461795059, "train_speed(iter/s)": 0.231759 }, { "epoch": 3.820413141245522, "grad_norm": 0.49286091327667236, "learning_rate": 1.311558008487902e-05, "loss": 0.03725181519985199, "memory(GiB)": 122.96, "step": 50120, "token_acc": 0.9857277501009829, "train_speed(iter/s)": 0.23176 }, { "epoch": 3.820794267855782, "grad_norm": 1.3215802907943726, "learning_rate": 1.3107497374449635e-05, "loss": 0.053812050819396974, "memory(GiB)": 122.96, "step": 50125, "token_acc": 0.9721788058768365, "train_speed(iter/s)": 0.231768 }, { "epoch": 3.8211753944660414, "grad_norm": 0.902652382850647, "learning_rate": 1.3099416779634087e-05, "loss": 0.06591414809226989, "memory(GiB)": 122.96, "step": 50130, "token_acc": 0.9741298212605832, "train_speed(iter/s)": 0.231771 }, { "epoch": 3.8215565210763014, "grad_norm": 1.4655357599258423, "learning_rate": 1.3091338300895739e-05, "loss": 0.05505663752555847, "memory(GiB)": 122.96, "step": 50135, "token_acc": 0.9773109243697479, "train_speed(iter/s)": 0.231775 }, { "epoch": 3.8219376476865614, "grad_norm": 1.7618012428283691, "learning_rate": 1.3083261938697856e-05, "loss": 0.06160370111465454, "memory(GiB)": 122.96, "step": 50140, "token_acc": 0.9724284199363733, "train_speed(iter/s)": 0.23178 }, { "epoch": 3.8223187742968214, "grad_norm": 1.2222771644592285, "learning_rate": 1.3075187693503605e-05, "loss": 0.046582365036010744, "memory(GiB)": 122.96, "step": 50145, "token_acc": 0.9793780687397708, "train_speed(iter/s)": 0.231787 }, { "epoch": 3.8226999009070814, "grad_norm": 1.2998663187026978, "learning_rate": 1.3067115565775972e-05, "loss": 0.03838630318641663, "memory(GiB)": 122.96, "step": 50150, "token_acc": 0.9771952817824378, "train_speed(iter/s)": 0.231793 }, { "epoch": 3.8230810275173415, "grad_norm": 0.6511032581329346, "learning_rate": 1.3059045555977872e-05, "loss": 0.040126532316207886, "memory(GiB)": 122.96, "step": 50155, "token_acc": 0.9807692307692307, "train_speed(iter/s)": 0.231798 }, { "epoch": 3.823462154127601, "grad_norm": 1.7681851387023926, "learning_rate": 1.3050977664572096e-05, "loss": 0.040518027544021604, "memory(GiB)": 122.96, "step": 50160, "token_acc": 0.9868220983274202, "train_speed(iter/s)": 0.231806 }, { "epoch": 3.823843280737861, "grad_norm": 0.4970746338367462, "learning_rate": 1.3042911892021254e-05, "loss": 0.03782053291797638, "memory(GiB)": 122.96, "step": 50165, "token_acc": 0.9806823492755881, "train_speed(iter/s)": 0.231808 }, { "epoch": 3.824224407348121, "grad_norm": 1.6196839809417725, "learning_rate": 1.303484823878795e-05, "loss": 0.08099828958511353, "memory(GiB)": 122.96, "step": 50170, "token_acc": 0.9760578226170757, "train_speed(iter/s)": 0.231809 }, { "epoch": 3.824605533958381, "grad_norm": 1.2537882328033447, "learning_rate": 1.3026786705334537e-05, "loss": 0.04769757688045502, "memory(GiB)": 122.96, "step": 50175, "token_acc": 0.9828788839568802, "train_speed(iter/s)": 0.231816 }, { "epoch": 3.8249866605686407, "grad_norm": 2.3369550704956055, "learning_rate": 1.3018727292123334e-05, "loss": 0.059003764390945436, "memory(GiB)": 122.96, "step": 50180, "token_acc": 0.9778325123152709, "train_speed(iter/s)": 0.231823 }, { "epoch": 3.8253677871789007, "grad_norm": 1.4287701845169067, "learning_rate": 1.3010669999616526e-05, "loss": 0.08383334279060364, "memory(GiB)": 122.96, "step": 50185, "token_acc": 0.9652974504249292, "train_speed(iter/s)": 0.231829 }, { "epoch": 3.8257489137891607, "grad_norm": 1.0411577224731445, "learning_rate": 1.3002614828276122e-05, "loss": 0.05775566697120667, "memory(GiB)": 122.96, "step": 50190, "token_acc": 0.9832585949177878, "train_speed(iter/s)": 0.231832 }, { "epoch": 3.8261300403994207, "grad_norm": 1.0609766244888306, "learning_rate": 1.2994561778564068e-05, "loss": 0.044123786687850955, "memory(GiB)": 122.96, "step": 50195, "token_acc": 0.9819447465738526, "train_speed(iter/s)": 0.231836 }, { "epoch": 3.8265111670096807, "grad_norm": 0.6275467872619629, "learning_rate": 1.2986510850942185e-05, "loss": 0.041119879484176634, "memory(GiB)": 122.96, "step": 50200, "token_acc": 0.9836453868711933, "train_speed(iter/s)": 0.231837 }, { "epoch": 3.8265111670096807, "eval_loss": 0.054529547691345215, "eval_runtime": 223.4058, "eval_samples_per_second": 2.372, "eval_steps_per_second": 2.372, "eval_token_acc": 0.9774561773387145, "step": 50200 }, { "epoch": 3.8268922936199408, "grad_norm": 1.837175726890564, "learning_rate": 1.297846204587213e-05, "loss": 0.06607415080070496, "memory(GiB)": 122.96, "step": 50205, "token_acc": 0.9772941873998111, "train_speed(iter/s)": 0.231605 }, { "epoch": 3.8272734202302003, "grad_norm": 0.9577572345733643, "learning_rate": 1.2970415363815475e-05, "loss": 0.0311565637588501, "memory(GiB)": 122.96, "step": 50210, "token_acc": 0.9861846649781257, "train_speed(iter/s)": 0.23161 }, { "epoch": 3.8276545468404604, "grad_norm": 0.6684634685516357, "learning_rate": 1.296237080523367e-05, "loss": 0.05382999777793884, "memory(GiB)": 122.96, "step": 50215, "token_acc": 0.9743886000385134, "train_speed(iter/s)": 0.231615 }, { "epoch": 3.8280356734507204, "grad_norm": 1.3922325372695923, "learning_rate": 1.2954328370588015e-05, "loss": 0.03891924321651459, "memory(GiB)": 122.96, "step": 50220, "token_acc": 0.9843695727683224, "train_speed(iter/s)": 0.231623 }, { "epoch": 3.8284168000609804, "grad_norm": 1.273092269897461, "learning_rate": 1.2946288060339712e-05, "loss": 0.03998846709728241, "memory(GiB)": 122.96, "step": 50225, "token_acc": 0.9775040171397965, "train_speed(iter/s)": 0.23163 }, { "epoch": 3.82879792667124, "grad_norm": 2.2335104942321777, "learning_rate": 1.2938249874949854e-05, "loss": 0.054901331663131714, "memory(GiB)": 122.96, "step": 50230, "token_acc": 0.9794967381174278, "train_speed(iter/s)": 0.231635 }, { "epoch": 3.8291790532815, "grad_norm": 0.6577880382537842, "learning_rate": 1.293021381487936e-05, "loss": 0.050800710916519165, "memory(GiB)": 122.96, "step": 50235, "token_acc": 0.9808469250336675, "train_speed(iter/s)": 0.231638 }, { "epoch": 3.82956017989176, "grad_norm": 1.5287013053894043, "learning_rate": 1.2922179880589086e-05, "loss": 0.06396810412406921, "memory(GiB)": 122.96, "step": 50240, "token_acc": 0.9757854712827697, "train_speed(iter/s)": 0.231644 }, { "epoch": 3.82994130650202, "grad_norm": 3.622328758239746, "learning_rate": 1.2914148072539744e-05, "loss": 0.05395234227180481, "memory(GiB)": 122.96, "step": 50245, "token_acc": 0.9825057430641456, "train_speed(iter/s)": 0.231647 }, { "epoch": 3.83032243311228, "grad_norm": 1.4119079113006592, "learning_rate": 1.2906118391191896e-05, "loss": 0.052785396575927734, "memory(GiB)": 122.96, "step": 50250, "token_acc": 0.9737559645535105, "train_speed(iter/s)": 0.231654 }, { "epoch": 3.83070355972254, "grad_norm": 0.6147369742393494, "learning_rate": 1.2898090837006038e-05, "loss": 0.03671661615371704, "memory(GiB)": 122.96, "step": 50255, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.231656 }, { "epoch": 3.8310846863327996, "grad_norm": 1.0340203046798706, "learning_rate": 1.289006541044248e-05, "loss": 0.05837686657905579, "memory(GiB)": 122.96, "step": 50260, "token_acc": 0.9811320754716981, "train_speed(iter/s)": 0.231661 }, { "epoch": 3.8314658129430597, "grad_norm": 0.7703333497047424, "learning_rate": 1.2882042111961462e-05, "loss": 0.05537484884262085, "memory(GiB)": 122.96, "step": 50265, "token_acc": 0.9755529685681025, "train_speed(iter/s)": 0.231667 }, { "epoch": 3.8318469395533197, "grad_norm": 2.469041347503662, "learning_rate": 1.2874020942023097e-05, "loss": 0.07648813724517822, "memory(GiB)": 122.96, "step": 50270, "token_acc": 0.9766025641025641, "train_speed(iter/s)": 0.231673 }, { "epoch": 3.8322280661635797, "grad_norm": 1.2361959218978882, "learning_rate": 1.2866001901087322e-05, "loss": 0.050579124689102174, "memory(GiB)": 122.96, "step": 50275, "token_acc": 0.9781583085794164, "train_speed(iter/s)": 0.231678 }, { "epoch": 3.8326091927738393, "grad_norm": 0.8420367240905762, "learning_rate": 1.2857984989614024e-05, "loss": 0.04226187467575073, "memory(GiB)": 122.96, "step": 50280, "token_acc": 0.9817920459990417, "train_speed(iter/s)": 0.231682 }, { "epoch": 3.8329903193840993, "grad_norm": 1.238662600517273, "learning_rate": 1.2849970208062939e-05, "loss": 0.03464614450931549, "memory(GiB)": 122.96, "step": 50285, "token_acc": 0.9823116064938212, "train_speed(iter/s)": 0.231688 }, { "epoch": 3.8333714459943593, "grad_norm": 1.0784966945648193, "learning_rate": 1.2841957556893647e-05, "loss": 0.0631529688835144, "memory(GiB)": 122.96, "step": 50290, "token_acc": 0.9718430034129693, "train_speed(iter/s)": 0.231695 }, { "epoch": 3.8337525726046193, "grad_norm": 1.1249167919158936, "learning_rate": 1.2833947036565658e-05, "loss": 0.049244260787963866, "memory(GiB)": 122.96, "step": 50295, "token_acc": 0.9753336029114436, "train_speed(iter/s)": 0.231703 }, { "epoch": 3.8341336992148793, "grad_norm": 0.712184727191925, "learning_rate": 1.2825938647538332e-05, "loss": 0.05893604159355163, "memory(GiB)": 122.96, "step": 50300, "token_acc": 0.9773138254150913, "train_speed(iter/s)": 0.231708 }, { "epoch": 3.8345148258251394, "grad_norm": 0.38783806562423706, "learning_rate": 1.281793239027092e-05, "loss": 0.017243072390556335, "memory(GiB)": 122.96, "step": 50305, "token_acc": 0.9950338600451467, "train_speed(iter/s)": 0.231716 }, { "epoch": 3.834895952435399, "grad_norm": 1.0081361532211304, "learning_rate": 1.2809928265222554e-05, "loss": 0.0383542537689209, "memory(GiB)": 122.96, "step": 50310, "token_acc": 0.9843729652298476, "train_speed(iter/s)": 0.231718 }, { "epoch": 3.835277079045659, "grad_norm": 1.1456215381622314, "learning_rate": 1.2801926272852199e-05, "loss": 0.03889622688293457, "memory(GiB)": 122.96, "step": 50315, "token_acc": 0.9866611087953314, "train_speed(iter/s)": 0.231722 }, { "epoch": 3.835658205655919, "grad_norm": 0.7214980721473694, "learning_rate": 1.2793926413618757e-05, "loss": 0.03886204957962036, "memory(GiB)": 122.96, "step": 50320, "token_acc": 0.9875389408099688, "train_speed(iter/s)": 0.231726 }, { "epoch": 3.8360393322661785, "grad_norm": 0.6265476942062378, "learning_rate": 1.278592868798099e-05, "loss": 0.0425072968006134, "memory(GiB)": 122.96, "step": 50325, "token_acc": 0.9834270944199484, "train_speed(iter/s)": 0.23173 }, { "epoch": 3.8364204588764386, "grad_norm": 0.5224049091339111, "learning_rate": 1.277793309639751e-05, "loss": 0.034013029932975766, "memory(GiB)": 122.96, "step": 50330, "token_acc": 0.986396126354623, "train_speed(iter/s)": 0.231735 }, { "epoch": 3.8368015854866986, "grad_norm": 2.4455761909484863, "learning_rate": 1.2769939639326827e-05, "loss": 0.05904126167297363, "memory(GiB)": 122.96, "step": 50335, "token_acc": 0.9786036036036037, "train_speed(iter/s)": 0.231738 }, { "epoch": 3.8371827120969586, "grad_norm": 1.0649834871292114, "learning_rate": 1.2761948317227358e-05, "loss": 0.048549768328666684, "memory(GiB)": 122.96, "step": 50340, "token_acc": 0.9821143404663047, "train_speed(iter/s)": 0.231745 }, { "epoch": 3.8375638387072186, "grad_norm": 1.1620073318481445, "learning_rate": 1.275395913055733e-05, "loss": 0.050873303413391115, "memory(GiB)": 122.96, "step": 50345, "token_acc": 0.9839534223232036, "train_speed(iter/s)": 0.231748 }, { "epoch": 3.8379449653174786, "grad_norm": 2.0194294452667236, "learning_rate": 1.2745972079774904e-05, "loss": 0.04831646978855133, "memory(GiB)": 122.96, "step": 50350, "token_acc": 0.9786804308797128, "train_speed(iter/s)": 0.231753 }, { "epoch": 3.8383260919277387, "grad_norm": 2.040729284286499, "learning_rate": 1.273798716533811e-05, "loss": 0.04964113235473633, "memory(GiB)": 122.96, "step": 50355, "token_acc": 0.9806317044100119, "train_speed(iter/s)": 0.231759 }, { "epoch": 3.8387072185379982, "grad_norm": 1.0265144109725952, "learning_rate": 1.2730004387704825e-05, "loss": 0.04545291662216187, "memory(GiB)": 122.96, "step": 50360, "token_acc": 0.9796147372358331, "train_speed(iter/s)": 0.231762 }, { "epoch": 3.8390883451482583, "grad_norm": 1.0641820430755615, "learning_rate": 1.2722023747332833e-05, "loss": 0.07213475108146668, "memory(GiB)": 122.96, "step": 50365, "token_acc": 0.9637069162291714, "train_speed(iter/s)": 0.231768 }, { "epoch": 3.8394694717585183, "grad_norm": 1.0204219818115234, "learning_rate": 1.2714045244679806e-05, "loss": 0.04391777515411377, "memory(GiB)": 122.96, "step": 50370, "token_acc": 0.980544747081712, "train_speed(iter/s)": 0.231773 }, { "epoch": 3.839850598368778, "grad_norm": 0.5755395293235779, "learning_rate": 1.2706068880203236e-05, "loss": 0.03606230914592743, "memory(GiB)": 122.96, "step": 50375, "token_acc": 0.9892818863879957, "train_speed(iter/s)": 0.231778 }, { "epoch": 3.840231724979038, "grad_norm": 1.0163038969039917, "learning_rate": 1.2698094654360555e-05, "loss": 0.047356894612312316, "memory(GiB)": 122.96, "step": 50380, "token_acc": 0.9740178431679449, "train_speed(iter/s)": 0.231782 }, { "epoch": 3.840612851589298, "grad_norm": 1.3595225811004639, "learning_rate": 1.2690122567609059e-05, "loss": 0.047880321741104126, "memory(GiB)": 122.96, "step": 50385, "token_acc": 0.9834502608382802, "train_speed(iter/s)": 0.231785 }, { "epoch": 3.840993978199558, "grad_norm": 0.5920113325119019, "learning_rate": 1.2682152620405874e-05, "loss": 0.057951098680496214, "memory(GiB)": 122.96, "step": 50390, "token_acc": 0.9804315775365232, "train_speed(iter/s)": 0.231788 }, { "epoch": 3.841375104809818, "grad_norm": 2.133126735687256, "learning_rate": 1.2674184813208068e-05, "loss": 0.05411117672920227, "memory(GiB)": 122.96, "step": 50395, "token_acc": 0.9793250950570342, "train_speed(iter/s)": 0.231794 }, { "epoch": 3.841756231420078, "grad_norm": 2.7965588569641113, "learning_rate": 1.2666219146472557e-05, "loss": 0.05299661755561828, "memory(GiB)": 122.96, "step": 50400, "token_acc": 0.9759270044651523, "train_speed(iter/s)": 0.2318 }, { "epoch": 3.841756231420078, "eval_loss": 0.054623786360025406, "eval_runtime": 220.0874, "eval_samples_per_second": 2.408, "eval_steps_per_second": 2.408, "eval_token_acc": 0.9776971266791157, "step": 50400 }, { "epoch": 3.8421373580303375, "grad_norm": 0.5320430994033813, "learning_rate": 1.2658255620656117e-05, "loss": 0.04121063351631164, "memory(GiB)": 122.96, "step": 50405, "token_acc": 0.9782739545415461, "train_speed(iter/s)": 0.231569 }, { "epoch": 3.8425184846405975, "grad_norm": 1.309600591659546, "learning_rate": 1.2650294236215432e-05, "loss": 0.0697929322719574, "memory(GiB)": 122.96, "step": 50410, "token_acc": 0.9733137213700357, "train_speed(iter/s)": 0.231575 }, { "epoch": 3.8428996112508576, "grad_norm": 0.7350257635116577, "learning_rate": 1.2642334993607063e-05, "loss": 0.04842133224010468, "memory(GiB)": 122.96, "step": 50415, "token_acc": 0.9818415784958536, "train_speed(iter/s)": 0.231578 }, { "epoch": 3.8432807378611176, "grad_norm": 0.8174954652786255, "learning_rate": 1.2634377893287403e-05, "loss": 0.03530073761940002, "memory(GiB)": 122.96, "step": 50420, "token_acc": 0.9884720184447705, "train_speed(iter/s)": 0.231582 }, { "epoch": 3.843661864471377, "grad_norm": 0.9605221152305603, "learning_rate": 1.2626422935712789e-05, "loss": 0.05299175977706909, "memory(GiB)": 122.96, "step": 50425, "token_acc": 0.9764921946740128, "train_speed(iter/s)": 0.231586 }, { "epoch": 3.844042991081637, "grad_norm": 1.2505054473876953, "learning_rate": 1.2618470121339376e-05, "loss": 0.03995031714439392, "memory(GiB)": 122.96, "step": 50430, "token_acc": 0.9841605068637803, "train_speed(iter/s)": 0.231591 }, { "epoch": 3.844424117691897, "grad_norm": 0.6629741787910461, "learning_rate": 1.261051945062321e-05, "loss": 0.04857286810874939, "memory(GiB)": 122.96, "step": 50435, "token_acc": 0.981544140264534, "train_speed(iter/s)": 0.231593 }, { "epoch": 3.844805244302157, "grad_norm": 0.8947486877441406, "learning_rate": 1.2602570924020273e-05, "loss": 0.03470602631568909, "memory(GiB)": 122.96, "step": 50440, "token_acc": 0.9855044074436826, "train_speed(iter/s)": 0.231598 }, { "epoch": 3.845186370912417, "grad_norm": 0.47205042839050293, "learning_rate": 1.2594624541986339e-05, "loss": 0.031058016419410705, "memory(GiB)": 122.96, "step": 50445, "token_acc": 0.9891803764636135, "train_speed(iter/s)": 0.231599 }, { "epoch": 3.8455674975226772, "grad_norm": 1.2140752077102661, "learning_rate": 1.25866803049771e-05, "loss": 0.04694036841392517, "memory(GiB)": 122.96, "step": 50450, "token_acc": 0.9803682848881449, "train_speed(iter/s)": 0.231604 }, { "epoch": 3.845948624132937, "grad_norm": 0.003274702001363039, "learning_rate": 1.2578738213448143e-05, "loss": 0.036681875586509705, "memory(GiB)": 122.96, "step": 50455, "token_acc": 0.9833829753879918, "train_speed(iter/s)": 0.231606 }, { "epoch": 3.846329750743197, "grad_norm": 0.10779248923063278, "learning_rate": 1.2570798267854884e-05, "loss": 0.03365403413772583, "memory(GiB)": 122.96, "step": 50460, "token_acc": 0.9860966284323949, "train_speed(iter/s)": 0.231609 }, { "epoch": 3.846710877353457, "grad_norm": 2.4534904956817627, "learning_rate": 1.2562860468652644e-05, "loss": 0.0636795163154602, "memory(GiB)": 122.96, "step": 50465, "token_acc": 0.968167701863354, "train_speed(iter/s)": 0.231615 }, { "epoch": 3.847092003963717, "grad_norm": 0.9723173379898071, "learning_rate": 1.2554924816296649e-05, "loss": 0.043748652935028075, "memory(GiB)": 122.96, "step": 50470, "token_acc": 0.9859293193717278, "train_speed(iter/s)": 0.231622 }, { "epoch": 3.8474731305739764, "grad_norm": 1.9841580390930176, "learning_rate": 1.2546991311241929e-05, "loss": 0.03537192940711975, "memory(GiB)": 122.96, "step": 50475, "token_acc": 0.9830567081604425, "train_speed(iter/s)": 0.231628 }, { "epoch": 3.8478542571842365, "grad_norm": 1.233746886253357, "learning_rate": 1.2539059953943467e-05, "loss": 0.0454510897397995, "memory(GiB)": 122.96, "step": 50480, "token_acc": 0.9842476914720261, "train_speed(iter/s)": 0.231636 }, { "epoch": 3.8482353837944965, "grad_norm": 1.1865620613098145, "learning_rate": 1.2531130744856067e-05, "loss": 0.042274254560470584, "memory(GiB)": 122.96, "step": 50485, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.231641 }, { "epoch": 3.8486165104047565, "grad_norm": 1.7750579118728638, "learning_rate": 1.2523203684434436e-05, "loss": 0.04155125319957733, "memory(GiB)": 122.96, "step": 50490, "token_acc": 0.9820042925540696, "train_speed(iter/s)": 0.231647 }, { "epoch": 3.8489976370150165, "grad_norm": 2.894789934158325, "learning_rate": 1.2515278773133182e-05, "loss": 0.08918528556823731, "memory(GiB)": 122.96, "step": 50495, "token_acc": 0.9704907161803713, "train_speed(iter/s)": 0.231654 }, { "epoch": 3.8493787636252765, "grad_norm": 0.7696786522865295, "learning_rate": 1.2507356011406723e-05, "loss": 0.051129841804504396, "memory(GiB)": 122.96, "step": 50500, "token_acc": 0.9799225931301403, "train_speed(iter/s)": 0.231659 }, { "epoch": 3.849759890235536, "grad_norm": 1.1662195920944214, "learning_rate": 1.2499435399709408e-05, "loss": 0.06380079984664917, "memory(GiB)": 122.96, "step": 50505, "token_acc": 0.9777494331065759, "train_speed(iter/s)": 0.23166 }, { "epoch": 3.850141016845796, "grad_norm": 0.8223883509635925, "learning_rate": 1.2491516938495463e-05, "loss": 0.061128705739974976, "memory(GiB)": 122.96, "step": 50510, "token_acc": 0.9751833051325437, "train_speed(iter/s)": 0.231665 }, { "epoch": 3.850522143456056, "grad_norm": 1.410601258277893, "learning_rate": 1.248360062821895e-05, "loss": 0.06262748241424561, "memory(GiB)": 122.96, "step": 50515, "token_acc": 0.9693586698337292, "train_speed(iter/s)": 0.231671 }, { "epoch": 3.850903270066316, "grad_norm": 1.096531867980957, "learning_rate": 1.2475686469333841e-05, "loss": 0.03176690340042114, "memory(GiB)": 122.96, "step": 50520, "token_acc": 0.9865277071682765, "train_speed(iter/s)": 0.231677 }, { "epoch": 3.8512843966765757, "grad_norm": 1.2234106063842773, "learning_rate": 1.2467774462293991e-05, "loss": 0.03707926869392395, "memory(GiB)": 122.96, "step": 50525, "token_acc": 0.9806747461513265, "train_speed(iter/s)": 0.231684 }, { "epoch": 3.8516655232868358, "grad_norm": 0.5723353028297424, "learning_rate": 1.2459864607553096e-05, "loss": 0.029794687032699586, "memory(GiB)": 122.96, "step": 50530, "token_acc": 0.9893238434163701, "train_speed(iter/s)": 0.231689 }, { "epoch": 3.8520466498970958, "grad_norm": 1.2064529657363892, "learning_rate": 1.2451956905564755e-05, "loss": 0.03475046753883362, "memory(GiB)": 122.96, "step": 50535, "token_acc": 0.9834203254528707, "train_speed(iter/s)": 0.231695 }, { "epoch": 3.852427776507356, "grad_norm": 1.2407851219177246, "learning_rate": 1.2444051356782455e-05, "loss": 0.07004474401473999, "memory(GiB)": 122.96, "step": 50540, "token_acc": 0.9701001644490955, "train_speed(iter/s)": 0.231697 }, { "epoch": 3.852808903117616, "grad_norm": 0.719999372959137, "learning_rate": 1.2436147961659517e-05, "loss": 0.06073113083839417, "memory(GiB)": 122.96, "step": 50545, "token_acc": 0.9776651651651652, "train_speed(iter/s)": 0.231697 }, { "epoch": 3.853190029727876, "grad_norm": 0.8959406614303589, "learning_rate": 1.2428246720649172e-05, "loss": 0.05611024498939514, "memory(GiB)": 122.96, "step": 50550, "token_acc": 0.9811676082862524, "train_speed(iter/s)": 0.231701 }, { "epoch": 3.8535711563381354, "grad_norm": 1.5606818199157715, "learning_rate": 1.2420347634204537e-05, "loss": 0.05178274512290955, "memory(GiB)": 122.96, "step": 50555, "token_acc": 0.97809475292919, "train_speed(iter/s)": 0.231707 }, { "epoch": 3.8539522829483954, "grad_norm": 1.0501048564910889, "learning_rate": 1.2412450702778566e-05, "loss": 0.04377864897251129, "memory(GiB)": 122.96, "step": 50560, "token_acc": 0.9810113931641016, "train_speed(iter/s)": 0.231713 }, { "epoch": 3.8543334095586554, "grad_norm": 0.630512535572052, "learning_rate": 1.2404555926824118e-05, "loss": 0.03580373525619507, "memory(GiB)": 122.96, "step": 50565, "token_acc": 0.9839164317691925, "train_speed(iter/s)": 0.231716 }, { "epoch": 3.8547145361689155, "grad_norm": 2.592733860015869, "learning_rate": 1.239666330679392e-05, "loss": 0.05785633325576782, "memory(GiB)": 122.96, "step": 50570, "token_acc": 0.9810201660735468, "train_speed(iter/s)": 0.231722 }, { "epoch": 3.855095662779175, "grad_norm": 0.7212926149368286, "learning_rate": 1.2388772843140584e-05, "loss": 0.041939917206764224, "memory(GiB)": 122.96, "step": 50575, "token_acc": 0.9870197300103842, "train_speed(iter/s)": 0.231724 }, { "epoch": 3.855476789389435, "grad_norm": 0.7076135873794556, "learning_rate": 1.23808845363166e-05, "loss": 0.05555415153503418, "memory(GiB)": 122.96, "step": 50580, "token_acc": 0.9797221179121292, "train_speed(iter/s)": 0.231731 }, { "epoch": 3.855857915999695, "grad_norm": 1.2575916051864624, "learning_rate": 1.2372998386774298e-05, "loss": 0.042618751525878906, "memory(GiB)": 122.96, "step": 50585, "token_acc": 0.9807219807219807, "train_speed(iter/s)": 0.231736 }, { "epoch": 3.856239042609955, "grad_norm": 0.8107590675354004, "learning_rate": 1.2365114394965932e-05, "loss": 0.038021552562713626, "memory(GiB)": 122.96, "step": 50590, "token_acc": 0.9815285153544216, "train_speed(iter/s)": 0.231742 }, { "epoch": 3.856620169220215, "grad_norm": 1.006014347076416, "learning_rate": 1.2357232561343618e-05, "loss": 0.04489204287528992, "memory(GiB)": 122.96, "step": 50595, "token_acc": 0.9780734170978074, "train_speed(iter/s)": 0.231748 }, { "epoch": 3.857001295830475, "grad_norm": 2.5384278297424316, "learning_rate": 1.2349352886359323e-05, "loss": 0.07172273397445679, "memory(GiB)": 122.96, "step": 50600, "token_acc": 0.9739359947212142, "train_speed(iter/s)": 0.231755 }, { "epoch": 3.857001295830475, "eval_loss": 0.053873609751462936, "eval_runtime": 221.2011, "eval_samples_per_second": 2.396, "eval_steps_per_second": 2.396, "eval_token_acc": 0.977870309017529, "step": 50600 }, { "epoch": 3.8573824224407347, "grad_norm": 0.8978815078735352, "learning_rate": 1.2341475370464917e-05, "loss": 0.042777815461158754, "memory(GiB)": 122.96, "step": 50605, "token_acc": 0.9780546364887418, "train_speed(iter/s)": 0.231526 }, { "epoch": 3.8577635490509947, "grad_norm": 1.1799349784851074, "learning_rate": 1.2333600014112157e-05, "loss": 0.07047960758209229, "memory(GiB)": 122.96, "step": 50610, "token_acc": 0.9777422170497527, "train_speed(iter/s)": 0.23153 }, { "epoch": 3.8581446756612547, "grad_norm": 1.5920661687850952, "learning_rate": 1.232572681775263e-05, "loss": 0.04615117311477661, "memory(GiB)": 122.96, "step": 50615, "token_acc": 0.9820350675481461, "train_speed(iter/s)": 0.231532 }, { "epoch": 3.8585258022715148, "grad_norm": 0.6277801394462585, "learning_rate": 1.2317855781837839e-05, "loss": 0.0571999728679657, "memory(GiB)": 122.96, "step": 50620, "token_acc": 0.9767636397434706, "train_speed(iter/s)": 0.231532 }, { "epoch": 3.8589069288817743, "grad_norm": 0.7721825838088989, "learning_rate": 1.2309986906819166e-05, "loss": 0.05024533271789551, "memory(GiB)": 122.96, "step": 50625, "token_acc": 0.9787753568745304, "train_speed(iter/s)": 0.231536 }, { "epoch": 3.8592880554920344, "grad_norm": 1.871994972229004, "learning_rate": 1.2302120193147825e-05, "loss": 0.07870106101036071, "memory(GiB)": 122.96, "step": 50630, "token_acc": 0.9649631190727082, "train_speed(iter/s)": 0.231542 }, { "epoch": 3.8596691821022944, "grad_norm": 1.865006685256958, "learning_rate": 1.2294255641274955e-05, "loss": 0.04648490250110626, "memory(GiB)": 122.96, "step": 50635, "token_acc": 0.9835255354200988, "train_speed(iter/s)": 0.23155 }, { "epoch": 3.8600503087125544, "grad_norm": 0.7912378907203674, "learning_rate": 1.2286393251651556e-05, "loss": 0.026169970631599426, "memory(GiB)": 122.96, "step": 50640, "token_acc": 0.9845094664371773, "train_speed(iter/s)": 0.231557 }, { "epoch": 3.8604314353228144, "grad_norm": 1.1479448080062866, "learning_rate": 1.2278533024728483e-05, "loss": 0.05565265417098999, "memory(GiB)": 122.96, "step": 50645, "token_acc": 0.9776688453159041, "train_speed(iter/s)": 0.231564 }, { "epoch": 3.8608125619330744, "grad_norm": 0.6022094488143921, "learning_rate": 1.2270674960956507e-05, "loss": 0.06507146954536439, "memory(GiB)": 122.96, "step": 50650, "token_acc": 0.9775878748790713, "train_speed(iter/s)": 0.231566 }, { "epoch": 3.861193688543334, "grad_norm": 0.8231168389320374, "learning_rate": 1.2262819060786218e-05, "loss": 0.03653512299060822, "memory(GiB)": 122.96, "step": 50655, "token_acc": 0.9841918294849024, "train_speed(iter/s)": 0.23157 }, { "epoch": 3.861574815153594, "grad_norm": 2.8083908557891846, "learning_rate": 1.2254965324668138e-05, "loss": 0.06345218420028687, "memory(GiB)": 122.96, "step": 50660, "token_acc": 0.9820491109229467, "train_speed(iter/s)": 0.231574 }, { "epoch": 3.861955941763854, "grad_norm": 1.2472611665725708, "learning_rate": 1.2247113753052647e-05, "loss": 0.03618188202381134, "memory(GiB)": 122.96, "step": 50665, "token_acc": 0.981965734896303, "train_speed(iter/s)": 0.23158 }, { "epoch": 3.8623370683741136, "grad_norm": 1.028941035270691, "learning_rate": 1.2239264346389978e-05, "loss": 0.02652047574520111, "memory(GiB)": 122.96, "step": 50670, "token_acc": 0.9903677758318739, "train_speed(iter/s)": 0.231586 }, { "epoch": 3.8627181949843736, "grad_norm": 0.7729822397232056, "learning_rate": 1.2231417105130266e-05, "loss": 0.028298863768577577, "memory(GiB)": 122.96, "step": 50675, "token_acc": 0.9871970736168267, "train_speed(iter/s)": 0.231592 }, { "epoch": 3.8630993215946337, "grad_norm": 0.704942524433136, "learning_rate": 1.2223572029723529e-05, "loss": 0.04896172285079956, "memory(GiB)": 122.96, "step": 50680, "token_acc": 0.9805664668182758, "train_speed(iter/s)": 0.231592 }, { "epoch": 3.8634804482048937, "grad_norm": 0.6823765635490417, "learning_rate": 1.2215729120619618e-05, "loss": 0.040435203909873964, "memory(GiB)": 122.96, "step": 50685, "token_acc": 0.9852296705080344, "train_speed(iter/s)": 0.231595 }, { "epoch": 3.8638615748151537, "grad_norm": 1.554443359375, "learning_rate": 1.2207888378268307e-05, "loss": 0.03195061683654785, "memory(GiB)": 122.96, "step": 50690, "token_acc": 0.9808342728297632, "train_speed(iter/s)": 0.2316 }, { "epoch": 3.8642427014254137, "grad_norm": 1.6847525835037231, "learning_rate": 1.220004980311923e-05, "loss": 0.0860534131526947, "memory(GiB)": 122.96, "step": 50695, "token_acc": 0.9693783434790629, "train_speed(iter/s)": 0.231604 }, { "epoch": 3.8646238280356737, "grad_norm": 1.5640285015106201, "learning_rate": 1.2192213395621855e-05, "loss": 0.04797317087650299, "memory(GiB)": 122.96, "step": 50700, "token_acc": 0.9790209790209791, "train_speed(iter/s)": 0.231612 }, { "epoch": 3.8650049546459333, "grad_norm": 0.6152635812759399, "learning_rate": 1.2184379156225617e-05, "loss": 0.05932672023773193, "memory(GiB)": 122.96, "step": 50705, "token_acc": 0.9789653212052303, "train_speed(iter/s)": 0.231615 }, { "epoch": 3.8653860812561933, "grad_norm": 1.6026955842971802, "learning_rate": 1.2176547085379742e-05, "loss": 0.03722996711730957, "memory(GiB)": 122.96, "step": 50710, "token_acc": 0.9832649194821598, "train_speed(iter/s)": 0.231621 }, { "epoch": 3.8657672078664533, "grad_norm": 1.8729532957077026, "learning_rate": 1.2168717183533362e-05, "loss": 0.05179585218429565, "memory(GiB)": 122.96, "step": 50715, "token_acc": 0.9789410348977136, "train_speed(iter/s)": 0.231628 }, { "epoch": 3.866148334476713, "grad_norm": 1.4397335052490234, "learning_rate": 1.216088945113551e-05, "loss": 0.044823139905929565, "memory(GiB)": 122.96, "step": 50720, "token_acc": 0.9774155995343422, "train_speed(iter/s)": 0.231633 }, { "epoch": 3.866529461086973, "grad_norm": 1.4942536354064941, "learning_rate": 1.2153063888635041e-05, "loss": 0.040431654453277587, "memory(GiB)": 122.96, "step": 50725, "token_acc": 0.9797270955165692, "train_speed(iter/s)": 0.231641 }, { "epoch": 3.866910587697233, "grad_norm": 2.241635322570801, "learning_rate": 1.2145240496480725e-05, "loss": 0.033487001061439516, "memory(GiB)": 122.96, "step": 50730, "token_acc": 0.985979381443299, "train_speed(iter/s)": 0.231649 }, { "epoch": 3.867291714307493, "grad_norm": 0.06870398670434952, "learning_rate": 1.2137419275121214e-05, "loss": 0.027337872982025148, "memory(GiB)": 122.96, "step": 50735, "token_acc": 0.9891846921797005, "train_speed(iter/s)": 0.231657 }, { "epoch": 3.867672840917753, "grad_norm": 3.028458833694458, "learning_rate": 1.2129600225004988e-05, "loss": 0.07602840662002563, "memory(GiB)": 122.96, "step": 50740, "token_acc": 0.9785114280132838, "train_speed(iter/s)": 0.231661 }, { "epoch": 3.868053967528013, "grad_norm": 0.7322419285774231, "learning_rate": 1.212178334658045e-05, "loss": 0.03140731155872345, "memory(GiB)": 122.96, "step": 50745, "token_acc": 0.9865601162368326, "train_speed(iter/s)": 0.231665 }, { "epoch": 3.8684350941382726, "grad_norm": 0.9080843925476074, "learning_rate": 1.2113968640295875e-05, "loss": 0.04693405330181122, "memory(GiB)": 122.96, "step": 50750, "token_acc": 0.9813499111900533, "train_speed(iter/s)": 0.23167 }, { "epoch": 3.8688162207485326, "grad_norm": 1.4741376638412476, "learning_rate": 1.210615610659937e-05, "loss": 0.06899999976158142, "memory(GiB)": 122.96, "step": 50755, "token_acc": 0.9712202609363009, "train_speed(iter/s)": 0.231677 }, { "epoch": 3.8691973473587926, "grad_norm": 0.6803414225578308, "learning_rate": 1.2098345745938966e-05, "loss": 0.054275786876678465, "memory(GiB)": 122.96, "step": 50760, "token_acc": 0.9799502642213243, "train_speed(iter/s)": 0.23168 }, { "epoch": 3.8695784739690526, "grad_norm": 0.913201093673706, "learning_rate": 1.209053755876256e-05, "loss": 0.045914024114608765, "memory(GiB)": 122.96, "step": 50765, "token_acc": 0.9799873604381715, "train_speed(iter/s)": 0.231684 }, { "epoch": 3.869959600579312, "grad_norm": 0.6526231169700623, "learning_rate": 1.2082731545517895e-05, "loss": 0.05198081135749817, "memory(GiB)": 122.96, "step": 50770, "token_acc": 0.9824052240159623, "train_speed(iter/s)": 0.231687 }, { "epoch": 3.8703407271895722, "grad_norm": 1.2886043787002563, "learning_rate": 1.207492770665261e-05, "loss": 0.03176849484443665, "memory(GiB)": 122.96, "step": 50775, "token_acc": 0.9872262773722628, "train_speed(iter/s)": 0.231691 }, { "epoch": 3.8707218537998322, "grad_norm": 2.394361972808838, "learning_rate": 1.2067126042614246e-05, "loss": 0.03694923520088196, "memory(GiB)": 122.96, "step": 50780, "token_acc": 0.9858724704085529, "train_speed(iter/s)": 0.231698 }, { "epoch": 3.8711029804100923, "grad_norm": 1.184927225112915, "learning_rate": 1.205932655385016e-05, "loss": 0.07671515941619873, "memory(GiB)": 122.96, "step": 50785, "token_acc": 0.9794961136424551, "train_speed(iter/s)": 0.231703 }, { "epoch": 3.8714841070203523, "grad_norm": 1.7796183824539185, "learning_rate": 1.2051529240807629e-05, "loss": 0.057587307691574094, "memory(GiB)": 122.96, "step": 50790, "token_acc": 0.9838199739631764, "train_speed(iter/s)": 0.231707 }, { "epoch": 3.8718652336306123, "grad_norm": 0.7957043051719666, "learning_rate": 1.2043734103933807e-05, "loss": 0.03489675521850586, "memory(GiB)": 122.96, "step": 50795, "token_acc": 0.9839008142116951, "train_speed(iter/s)": 0.231712 }, { "epoch": 3.872246360240872, "grad_norm": 0.5677091479301453, "learning_rate": 1.2035941143675683e-05, "loss": 0.03947655260562897, "memory(GiB)": 122.96, "step": 50800, "token_acc": 0.9883002497699487, "train_speed(iter/s)": 0.231716 }, { "epoch": 3.872246360240872, "eval_loss": 0.05374591425061226, "eval_runtime": 222.1801, "eval_samples_per_second": 2.385, "eval_steps_per_second": 2.385, "eval_token_acc": 0.9776745376784531, "step": 50800 }, { "epoch": 3.872627486851132, "grad_norm": 0.7231343388557434, "learning_rate": 1.2028150360480156e-05, "loss": 0.05138644576072693, "memory(GiB)": 122.96, "step": 50805, "token_acc": 0.9778390717139053, "train_speed(iter/s)": 0.231488 }, { "epoch": 3.873008613461392, "grad_norm": 1.0189954042434692, "learning_rate": 1.2020361754794013e-05, "loss": 0.08195708990097046, "memory(GiB)": 122.96, "step": 50810, "token_acc": 0.9795379537953796, "train_speed(iter/s)": 0.231494 }, { "epoch": 3.873389740071652, "grad_norm": 2.8948137760162354, "learning_rate": 1.2012575327063857e-05, "loss": 0.07244129776954651, "memory(GiB)": 122.96, "step": 50815, "token_acc": 0.9780294759825328, "train_speed(iter/s)": 0.231497 }, { "epoch": 3.8737708666819115, "grad_norm": 1.1089445352554321, "learning_rate": 1.2004791077736243e-05, "loss": 0.043086308240890506, "memory(GiB)": 122.96, "step": 50820, "token_acc": 0.9824317272569143, "train_speed(iter/s)": 0.231501 }, { "epoch": 3.8741519932921715, "grad_norm": 1.0985496044158936, "learning_rate": 1.1997009007257526e-05, "loss": 0.058171427249908446, "memory(GiB)": 122.96, "step": 50825, "token_acc": 0.9741723409250997, "train_speed(iter/s)": 0.231507 }, { "epoch": 3.8745331199024315, "grad_norm": 0.9506089687347412, "learning_rate": 1.1989229116073986e-05, "loss": 0.060480648279190065, "memory(GiB)": 122.96, "step": 50830, "token_acc": 0.9791027327195675, "train_speed(iter/s)": 0.231509 }, { "epoch": 3.8749142465126916, "grad_norm": 0.8551661372184753, "learning_rate": 1.198145140463176e-05, "loss": 0.035020798444747925, "memory(GiB)": 122.96, "step": 50835, "token_acc": 0.9866131191432396, "train_speed(iter/s)": 0.231517 }, { "epoch": 3.8752953731229516, "grad_norm": 0.9135546088218689, "learning_rate": 1.1973675873376877e-05, "loss": 0.058391904830932616, "memory(GiB)": 122.96, "step": 50840, "token_acc": 0.9714219330855018, "train_speed(iter/s)": 0.231523 }, { "epoch": 3.8756764997332116, "grad_norm": 1.0209782123565674, "learning_rate": 1.1965902522755212e-05, "loss": 0.04699152708053589, "memory(GiB)": 122.96, "step": 50845, "token_acc": 0.9839417889850709, "train_speed(iter/s)": 0.231526 }, { "epoch": 3.876057626343471, "grad_norm": 2.583638906478882, "learning_rate": 1.1958131353212554e-05, "loss": 0.05000759363174438, "memory(GiB)": 122.96, "step": 50850, "token_acc": 0.9833546734955185, "train_speed(iter/s)": 0.231529 }, { "epoch": 3.876438752953731, "grad_norm": 0.8220499157905579, "learning_rate": 1.1950362365194517e-05, "loss": 0.04222618043422699, "memory(GiB)": 122.96, "step": 50855, "token_acc": 0.9863523573200993, "train_speed(iter/s)": 0.231533 }, { "epoch": 3.876819879563991, "grad_norm": 1.7526018619537354, "learning_rate": 1.1942595559146636e-05, "loss": 0.04008788764476776, "memory(GiB)": 122.96, "step": 50860, "token_acc": 0.9837894369879728, "train_speed(iter/s)": 0.231538 }, { "epoch": 3.8772010061742512, "grad_norm": 0.9718064069747925, "learning_rate": 1.193483093551428e-05, "loss": 0.0301837682723999, "memory(GiB)": 122.96, "step": 50865, "token_acc": 0.9886557005104935, "train_speed(iter/s)": 0.231544 }, { "epoch": 3.877582132784511, "grad_norm": 2.552469491958618, "learning_rate": 1.192706849474272e-05, "loss": 0.06250406503677368, "memory(GiB)": 122.96, "step": 50870, "token_acc": 0.9765353418308227, "train_speed(iter/s)": 0.231551 }, { "epoch": 3.877963259394771, "grad_norm": 2.704773426055908, "learning_rate": 1.1919308237277122e-05, "loss": 0.05207157731056213, "memory(GiB)": 122.96, "step": 50875, "token_acc": 0.9823705926481621, "train_speed(iter/s)": 0.231558 }, { "epoch": 3.878344386005031, "grad_norm": 1.0245305299758911, "learning_rate": 1.1911550163562463e-05, "loss": 0.030077582597732543, "memory(GiB)": 122.96, "step": 50880, "token_acc": 0.98408005458267, "train_speed(iter/s)": 0.231563 }, { "epoch": 3.878725512615291, "grad_norm": 1.131682276725769, "learning_rate": 1.1903794274043656e-05, "loss": 0.03201265931129456, "memory(GiB)": 122.96, "step": 50885, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.231567 }, { "epoch": 3.879106639225551, "grad_norm": 0.7311287522315979, "learning_rate": 1.1896040569165468e-05, "loss": 0.05604674220085144, "memory(GiB)": 122.96, "step": 50890, "token_acc": 0.9813181712694361, "train_speed(iter/s)": 0.23157 }, { "epoch": 3.879487765835811, "grad_norm": 0.9750102758407593, "learning_rate": 1.1888289049372515e-05, "loss": 0.04089726805686951, "memory(GiB)": 122.96, "step": 50895, "token_acc": 0.9834084391642769, "train_speed(iter/s)": 0.231575 }, { "epoch": 3.8798688924460705, "grad_norm": 0.00010794557601911947, "learning_rate": 1.1880539715109328e-05, "loss": 0.036645087599754336, "memory(GiB)": 122.96, "step": 50900, "token_acc": 0.9798614118666089, "train_speed(iter/s)": 0.231579 }, { "epoch": 3.8802500190563305, "grad_norm": 0.7706751227378845, "learning_rate": 1.1872792566820307e-05, "loss": 0.04651977419853211, "memory(GiB)": 122.96, "step": 50905, "token_acc": 0.9851309889072457, "train_speed(iter/s)": 0.231584 }, { "epoch": 3.8806311456665905, "grad_norm": 0.8303752541542053, "learning_rate": 1.1865047604949687e-05, "loss": 0.042199867963790896, "memory(GiB)": 122.96, "step": 50910, "token_acc": 0.9824328803447133, "train_speed(iter/s)": 0.231591 }, { "epoch": 3.8810122722768505, "grad_norm": 0.624332845211029, "learning_rate": 1.1857304829941613e-05, "loss": 0.04063507318496704, "memory(GiB)": 122.96, "step": 50915, "token_acc": 0.9839857651245552, "train_speed(iter/s)": 0.231596 }, { "epoch": 3.88139339888711, "grad_norm": 0.8333251476287842, "learning_rate": 1.1849564242240124e-05, "loss": 0.05069692730903626, "memory(GiB)": 122.96, "step": 50920, "token_acc": 0.9796718972895863, "train_speed(iter/s)": 0.231601 }, { "epoch": 3.88177452549737, "grad_norm": 1.3167705535888672, "learning_rate": 1.1841825842289067e-05, "loss": 0.0485095739364624, "memory(GiB)": 122.96, "step": 50925, "token_acc": 0.9779980657640233, "train_speed(iter/s)": 0.231607 }, { "epoch": 3.88215565210763, "grad_norm": 0.3416613042354584, "learning_rate": 1.1834089630532224e-05, "loss": 0.04263681173324585, "memory(GiB)": 122.96, "step": 50930, "token_acc": 0.9803331326510134, "train_speed(iter/s)": 0.231612 }, { "epoch": 3.88253677871789, "grad_norm": 1.8198941946029663, "learning_rate": 1.1826355607413242e-05, "loss": 0.04147031903266907, "memory(GiB)": 122.96, "step": 50935, "token_acc": 0.9848997656860192, "train_speed(iter/s)": 0.231618 }, { "epoch": 3.88291790532815, "grad_norm": 1.8757859468460083, "learning_rate": 1.181862377337561e-05, "loss": 0.04862704873085022, "memory(GiB)": 122.96, "step": 50940, "token_acc": 0.9775888717156105, "train_speed(iter/s)": 0.231625 }, { "epoch": 3.88329903193841, "grad_norm": 0.8514116406440735, "learning_rate": 1.1810894128862715e-05, "loss": 0.0254100501537323, "memory(GiB)": 122.96, "step": 50945, "token_acc": 0.9910567621828801, "train_speed(iter/s)": 0.23163 }, { "epoch": 3.8836801585486698, "grad_norm": 1.5140398740768433, "learning_rate": 1.180316667431784e-05, "loss": 0.05464458465576172, "memory(GiB)": 122.96, "step": 50950, "token_acc": 0.9800593276203032, "train_speed(iter/s)": 0.231631 }, { "epoch": 3.88406128515893, "grad_norm": 1.453948736190796, "learning_rate": 1.1795441410184088e-05, "loss": 0.06644083261489868, "memory(GiB)": 122.96, "step": 50955, "token_acc": 0.9717439293598233, "train_speed(iter/s)": 0.231638 }, { "epoch": 3.88444241176919, "grad_norm": 0.9811369180679321, "learning_rate": 1.178771833690448e-05, "loss": 0.042607882618904115, "memory(GiB)": 122.96, "step": 50960, "token_acc": 0.9776806258628624, "train_speed(iter/s)": 0.231644 }, { "epoch": 3.8848235383794494, "grad_norm": 1.1525174379348755, "learning_rate": 1.177999745492191e-05, "loss": 0.045253926515579225, "memory(GiB)": 122.96, "step": 50965, "token_acc": 0.9813874788494078, "train_speed(iter/s)": 0.231648 }, { "epoch": 3.8852046649897094, "grad_norm": 1.4365228414535522, "learning_rate": 1.1772278764679096e-05, "loss": 0.03565104007720947, "memory(GiB)": 122.96, "step": 50970, "token_acc": 0.9833655705996132, "train_speed(iter/s)": 0.231656 }, { "epoch": 3.8855857915999694, "grad_norm": 0.9353759288787842, "learning_rate": 1.1764562266618728e-05, "loss": 0.0359194278717041, "memory(GiB)": 122.96, "step": 50975, "token_acc": 0.9859062602425435, "train_speed(iter/s)": 0.231663 }, { "epoch": 3.8859669182102294, "grad_norm": 1.4310956001281738, "learning_rate": 1.1756847961183265e-05, "loss": 0.04315637350082398, "memory(GiB)": 122.96, "step": 50980, "token_acc": 0.9836605794315529, "train_speed(iter/s)": 0.231666 }, { "epoch": 3.8863480448204895, "grad_norm": 1.1176003217697144, "learning_rate": 1.1749135848815096e-05, "loss": 0.049867621064186095, "memory(GiB)": 122.96, "step": 50985, "token_acc": 0.9773111612175873, "train_speed(iter/s)": 0.231669 }, { "epoch": 3.8867291714307495, "grad_norm": 0.2627639174461365, "learning_rate": 1.1741425929956501e-05, "loss": 0.035209599137306216, "memory(GiB)": 122.96, "step": 50990, "token_acc": 0.9851244687310261, "train_speed(iter/s)": 0.231676 }, { "epoch": 3.8871102980410095, "grad_norm": 1.8006067276000977, "learning_rate": 1.1733718205049572e-05, "loss": 0.05008125901222229, "memory(GiB)": 122.96, "step": 50995, "token_acc": 0.9791744142804016, "train_speed(iter/s)": 0.231681 }, { "epoch": 3.887491424651269, "grad_norm": 1.288939356803894, "learning_rate": 1.1726012674536324e-05, "loss": 0.05130969882011414, "memory(GiB)": 122.96, "step": 51000, "token_acc": 0.9749578617866602, "train_speed(iter/s)": 0.231687 }, { "epoch": 3.887491424651269, "eval_loss": 0.05311594903469086, "eval_runtime": 218.5585, "eval_samples_per_second": 2.425, "eval_steps_per_second": 2.425, "eval_token_acc": 0.9778176013493163, "step": 51000 }, { "epoch": 3.887872551261529, "grad_norm": 0.9098899960517883, "learning_rate": 1.1718309338858652e-05, "loss": 0.029017600417137145, "memory(GiB)": 122.96, "step": 51005, "token_acc": 0.9782115133736476, "train_speed(iter/s)": 0.231459 }, { "epoch": 3.888253677871789, "grad_norm": 1.2698729038238525, "learning_rate": 1.1710608198458277e-05, "loss": 0.033893316984176636, "memory(GiB)": 122.96, "step": 51010, "token_acc": 0.9896769896769897, "train_speed(iter/s)": 0.231463 }, { "epoch": 3.8886348044820487, "grad_norm": 1.236098289489746, "learning_rate": 1.1702909253776833e-05, "loss": 0.07047884464263916, "memory(GiB)": 122.96, "step": 51015, "token_acc": 0.9754925516578568, "train_speed(iter/s)": 0.231472 }, { "epoch": 3.8890159310923087, "grad_norm": 1.316372275352478, "learning_rate": 1.1695212505255843e-05, "loss": 0.052772504091262815, "memory(GiB)": 122.96, "step": 51020, "token_acc": 0.9811197916666666, "train_speed(iter/s)": 0.231476 }, { "epoch": 3.8893970577025687, "grad_norm": 0.8841883540153503, "learning_rate": 1.1687517953336647e-05, "loss": 0.05011090040206909, "memory(GiB)": 122.96, "step": 51025, "token_acc": 0.9772043691625771, "train_speed(iter/s)": 0.231481 }, { "epoch": 3.8897781843128287, "grad_norm": 0.8805028200149536, "learning_rate": 1.1679825598460498e-05, "loss": 0.062026846408844, "memory(GiB)": 122.96, "step": 51030, "token_acc": 0.9765540976554098, "train_speed(iter/s)": 0.231486 }, { "epoch": 3.8901593109230888, "grad_norm": 1.5380381345748901, "learning_rate": 1.1672135441068543e-05, "loss": 0.05953688621520996, "memory(GiB)": 122.96, "step": 51035, "token_acc": 0.9783783783783784, "train_speed(iter/s)": 0.231494 }, { "epoch": 3.8905404375333488, "grad_norm": 0.9594656825065613, "learning_rate": 1.1664447481601743e-05, "loss": 0.04035902619361877, "memory(GiB)": 122.96, "step": 51040, "token_acc": 0.9852348993288591, "train_speed(iter/s)": 0.231496 }, { "epoch": 3.8909215641436083, "grad_norm": 1.4335561990737915, "learning_rate": 1.1656761720500992e-05, "loss": 0.052445799112319946, "memory(GiB)": 122.96, "step": 51045, "token_acc": 0.9752593774940144, "train_speed(iter/s)": 0.231501 }, { "epoch": 3.8913026907538684, "grad_norm": 1.507562279701233, "learning_rate": 1.1649078158207011e-05, "loss": 0.051173895597457886, "memory(GiB)": 122.96, "step": 51050, "token_acc": 0.9805108798486282, "train_speed(iter/s)": 0.231506 }, { "epoch": 3.8916838173641284, "grad_norm": 1.8570027351379395, "learning_rate": 1.1641396795160425e-05, "loss": 0.04765793085098267, "memory(GiB)": 122.96, "step": 51055, "token_acc": 0.9880581516095535, "train_speed(iter/s)": 0.23151 }, { "epoch": 3.8920649439743884, "grad_norm": 2.058049201965332, "learning_rate": 1.1633717631801743e-05, "loss": 0.05437748432159424, "memory(GiB)": 122.96, "step": 51060, "token_acc": 0.9668587896253602, "train_speed(iter/s)": 0.231517 }, { "epoch": 3.892446070584648, "grad_norm": 1.4154777526855469, "learning_rate": 1.1626040668571297e-05, "loss": 0.05362914204597473, "memory(GiB)": 122.96, "step": 51065, "token_acc": 0.9801548886737658, "train_speed(iter/s)": 0.231521 }, { "epoch": 3.892827197194908, "grad_norm": 2.4187586307525635, "learning_rate": 1.1618365905909345e-05, "loss": 0.04689092934131622, "memory(GiB)": 122.96, "step": 51070, "token_acc": 0.9808984789529537, "train_speed(iter/s)": 0.231526 }, { "epoch": 3.893208323805168, "grad_norm": 1.185403823852539, "learning_rate": 1.1610693344256007e-05, "loss": 0.0476244330406189, "memory(GiB)": 122.96, "step": 51075, "token_acc": 0.9771041599484037, "train_speed(iter/s)": 0.23153 }, { "epoch": 3.893589450415428, "grad_norm": 2.115180015563965, "learning_rate": 1.1603022984051249e-05, "loss": 0.040831688046455386, "memory(GiB)": 122.96, "step": 51080, "token_acc": 0.9845846417356552, "train_speed(iter/s)": 0.231536 }, { "epoch": 3.893970577025688, "grad_norm": 1.393379807472229, "learning_rate": 1.1595354825734934e-05, "loss": 0.054538500308990476, "memory(GiB)": 122.96, "step": 51085, "token_acc": 0.9801023308698124, "train_speed(iter/s)": 0.231542 }, { "epoch": 3.894351703635948, "grad_norm": 0.9205566048622131, "learning_rate": 1.1587688869746815e-05, "loss": 0.03399237096309662, "memory(GiB)": 122.96, "step": 51090, "token_acc": 0.984282506634007, "train_speed(iter/s)": 0.231548 }, { "epoch": 3.8947328302462076, "grad_norm": 1.4186513423919678, "learning_rate": 1.1580025116526471e-05, "loss": 0.03806195855140686, "memory(GiB)": 122.96, "step": 51095, "token_acc": 0.9779830941615884, "train_speed(iter/s)": 0.231553 }, { "epoch": 3.8951139568564677, "grad_norm": 3.177903413772583, "learning_rate": 1.1572363566513394e-05, "loss": 0.07277161478996277, "memory(GiB)": 122.96, "step": 51100, "token_acc": 0.971900826446281, "train_speed(iter/s)": 0.231559 }, { "epoch": 3.8954950834667277, "grad_norm": 1.3590837717056274, "learning_rate": 1.1564704220146943e-05, "loss": 0.06425299048423767, "memory(GiB)": 122.96, "step": 51105, "token_acc": 0.9743295897318359, "train_speed(iter/s)": 0.231566 }, { "epoch": 3.8958762100769877, "grad_norm": 1.6509969234466553, "learning_rate": 1.1557047077866344e-05, "loss": 0.03801352679729462, "memory(GiB)": 122.96, "step": 51110, "token_acc": 0.9839482510781026, "train_speed(iter/s)": 0.231572 }, { "epoch": 3.8962573366872473, "grad_norm": 1.049997091293335, "learning_rate": 1.154939214011071e-05, "loss": 0.028181520104408265, "memory(GiB)": 122.96, "step": 51115, "token_acc": 0.9876962926987211, "train_speed(iter/s)": 0.231576 }, { "epoch": 3.8966384632975073, "grad_norm": 0.8251073956489563, "learning_rate": 1.1541739407318991e-05, "loss": 0.05747435688972473, "memory(GiB)": 122.96, "step": 51120, "token_acc": 0.979788593332559, "train_speed(iter/s)": 0.231579 }, { "epoch": 3.8970195899077673, "grad_norm": 1.4332162141799927, "learning_rate": 1.153408887993005e-05, "loss": 0.054938048124313354, "memory(GiB)": 122.96, "step": 51125, "token_acc": 0.984936268829664, "train_speed(iter/s)": 0.231582 }, { "epoch": 3.8974007165180273, "grad_norm": 1.4718397855758667, "learning_rate": 1.1526440558382623e-05, "loss": 0.044423246383666994, "memory(GiB)": 122.96, "step": 51130, "token_acc": 0.9827111984282908, "train_speed(iter/s)": 0.231587 }, { "epoch": 3.8977818431282873, "grad_norm": 0.5809102654457092, "learning_rate": 1.1518794443115272e-05, "loss": 0.05169561505317688, "memory(GiB)": 122.96, "step": 51135, "token_acc": 0.9852177387135438, "train_speed(iter/s)": 0.231594 }, { "epoch": 3.8981629697385474, "grad_norm": 1.5529612302780151, "learning_rate": 1.151115053456649e-05, "loss": 0.05654230713844299, "memory(GiB)": 122.96, "step": 51140, "token_acc": 0.9722552516845026, "train_speed(iter/s)": 0.231601 }, { "epoch": 3.898544096348807, "grad_norm": 1.0147632360458374, "learning_rate": 1.1503508833174625e-05, "loss": 0.03676256239414215, "memory(GiB)": 122.96, "step": 51145, "token_acc": 0.9871441689623508, "train_speed(iter/s)": 0.231602 }, { "epoch": 3.898925222959067, "grad_norm": 2.011085033416748, "learning_rate": 1.1495869339377873e-05, "loss": 0.04628669023513794, "memory(GiB)": 122.96, "step": 51150, "token_acc": 0.9777275100052201, "train_speed(iter/s)": 0.231607 }, { "epoch": 3.899306349569327, "grad_norm": 1.2669404745101929, "learning_rate": 1.1488232053614328e-05, "loss": 0.03952302634716034, "memory(GiB)": 122.96, "step": 51155, "token_acc": 0.9845679012345679, "train_speed(iter/s)": 0.231614 }, { "epoch": 3.899687476179587, "grad_norm": 0.8548936247825623, "learning_rate": 1.1480596976321978e-05, "loss": 0.044951322674751285, "memory(GiB)": 122.96, "step": 51160, "token_acc": 0.9772612430520465, "train_speed(iter/s)": 0.23162 }, { "epoch": 3.9000686027898466, "grad_norm": 1.1155694723129272, "learning_rate": 1.1472964107938621e-05, "loss": 0.06704039573669433, "memory(GiB)": 122.96, "step": 51165, "token_acc": 0.9804910127137221, "train_speed(iter/s)": 0.231625 }, { "epoch": 3.9004497294001066, "grad_norm": 1.451248049736023, "learning_rate": 1.1465333448901989e-05, "loss": 0.043098649382591246, "memory(GiB)": 122.96, "step": 51170, "token_acc": 0.9807157057654076, "train_speed(iter/s)": 0.23163 }, { "epoch": 3.9008308560103666, "grad_norm": 0.5247163772583008, "learning_rate": 1.1457704999649671e-05, "loss": 0.04352582097053528, "memory(GiB)": 122.96, "step": 51175, "token_acc": 0.9858263730701088, "train_speed(iter/s)": 0.231632 }, { "epoch": 3.9012119826206266, "grad_norm": 1.5003371238708496, "learning_rate": 1.1450078760619104e-05, "loss": 0.04322465360164642, "memory(GiB)": 122.96, "step": 51180, "token_acc": 0.9830682401231401, "train_speed(iter/s)": 0.231638 }, { "epoch": 3.9015931092308866, "grad_norm": 2.6836729049682617, "learning_rate": 1.144245473224762e-05, "loss": 0.038314545154571535, "memory(GiB)": 122.96, "step": 51185, "token_acc": 0.9871647509578544, "train_speed(iter/s)": 0.231642 }, { "epoch": 3.9019742358411467, "grad_norm": 0.5687052607536316, "learning_rate": 1.1434832914972449e-05, "loss": 0.051983559131622316, "memory(GiB)": 122.96, "step": 51190, "token_acc": 0.9785714285714285, "train_speed(iter/s)": 0.231649 }, { "epoch": 3.9023553624514062, "grad_norm": 1.2287739515304565, "learning_rate": 1.1427213309230628e-05, "loss": 0.07160993814468383, "memory(GiB)": 122.96, "step": 51195, "token_acc": 0.9756167527251864, "train_speed(iter/s)": 0.231656 }, { "epoch": 3.9027364890616663, "grad_norm": 1.0907610654830933, "learning_rate": 1.1419595915459124e-05, "loss": 0.053579843044281004, "memory(GiB)": 122.96, "step": 51200, "token_acc": 0.9790276453765491, "train_speed(iter/s)": 0.231662 }, { "epoch": 3.9027364890616663, "eval_loss": 0.053008563816547394, "eval_runtime": 219.7162, "eval_samples_per_second": 2.412, "eval_steps_per_second": 2.412, "eval_token_acc": 0.9779154870188543, "step": 51200 }, { "epoch": 3.9031176156719263, "grad_norm": 0.7461091876029968, "learning_rate": 1.1411980734094774e-05, "loss": 0.08436903357505798, "memory(GiB)": 122.96, "step": 51205, "token_acc": 0.9778990068541055, "train_speed(iter/s)": 0.231437 }, { "epoch": 3.9034987422821863, "grad_norm": 1.4322257041931152, "learning_rate": 1.1404367765574248e-05, "loss": 0.05356778502464295, "memory(GiB)": 122.96, "step": 51210, "token_acc": 0.9805668016194332, "train_speed(iter/s)": 0.23144 }, { "epoch": 3.903879868892446, "grad_norm": 2.317495107650757, "learning_rate": 1.1396757010334135e-05, "loss": 0.04214376509189606, "memory(GiB)": 122.96, "step": 51215, "token_acc": 0.9843426203085425, "train_speed(iter/s)": 0.231446 }, { "epoch": 3.904260995502706, "grad_norm": 1.395789384841919, "learning_rate": 1.1389148468810856e-05, "loss": 0.03748279511928558, "memory(GiB)": 122.96, "step": 51220, "token_acc": 0.9852841906304847, "train_speed(iter/s)": 0.231447 }, { "epoch": 3.904642122112966, "grad_norm": 2.0328028202056885, "learning_rate": 1.1381542141440732e-05, "loss": 0.04581472873687744, "memory(GiB)": 122.96, "step": 51225, "token_acc": 0.9837948194462167, "train_speed(iter/s)": 0.231451 }, { "epoch": 3.905023248723226, "grad_norm": 0.7982304692268372, "learning_rate": 1.137393802865997e-05, "loss": 0.04009149670600891, "memory(GiB)": 122.96, "step": 51230, "token_acc": 0.9839935163610577, "train_speed(iter/s)": 0.231449 }, { "epoch": 3.905404375333486, "grad_norm": 1.2761579751968384, "learning_rate": 1.1366336130904587e-05, "loss": 0.028140330314636232, "memory(GiB)": 122.96, "step": 51235, "token_acc": 0.9893265565438374, "train_speed(iter/s)": 0.231454 }, { "epoch": 3.905785501943746, "grad_norm": 1.148417592048645, "learning_rate": 1.1358736448610564e-05, "loss": 0.046469300985336304, "memory(GiB)": 122.96, "step": 51240, "token_acc": 0.9844533600802408, "train_speed(iter/s)": 0.231461 }, { "epoch": 3.9061666285540055, "grad_norm": 2.168182849884033, "learning_rate": 1.1351138982213694e-05, "loss": 0.058275991678237916, "memory(GiB)": 122.96, "step": 51245, "token_acc": 0.9797449362340586, "train_speed(iter/s)": 0.231467 }, { "epoch": 3.9065477551642656, "grad_norm": 1.2552767992019653, "learning_rate": 1.1343543732149642e-05, "loss": 0.0684902548789978, "memory(GiB)": 122.96, "step": 51250, "token_acc": 0.9672008387000149, "train_speed(iter/s)": 0.231471 }, { "epoch": 3.9069288817745256, "grad_norm": 1.5456522703170776, "learning_rate": 1.133595069885398e-05, "loss": 0.02990352511405945, "memory(GiB)": 122.96, "step": 51255, "token_acc": 0.9873743880443184, "train_speed(iter/s)": 0.231477 }, { "epoch": 3.907310008384785, "grad_norm": 1.1134836673736572, "learning_rate": 1.1328359882762113e-05, "loss": 0.054629212617874144, "memory(GiB)": 122.96, "step": 51260, "token_acc": 0.9793735676088617, "train_speed(iter/s)": 0.231483 }, { "epoch": 3.907691134995045, "grad_norm": 1.4255874156951904, "learning_rate": 1.1320771284309345e-05, "loss": 0.03724370300769806, "memory(GiB)": 122.96, "step": 51265, "token_acc": 0.9844789356984479, "train_speed(iter/s)": 0.231489 }, { "epoch": 3.908072261605305, "grad_norm": 5.712593078613281, "learning_rate": 1.1313184903930862e-05, "loss": 0.05358842611312866, "memory(GiB)": 122.96, "step": 51270, "token_acc": 0.9751209398756047, "train_speed(iter/s)": 0.231498 }, { "epoch": 3.908453388215565, "grad_norm": 2.1062941551208496, "learning_rate": 1.1305600742061684e-05, "loss": 0.051426428556442264, "memory(GiB)": 122.96, "step": 51275, "token_acc": 0.9827362969356928, "train_speed(iter/s)": 0.231503 }, { "epoch": 3.9088345148258252, "grad_norm": 1.4906612634658813, "learning_rate": 1.129801879913674e-05, "loss": 0.04892503023147583, "memory(GiB)": 122.96, "step": 51280, "token_acc": 0.9764038231780168, "train_speed(iter/s)": 0.231509 }, { "epoch": 3.9092156414360852, "grad_norm": 0.10082317888736725, "learning_rate": 1.1290439075590836e-05, "loss": 0.04102218151092529, "memory(GiB)": 122.96, "step": 51285, "token_acc": 0.9801025641025641, "train_speed(iter/s)": 0.231514 }, { "epoch": 3.9095967680463453, "grad_norm": 1.5681967735290527, "learning_rate": 1.1282861571858599e-05, "loss": 0.048112761974334714, "memory(GiB)": 122.96, "step": 51290, "token_acc": 0.9796057104010877, "train_speed(iter/s)": 0.23152 }, { "epoch": 3.909977894656605, "grad_norm": 1.4931796789169312, "learning_rate": 1.1275286288374581e-05, "loss": 0.0785984754562378, "memory(GiB)": 122.96, "step": 51295, "token_acc": 0.9702881152460985, "train_speed(iter/s)": 0.231526 }, { "epoch": 3.910359021266865, "grad_norm": 0.976799488067627, "learning_rate": 1.1267713225573206e-05, "loss": 0.04576157927513123, "memory(GiB)": 122.96, "step": 51300, "token_acc": 0.9844399938376214, "train_speed(iter/s)": 0.231529 }, { "epoch": 3.910740147877125, "grad_norm": 0.919019341468811, "learning_rate": 1.1260142383888722e-05, "loss": 0.06883474588394164, "memory(GiB)": 122.96, "step": 51305, "token_acc": 0.9673684210526315, "train_speed(iter/s)": 0.231534 }, { "epoch": 3.9111212744873844, "grad_norm": 1.0799696445465088, "learning_rate": 1.1252573763755298e-05, "loss": 0.052004379034042356, "memory(GiB)": 122.96, "step": 51310, "token_acc": 0.9814642228435332, "train_speed(iter/s)": 0.231534 }, { "epoch": 3.9115024010976445, "grad_norm": 1.803115963935852, "learning_rate": 1.1245007365606968e-05, "loss": 0.060817569494247437, "memory(GiB)": 122.96, "step": 51315, "token_acc": 0.9795879435330027, "train_speed(iter/s)": 0.23154 }, { "epoch": 3.9118835277079045, "grad_norm": 0.7082263231277466, "learning_rate": 1.1237443189877617e-05, "loss": 0.04532873034477234, "memory(GiB)": 122.96, "step": 51320, "token_acc": 0.9793753682969947, "train_speed(iter/s)": 0.231545 }, { "epoch": 3.9122646543181645, "grad_norm": 3.6490819454193115, "learning_rate": 1.1229881237001012e-05, "loss": 0.0660049319267273, "memory(GiB)": 122.96, "step": 51325, "token_acc": 0.9727291367644043, "train_speed(iter/s)": 0.23155 }, { "epoch": 3.9126457809284245, "grad_norm": 0.8894729018211365, "learning_rate": 1.1222321507410816e-05, "loss": 0.06788381338119506, "memory(GiB)": 122.96, "step": 51330, "token_acc": 0.9756934088875634, "train_speed(iter/s)": 0.231553 }, { "epoch": 3.9130269075386845, "grad_norm": 1.0234318971633911, "learning_rate": 1.1214764001540517e-05, "loss": 0.04652920663356781, "memory(GiB)": 122.96, "step": 51335, "token_acc": 0.9821498626912515, "train_speed(iter/s)": 0.231558 }, { "epoch": 3.913408034148944, "grad_norm": 0.9867760539054871, "learning_rate": 1.120720871982352e-05, "loss": 0.03425142168998718, "memory(GiB)": 122.96, "step": 51340, "token_acc": 0.9834963325183375, "train_speed(iter/s)": 0.231564 }, { "epoch": 3.913789160759204, "grad_norm": 1.7054564952850342, "learning_rate": 1.1199655662693093e-05, "loss": 0.06629308462142944, "memory(GiB)": 122.96, "step": 51345, "token_acc": 0.9703968770331816, "train_speed(iter/s)": 0.231571 }, { "epoch": 3.914170287369464, "grad_norm": 0.6134839057922363, "learning_rate": 1.1192104830582351e-05, "loss": 0.027672985196113588, "memory(GiB)": 122.96, "step": 51350, "token_acc": 0.9899355877616747, "train_speed(iter/s)": 0.231578 }, { "epoch": 3.914551413979724, "grad_norm": 2.081172227859497, "learning_rate": 1.1184556223924297e-05, "loss": 0.056738758087158205, "memory(GiB)": 122.96, "step": 51355, "token_acc": 0.9758735440931781, "train_speed(iter/s)": 0.231583 }, { "epoch": 3.9149325405899837, "grad_norm": 1.5878934860229492, "learning_rate": 1.1177009843151837e-05, "loss": 0.05511375665664673, "memory(GiB)": 122.96, "step": 51360, "token_acc": 0.9795795795795795, "train_speed(iter/s)": 0.231588 }, { "epoch": 3.9153136672002438, "grad_norm": 1.1041063070297241, "learning_rate": 1.116946568869769e-05, "loss": 0.038293454051017764, "memory(GiB)": 122.96, "step": 51365, "token_acc": 0.9828975820719481, "train_speed(iter/s)": 0.231594 }, { "epoch": 3.915694793810504, "grad_norm": 1.291707992553711, "learning_rate": 1.1161923760994487e-05, "loss": 0.05265974998474121, "memory(GiB)": 122.96, "step": 51370, "token_acc": 0.9811268387454899, "train_speed(iter/s)": 0.231599 }, { "epoch": 3.916075920420764, "grad_norm": 1.2475241422653198, "learning_rate": 1.1154384060474726e-05, "loss": 0.04529010951519012, "memory(GiB)": 122.96, "step": 51375, "token_acc": 0.9808182590749059, "train_speed(iter/s)": 0.231602 }, { "epoch": 3.916457047031024, "grad_norm": 1.1012896299362183, "learning_rate": 1.114684658757077e-05, "loss": 0.03453406095504761, "memory(GiB)": 122.96, "step": 51380, "token_acc": 0.9849942913064753, "train_speed(iter/s)": 0.231605 }, { "epoch": 3.916838173641284, "grad_norm": 0.8994232416152954, "learning_rate": 1.113931134271488e-05, "loss": 0.03687165379524231, "memory(GiB)": 122.96, "step": 51385, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.231608 }, { "epoch": 3.9172193002515434, "grad_norm": 2.0076653957366943, "learning_rate": 1.1131778326339137e-05, "loss": 0.03025192618370056, "memory(GiB)": 122.96, "step": 51390, "token_acc": 0.9866814650388457, "train_speed(iter/s)": 0.231615 }, { "epoch": 3.9176004268618034, "grad_norm": 1.0626367330551147, "learning_rate": 1.1124247538875532e-05, "loss": 0.046755677461624144, "memory(GiB)": 122.96, "step": 51395, "token_acc": 0.9794344473007712, "train_speed(iter/s)": 0.231622 }, { "epoch": 3.9179815534720634, "grad_norm": 2.0421643257141113, "learning_rate": 1.1116718980755942e-05, "loss": 0.06878650188446045, "memory(GiB)": 122.96, "step": 51400, "token_acc": 0.969429747207525, "train_speed(iter/s)": 0.231629 }, { "epoch": 3.9179815534720634, "eval_loss": 0.05257737636566162, "eval_runtime": 220.9896, "eval_samples_per_second": 2.398, "eval_steps_per_second": 2.398, "eval_token_acc": 0.9780133726883923, "step": 51400 }, { "epoch": 3.9183626800823235, "grad_norm": 1.408014178276062, "learning_rate": 1.1109192652412059e-05, "loss": 0.04086217284202576, "memory(GiB)": 122.96, "step": 51405, "token_acc": 0.9784263959390863, "train_speed(iter/s)": 0.231402 }, { "epoch": 3.918743806692583, "grad_norm": 1.550033450126648, "learning_rate": 1.1101668554275508e-05, "loss": 0.06661374568939209, "memory(GiB)": 122.96, "step": 51410, "token_acc": 0.9768330546930977, "train_speed(iter/s)": 0.231407 }, { "epoch": 3.919124933302843, "grad_norm": 1.3014525175094604, "learning_rate": 1.1094146686777763e-05, "loss": 0.0392835259437561, "memory(GiB)": 122.96, "step": 51415, "token_acc": 0.9867689069312885, "train_speed(iter/s)": 0.231411 }, { "epoch": 3.919506059913103, "grad_norm": 0.6724214553833008, "learning_rate": 1.1086627050350151e-05, "loss": 0.039171481132507326, "memory(GiB)": 122.96, "step": 51420, "token_acc": 0.983147297001532, "train_speed(iter/s)": 0.231412 }, { "epoch": 3.919887186523363, "grad_norm": 0.6644142270088196, "learning_rate": 1.1079109645423907e-05, "loss": 0.07114088535308838, "memory(GiB)": 122.96, "step": 51425, "token_acc": 0.9791381148165068, "train_speed(iter/s)": 0.231416 }, { "epoch": 3.920268313133623, "grad_norm": 1.8837106227874756, "learning_rate": 1.1071594472430102e-05, "loss": 0.06058574318885803, "memory(GiB)": 122.96, "step": 51430, "token_acc": 0.9778796870785001, "train_speed(iter/s)": 0.231421 }, { "epoch": 3.920649439743883, "grad_norm": 0.7438315153121948, "learning_rate": 1.1064081531799703e-05, "loss": 0.03802376091480255, "memory(GiB)": 122.96, "step": 51435, "token_acc": 0.9829614604462474, "train_speed(iter/s)": 0.231429 }, { "epoch": 3.9210305663541427, "grad_norm": 1.2031162977218628, "learning_rate": 1.1056570823963552e-05, "loss": 0.05174833536148071, "memory(GiB)": 122.96, "step": 51440, "token_acc": 0.9779411764705882, "train_speed(iter/s)": 0.231435 }, { "epoch": 3.9214116929644027, "grad_norm": 0.10071277618408203, "learning_rate": 1.1049062349352336e-05, "loss": 0.04820125699043274, "memory(GiB)": 122.96, "step": 51445, "token_acc": 0.9766600920447074, "train_speed(iter/s)": 0.231442 }, { "epoch": 3.9217928195746627, "grad_norm": 1.1043941974639893, "learning_rate": 1.1041556108396638e-05, "loss": 0.044291707873344424, "memory(GiB)": 122.96, "step": 51450, "token_acc": 0.9827140329386144, "train_speed(iter/s)": 0.231443 }, { "epoch": 3.9221739461849228, "grad_norm": 0.8910601139068604, "learning_rate": 1.1034052101526921e-05, "loss": 0.054172462224960326, "memory(GiB)": 122.96, "step": 51455, "token_acc": 0.9819680577022154, "train_speed(iter/s)": 0.231445 }, { "epoch": 3.9225550727951823, "grad_norm": 1.6487476825714111, "learning_rate": 1.102655032917348e-05, "loss": 0.03422315120697021, "memory(GiB)": 122.96, "step": 51460, "token_acc": 0.9862989941033645, "train_speed(iter/s)": 0.231449 }, { "epoch": 3.9229361994054424, "grad_norm": 1.0805476903915405, "learning_rate": 1.1019050791766517e-05, "loss": 0.0621816098690033, "memory(GiB)": 122.96, "step": 51465, "token_acc": 0.9814537840263237, "train_speed(iter/s)": 0.231455 }, { "epoch": 3.9233173260157024, "grad_norm": 1.8110241889953613, "learning_rate": 1.1011553489736115e-05, "loss": 0.036056673526763915, "memory(GiB)": 122.96, "step": 51470, "token_acc": 0.9872237569060773, "train_speed(iter/s)": 0.231462 }, { "epoch": 3.9236984526259624, "grad_norm": 0.46087411046028137, "learning_rate": 1.1004058423512176e-05, "loss": 0.07609274983406067, "memory(GiB)": 122.96, "step": 51475, "token_acc": 0.9721831536177528, "train_speed(iter/s)": 0.231465 }, { "epoch": 3.9240795792362224, "grad_norm": 0.8722853660583496, "learning_rate": 1.099656559352452e-05, "loss": 0.022237707674503327, "memory(GiB)": 122.96, "step": 51480, "token_acc": 0.9883369330453564, "train_speed(iter/s)": 0.231473 }, { "epoch": 3.9244607058464824, "grad_norm": 2.878053665161133, "learning_rate": 1.0989075000202842e-05, "loss": 0.06658769845962524, "memory(GiB)": 122.96, "step": 51485, "token_acc": 0.9856304391823517, "train_speed(iter/s)": 0.231478 }, { "epoch": 3.924841832456742, "grad_norm": 0.7139894962310791, "learning_rate": 1.0981586643976671e-05, "loss": 0.025000414252281188, "memory(GiB)": 122.96, "step": 51490, "token_acc": 0.98999648999649, "train_speed(iter/s)": 0.231481 }, { "epoch": 3.925222959067002, "grad_norm": 0.6842798590660095, "learning_rate": 1.0974100525275438e-05, "loss": 0.03789767920970917, "memory(GiB)": 122.96, "step": 51495, "token_acc": 0.9830178666194941, "train_speed(iter/s)": 0.231484 }, { "epoch": 3.925604085677262, "grad_norm": 1.6741456985473633, "learning_rate": 1.0966616644528449e-05, "loss": 0.06948559880256652, "memory(GiB)": 122.96, "step": 51500, "token_acc": 0.9748723989006675, "train_speed(iter/s)": 0.231492 }, { "epoch": 3.925985212287522, "grad_norm": 0.7664462327957153, "learning_rate": 1.0959135002164834e-05, "loss": 0.043907192349433896, "memory(GiB)": 122.96, "step": 51505, "token_acc": 0.9808660624370594, "train_speed(iter/s)": 0.231496 }, { "epoch": 3.9263663388977816, "grad_norm": 0.7637376189231873, "learning_rate": 1.095165559861368e-05, "loss": 0.04117330014705658, "memory(GiB)": 122.96, "step": 51510, "token_acc": 0.984737707914458, "train_speed(iter/s)": 0.231495 }, { "epoch": 3.9267474655080417, "grad_norm": 0.6985693573951721, "learning_rate": 1.094417843430386e-05, "loss": 0.049929994344711306, "memory(GiB)": 122.96, "step": 51515, "token_acc": 0.9834744054816607, "train_speed(iter/s)": 0.231497 }, { "epoch": 3.9271285921183017, "grad_norm": 0.5670239925384521, "learning_rate": 1.0936703509664159e-05, "loss": 0.034011447429656984, "memory(GiB)": 122.96, "step": 51520, "token_acc": 0.985117618819011, "train_speed(iter/s)": 0.231502 }, { "epoch": 3.9275097187285617, "grad_norm": 2.0490834712982178, "learning_rate": 1.0929230825123255e-05, "loss": 0.0678870677947998, "memory(GiB)": 122.96, "step": 51525, "token_acc": 0.9740484429065744, "train_speed(iter/s)": 0.231504 }, { "epoch": 3.9278908453388217, "grad_norm": 1.2141438722610474, "learning_rate": 1.0921760381109635e-05, "loss": 0.04360925555229187, "memory(GiB)": 122.96, "step": 51530, "token_acc": 0.9836007758772704, "train_speed(iter/s)": 0.231508 }, { "epoch": 3.9282719719490817, "grad_norm": 1.3209927082061768, "learning_rate": 1.0914292178051716e-05, "loss": 0.0237454354763031, "memory(GiB)": 122.96, "step": 51535, "token_acc": 0.9881118881118881, "train_speed(iter/s)": 0.231514 }, { "epoch": 3.9286530985593413, "grad_norm": 2.245532512664795, "learning_rate": 1.0906826216377775e-05, "loss": 0.03431702852249145, "memory(GiB)": 122.96, "step": 51540, "token_acc": 0.9859564164648911, "train_speed(iter/s)": 0.231522 }, { "epoch": 3.9290342251696013, "grad_norm": 0.9160019755363464, "learning_rate": 1.089936249651592e-05, "loss": 0.04398788511753082, "memory(GiB)": 122.96, "step": 51545, "token_acc": 0.9842435367905767, "train_speed(iter/s)": 0.231525 }, { "epoch": 3.9294153517798613, "grad_norm": 1.311905026435852, "learning_rate": 1.0891901018894174e-05, "loss": 0.021248626708984374, "memory(GiB)": 122.96, "step": 51550, "token_acc": 0.9887155658811816, "train_speed(iter/s)": 0.231529 }, { "epoch": 3.9297964783901214, "grad_norm": 0.8772106170654297, "learning_rate": 1.088444178394044e-05, "loss": 0.0502490758895874, "memory(GiB)": 122.96, "step": 51555, "token_acc": 0.9834892680242158, "train_speed(iter/s)": 0.231535 }, { "epoch": 3.930177605000381, "grad_norm": 0.9306280612945557, "learning_rate": 1.0876984792082434e-05, "loss": 0.040133881568908694, "memory(GiB)": 122.96, "step": 51560, "token_acc": 0.9794646131279795, "train_speed(iter/s)": 0.231542 }, { "epoch": 3.930558731610641, "grad_norm": 1.2149715423583984, "learning_rate": 1.08695300437478e-05, "loss": 0.06817114353179932, "memory(GiB)": 122.96, "step": 51565, "token_acc": 0.9660724554341575, "train_speed(iter/s)": 0.231548 }, { "epoch": 3.930939858220901, "grad_norm": 1.9617668390274048, "learning_rate": 1.0862077539364041e-05, "loss": 0.06333566308021546, "memory(GiB)": 122.96, "step": 51570, "token_acc": 0.9688249400479616, "train_speed(iter/s)": 0.231554 }, { "epoch": 3.931320984831161, "grad_norm": 0.9159767031669617, "learning_rate": 1.0854627279358503e-05, "loss": 0.048711493611335754, "memory(GiB)": 122.96, "step": 51575, "token_acc": 0.9846067415730337, "train_speed(iter/s)": 0.231554 }, { "epoch": 3.931702111441421, "grad_norm": 1.0922186374664307, "learning_rate": 1.084717926415843e-05, "loss": 0.04200442135334015, "memory(GiB)": 122.96, "step": 51580, "token_acc": 0.9856648541769649, "train_speed(iter/s)": 0.231559 }, { "epoch": 3.932083238051681, "grad_norm": 1.1889369487762451, "learning_rate": 1.083973349419095e-05, "loss": 0.029459795355796813, "memory(GiB)": 122.96, "step": 51585, "token_acc": 0.9848959817611855, "train_speed(iter/s)": 0.231565 }, { "epoch": 3.9324643646619406, "grad_norm": 1.732568383216858, "learning_rate": 1.0832289969883014e-05, "loss": 0.051018184423446654, "memory(GiB)": 122.96, "step": 51590, "token_acc": 0.9805825242718447, "train_speed(iter/s)": 0.231571 }, { "epoch": 3.9328454912722006, "grad_norm": 0.6986112594604492, "learning_rate": 1.0824848691661504e-05, "loss": 0.05173635482788086, "memory(GiB)": 122.96, "step": 51595, "token_acc": 0.9799977008851593, "train_speed(iter/s)": 0.231572 }, { "epoch": 3.9332266178824606, "grad_norm": 1.0006242990493774, "learning_rate": 1.0817409659953116e-05, "loss": 0.05944143533706665, "memory(GiB)": 122.96, "step": 51600, "token_acc": 0.9763522012578616, "train_speed(iter/s)": 0.231577 }, { "epoch": 3.9332266178824606, "eval_loss": 0.05234023556113243, "eval_runtime": 219.6408, "eval_samples_per_second": 2.413, "eval_steps_per_second": 2.413, "eval_token_acc": 0.9784124450334317, "step": 51600 }, { "epoch": 3.93360774449272, "grad_norm": 2.0854735374450684, "learning_rate": 1.0809972875184448e-05, "loss": 0.061411821842193605, "memory(GiB)": 122.96, "step": 51605, "token_acc": 0.9782702631444035, "train_speed(iter/s)": 0.231355 }, { "epoch": 3.9339888711029802, "grad_norm": 1.075013518333435, "learning_rate": 1.0802538337781987e-05, "loss": 0.05167187452316284, "memory(GiB)": 122.96, "step": 51610, "token_acc": 0.9753770390889505, "train_speed(iter/s)": 0.231359 }, { "epoch": 3.9343699977132403, "grad_norm": 0.7805740237236023, "learning_rate": 1.0795106048172038e-05, "loss": 0.034136056900024414, "memory(GiB)": 122.96, "step": 51615, "token_acc": 0.98614913834756, "train_speed(iter/s)": 0.231363 }, { "epoch": 3.9347511243235003, "grad_norm": 1.0014081001281738, "learning_rate": 1.0787676006780828e-05, "loss": 0.035671192407608035, "memory(GiB)": 122.96, "step": 51620, "token_acc": 0.9840146430750457, "train_speed(iter/s)": 0.231365 }, { "epoch": 3.9351322509337603, "grad_norm": 1.7826968431472778, "learning_rate": 1.0780248214034443e-05, "loss": 0.04408842325210571, "memory(GiB)": 122.96, "step": 51625, "token_acc": 0.9836267605633803, "train_speed(iter/s)": 0.231368 }, { "epoch": 3.9355133775440203, "grad_norm": 0.838721752166748, "learning_rate": 1.0772822670358806e-05, "loss": 0.04615304470062256, "memory(GiB)": 122.96, "step": 51630, "token_acc": 0.981547064305685, "train_speed(iter/s)": 0.231372 }, { "epoch": 3.9358945041542803, "grad_norm": 0.7518568634986877, "learning_rate": 1.0765399376179747e-05, "loss": 0.0484409749507904, "memory(GiB)": 122.96, "step": 51635, "token_acc": 0.9815809669992326, "train_speed(iter/s)": 0.231377 }, { "epoch": 3.93627563076454, "grad_norm": 2.641598701477051, "learning_rate": 1.075797833192298e-05, "loss": 0.05204703807830811, "memory(GiB)": 122.96, "step": 51640, "token_acc": 0.9860476305027664, "train_speed(iter/s)": 0.231383 }, { "epoch": 3.9366567573748, "grad_norm": 1.3421415090560913, "learning_rate": 1.0750559538014043e-05, "loss": 0.049748319387435916, "memory(GiB)": 122.96, "step": 51645, "token_acc": 0.9767100678525582, "train_speed(iter/s)": 0.231388 }, { "epoch": 3.93703788398506, "grad_norm": 1.5012662410736084, "learning_rate": 1.0743142994878391e-05, "loss": 0.03908743560314178, "memory(GiB)": 122.96, "step": 51650, "token_acc": 0.9846119536128457, "train_speed(iter/s)": 0.231394 }, { "epoch": 3.9374190105953195, "grad_norm": 0.8360908031463623, "learning_rate": 1.0735728702941294e-05, "loss": 0.03732075095176697, "memory(GiB)": 122.96, "step": 51655, "token_acc": 0.9856353591160221, "train_speed(iter/s)": 0.231401 }, { "epoch": 3.9378001372055795, "grad_norm": 1.7296096086502075, "learning_rate": 1.0728316662627951e-05, "loss": 0.04428608417510986, "memory(GiB)": 122.96, "step": 51660, "token_acc": 0.9805068226120858, "train_speed(iter/s)": 0.231407 }, { "epoch": 3.9381812638158396, "grad_norm": 0.5920084714889526, "learning_rate": 1.0720906874363423e-05, "loss": 0.03089710772037506, "memory(GiB)": 122.96, "step": 51665, "token_acc": 0.9865253595760787, "train_speed(iter/s)": 0.231409 }, { "epoch": 3.9385623904260996, "grad_norm": 1.2517850399017334, "learning_rate": 1.0713499338572592e-05, "loss": 0.061361676454544066, "memory(GiB)": 122.96, "step": 51670, "token_acc": 0.9872192099147947, "train_speed(iter/s)": 0.231412 }, { "epoch": 3.9389435170363596, "grad_norm": 0.7913892865180969, "learning_rate": 1.070609405568026e-05, "loss": 0.03922918140888214, "memory(GiB)": 122.96, "step": 51675, "token_acc": 0.9854309285588907, "train_speed(iter/s)": 0.231418 }, { "epoch": 3.9393246436466196, "grad_norm": 0.5058150887489319, "learning_rate": 1.0698691026111102e-05, "loss": 0.039030084013938905, "memory(GiB)": 122.96, "step": 51680, "token_acc": 0.9865194505071254, "train_speed(iter/s)": 0.231417 }, { "epoch": 3.939705770256879, "grad_norm": 1.0301601886749268, "learning_rate": 1.0691290250289621e-05, "loss": 0.05009523034095764, "memory(GiB)": 122.96, "step": 51685, "token_acc": 0.970620239390642, "train_speed(iter/s)": 0.231422 }, { "epoch": 3.940086896867139, "grad_norm": 1.6405121088027954, "learning_rate": 1.0683891728640228e-05, "loss": 0.03854672610759735, "memory(GiB)": 122.96, "step": 51690, "token_acc": 0.9847130457313124, "train_speed(iter/s)": 0.231425 }, { "epoch": 3.940468023477399, "grad_norm": 1.2215406894683838, "learning_rate": 1.067649546158721e-05, "loss": 0.0485937237739563, "memory(GiB)": 122.96, "step": 51695, "token_acc": 0.9816784869976359, "train_speed(iter/s)": 0.231432 }, { "epoch": 3.9408491500876592, "grad_norm": 1.7016386985778809, "learning_rate": 1.066910144955468e-05, "loss": 0.07070796489715576, "memory(GiB)": 122.96, "step": 51700, "token_acc": 0.974581166955517, "train_speed(iter/s)": 0.231437 }, { "epoch": 3.941230276697919, "grad_norm": 0.7805156707763672, "learning_rate": 1.0661709692966664e-05, "loss": 0.03360549807548523, "memory(GiB)": 122.96, "step": 51705, "token_acc": 0.9873598855234915, "train_speed(iter/s)": 0.231442 }, { "epoch": 3.941611403308179, "grad_norm": 1.9319251775741577, "learning_rate": 1.0654320192247059e-05, "loss": 0.034000718593597413, "memory(GiB)": 122.96, "step": 51710, "token_acc": 0.9879955773179593, "train_speed(iter/s)": 0.231445 }, { "epoch": 3.941992529918439, "grad_norm": 0.9692011475563049, "learning_rate": 1.0646932947819587e-05, "loss": 0.033000203967094424, "memory(GiB)": 122.96, "step": 51715, "token_acc": 0.9823788546255506, "train_speed(iter/s)": 0.231451 }, { "epoch": 3.942373656528699, "grad_norm": 1.5332063436508179, "learning_rate": 1.0639547960107899e-05, "loss": 0.03381035327911377, "memory(GiB)": 122.96, "step": 51720, "token_acc": 0.9863835305560058, "train_speed(iter/s)": 0.231454 }, { "epoch": 3.942754783138959, "grad_norm": 0.6465722918510437, "learning_rate": 1.063216522953549e-05, "loss": 0.03223088681697846, "memory(GiB)": 122.96, "step": 51725, "token_acc": 0.9871428571428571, "train_speed(iter/s)": 0.231461 }, { "epoch": 3.943135909749219, "grad_norm": 0.7959275841712952, "learning_rate": 1.0624784756525701e-05, "loss": 0.031005209684371947, "memory(GiB)": 122.96, "step": 51730, "token_acc": 0.9843235260706578, "train_speed(iter/s)": 0.231462 }, { "epoch": 3.9435170363594785, "grad_norm": 1.3401457071304321, "learning_rate": 1.0617406541501784e-05, "loss": 0.06455446481704712, "memory(GiB)": 122.96, "step": 51735, "token_acc": 0.9746252958190902, "train_speed(iter/s)": 0.231463 }, { "epoch": 3.9438981629697385, "grad_norm": 2.1830432415008545, "learning_rate": 1.0610030584886854e-05, "loss": 0.04050070643424988, "memory(GiB)": 122.96, "step": 51740, "token_acc": 0.9845585324006862, "train_speed(iter/s)": 0.231465 }, { "epoch": 3.9442792895799985, "grad_norm": 1.2484813928604126, "learning_rate": 1.0602656887103868e-05, "loss": 0.03611060976982117, "memory(GiB)": 122.96, "step": 51745, "token_acc": 0.9864511916421809, "train_speed(iter/s)": 0.231469 }, { "epoch": 3.9446604161902585, "grad_norm": 0.4281339645385742, "learning_rate": 1.0595285448575687e-05, "loss": 0.039570951461791994, "memory(GiB)": 122.96, "step": 51750, "token_acc": 0.9879718947243064, "train_speed(iter/s)": 0.231472 }, { "epoch": 3.945041542800518, "grad_norm": 1.1534343957901, "learning_rate": 1.0587916269725034e-05, "loss": 0.06450716257095337, "memory(GiB)": 122.96, "step": 51755, "token_acc": 0.977027027027027, "train_speed(iter/s)": 0.231475 }, { "epoch": 3.945422669410778, "grad_norm": 0.9747397303581238, "learning_rate": 1.0580549350974479e-05, "loss": 0.06625234484672546, "memory(GiB)": 122.96, "step": 51760, "token_acc": 0.9686985172981878, "train_speed(iter/s)": 0.231483 }, { "epoch": 3.945803796021038, "grad_norm": 1.5018455982208252, "learning_rate": 1.0573184692746486e-05, "loss": 0.06825854182243347, "memory(GiB)": 122.96, "step": 51765, "token_acc": 0.9766905737704918, "train_speed(iter/s)": 0.231488 }, { "epoch": 3.946184922631298, "grad_norm": 1.1772207021713257, "learning_rate": 1.056582229546339e-05, "loss": 0.04838410019874573, "memory(GiB)": 122.96, "step": 51770, "token_acc": 0.9789833822091887, "train_speed(iter/s)": 0.231494 }, { "epoch": 3.946566049241558, "grad_norm": 1.1979572772979736, "learning_rate": 1.0558462159547389e-05, "loss": 0.030829030275344848, "memory(GiB)": 122.96, "step": 51775, "token_acc": 0.9857964152857626, "train_speed(iter/s)": 0.231497 }, { "epoch": 3.946947175851818, "grad_norm": 1.2399595975875854, "learning_rate": 1.055110428542056e-05, "loss": 0.04400015771389008, "memory(GiB)": 122.96, "step": 51780, "token_acc": 0.9807721888940163, "train_speed(iter/s)": 0.231502 }, { "epoch": 3.9473283024620778, "grad_norm": 1.0038831233978271, "learning_rate": 1.0543748673504828e-05, "loss": 0.0615730345249176, "memory(GiB)": 122.96, "step": 51785, "token_acc": 0.975879854368932, "train_speed(iter/s)": 0.231505 }, { "epoch": 3.947709429072338, "grad_norm": 0.7944331169128418, "learning_rate": 1.0536395324222009e-05, "loss": 0.030491346120834352, "memory(GiB)": 122.96, "step": 51790, "token_acc": 0.9812946616225304, "train_speed(iter/s)": 0.231511 }, { "epoch": 3.948090555682598, "grad_norm": 1.9916739463806152, "learning_rate": 1.0529044237993796e-05, "loss": 0.06050439476966858, "memory(GiB)": 122.96, "step": 51795, "token_acc": 0.9773513139695712, "train_speed(iter/s)": 0.231515 }, { "epoch": 3.948471682292858, "grad_norm": 0.6426489949226379, "learning_rate": 1.0521695415241717e-05, "loss": 0.05459884405136108, "memory(GiB)": 122.96, "step": 51800, "token_acc": 0.9812302125734962, "train_speed(iter/s)": 0.231521 }, { "epoch": 3.948471682292858, "eval_loss": 0.051885321736335754, "eval_runtime": 220.269, "eval_samples_per_second": 2.406, "eval_steps_per_second": 2.406, "eval_token_acc": 0.9780510210228299, "step": 51800 }, { "epoch": 3.9488528089031174, "grad_norm": 0.7423007488250732, "learning_rate": 1.0514348856387201e-05, "loss": 0.03791236877441406, "memory(GiB)": 122.96, "step": 51805, "token_acc": 0.9781844898196711, "train_speed(iter/s)": 0.231298 }, { "epoch": 3.9492339355133774, "grad_norm": 0.6932759881019592, "learning_rate": 1.0507004561851564e-05, "loss": 0.03876354694366455, "memory(GiB)": 122.96, "step": 51810, "token_acc": 0.9872565082832696, "train_speed(iter/s)": 0.231303 }, { "epoch": 3.9496150621236374, "grad_norm": 0.807320237159729, "learning_rate": 1.049966253205592e-05, "loss": 0.05160326361656189, "memory(GiB)": 122.96, "step": 51815, "token_acc": 0.9796631712742294, "train_speed(iter/s)": 0.231307 }, { "epoch": 3.9499961887338975, "grad_norm": 1.0938657522201538, "learning_rate": 1.0492322767421347e-05, "loss": 0.03879555761814117, "memory(GiB)": 122.96, "step": 51820, "token_acc": 0.9815989847715736, "train_speed(iter/s)": 0.231316 }, { "epoch": 3.9503773153441575, "grad_norm": 1.008934736251831, "learning_rate": 1.0484985268368713e-05, "loss": 0.06141721606254578, "memory(GiB)": 122.96, "step": 51825, "token_acc": 0.9819895287958115, "train_speed(iter/s)": 0.23132 }, { "epoch": 3.9507584419544175, "grad_norm": 1.0706039667129517, "learning_rate": 1.0477650035318798e-05, "loss": 0.026409924030303955, "memory(GiB)": 122.96, "step": 51830, "token_acc": 0.9861849096705633, "train_speed(iter/s)": 0.231328 }, { "epoch": 3.951139568564677, "grad_norm": 1.1000930070877075, "learning_rate": 1.0470317068692265e-05, "loss": 0.039687278866767886, "memory(GiB)": 122.96, "step": 51835, "token_acc": 0.9825641025641025, "train_speed(iter/s)": 0.231333 }, { "epoch": 3.951520695174937, "grad_norm": 0.6287328600883484, "learning_rate": 1.0462986368909589e-05, "loss": 0.035385462641716006, "memory(GiB)": 122.96, "step": 51840, "token_acc": 0.9845719661335842, "train_speed(iter/s)": 0.231337 }, { "epoch": 3.951901821785197, "grad_norm": 0.9958593249320984, "learning_rate": 1.0455657936391172e-05, "loss": 0.03518474400043488, "memory(GiB)": 122.96, "step": 51845, "token_acc": 0.981680353758686, "train_speed(iter/s)": 0.231343 }, { "epoch": 3.952282948395457, "grad_norm": 1.5637940168380737, "learning_rate": 1.044833177155728e-05, "loss": 0.05206428170204162, "memory(GiB)": 122.96, "step": 51850, "token_acc": 0.976867151354924, "train_speed(iter/s)": 0.231346 }, { "epoch": 3.9526640750057167, "grad_norm": 0.9323400259017944, "learning_rate": 1.0441007874828001e-05, "loss": 0.03954291641712189, "memory(GiB)": 122.96, "step": 51855, "token_acc": 0.9789562289562289, "train_speed(iter/s)": 0.231349 }, { "epoch": 3.9530452016159767, "grad_norm": 0.8563870787620544, "learning_rate": 1.0433686246623353e-05, "loss": 0.0673100471496582, "memory(GiB)": 122.96, "step": 51860, "token_acc": 0.9746709434797036, "train_speed(iter/s)": 0.23135 }, { "epoch": 3.9534263282262367, "grad_norm": 0.9309912323951721, "learning_rate": 1.0426366887363192e-05, "loss": 0.04936817288398743, "memory(GiB)": 122.96, "step": 51865, "token_acc": 0.9783573060895282, "train_speed(iter/s)": 0.231353 }, { "epoch": 3.9538074548364968, "grad_norm": 0.6812136173248291, "learning_rate": 1.041904979746724e-05, "loss": 0.028842097520828246, "memory(GiB)": 122.96, "step": 51870, "token_acc": 0.9886769964243146, "train_speed(iter/s)": 0.231358 }, { "epoch": 3.954188581446757, "grad_norm": 0.8967809081077576, "learning_rate": 1.0411734977355097e-05, "loss": 0.046016883850097653, "memory(GiB)": 122.96, "step": 51875, "token_acc": 0.9751443635450665, "train_speed(iter/s)": 0.231364 }, { "epoch": 3.954569708057017, "grad_norm": 1.5125707387924194, "learning_rate": 1.040442242744626e-05, "loss": 0.048521846532821655, "memory(GiB)": 122.96, "step": 51880, "token_acc": 0.9807381029459372, "train_speed(iter/s)": 0.231368 }, { "epoch": 3.9549508346672764, "grad_norm": 1.210307240486145, "learning_rate": 1.0397112148160037e-05, "loss": 0.043768495321273804, "memory(GiB)": 122.96, "step": 51885, "token_acc": 0.9799051704673741, "train_speed(iter/s)": 0.231374 }, { "epoch": 3.9553319612775364, "grad_norm": 0.8649747371673584, "learning_rate": 1.038980413991565e-05, "loss": 0.049902024865150454, "memory(GiB)": 122.96, "step": 51890, "token_acc": 0.9800505050505051, "train_speed(iter/s)": 0.23138 }, { "epoch": 3.9557130878877964, "grad_norm": 0.8206137418746948, "learning_rate": 1.0382498403132196e-05, "loss": 0.03775314688682556, "memory(GiB)": 122.96, "step": 51895, "token_acc": 0.9850016302575807, "train_speed(iter/s)": 0.231384 }, { "epoch": 3.956094214498056, "grad_norm": 0.5441038012504578, "learning_rate": 1.03751949382286e-05, "loss": 0.05034189820289612, "memory(GiB)": 122.96, "step": 51900, "token_acc": 0.9784817692767483, "train_speed(iter/s)": 0.231388 }, { "epoch": 3.956475341108316, "grad_norm": 1.1318910121917725, "learning_rate": 1.0367893745623691e-05, "loss": 0.04422245621681213, "memory(GiB)": 122.96, "step": 51905, "token_acc": 0.9798616761594793, "train_speed(iter/s)": 0.231393 }, { "epoch": 3.956856467718576, "grad_norm": 1.0274479389190674, "learning_rate": 1.0360594825736158e-05, "loss": 0.036324572563171384, "memory(GiB)": 122.96, "step": 51910, "token_acc": 0.9834061135371179, "train_speed(iter/s)": 0.231396 }, { "epoch": 3.957237594328836, "grad_norm": 0.5659840106964111, "learning_rate": 1.0353298178984566e-05, "loss": 0.05979503989219666, "memory(GiB)": 122.96, "step": 51915, "token_acc": 0.9805676855895197, "train_speed(iter/s)": 0.231403 }, { "epoch": 3.957618720939096, "grad_norm": 1.268605351448059, "learning_rate": 1.0346003805787353e-05, "loss": 0.037833505868911745, "memory(GiB)": 122.96, "step": 51920, "token_acc": 0.9815778066753359, "train_speed(iter/s)": 0.231408 }, { "epoch": 3.957999847549356, "grad_norm": 0.6005387306213379, "learning_rate": 1.0338711706562792e-05, "loss": 0.03937556743621826, "memory(GiB)": 122.96, "step": 51925, "token_acc": 0.9804994868286008, "train_speed(iter/s)": 0.231411 }, { "epoch": 3.958380974159616, "grad_norm": 1.0626050233840942, "learning_rate": 1.0331421881729058e-05, "loss": 0.039437723159790036, "memory(GiB)": 122.96, "step": 51930, "token_acc": 0.9829151094500801, "train_speed(iter/s)": 0.231419 }, { "epoch": 3.9587621007698757, "grad_norm": 1.4974427223205566, "learning_rate": 1.0324134331704216e-05, "loss": 0.053370773792266846, "memory(GiB)": 122.96, "step": 51935, "token_acc": 0.9813343923749007, "train_speed(iter/s)": 0.231422 }, { "epoch": 3.9591432273801357, "grad_norm": 0.387844979763031, "learning_rate": 1.031684905690613e-05, "loss": 0.038925981521606444, "memory(GiB)": 122.96, "step": 51940, "token_acc": 0.9840442852491045, "train_speed(iter/s)": 0.231428 }, { "epoch": 3.9595243539903957, "grad_norm": 0.5271110534667969, "learning_rate": 1.0309566057752606e-05, "loss": 0.02247331291437149, "memory(GiB)": 122.96, "step": 51945, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.231436 }, { "epoch": 3.9599054806006553, "grad_norm": 1.087931752204895, "learning_rate": 1.0302285334661293e-05, "loss": 0.05033569931983948, "memory(GiB)": 122.96, "step": 51950, "token_acc": 0.9820239680426098, "train_speed(iter/s)": 0.231441 }, { "epoch": 3.9602866072109153, "grad_norm": 1.1501922607421875, "learning_rate": 1.029500688804968e-05, "loss": 0.03238977491855621, "memory(GiB)": 122.96, "step": 51955, "token_acc": 0.9886613021214338, "train_speed(iter/s)": 0.231444 }, { "epoch": 3.9606677338211753, "grad_norm": 0.6964325904846191, "learning_rate": 1.028773071833517e-05, "loss": 0.05202380418777466, "memory(GiB)": 122.96, "step": 51960, "token_acc": 0.9831127339114559, "train_speed(iter/s)": 0.231449 }, { "epoch": 3.9610488604314353, "grad_norm": 1.5785268545150757, "learning_rate": 1.028045682593503e-05, "loss": 0.06012837290763855, "memory(GiB)": 122.96, "step": 51965, "token_acc": 0.9805589307411907, "train_speed(iter/s)": 0.231456 }, { "epoch": 3.9614299870416954, "grad_norm": 0.774085521697998, "learning_rate": 1.0273185211266355e-05, "loss": 0.04425489604473114, "memory(GiB)": 122.96, "step": 51970, "token_acc": 0.9854721549636803, "train_speed(iter/s)": 0.23146 }, { "epoch": 3.9618111136519554, "grad_norm": 0.6649735569953918, "learning_rate": 1.0265915874746156e-05, "loss": 0.038875934481620786, "memory(GiB)": 122.96, "step": 51975, "token_acc": 0.9839111281363724, "train_speed(iter/s)": 0.231466 }, { "epoch": 3.962192240262215, "grad_norm": 0.8553614020347595, "learning_rate": 1.0258648816791304e-05, "loss": 0.04392178952693939, "memory(GiB)": 122.96, "step": 51980, "token_acc": 0.9830888697152718, "train_speed(iter/s)": 0.231468 }, { "epoch": 3.962573366872475, "grad_norm": 1.1966474056243896, "learning_rate": 1.0251384037818506e-05, "loss": 0.04258418083190918, "memory(GiB)": 122.96, "step": 51985, "token_acc": 0.9788732394366197, "train_speed(iter/s)": 0.231474 }, { "epoch": 3.962954493482735, "grad_norm": 0.07907971739768982, "learning_rate": 1.0244121538244394e-05, "loss": 0.03567745089530945, "memory(GiB)": 122.96, "step": 51990, "token_acc": 0.9826319305277221, "train_speed(iter/s)": 0.231478 }, { "epoch": 3.963335620092995, "grad_norm": 0.9896934032440186, "learning_rate": 1.0236861318485408e-05, "loss": 0.026393452286720277, "memory(GiB)": 122.96, "step": 51995, "token_acc": 0.9918319719953326, "train_speed(iter/s)": 0.231483 }, { "epoch": 3.9637167467032546, "grad_norm": 2.1618473529815674, "learning_rate": 1.0229603378957896e-05, "loss": 0.04906064569950104, "memory(GiB)": 122.96, "step": 52000, "token_acc": 0.9786839666357738, "train_speed(iter/s)": 0.231491 }, { "epoch": 3.9637167467032546, "eval_loss": 0.052068110555410385, "eval_runtime": 220.3243, "eval_samples_per_second": 2.406, "eval_steps_per_second": 2.406, "eval_token_acc": 0.9781037286910427, "step": 52000 }, { "epoch": 3.9640978733135146, "grad_norm": 0.8084425330162048, "learning_rate": 1.0222347720078091e-05, "loss": 0.03789832293987274, "memory(GiB)": 122.96, "step": 52005, "token_acc": 0.978345701357466, "train_speed(iter/s)": 0.231267 }, { "epoch": 3.9644789999237746, "grad_norm": 1.9557433128356934, "learning_rate": 1.0215094342262043e-05, "loss": 0.05342034101486206, "memory(GiB)": 122.96, "step": 52010, "token_acc": 0.9774703557312253, "train_speed(iter/s)": 0.231274 }, { "epoch": 3.9648601265340346, "grad_norm": 1.251144289970398, "learning_rate": 1.0207843245925708e-05, "loss": 0.04574134051799774, "memory(GiB)": 122.96, "step": 52015, "token_acc": 0.9801115692456949, "train_speed(iter/s)": 0.231279 }, { "epoch": 3.9652412531442947, "grad_norm": 0.7612956762313843, "learning_rate": 1.0200594431484916e-05, "loss": 0.029428154230117798, "memory(GiB)": 122.96, "step": 52020, "token_acc": 0.9861690034103827, "train_speed(iter/s)": 0.231283 }, { "epoch": 3.9656223797545547, "grad_norm": 1.0472052097320557, "learning_rate": 1.0193347899355327e-05, "loss": 0.053851211071014406, "memory(GiB)": 122.96, "step": 52025, "token_acc": 0.9768511648612553, "train_speed(iter/s)": 0.231287 }, { "epoch": 3.9660035063648142, "grad_norm": 2.2214648723602295, "learning_rate": 1.0186103649952511e-05, "loss": 0.04001335799694061, "memory(GiB)": 122.96, "step": 52030, "token_acc": 0.9871814671814672, "train_speed(iter/s)": 0.23129 }, { "epoch": 3.9663846329750743, "grad_norm": 0.04819333553314209, "learning_rate": 1.017886168369191e-05, "loss": 0.014670448005199432, "memory(GiB)": 122.96, "step": 52035, "token_acc": 0.9941205291523763, "train_speed(iter/s)": 0.231295 }, { "epoch": 3.9667657595853343, "grad_norm": 1.8379377126693726, "learning_rate": 1.0171622000988768e-05, "loss": 0.05781666040420532, "memory(GiB)": 122.96, "step": 52040, "token_acc": 0.974083264405845, "train_speed(iter/s)": 0.231302 }, { "epoch": 3.9671468861955943, "grad_norm": 0.8667842745780945, "learning_rate": 1.0164384602258303e-05, "loss": 0.06383656859397888, "memory(GiB)": 122.96, "step": 52045, "token_acc": 0.9777622132726318, "train_speed(iter/s)": 0.231305 }, { "epoch": 3.967528012805854, "grad_norm": 1.0237388610839844, "learning_rate": 1.0157149487915513e-05, "loss": 0.04119675159454346, "memory(GiB)": 122.96, "step": 52050, "token_acc": 0.9826230864708316, "train_speed(iter/s)": 0.231312 }, { "epoch": 3.967909139416114, "grad_norm": 0.6886779069900513, "learning_rate": 1.0149916658375303e-05, "loss": 0.03821527063846588, "memory(GiB)": 122.96, "step": 52055, "token_acc": 0.9841860465116279, "train_speed(iter/s)": 0.231315 }, { "epoch": 3.968290266026374, "grad_norm": 0.8850218057632446, "learning_rate": 1.0142686114052458e-05, "loss": 0.040331804752349855, "memory(GiB)": 122.96, "step": 52060, "token_acc": 0.9840174966352625, "train_speed(iter/s)": 0.231319 }, { "epoch": 3.968671392636634, "grad_norm": 2.8142495155334473, "learning_rate": 1.013545785536159e-05, "loss": 0.050896954536437986, "memory(GiB)": 122.96, "step": 52065, "token_acc": 0.9835343672219031, "train_speed(iter/s)": 0.231324 }, { "epoch": 3.969052519246894, "grad_norm": 1.0718529224395752, "learning_rate": 1.0128231882717226e-05, "loss": 0.06547519564628601, "memory(GiB)": 122.96, "step": 52070, "token_acc": 0.9754990925589837, "train_speed(iter/s)": 0.23133 }, { "epoch": 3.969433645857154, "grad_norm": 0.5022848844528198, "learning_rate": 1.0121008196533743e-05, "loss": 0.048174649477005005, "memory(GiB)": 122.96, "step": 52075, "token_acc": 0.9839489126682776, "train_speed(iter/s)": 0.231335 }, { "epoch": 3.9698147724674135, "grad_norm": 0.7463897466659546, "learning_rate": 1.0113786797225367e-05, "loss": 0.0500415027141571, "memory(GiB)": 122.96, "step": 52080, "token_acc": 0.9840240796480667, "train_speed(iter/s)": 0.231341 }, { "epoch": 3.9701958990776736, "grad_norm": 0.8613776564598083, "learning_rate": 1.0106567685206226e-05, "loss": 0.03578961789608002, "memory(GiB)": 122.96, "step": 52085, "token_acc": 0.9858042217010246, "train_speed(iter/s)": 0.231342 }, { "epoch": 3.9705770256879336, "grad_norm": 0.7708094716072083, "learning_rate": 1.0099350860890312e-05, "loss": 0.035759395360946654, "memory(GiB)": 122.96, "step": 52090, "token_acc": 0.9853862212943633, "train_speed(iter/s)": 0.231348 }, { "epoch": 3.9709581522981936, "grad_norm": 0.9581434726715088, "learning_rate": 1.0092136324691449e-05, "loss": 0.0493901401758194, "memory(GiB)": 122.96, "step": 52095, "token_acc": 0.9781362007168459, "train_speed(iter/s)": 0.231353 }, { "epoch": 3.971339278908453, "grad_norm": 1.3640735149383545, "learning_rate": 1.0084924077023377e-05, "loss": 0.029024749994277954, "memory(GiB)": 122.96, "step": 52100, "token_acc": 0.9868686868686869, "train_speed(iter/s)": 0.231359 }, { "epoch": 3.971720405518713, "grad_norm": 1.0566426515579224, "learning_rate": 1.0077714118299691e-05, "loss": 0.05831056237220764, "memory(GiB)": 122.96, "step": 52105, "token_acc": 0.9759211376858435, "train_speed(iter/s)": 0.231363 }, { "epoch": 3.972101532128973, "grad_norm": 0.635669469833374, "learning_rate": 1.0070506448933826e-05, "loss": 0.037541437149047854, "memory(GiB)": 122.96, "step": 52110, "token_acc": 0.9814621409921671, "train_speed(iter/s)": 0.231369 }, { "epoch": 3.9724826587392332, "grad_norm": 1.0232336521148682, "learning_rate": 1.006330106933912e-05, "loss": 0.05409092307090759, "memory(GiB)": 122.96, "step": 52115, "token_acc": 0.982108626198083, "train_speed(iter/s)": 0.231372 }, { "epoch": 3.9728637853494932, "grad_norm": 1.3512825965881348, "learning_rate": 1.005609797992878e-05, "loss": 0.04729689359664917, "memory(GiB)": 122.96, "step": 52120, "token_acc": 0.98229939312205, "train_speed(iter/s)": 0.231375 }, { "epoch": 3.9732449119597533, "grad_norm": 0.8771897554397583, "learning_rate": 1.0048897181115852e-05, "loss": 0.04070386588573456, "memory(GiB)": 122.96, "step": 52125, "token_acc": 0.9869383490073145, "train_speed(iter/s)": 0.231381 }, { "epoch": 3.973626038570013, "grad_norm": 1.3175917863845825, "learning_rate": 1.0041698673313266e-05, "loss": 0.030161169171333314, "memory(GiB)": 122.96, "step": 52130, "token_acc": 0.9877750611246944, "train_speed(iter/s)": 0.231387 }, { "epoch": 3.974007165180273, "grad_norm": 1.3232738971710205, "learning_rate": 1.0034502456933854e-05, "loss": 0.034717094898223874, "memory(GiB)": 122.96, "step": 52135, "token_acc": 0.983675094565001, "train_speed(iter/s)": 0.231392 }, { "epoch": 3.974388291790533, "grad_norm": 0.08836917579174042, "learning_rate": 1.0027308532390245e-05, "loss": 0.05440279245376587, "memory(GiB)": 122.96, "step": 52140, "token_acc": 0.9781134999031571, "train_speed(iter/s)": 0.231395 }, { "epoch": 3.974769418400793, "grad_norm": 1.4353477954864502, "learning_rate": 1.0020116900094994e-05, "loss": 0.03706251382827759, "memory(GiB)": 122.96, "step": 52145, "token_acc": 0.985494880546075, "train_speed(iter/s)": 0.231402 }, { "epoch": 3.9751505450110525, "grad_norm": 1.1868644952774048, "learning_rate": 1.0012927560460528e-05, "loss": 0.06395893096923828, "memory(GiB)": 122.96, "step": 52150, "token_acc": 0.9809230769230769, "train_speed(iter/s)": 0.231407 }, { "epoch": 3.9755316716213125, "grad_norm": 1.139275312423706, "learning_rate": 1.0005740513899086e-05, "loss": 0.04365535378456116, "memory(GiB)": 122.96, "step": 52155, "token_acc": 0.9838541666666667, "train_speed(iter/s)": 0.23141 }, { "epoch": 3.9759127982315725, "grad_norm": 0.5248676538467407, "learning_rate": 9.998555760822842e-06, "loss": 0.04032517075538635, "memory(GiB)": 122.96, "step": 52160, "token_acc": 0.984739121468344, "train_speed(iter/s)": 0.231415 }, { "epoch": 3.9762939248418325, "grad_norm": 0.5345430970191956, "learning_rate": 9.991373301643786e-06, "loss": 0.04140601754188537, "memory(GiB)": 122.96, "step": 52165, "token_acc": 0.9861563517915309, "train_speed(iter/s)": 0.231419 }, { "epoch": 3.9766750514520925, "grad_norm": 0.6809677481651306, "learning_rate": 9.984193136773796e-06, "loss": 0.05934844613075256, "memory(GiB)": 122.96, "step": 52170, "token_acc": 0.9791883454734651, "train_speed(iter/s)": 0.231421 }, { "epoch": 3.9770561780623526, "grad_norm": 0.6744194030761719, "learning_rate": 9.977015266624656e-06, "loss": 0.05623188614845276, "memory(GiB)": 122.96, "step": 52175, "token_acc": 0.9790658276863504, "train_speed(iter/s)": 0.231422 }, { "epoch": 3.977437304672612, "grad_norm": 1.3594963550567627, "learning_rate": 9.969839691607952e-06, "loss": 0.08316723704338073, "memory(GiB)": 122.96, "step": 52180, "token_acc": 0.9680522780903975, "train_speed(iter/s)": 0.231426 }, { "epoch": 3.977818431282872, "grad_norm": 1.1234586238861084, "learning_rate": 9.962666412135174e-06, "loss": 0.03742876648902893, "memory(GiB)": 122.96, "step": 52185, "token_acc": 0.9808052434456929, "train_speed(iter/s)": 0.231432 }, { "epoch": 3.978199557893132, "grad_norm": 1.0699101686477661, "learning_rate": 9.9554954286177e-06, "loss": 0.052267223596572876, "memory(GiB)": 122.96, "step": 52190, "token_acc": 0.9736766398158804, "train_speed(iter/s)": 0.231435 }, { "epoch": 3.9785806845033918, "grad_norm": 1.2316360473632812, "learning_rate": 9.948326741466718e-06, "loss": 0.03406568765640259, "memory(GiB)": 122.96, "step": 52195, "token_acc": 0.9878787878787879, "train_speed(iter/s)": 0.231439 }, { "epoch": 3.9789618111136518, "grad_norm": 0.9024779796600342, "learning_rate": 9.941160351093337e-06, "loss": 0.028819751739501954, "memory(GiB)": 122.96, "step": 52200, "token_acc": 0.9879897913226242, "train_speed(iter/s)": 0.231443 }, { "epoch": 3.9789618111136518, "eval_loss": 0.0524248369038105, "eval_runtime": 220.1603, "eval_samples_per_second": 2.407, "eval_steps_per_second": 2.407, "eval_token_acc": 0.9782543220287935, "step": 52200 }, { "epoch": 3.979342937723912, "grad_norm": 3.3543455600738525, "learning_rate": 9.933996257908523e-06, "loss": 0.04976873993873596, "memory(GiB)": 122.96, "step": 52205, "token_acc": 0.9785043722399321, "train_speed(iter/s)": 0.231223 }, { "epoch": 3.979724064334172, "grad_norm": 1.0124200582504272, "learning_rate": 9.926834462323087e-06, "loss": 0.04307752251625061, "memory(GiB)": 122.96, "step": 52210, "token_acc": 0.9809885931558935, "train_speed(iter/s)": 0.231227 }, { "epoch": 3.980105190944432, "grad_norm": 1.0676136016845703, "learning_rate": 9.919674964747738e-06, "loss": 0.0473435640335083, "memory(GiB)": 122.96, "step": 52215, "token_acc": 0.9802547770700637, "train_speed(iter/s)": 0.231232 }, { "epoch": 3.980486317554692, "grad_norm": 0.7769216299057007, "learning_rate": 9.912517765593027e-06, "loss": 0.02982659637928009, "memory(GiB)": 122.96, "step": 52220, "token_acc": 0.9865390367932995, "train_speed(iter/s)": 0.231238 }, { "epoch": 3.980867444164952, "grad_norm": 0.9657455682754517, "learning_rate": 9.905362865269397e-06, "loss": 0.035853061079978946, "memory(GiB)": 122.96, "step": 52225, "token_acc": 0.983389504092441, "train_speed(iter/s)": 0.231243 }, { "epoch": 3.9812485707752114, "grad_norm": 1.7645196914672852, "learning_rate": 9.898210264187152e-06, "loss": 0.06978020668029786, "memory(GiB)": 122.96, "step": 52230, "token_acc": 0.9780477408354646, "train_speed(iter/s)": 0.231248 }, { "epoch": 3.9816296973854715, "grad_norm": 0.6699644327163696, "learning_rate": 9.891059962756439e-06, "loss": 0.037666457891464236, "memory(GiB)": 122.96, "step": 52235, "token_acc": 0.9847144006436042, "train_speed(iter/s)": 0.231251 }, { "epoch": 3.9820108239957315, "grad_norm": 1.351041316986084, "learning_rate": 9.88391196138731e-06, "loss": 0.06290179491043091, "memory(GiB)": 122.96, "step": 52240, "token_acc": 0.9740786457414918, "train_speed(iter/s)": 0.231256 }, { "epoch": 3.982391950605991, "grad_norm": 1.368553638458252, "learning_rate": 9.876766260489684e-06, "loss": 0.05096173882484436, "memory(GiB)": 122.96, "step": 52245, "token_acc": 0.9804602692140686, "train_speed(iter/s)": 0.231264 }, { "epoch": 3.982773077216251, "grad_norm": 0.9188441038131714, "learning_rate": 9.869622860473305e-06, "loss": 0.0750480055809021, "memory(GiB)": 122.96, "step": 52250, "token_acc": 0.9730122231634999, "train_speed(iter/s)": 0.231266 }, { "epoch": 3.983154203826511, "grad_norm": 0.7882497310638428, "learning_rate": 9.862481761747828e-06, "loss": 0.0332461416721344, "memory(GiB)": 122.96, "step": 52255, "token_acc": 0.9852650494159928, "train_speed(iter/s)": 0.231269 }, { "epoch": 3.983535330436771, "grad_norm": 1.2733711004257202, "learning_rate": 9.855342964722775e-06, "loss": 0.05221565365791321, "memory(GiB)": 122.96, "step": 52260, "token_acc": 0.9777397260273972, "train_speed(iter/s)": 0.231274 }, { "epoch": 3.983916457047031, "grad_norm": 0.9677417874336243, "learning_rate": 9.848206469807491e-06, "loss": 0.02698550224304199, "memory(GiB)": 122.96, "step": 52265, "token_acc": 0.9857775705292608, "train_speed(iter/s)": 0.23128 }, { "epoch": 3.984297583657291, "grad_norm": 0.9833138585090637, "learning_rate": 9.84107227741124e-06, "loss": 0.05232709050178528, "memory(GiB)": 122.96, "step": 52270, "token_acc": 0.9837690318873887, "train_speed(iter/s)": 0.231282 }, { "epoch": 3.9846787102675507, "grad_norm": 1.6058319807052612, "learning_rate": 9.833940387943152e-06, "loss": 0.061308807134628295, "memory(GiB)": 122.96, "step": 52275, "token_acc": 0.9804423748544819, "train_speed(iter/s)": 0.231288 }, { "epoch": 3.9850598368778107, "grad_norm": 0.9021174311637878, "learning_rate": 9.82681080181217e-06, "loss": 0.051849716901779176, "memory(GiB)": 122.96, "step": 52280, "token_acc": 0.9789603960396039, "train_speed(iter/s)": 0.231293 }, { "epoch": 3.9854409634880708, "grad_norm": 0.8129973411560059, "learning_rate": 9.819683519427165e-06, "loss": 0.04443131983280182, "memory(GiB)": 122.96, "step": 52285, "token_acc": 0.9827216140802747, "train_speed(iter/s)": 0.231293 }, { "epoch": 3.9858220900983308, "grad_norm": 0.78498375415802, "learning_rate": 9.812558541196865e-06, "loss": 0.02956903576850891, "memory(GiB)": 122.96, "step": 52290, "token_acc": 0.9895620603585206, "train_speed(iter/s)": 0.231298 }, { "epoch": 3.9862032167085903, "grad_norm": 1.4261513948440552, "learning_rate": 9.805435867529827e-06, "loss": 0.040596958994865415, "memory(GiB)": 122.96, "step": 52295, "token_acc": 0.9839261285909713, "train_speed(iter/s)": 0.2313 }, { "epoch": 3.9865843433188504, "grad_norm": 1.0671530961990356, "learning_rate": 9.798315498834515e-06, "loss": 0.055164217948913574, "memory(GiB)": 122.96, "step": 52300, "token_acc": 0.9772944877581726, "train_speed(iter/s)": 0.231302 }, { "epoch": 3.9869654699291104, "grad_norm": 0.18717144429683685, "learning_rate": 9.791197435519251e-06, "loss": 0.03828292489051819, "memory(GiB)": 122.96, "step": 52305, "token_acc": 0.9788947514579284, "train_speed(iter/s)": 0.231308 }, { "epoch": 3.9873465965393704, "grad_norm": 1.0860241651535034, "learning_rate": 9.784081677992223e-06, "loss": 0.035008400678634644, "memory(GiB)": 122.96, "step": 52310, "token_acc": 0.9824528998891762, "train_speed(iter/s)": 0.231312 }, { "epoch": 3.9877277231496304, "grad_norm": 0.7023165822029114, "learning_rate": 9.776968226661497e-06, "loss": 0.04426567256450653, "memory(GiB)": 122.96, "step": 52315, "token_acc": 0.9829512051734274, "train_speed(iter/s)": 0.231319 }, { "epoch": 3.9881088497598904, "grad_norm": 1.2209738492965698, "learning_rate": 9.769857081934974e-06, "loss": 0.025460779666900635, "memory(GiB)": 122.96, "step": 52320, "token_acc": 0.9885024840312279, "train_speed(iter/s)": 0.231322 }, { "epoch": 3.98848997637015, "grad_norm": 0.8744321465492249, "learning_rate": 9.76274824422046e-06, "loss": 0.03195506632328034, "memory(GiB)": 122.96, "step": 52325, "token_acc": 0.9872068230277186, "train_speed(iter/s)": 0.231323 }, { "epoch": 3.98887110298041, "grad_norm": 2.0720646381378174, "learning_rate": 9.755641713925617e-06, "loss": 0.049174898862838747, "memory(GiB)": 122.96, "step": 52330, "token_acc": 0.9800718719372754, "train_speed(iter/s)": 0.23133 }, { "epoch": 3.98925222959067, "grad_norm": 0.9131937026977539, "learning_rate": 9.748537491457955e-06, "loss": 0.062190836668014525, "memory(GiB)": 122.96, "step": 52335, "token_acc": 0.9833497954000282, "train_speed(iter/s)": 0.231332 }, { "epoch": 3.98963335620093, "grad_norm": 2.9445643424987793, "learning_rate": 9.741435577224878e-06, "loss": 0.0502646803855896, "memory(GiB)": 122.96, "step": 52340, "token_acc": 0.9845947756195579, "train_speed(iter/s)": 0.231336 }, { "epoch": 3.9900144828111896, "grad_norm": 0.7924624085426331, "learning_rate": 9.734335971633662e-06, "loss": 0.02771223783493042, "memory(GiB)": 122.96, "step": 52345, "token_acc": 0.9893369523070957, "train_speed(iter/s)": 0.23134 }, { "epoch": 3.9903956094214497, "grad_norm": 1.7174787521362305, "learning_rate": 9.72723867509141e-06, "loss": 0.046906685829162596, "memory(GiB)": 122.96, "step": 52350, "token_acc": 0.9803171131765992, "train_speed(iter/s)": 0.231345 }, { "epoch": 3.9907767360317097, "grad_norm": 0.8441598415374756, "learning_rate": 9.720143688005128e-06, "loss": 0.0658511221408844, "memory(GiB)": 122.96, "step": 52355, "token_acc": 0.9753224901850813, "train_speed(iter/s)": 0.231351 }, { "epoch": 3.9911578626419697, "grad_norm": 1.2509552240371704, "learning_rate": 9.713051010781704e-06, "loss": 0.06748469471931458, "memory(GiB)": 122.96, "step": 52360, "token_acc": 0.9716504153123872, "train_speed(iter/s)": 0.231354 }, { "epoch": 3.9915389892522297, "grad_norm": 0.5328667163848877, "learning_rate": 9.705960643827833e-06, "loss": 0.04807915687561035, "memory(GiB)": 122.96, "step": 52365, "token_acc": 0.9817704310211648, "train_speed(iter/s)": 0.231356 }, { "epoch": 3.9919201158624897, "grad_norm": 1.6722440719604492, "learning_rate": 9.698872587550128e-06, "loss": 0.04576430320739746, "memory(GiB)": 122.96, "step": 52370, "token_acc": 0.9806167400881057, "train_speed(iter/s)": 0.231363 }, { "epoch": 3.9923012424727493, "grad_norm": 0.8203909397125244, "learning_rate": 9.691786842355083e-06, "loss": 0.05106406807899475, "memory(GiB)": 122.96, "step": 52375, "token_acc": 0.9808843406062947, "train_speed(iter/s)": 0.231367 }, { "epoch": 3.9926823690830093, "grad_norm": 1.065798282623291, "learning_rate": 9.684703408648988e-06, "loss": 0.04249335825443268, "memory(GiB)": 122.96, "step": 52380, "token_acc": 0.9825626959247649, "train_speed(iter/s)": 0.231371 }, { "epoch": 3.9930634956932693, "grad_norm": 2.0104737281799316, "learning_rate": 9.677622286838084e-06, "loss": 0.036745432019233706, "memory(GiB)": 122.96, "step": 52385, "token_acc": 0.9813534464697256, "train_speed(iter/s)": 0.231376 }, { "epoch": 3.9934446223035294, "grad_norm": 0.6745442748069763, "learning_rate": 9.670543477328408e-06, "loss": 0.031999999284744264, "memory(GiB)": 122.96, "step": 52390, "token_acc": 0.9876748834110592, "train_speed(iter/s)": 0.231382 }, { "epoch": 3.993825748913789, "grad_norm": 1.8348498344421387, "learning_rate": 9.66346698052591e-06, "loss": 0.06186319589614868, "memory(GiB)": 122.96, "step": 52395, "token_acc": 0.9740684793554885, "train_speed(iter/s)": 0.231387 }, { "epoch": 3.994206875524049, "grad_norm": 1.2472491264343262, "learning_rate": 9.65639279683641e-06, "loss": 0.04645383059978485, "memory(GiB)": 122.96, "step": 52400, "token_acc": 0.9853675945753033, "train_speed(iter/s)": 0.231394 }, { "epoch": 3.994206875524049, "eval_loss": 0.05157297104597092, "eval_runtime": 220.0832, "eval_samples_per_second": 2.408, "eval_steps_per_second": 2.408, "eval_token_acc": 0.9785856273718451, "step": 52400 }, { "epoch": 3.994588002134309, "grad_norm": 0.7042508125305176, "learning_rate": 9.649320926665556e-06, "loss": 0.05233796238899231, "memory(GiB)": 122.96, "step": 52405, "token_acc": 0.978604177279674, "train_speed(iter/s)": 0.231173 }, { "epoch": 3.994969128744569, "grad_norm": 1.1837080717086792, "learning_rate": 9.642251370418897e-06, "loss": 0.03458074927330017, "memory(GiB)": 122.96, "step": 52410, "token_acc": 0.9832095576364224, "train_speed(iter/s)": 0.231177 }, { "epoch": 3.995350255354829, "grad_norm": 0.5049640536308289, "learning_rate": 9.63518412850185e-06, "loss": 0.022986891865730285, "memory(GiB)": 122.96, "step": 52415, "token_acc": 0.9906925814399123, "train_speed(iter/s)": 0.231182 }, { "epoch": 3.995731381965089, "grad_norm": 0.5596652030944824, "learning_rate": 9.62811920131967e-06, "loss": 0.04311366379261017, "memory(GiB)": 122.96, "step": 52420, "token_acc": 0.9823844779167731, "train_speed(iter/s)": 0.231185 }, { "epoch": 3.9961125085753486, "grad_norm": 1.036960482597351, "learning_rate": 9.621056589277499e-06, "loss": 0.03890994191169739, "memory(GiB)": 122.96, "step": 52425, "token_acc": 0.9862756157172401, "train_speed(iter/s)": 0.231189 }, { "epoch": 3.9964936351856086, "grad_norm": 0.7521655559539795, "learning_rate": 9.613996292780364e-06, "loss": 0.036190399527549745, "memory(GiB)": 122.96, "step": 52430, "token_acc": 0.9857539315448659, "train_speed(iter/s)": 0.231194 }, { "epoch": 3.9968747617958686, "grad_norm": 2.9914731979370117, "learning_rate": 9.606938312233116e-06, "loss": 0.05431786775588989, "memory(GiB)": 122.96, "step": 52435, "token_acc": 0.9772627896808046, "train_speed(iter/s)": 0.2312 }, { "epoch": 3.9972558884061287, "grad_norm": 1.124068260192871, "learning_rate": 9.599882648040508e-06, "loss": 0.0526716411113739, "memory(GiB)": 122.96, "step": 52440, "token_acc": 0.9807429664026223, "train_speed(iter/s)": 0.231202 }, { "epoch": 3.9976370150163882, "grad_norm": 1.107591152191162, "learning_rate": 9.592829300607153e-06, "loss": 0.04529925584793091, "memory(GiB)": 122.96, "step": 52445, "token_acc": 0.9821804943475761, "train_speed(iter/s)": 0.231206 }, { "epoch": 3.9980181416266483, "grad_norm": 1.6571974754333496, "learning_rate": 9.585778270337525e-06, "loss": 0.044786930084228516, "memory(GiB)": 122.96, "step": 52450, "token_acc": 0.9856658848058379, "train_speed(iter/s)": 0.231211 }, { "epoch": 3.9983992682369083, "grad_norm": 0.7616850137710571, "learning_rate": 9.578729557635985e-06, "loss": 0.04959052801132202, "memory(GiB)": 122.96, "step": 52455, "token_acc": 0.9773944080904223, "train_speed(iter/s)": 0.231216 }, { "epoch": 3.9987803948471683, "grad_norm": 1.9440596103668213, "learning_rate": 9.571683162906708e-06, "loss": 0.051061820983886716, "memory(GiB)": 122.96, "step": 52460, "token_acc": 0.9807450816241106, "train_speed(iter/s)": 0.231223 }, { "epoch": 3.9991615214574283, "grad_norm": 2.2631521224975586, "learning_rate": 9.564639086553796e-06, "loss": 0.06869486570358277, "memory(GiB)": 122.96, "step": 52465, "token_acc": 0.973489932885906, "train_speed(iter/s)": 0.231229 }, { "epoch": 3.9995426480676883, "grad_norm": 0.7974331974983215, "learning_rate": 9.5575973289812e-06, "loss": 0.03535140454769135, "memory(GiB)": 122.96, "step": 52470, "token_acc": 0.9858429858429858, "train_speed(iter/s)": 0.231236 }, { "epoch": 3.999923774677948, "grad_norm": 0.9307478070259094, "learning_rate": 9.55055789059271e-06, "loss": 0.07691041231155396, "memory(GiB)": 122.96, "step": 52475, "token_acc": 0.9736927573887626, "train_speed(iter/s)": 0.23124 }, { "epoch": 4.000304901288208, "grad_norm": 1.0575389862060547, "learning_rate": 9.543520771792014e-06, "loss": 0.04244367480278015, "memory(GiB)": 122.96, "step": 52480, "token_acc": 0.9822140447715425, "train_speed(iter/s)": 0.231246 }, { "epoch": 4.0006860278984675, "grad_norm": 0.8628977537155151, "learning_rate": 9.536485972982672e-06, "loss": 0.033648896217346194, "memory(GiB)": 122.96, "step": 52485, "token_acc": 0.9908561928512053, "train_speed(iter/s)": 0.231254 }, { "epoch": 4.0010671545087275, "grad_norm": 0.8735003471374512, "learning_rate": 9.52945349456808e-06, "loss": 0.040229016542434694, "memory(GiB)": 122.96, "step": 52490, "token_acc": 0.9832869080779945, "train_speed(iter/s)": 0.231257 }, { "epoch": 4.0014482811189875, "grad_norm": 1.905502200126648, "learning_rate": 9.52242333695152e-06, "loss": 0.049483183026313785, "memory(GiB)": 122.96, "step": 52495, "token_acc": 0.983601579107197, "train_speed(iter/s)": 0.231261 }, { "epoch": 4.001829407729248, "grad_norm": 0.6449739933013916, "learning_rate": 9.515395500536151e-06, "loss": 0.04007861912250519, "memory(GiB)": 122.96, "step": 52500, "token_acc": 0.9848050458715596, "train_speed(iter/s)": 0.231267 }, { "epoch": 4.002210534339508, "grad_norm": 0.7868517637252808, "learning_rate": 9.508369985724974e-06, "loss": 0.037059095501899716, "memory(GiB)": 122.96, "step": 52505, "token_acc": 0.9825988273122754, "train_speed(iter/s)": 0.231271 }, { "epoch": 4.002591660949768, "grad_norm": 0.8574061393737793, "learning_rate": 9.501346792920868e-06, "loss": 0.023155105113983155, "memory(GiB)": 122.96, "step": 52510, "token_acc": 0.9927774130006566, "train_speed(iter/s)": 0.231279 }, { "epoch": 4.002972787560028, "grad_norm": 0.9241961240768433, "learning_rate": 9.494325922526603e-06, "loss": 0.027729725837707518, "memory(GiB)": 122.96, "step": 52515, "token_acc": 0.9887567567567568, "train_speed(iter/s)": 0.231283 }, { "epoch": 4.003353914170288, "grad_norm": 2.3248424530029297, "learning_rate": 9.487307374944759e-06, "loss": 0.03582266867160797, "memory(GiB)": 122.96, "step": 52520, "token_acc": 0.9867307692307692, "train_speed(iter/s)": 0.231289 }, { "epoch": 4.003735040780548, "grad_norm": 0.6458215713500977, "learning_rate": 9.480291150577842e-06, "loss": 0.04143195152282715, "memory(GiB)": 122.96, "step": 52525, "token_acc": 0.9836156315731208, "train_speed(iter/s)": 0.23129 }, { "epoch": 4.004116167390808, "grad_norm": 1.8587514162063599, "learning_rate": 9.473277249828205e-06, "loss": 0.05110546350479126, "memory(GiB)": 122.96, "step": 52530, "token_acc": 0.9810704071905216, "train_speed(iter/s)": 0.231293 }, { "epoch": 4.004497294001067, "grad_norm": 0.9258203506469727, "learning_rate": 9.466265673098035e-06, "loss": 0.03483415544033051, "memory(GiB)": 122.96, "step": 52535, "token_acc": 0.987059122050241, "train_speed(iter/s)": 0.231299 }, { "epoch": 4.004878420611327, "grad_norm": 0.756892204284668, "learning_rate": 9.459256420789431e-06, "loss": 0.04018638730049133, "memory(GiB)": 122.96, "step": 52540, "token_acc": 0.9865370231862378, "train_speed(iter/s)": 0.231303 }, { "epoch": 4.005259547221587, "grad_norm": 1.9269832372665405, "learning_rate": 9.45224949330435e-06, "loss": 0.027907297015190125, "memory(GiB)": 122.96, "step": 52545, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.231307 }, { "epoch": 4.005640673831847, "grad_norm": 0.42333465814590454, "learning_rate": 9.445244891044585e-06, "loss": 0.014520229399204254, "memory(GiB)": 122.96, "step": 52550, "token_acc": 0.9940789473684211, "train_speed(iter/s)": 0.231312 }, { "epoch": 4.006021800442107, "grad_norm": 0.36797037720680237, "learning_rate": 9.43824261441184e-06, "loss": 0.03380066752433777, "memory(GiB)": 122.96, "step": 52555, "token_acc": 0.985322625311548, "train_speed(iter/s)": 0.231318 }, { "epoch": 4.006402927052367, "grad_norm": 1.61696457862854, "learning_rate": 9.431242663807637e-06, "loss": 0.026766780018806457, "memory(GiB)": 122.96, "step": 52560, "token_acc": 0.9855899945622622, "train_speed(iter/s)": 0.231324 }, { "epoch": 4.006784053662627, "grad_norm": 2.2351014614105225, "learning_rate": 9.424245039633412e-06, "loss": 0.044380050897598264, "memory(GiB)": 122.96, "step": 52565, "token_acc": 0.9849445324881141, "train_speed(iter/s)": 0.23133 }, { "epoch": 4.007165180272887, "grad_norm": 0.7728040814399719, "learning_rate": 9.417249742290435e-06, "loss": 0.029748895764350893, "memory(GiB)": 122.96, "step": 52570, "token_acc": 0.988805373420758, "train_speed(iter/s)": 0.231334 }, { "epoch": 4.007546306883147, "grad_norm": 1.172136664390564, "learning_rate": 9.410256772179855e-06, "loss": 0.028595021367073058, "memory(GiB)": 122.96, "step": 52575, "token_acc": 0.9888755261575466, "train_speed(iter/s)": 0.231341 }, { "epoch": 4.007927433493406, "grad_norm": 0.769741952419281, "learning_rate": 9.403266129702693e-06, "loss": 0.02559528350830078, "memory(GiB)": 122.96, "step": 52580, "token_acc": 0.9901795735129069, "train_speed(iter/s)": 0.231346 }, { "epoch": 4.008308560103666, "grad_norm": 1.5403199195861816, "learning_rate": 9.39627781525984e-06, "loss": 0.047275102138519286, "memory(GiB)": 122.96, "step": 52585, "token_acc": 0.9802631578947368, "train_speed(iter/s)": 0.231352 }, { "epoch": 4.008689686713926, "grad_norm": 1.063826084136963, "learning_rate": 9.389291829252017e-06, "loss": 0.026291093230247496, "memory(GiB)": 122.96, "step": 52590, "token_acc": 0.9876520457058606, "train_speed(iter/s)": 0.231355 }, { "epoch": 4.009070813324186, "grad_norm": 2.3337764739990234, "learning_rate": 9.382308172079863e-06, "loss": 0.03373496830463409, "memory(GiB)": 122.96, "step": 52595, "token_acc": 0.987216180118298, "train_speed(iter/s)": 0.231361 }, { "epoch": 4.009451939934446, "grad_norm": 1.2811906337738037, "learning_rate": 9.375326844143834e-06, "loss": 0.02250351458787918, "memory(GiB)": 122.96, "step": 52600, "token_acc": 0.9919583727530747, "train_speed(iter/s)": 0.231368 }, { "epoch": 4.009451939934446, "eval_loss": 0.05211297422647476, "eval_runtime": 220.6454, "eval_samples_per_second": 2.402, "eval_steps_per_second": 2.402, "eval_token_acc": 0.9785253900367448, "step": 52600 }, { "epoch": 4.009833066544706, "grad_norm": 1.6037710905075073, "learning_rate": 9.368347845844289e-06, "loss": 0.03164151012897491, "memory(GiB)": 122.96, "step": 52605, "token_acc": 0.978727790826958, "train_speed(iter/s)": 0.231151 }, { "epoch": 4.010214193154966, "grad_norm": 0.621146559715271, "learning_rate": 9.361371177581452e-06, "loss": 0.02856963872909546, "memory(GiB)": 122.96, "step": 52610, "token_acc": 0.9898305084745763, "train_speed(iter/s)": 0.231153 }, { "epoch": 4.010595319765226, "grad_norm": 0.9620033502578735, "learning_rate": 9.354396839755381e-06, "loss": 0.05135803818702698, "memory(GiB)": 122.96, "step": 52615, "token_acc": 0.9801283378182571, "train_speed(iter/s)": 0.231158 }, { "epoch": 4.010976446375486, "grad_norm": 1.2322100400924683, "learning_rate": 9.347424832766033e-06, "loss": 0.04842477738857269, "memory(GiB)": 122.96, "step": 52620, "token_acc": 0.9825161535537819, "train_speed(iter/s)": 0.231162 }, { "epoch": 4.011357572985746, "grad_norm": 0.17829051613807678, "learning_rate": 9.340455157013234e-06, "loss": 0.042334234714508055, "memory(GiB)": 122.96, "step": 52625, "token_acc": 0.9886122077133314, "train_speed(iter/s)": 0.231165 }, { "epoch": 4.011738699596005, "grad_norm": 1.175032377243042, "learning_rate": 9.33348781289663e-06, "loss": 0.03622499108314514, "memory(GiB)": 122.96, "step": 52630, "token_acc": 0.9854323308270677, "train_speed(iter/s)": 0.231169 }, { "epoch": 4.012119826206265, "grad_norm": 1.075997233390808, "learning_rate": 9.32652280081579e-06, "loss": 0.037969177961349486, "memory(GiB)": 122.96, "step": 52635, "token_acc": 0.9876444798724592, "train_speed(iter/s)": 0.231176 }, { "epoch": 4.012500952816525, "grad_norm": 0.8281236290931702, "learning_rate": 9.319560121170128e-06, "loss": 0.031634360551834106, "memory(GiB)": 122.96, "step": 52640, "token_acc": 0.9906311637080868, "train_speed(iter/s)": 0.231182 }, { "epoch": 4.012882079426785, "grad_norm": 0.5977849364280701, "learning_rate": 9.312599774358905e-06, "loss": 0.04577980041503906, "memory(GiB)": 122.96, "step": 52645, "token_acc": 0.9895551257253384, "train_speed(iter/s)": 0.231189 }, { "epoch": 4.0132632060370454, "grad_norm": 1.2979042530059814, "learning_rate": 9.30564176078127e-06, "loss": 0.018539050221443178, "memory(GiB)": 122.96, "step": 52650, "token_acc": 0.9931818181818182, "train_speed(iter/s)": 0.231196 }, { "epoch": 4.0136443326473055, "grad_norm": 1.9136297702789307, "learning_rate": 9.298686080836243e-06, "loss": 0.05117917060852051, "memory(GiB)": 122.96, "step": 52655, "token_acc": 0.9786075457020614, "train_speed(iter/s)": 0.231201 }, { "epoch": 4.0140254592575655, "grad_norm": 0.8746251463890076, "learning_rate": 9.291732734922687e-06, "loss": 0.031728506088256836, "memory(GiB)": 122.96, "step": 52660, "token_acc": 0.991012789491877, "train_speed(iter/s)": 0.231205 }, { "epoch": 4.0144065858678255, "grad_norm": 1.1615842580795288, "learning_rate": 9.284781723439345e-06, "loss": 0.03233384490013123, "memory(GiB)": 122.96, "step": 52665, "token_acc": 0.9819482288828338, "train_speed(iter/s)": 0.231212 }, { "epoch": 4.0147877124780855, "grad_norm": 0.8384132981300354, "learning_rate": 9.27783304678484e-06, "loss": 0.03689364194869995, "memory(GiB)": 122.96, "step": 52670, "token_acc": 0.9854414823218, "train_speed(iter/s)": 0.231215 }, { "epoch": 4.0151688390883455, "grad_norm": 2.815319776535034, "learning_rate": 9.270886705357628e-06, "loss": 0.04921550154685974, "memory(GiB)": 122.96, "step": 52675, "token_acc": 0.9835459419486042, "train_speed(iter/s)": 0.231219 }, { "epoch": 4.015549965698605, "grad_norm": 1.7892332077026367, "learning_rate": 9.263942699556055e-06, "loss": 0.029552119970321655, "memory(GiB)": 122.96, "step": 52680, "token_acc": 0.9868268113134444, "train_speed(iter/s)": 0.231224 }, { "epoch": 4.015931092308865, "grad_norm": 0.6579239964485168, "learning_rate": 9.257001029778345e-06, "loss": 0.045057058334350586, "memory(GiB)": 122.96, "step": 52685, "token_acc": 0.9823104693140794, "train_speed(iter/s)": 0.231231 }, { "epoch": 4.016312218919125, "grad_norm": 1.3548557758331299, "learning_rate": 9.25006169642254e-06, "loss": 0.04920731484889984, "memory(GiB)": 122.96, "step": 52690, "token_acc": 0.9800494350282486, "train_speed(iter/s)": 0.231236 }, { "epoch": 4.016693345529385, "grad_norm": 2.1456661224365234, "learning_rate": 9.24312469988659e-06, "loss": 0.040825659036636354, "memory(GiB)": 122.96, "step": 52695, "token_acc": 0.98503861003861, "train_speed(iter/s)": 0.231241 }, { "epoch": 4.017074472139645, "grad_norm": 1.2055604457855225, "learning_rate": 9.23619004056832e-06, "loss": 0.03416224718093872, "memory(GiB)": 122.96, "step": 52700, "token_acc": 0.9877020379479972, "train_speed(iter/s)": 0.231246 }, { "epoch": 4.017455598749905, "grad_norm": 2.144286632537842, "learning_rate": 9.229257718865364e-06, "loss": 0.025059786438941956, "memory(GiB)": 122.96, "step": 52705, "token_acc": 0.9892793329362716, "train_speed(iter/s)": 0.231251 }, { "epoch": 4.017836725360165, "grad_norm": 1.7947875261306763, "learning_rate": 9.222327735175296e-06, "loss": 0.04777849018573761, "memory(GiB)": 122.96, "step": 52710, "token_acc": 0.9774738535800482, "train_speed(iter/s)": 0.231256 }, { "epoch": 4.018217851970425, "grad_norm": 0.9490551352500916, "learning_rate": 9.215400089895493e-06, "loss": 0.024596858024597167, "memory(GiB)": 122.96, "step": 52715, "token_acc": 0.9857943925233645, "train_speed(iter/s)": 0.231262 }, { "epoch": 4.018598978580685, "grad_norm": 0.7532852292060852, "learning_rate": 9.208474783423226e-06, "loss": 0.04006710350513458, "memory(GiB)": 122.96, "step": 52720, "token_acc": 0.985188724319159, "train_speed(iter/s)": 0.231267 }, { "epoch": 4.018980105190945, "grad_norm": 0.9161026477813721, "learning_rate": 9.201551816155651e-06, "loss": 0.03552758097648621, "memory(GiB)": 122.96, "step": 52725, "token_acc": 0.98580375782881, "train_speed(iter/s)": 0.231273 }, { "epoch": 4.019361231801204, "grad_norm": 2.725898504257202, "learning_rate": 9.194631188489738e-06, "loss": 0.037670907378196714, "memory(GiB)": 122.96, "step": 52730, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.231279 }, { "epoch": 4.019742358411464, "grad_norm": 1.4505983591079712, "learning_rate": 9.187712900822365e-06, "loss": 0.05828405618667602, "memory(GiB)": 122.96, "step": 52735, "token_acc": 0.9860671310956302, "train_speed(iter/s)": 0.231285 }, { "epoch": 4.020123485021724, "grad_norm": 0.42199647426605225, "learning_rate": 9.18079695355028e-06, "loss": 0.05228818655014038, "memory(GiB)": 122.96, "step": 52740, "token_acc": 0.9856442577030813, "train_speed(iter/s)": 0.231289 }, { "epoch": 4.020504611631984, "grad_norm": 1.055625081062317, "learning_rate": 9.173883347070057e-06, "loss": 0.029672500491142274, "memory(GiB)": 122.96, "step": 52745, "token_acc": 0.9882874327318771, "train_speed(iter/s)": 0.231295 }, { "epoch": 4.020885738242244, "grad_norm": 0.857467532157898, "learning_rate": 9.166972081778158e-06, "loss": 0.036816665530204774, "memory(GiB)": 122.96, "step": 52750, "token_acc": 0.985521392549211, "train_speed(iter/s)": 0.231297 }, { "epoch": 4.021266864852504, "grad_norm": 0.9585351943969727, "learning_rate": 9.160063158070942e-06, "loss": 0.030226004123687745, "memory(GiB)": 122.96, "step": 52755, "token_acc": 0.9898465171192444, "train_speed(iter/s)": 0.231303 }, { "epoch": 4.021647991462764, "grad_norm": 2.1639039516448975, "learning_rate": 9.153156576344569e-06, "loss": 0.04881899058818817, "memory(GiB)": 122.96, "step": 52760, "token_acc": 0.9809983411250188, "train_speed(iter/s)": 0.231306 }, { "epoch": 4.022029118073024, "grad_norm": 0.7601252794265747, "learning_rate": 9.146252336995109e-06, "loss": 0.0409344345331192, "memory(GiB)": 122.96, "step": 52765, "token_acc": 0.9866200082338411, "train_speed(iter/s)": 0.23131 }, { "epoch": 4.022410244683284, "grad_norm": 0.7425379753112793, "learning_rate": 9.139350440418509e-06, "loss": 0.02981950342655182, "memory(GiB)": 122.96, "step": 52770, "token_acc": 0.9881993467495522, "train_speed(iter/s)": 0.231309 }, { "epoch": 4.022791371293544, "grad_norm": 0.780762791633606, "learning_rate": 9.13245088701053e-06, "loss": 0.04118178188800812, "memory(GiB)": 122.96, "step": 52775, "token_acc": 0.9866484080794249, "train_speed(iter/s)": 0.231313 }, { "epoch": 4.023172497903803, "grad_norm": 0.5330446362495422, "learning_rate": 9.125553677166859e-06, "loss": 0.032666510343551634, "memory(GiB)": 122.96, "step": 52780, "token_acc": 0.9894561598224195, "train_speed(iter/s)": 0.231312 }, { "epoch": 4.023553624514063, "grad_norm": 1.3616729974746704, "learning_rate": 9.118658811282993e-06, "loss": 0.025646737217903136, "memory(GiB)": 122.96, "step": 52785, "token_acc": 0.9900178253119429, "train_speed(iter/s)": 0.231316 }, { "epoch": 4.023934751124323, "grad_norm": 0.8428962230682373, "learning_rate": 9.111766289754332e-06, "loss": 0.0417491465806961, "memory(GiB)": 122.96, "step": 52790, "token_acc": 0.9841964502796012, "train_speed(iter/s)": 0.231321 }, { "epoch": 4.024315877734583, "grad_norm": 0.49344757199287415, "learning_rate": 9.10487611297614e-06, "loss": 0.02027701139450073, "memory(GiB)": 122.96, "step": 52795, "token_acc": 0.988759367194005, "train_speed(iter/s)": 0.231327 }, { "epoch": 4.024697004344843, "grad_norm": 1.8129373788833618, "learning_rate": 9.097988281343512e-06, "loss": 0.04298911094665527, "memory(GiB)": 122.96, "step": 52800, "token_acc": 0.983631812054689, "train_speed(iter/s)": 0.231331 }, { "epoch": 4.024697004344843, "eval_loss": 0.05186690390110016, "eval_runtime": 220.2868, "eval_samples_per_second": 2.406, "eval_steps_per_second": 2.406, "eval_token_acc": 0.9786533943738329, "step": 52800 }, { "epoch": 4.025078130955103, "grad_norm": 1.1553587913513184, "learning_rate": 9.091102795251449e-06, "loss": 0.02953561842441559, "memory(GiB)": 122.96, "step": 52805, "token_acc": 0.9789085918595832, "train_speed(iter/s)": 0.231112 }, { "epoch": 4.025459257565363, "grad_norm": 0.5298164486885071, "learning_rate": 9.084219655094811e-06, "loss": 0.04932044148445129, "memory(GiB)": 122.96, "step": 52810, "token_acc": 0.9797297297297297, "train_speed(iter/s)": 0.231116 }, { "epoch": 4.025840384175623, "grad_norm": 0.8931410908699036, "learning_rate": 9.077338861268297e-06, "loss": 0.03824707567691803, "memory(GiB)": 122.96, "step": 52815, "token_acc": 0.9827879704936637, "train_speed(iter/s)": 0.231121 }, { "epoch": 4.026221510785883, "grad_norm": 1.2847628593444824, "learning_rate": 9.070460414166488e-06, "loss": 0.03925137221813202, "memory(GiB)": 122.96, "step": 52820, "token_acc": 0.9829104091144485, "train_speed(iter/s)": 0.231126 }, { "epoch": 4.026602637396143, "grad_norm": 1.064353346824646, "learning_rate": 9.063584314183853e-06, "loss": 0.041063731908798216, "memory(GiB)": 122.96, "step": 52825, "token_acc": 0.9848240232483048, "train_speed(iter/s)": 0.231129 }, { "epoch": 4.026983764006403, "grad_norm": 0.736677348613739, "learning_rate": 9.056710561714676e-06, "loss": 0.02412194311618805, "memory(GiB)": 122.96, "step": 52830, "token_acc": 0.9892397248191921, "train_speed(iter/s)": 0.231133 }, { "epoch": 4.027364890616663, "grad_norm": 0.5586956739425659, "learning_rate": 9.049839157153151e-06, "loss": 0.028969168663024902, "memory(GiB)": 122.96, "step": 52835, "token_acc": 0.9873760987469609, "train_speed(iter/s)": 0.231133 }, { "epoch": 4.027746017226923, "grad_norm": 1.1764249801635742, "learning_rate": 9.042970100893316e-06, "loss": 0.017596770823001862, "memory(GiB)": 122.96, "step": 52840, "token_acc": 0.9917555771096024, "train_speed(iter/s)": 0.231136 }, { "epoch": 4.028127143837183, "grad_norm": 0.44791701436042786, "learning_rate": 9.036103393329088e-06, "loss": 0.030722448229789735, "memory(GiB)": 122.96, "step": 52845, "token_acc": 0.9882449989688595, "train_speed(iter/s)": 0.231141 }, { "epoch": 4.028508270447443, "grad_norm": 0.9655575156211853, "learning_rate": 9.029239034854237e-06, "loss": 0.027278134226799013, "memory(GiB)": 122.96, "step": 52850, "token_acc": 0.9891623391284714, "train_speed(iter/s)": 0.231146 }, { "epoch": 4.028889397057703, "grad_norm": 1.5153733491897583, "learning_rate": 9.022377025862393e-06, "loss": 0.025722548365592957, "memory(GiB)": 122.96, "step": 52855, "token_acc": 0.989084293511219, "train_speed(iter/s)": 0.231151 }, { "epoch": 4.029270523667963, "grad_norm": 1.8711496591567993, "learning_rate": 9.015517366747067e-06, "loss": 0.05787222981452942, "memory(GiB)": 122.96, "step": 52860, "token_acc": 0.9813407049067036, "train_speed(iter/s)": 0.231155 }, { "epoch": 4.029651650278223, "grad_norm": 1.5364397764205933, "learning_rate": 9.00866005790164e-06, "loss": 0.020999494194984435, "memory(GiB)": 122.96, "step": 52865, "token_acc": 0.9904789382573572, "train_speed(iter/s)": 0.231162 }, { "epoch": 4.030032776888483, "grad_norm": 1.2968932390213013, "learning_rate": 9.001805099719323e-06, "loss": 0.0368925541639328, "memory(GiB)": 122.96, "step": 52870, "token_acc": 0.9848094009744912, "train_speed(iter/s)": 0.231168 }, { "epoch": 4.030413903498742, "grad_norm": 0.9926802515983582, "learning_rate": 8.994952492593233e-06, "loss": 0.0196207731962204, "memory(GiB)": 122.96, "step": 52875, "token_acc": 0.9891135303265941, "train_speed(iter/s)": 0.231174 }, { "epoch": 4.030795030109002, "grad_norm": 1.4596511125564575, "learning_rate": 8.98810223691634e-06, "loss": 0.04368800222873688, "memory(GiB)": 122.96, "step": 52880, "token_acc": 0.9848629700446144, "train_speed(iter/s)": 0.231177 }, { "epoch": 4.031176156719262, "grad_norm": 0.6576618552207947, "learning_rate": 8.981254333081452e-06, "loss": 0.026919472217559814, "memory(GiB)": 122.96, "step": 52885, "token_acc": 0.9865916955017301, "train_speed(iter/s)": 0.231184 }, { "epoch": 4.031557283329522, "grad_norm": 0.49424320459365845, "learning_rate": 8.974408781481281e-06, "loss": 0.033433538675308225, "memory(GiB)": 122.96, "step": 52890, "token_acc": 0.9882284832111154, "train_speed(iter/s)": 0.231189 }, { "epoch": 4.031938409939782, "grad_norm": 2.1049458980560303, "learning_rate": 8.967565582508391e-06, "loss": 0.046914076805114745, "memory(GiB)": 122.96, "step": 52895, "token_acc": 0.9871071716357775, "train_speed(iter/s)": 0.231193 }, { "epoch": 4.032319536550042, "grad_norm": 1.4354501962661743, "learning_rate": 8.960724736555193e-06, "loss": 0.035204410552978516, "memory(GiB)": 122.96, "step": 52900, "token_acc": 0.984281971898071, "train_speed(iter/s)": 0.231198 }, { "epoch": 4.032700663160302, "grad_norm": 0.6713777780532837, "learning_rate": 8.95388624401398e-06, "loss": 0.035275721549987794, "memory(GiB)": 122.96, "step": 52905, "token_acc": 0.9882102272727272, "train_speed(iter/s)": 0.231201 }, { "epoch": 4.033081789770562, "grad_norm": 1.236171841621399, "learning_rate": 8.947050105276933e-06, "loss": 0.030578497052192687, "memory(GiB)": 122.96, "step": 52910, "token_acc": 0.9885337039610841, "train_speed(iter/s)": 0.231207 }, { "epoch": 4.033462916380822, "grad_norm": 1.452757716178894, "learning_rate": 8.940216320736039e-06, "loss": 0.02444092631340027, "memory(GiB)": 122.96, "step": 52915, "token_acc": 0.9901057659501876, "train_speed(iter/s)": 0.231215 }, { "epoch": 4.033844042991082, "grad_norm": 0.6699374318122864, "learning_rate": 8.933384890783203e-06, "loss": 0.05692403316497803, "memory(GiB)": 122.96, "step": 52920, "token_acc": 0.9814514259216323, "train_speed(iter/s)": 0.23122 }, { "epoch": 4.034225169601341, "grad_norm": 0.6010960936546326, "learning_rate": 8.926555815810178e-06, "loss": 0.04884012341499329, "memory(GiB)": 122.96, "step": 52925, "token_acc": 0.9855351976856316, "train_speed(iter/s)": 0.231227 }, { "epoch": 4.034606296211601, "grad_norm": 0.8342729806900024, "learning_rate": 8.919729096208562e-06, "loss": 0.02285350263118744, "memory(GiB)": 122.96, "step": 52930, "token_acc": 0.9936693450206964, "train_speed(iter/s)": 0.231233 }, { "epoch": 4.034987422821861, "grad_norm": 0.803972065448761, "learning_rate": 8.912904732369843e-06, "loss": 0.03307653665542602, "memory(GiB)": 122.96, "step": 52935, "token_acc": 0.9874330068379228, "train_speed(iter/s)": 0.231238 }, { "epoch": 4.035368549432121, "grad_norm": 0.9190524816513062, "learning_rate": 8.90608272468539e-06, "loss": 0.03402678668498993, "memory(GiB)": 122.96, "step": 52940, "token_acc": 0.9827973074046372, "train_speed(iter/s)": 0.231244 }, { "epoch": 4.035749676042381, "grad_norm": 1.2139225006103516, "learning_rate": 8.899263073546372e-06, "loss": 0.029283356666564942, "memory(GiB)": 122.96, "step": 52945, "token_acc": 0.9840883564208162, "train_speed(iter/s)": 0.231249 }, { "epoch": 4.036130802652641, "grad_norm": 1.508159875869751, "learning_rate": 8.892445779343905e-06, "loss": 0.037607702612876895, "memory(GiB)": 122.96, "step": 52950, "token_acc": 0.9819684447783621, "train_speed(iter/s)": 0.231253 }, { "epoch": 4.036511929262901, "grad_norm": 1.4129163026809692, "learning_rate": 8.885630842468895e-06, "loss": 0.03434442281723023, "memory(GiB)": 122.96, "step": 52955, "token_acc": 0.9890354952611039, "train_speed(iter/s)": 0.231257 }, { "epoch": 4.036893055873161, "grad_norm": 1.096596360206604, "learning_rate": 8.878818263312162e-06, "loss": 0.021105292439460754, "memory(GiB)": 122.96, "step": 52960, "token_acc": 0.9895769466584917, "train_speed(iter/s)": 0.231263 }, { "epoch": 4.037274182483421, "grad_norm": 1.415649652481079, "learning_rate": 8.87200804226439e-06, "loss": 0.02561710774898529, "memory(GiB)": 122.96, "step": 52965, "token_acc": 0.9901348240710293, "train_speed(iter/s)": 0.231267 }, { "epoch": 4.037655309093681, "grad_norm": 0.21034996211528778, "learning_rate": 8.865200179716088e-06, "loss": 0.019358985126018524, "memory(GiB)": 122.96, "step": 52970, "token_acc": 0.9905743095133713, "train_speed(iter/s)": 0.231273 }, { "epoch": 4.03803643570394, "grad_norm": 1.1209895610809326, "learning_rate": 8.858394676057651e-06, "loss": 0.053246313333511354, "memory(GiB)": 122.96, "step": 52975, "token_acc": 0.977765799757445, "train_speed(iter/s)": 0.231278 }, { "epoch": 4.0384175623142005, "grad_norm": 1.2059059143066406, "learning_rate": 8.851591531679388e-06, "loss": 0.05748300552368164, "memory(GiB)": 122.96, "step": 52980, "token_acc": 0.9787327662070988, "train_speed(iter/s)": 0.23128 }, { "epoch": 4.0387986889244605, "grad_norm": 1.4252771139144897, "learning_rate": 8.844790746971381e-06, "loss": 0.04434276819229126, "memory(GiB)": 122.96, "step": 52985, "token_acc": 0.9810040705563093, "train_speed(iter/s)": 0.231287 }, { "epoch": 4.0391798155347205, "grad_norm": 1.4893980026245117, "learning_rate": 8.837992322323662e-06, "loss": 0.04305590391159057, "memory(GiB)": 122.96, "step": 52990, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.231291 }, { "epoch": 4.0395609421449805, "grad_norm": 1.1045618057250977, "learning_rate": 8.83119625812605e-06, "loss": 0.03009539544582367, "memory(GiB)": 122.96, "step": 52995, "token_acc": 0.9858236461582081, "train_speed(iter/s)": 0.231296 }, { "epoch": 4.0399420687552405, "grad_norm": 1.0398067235946655, "learning_rate": 8.824402554768285e-06, "loss": 0.0452409565448761, "memory(GiB)": 122.96, "step": 53000, "token_acc": 0.9841392649903288, "train_speed(iter/s)": 0.231303 }, { "epoch": 4.0399420687552405, "eval_loss": 0.05224674195051193, "eval_runtime": 219.9296, "eval_samples_per_second": 2.41, "eval_steps_per_second": 2.41, "eval_token_acc": 0.9785856273718451, "step": 53000 }, { "epoch": 4.0403231953655006, "grad_norm": 1.356574296951294, "learning_rate": 8.81761121263997e-06, "loss": 0.03864647448062897, "memory(GiB)": 122.96, "step": 53005, "token_acc": 0.978709658473556, "train_speed(iter/s)": 0.231086 }, { "epoch": 4.040704321975761, "grad_norm": 0.5915005207061768, "learning_rate": 8.810822232130528e-06, "loss": 0.010471545904874802, "memory(GiB)": 122.96, "step": 53010, "token_acc": 0.9960604070912672, "train_speed(iter/s)": 0.231094 }, { "epoch": 4.041085448586021, "grad_norm": 1.3761918544769287, "learning_rate": 8.804035613629292e-06, "loss": 0.03934223651885986, "memory(GiB)": 122.96, "step": 53015, "token_acc": 0.9815262430939227, "train_speed(iter/s)": 0.231099 }, { "epoch": 4.041466575196281, "grad_norm": 0.7884155511856079, "learning_rate": 8.797251357525455e-06, "loss": 0.033416959643363955, "memory(GiB)": 122.96, "step": 53020, "token_acc": 0.9858726752503576, "train_speed(iter/s)": 0.231103 }, { "epoch": 4.04184770180654, "grad_norm": 1.1501128673553467, "learning_rate": 8.790469464208035e-06, "loss": 0.04065491259098053, "memory(GiB)": 122.96, "step": 53025, "token_acc": 0.9816585365853658, "train_speed(iter/s)": 0.231108 }, { "epoch": 4.0422288284168, "grad_norm": 0.6345053911209106, "learning_rate": 8.783689934065952e-06, "loss": 0.04552753269672394, "memory(GiB)": 122.96, "step": 53030, "token_acc": 0.9810536044362292, "train_speed(iter/s)": 0.231112 }, { "epoch": 4.04260995502706, "grad_norm": 0.43020302057266235, "learning_rate": 8.776912767487999e-06, "loss": 0.017389115691184998, "memory(GiB)": 122.96, "step": 53035, "token_acc": 0.9907949790794979, "train_speed(iter/s)": 0.231119 }, { "epoch": 4.04299108163732, "grad_norm": 0.5120699405670166, "learning_rate": 8.77013796486279e-06, "loss": 0.02597871422767639, "memory(GiB)": 122.96, "step": 53040, "token_acc": 0.9887359198998749, "train_speed(iter/s)": 0.231122 }, { "epoch": 4.04337220824758, "grad_norm": 0.7692918181419373, "learning_rate": 8.763365526578837e-06, "loss": 0.024728354811668397, "memory(GiB)": 122.96, "step": 53045, "token_acc": 0.9900629347466048, "train_speed(iter/s)": 0.231127 }, { "epoch": 4.04375333485784, "grad_norm": 0.658352255821228, "learning_rate": 8.756595453024518e-06, "loss": 0.032284015417099, "memory(GiB)": 122.96, "step": 53050, "token_acc": 0.9880047505938242, "train_speed(iter/s)": 0.23113 }, { "epoch": 4.0441344614681, "grad_norm": 1.997514009475708, "learning_rate": 8.749827744588052e-06, "loss": 0.02693096101284027, "memory(GiB)": 122.96, "step": 53055, "token_acc": 0.9934608468203368, "train_speed(iter/s)": 0.231133 }, { "epoch": 4.04451558807836, "grad_norm": 3.6627326011657715, "learning_rate": 8.743062401657537e-06, "loss": 0.03569806218147278, "memory(GiB)": 122.96, "step": 53060, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.231139 }, { "epoch": 4.04489671468862, "grad_norm": 1.630851149559021, "learning_rate": 8.736299424620947e-06, "loss": 0.04225233793258667, "memory(GiB)": 122.96, "step": 53065, "token_acc": 0.9857464366091523, "train_speed(iter/s)": 0.231145 }, { "epoch": 4.04527784129888, "grad_norm": 0.6519677639007568, "learning_rate": 8.729538813866089e-06, "loss": 0.026925182342529295, "memory(GiB)": 122.96, "step": 53070, "token_acc": 0.9847694554558731, "train_speed(iter/s)": 0.231151 }, { "epoch": 4.045658967909139, "grad_norm": 1.7253156900405884, "learning_rate": 8.722780569780664e-06, "loss": 0.05754987001419067, "memory(GiB)": 122.96, "step": 53075, "token_acc": 0.9784200385356455, "train_speed(iter/s)": 0.231155 }, { "epoch": 4.046040094519399, "grad_norm": 0.16876214742660522, "learning_rate": 8.716024692752233e-06, "loss": 0.023372572660446168, "memory(GiB)": 122.96, "step": 53080, "token_acc": 0.9883981274170568, "train_speed(iter/s)": 0.231159 }, { "epoch": 4.046421221129659, "grad_norm": 1.674110770225525, "learning_rate": 8.709271183168194e-06, "loss": 0.026538684964179993, "memory(GiB)": 122.96, "step": 53085, "token_acc": 0.9892844797787763, "train_speed(iter/s)": 0.231163 }, { "epoch": 4.046802347739919, "grad_norm": 1.0725077390670776, "learning_rate": 8.70252004141584e-06, "loss": 0.023627695441246033, "memory(GiB)": 122.96, "step": 53090, "token_acc": 0.9928111056023797, "train_speed(iter/s)": 0.231169 }, { "epoch": 4.047183474350179, "grad_norm": 1.0882399082183838, "learning_rate": 8.69577126788233e-06, "loss": 0.03949707746505737, "memory(GiB)": 122.96, "step": 53095, "token_acc": 0.9871692060946271, "train_speed(iter/s)": 0.231177 }, { "epoch": 4.047564600960439, "grad_norm": 0.6568900346755981, "learning_rate": 8.689024862954648e-06, "loss": 0.03811323642730713, "memory(GiB)": 122.96, "step": 53100, "token_acc": 0.9849766443630855, "train_speed(iter/s)": 0.231179 }, { "epoch": 4.047945727570699, "grad_norm": 0.9856979846954346, "learning_rate": 8.682280827019685e-06, "loss": 0.01258133351802826, "memory(GiB)": 122.96, "step": 53105, "token_acc": 0.9931159420289855, "train_speed(iter/s)": 0.231186 }, { "epoch": 4.048326854180959, "grad_norm": 0.5124707818031311, "learning_rate": 8.67553916046418e-06, "loss": 0.013389815390110017, "memory(GiB)": 122.96, "step": 53110, "token_acc": 0.9936235098419739, "train_speed(iter/s)": 0.231192 }, { "epoch": 4.048707980791219, "grad_norm": 0.8272969126701355, "learning_rate": 8.668799863674737e-06, "loss": 0.03569466471672058, "memory(GiB)": 122.96, "step": 53115, "token_acc": 0.9864029666254636, "train_speed(iter/s)": 0.231197 }, { "epoch": 4.049089107401479, "grad_norm": 1.3075565099716187, "learning_rate": 8.662062937037829e-06, "loss": 0.022727158665657044, "memory(GiB)": 122.96, "step": 53120, "token_acc": 0.9920264488525865, "train_speed(iter/s)": 0.231203 }, { "epoch": 4.049470234011738, "grad_norm": 0.5649165511131287, "learning_rate": 8.65532838093977e-06, "loss": 0.027509018778800964, "memory(GiB)": 122.96, "step": 53125, "token_acc": 0.9894255050505051, "train_speed(iter/s)": 0.231208 }, { "epoch": 4.049851360621998, "grad_norm": 0.6870656609535217, "learning_rate": 8.648596195766768e-06, "loss": 0.028896409273147582, "memory(GiB)": 122.96, "step": 53130, "token_acc": 0.9861337683523654, "train_speed(iter/s)": 0.231209 }, { "epoch": 4.050232487232258, "grad_norm": 0.6646307706832886, "learning_rate": 8.641866381904889e-06, "loss": 0.047933027148246765, "memory(GiB)": 122.96, "step": 53135, "token_acc": 0.9862548384548543, "train_speed(iter/s)": 0.231204 }, { "epoch": 4.050613613842518, "grad_norm": 1.2498652935028076, "learning_rate": 8.63513893974004e-06, "loss": 0.04202830195426941, "memory(GiB)": 122.96, "step": 53140, "token_acc": 0.9820998278829605, "train_speed(iter/s)": 0.231209 }, { "epoch": 4.050994740452778, "grad_norm": 1.074800729751587, "learning_rate": 8.628413869658015e-06, "loss": 0.03059517443180084, "memory(GiB)": 122.96, "step": 53145, "token_acc": 0.9900693273374555, "train_speed(iter/s)": 0.231213 }, { "epoch": 4.051375867063038, "grad_norm": 0.8095026016235352, "learning_rate": 8.621691172044477e-06, "loss": 0.03114011287689209, "memory(GiB)": 122.96, "step": 53150, "token_acc": 0.9886008457437029, "train_speed(iter/s)": 0.231218 }, { "epoch": 4.0517569936732984, "grad_norm": 2.2080652713775635, "learning_rate": 8.61497084728492e-06, "loss": 0.03933379054069519, "memory(GiB)": 122.96, "step": 53155, "token_acc": 0.9846775316896503, "train_speed(iter/s)": 0.231221 }, { "epoch": 4.0521381202835585, "grad_norm": 2.1587748527526855, "learning_rate": 8.608252895764746e-06, "loss": 0.03951936364173889, "memory(GiB)": 122.96, "step": 53160, "token_acc": 0.9870598218073823, "train_speed(iter/s)": 0.231225 }, { "epoch": 4.0525192468938185, "grad_norm": 0.42987459897994995, "learning_rate": 8.60153731786918e-06, "loss": 0.021104463934898378, "memory(GiB)": 122.96, "step": 53165, "token_acc": 0.988896442329481, "train_speed(iter/s)": 0.23123 }, { "epoch": 4.0529003735040785, "grad_norm": 0.5726818442344666, "learning_rate": 8.59482411398333e-06, "loss": 0.025954097509384155, "memory(GiB)": 122.96, "step": 53170, "token_acc": 0.9878072013716899, "train_speed(iter/s)": 0.231234 }, { "epoch": 4.053281500114338, "grad_norm": 2.883495330810547, "learning_rate": 8.588113284492189e-06, "loss": 0.04424841105937958, "memory(GiB)": 122.96, "step": 53175, "token_acc": 0.9875248086192231, "train_speed(iter/s)": 0.23124 }, { "epoch": 4.053662626724598, "grad_norm": 1.4618080854415894, "learning_rate": 8.581404829780565e-06, "loss": 0.04005302786827088, "memory(GiB)": 122.96, "step": 53180, "token_acc": 0.9822064056939501, "train_speed(iter/s)": 0.231246 }, { "epoch": 4.054043753334858, "grad_norm": 1.9108951091766357, "learning_rate": 8.574698750233167e-06, "loss": 0.03989262580871582, "memory(GiB)": 122.96, "step": 53185, "token_acc": 0.9827988957315779, "train_speed(iter/s)": 0.23125 }, { "epoch": 4.054424879945118, "grad_norm": 1.3731409311294556, "learning_rate": 8.567995046234573e-06, "loss": 0.030484318733215332, "memory(GiB)": 122.96, "step": 53190, "token_acc": 0.989200579481101, "train_speed(iter/s)": 0.231253 }, { "epoch": 4.054806006555378, "grad_norm": 0.591969907283783, "learning_rate": 8.561293718169178e-06, "loss": 0.044208687543869016, "memory(GiB)": 122.96, "step": 53195, "token_acc": 0.9897214360196633, "train_speed(iter/s)": 0.231257 }, { "epoch": 4.055187133165638, "grad_norm": 0.8106030225753784, "learning_rate": 8.554594766421292e-06, "loss": 0.03007746934890747, "memory(GiB)": 122.96, "step": 53200, "token_acc": 0.9892086330935251, "train_speed(iter/s)": 0.231262 }, { "epoch": 4.055187133165638, "eval_loss": 0.05240277573466301, "eval_runtime": 218.1698, "eval_samples_per_second": 2.429, "eval_steps_per_second": 2.429, "eval_token_acc": 0.9788868140473466, "step": 53200 }, { "epoch": 4.055568259775898, "grad_norm": 1.2380765676498413, "learning_rate": 8.547898191375081e-06, "loss": 0.03104727864265442, "memory(GiB)": 122.96, "step": 53205, "token_acc": 0.9790889326321356, "train_speed(iter/s)": 0.231049 }, { "epoch": 4.055949386386158, "grad_norm": 0.8753826022148132, "learning_rate": 8.541203993414532e-06, "loss": 0.03162429332733154, "memory(GiB)": 122.96, "step": 53210, "token_acc": 0.9813278008298755, "train_speed(iter/s)": 0.231057 }, { "epoch": 4.056330512996418, "grad_norm": 2.360689401626587, "learning_rate": 8.534512172923542e-06, "loss": 0.03210289180278778, "memory(GiB)": 122.96, "step": 53215, "token_acc": 0.9890025575447571, "train_speed(iter/s)": 0.231063 }, { "epoch": 4.056711639606677, "grad_norm": 0.8924784064292908, "learning_rate": 8.527822730285868e-06, "loss": 0.016955195367336272, "memory(GiB)": 122.96, "step": 53220, "token_acc": 0.9931192660550459, "train_speed(iter/s)": 0.23107 }, { "epoch": 4.057092766216937, "grad_norm": 0.5738916993141174, "learning_rate": 8.521135665885093e-06, "loss": 0.06728230714797974, "memory(GiB)": 122.96, "step": 53225, "token_acc": 0.981806506849315, "train_speed(iter/s)": 0.231074 }, { "epoch": 4.057473892827197, "grad_norm": 1.5071732997894287, "learning_rate": 8.514450980104704e-06, "loss": 0.03531339764595032, "memory(GiB)": 122.96, "step": 53230, "token_acc": 0.9840168243953733, "train_speed(iter/s)": 0.23108 }, { "epoch": 4.057855019437457, "grad_norm": 1.4378939867019653, "learning_rate": 8.507768673328043e-06, "loss": 0.041410624980926514, "memory(GiB)": 122.96, "step": 53235, "token_acc": 0.9856072555205048, "train_speed(iter/s)": 0.231085 }, { "epoch": 4.058236146047717, "grad_norm": 1.4626617431640625, "learning_rate": 8.501088745938279e-06, "loss": 0.044384732842445374, "memory(GiB)": 122.96, "step": 53240, "token_acc": 0.9866817795409465, "train_speed(iter/s)": 0.231091 }, { "epoch": 4.058617272657977, "grad_norm": 0.7525442242622375, "learning_rate": 8.494411198318526e-06, "loss": 0.023798835277557374, "memory(GiB)": 122.96, "step": 53245, "token_acc": 0.9924912397797431, "train_speed(iter/s)": 0.231094 }, { "epoch": 4.058998399268237, "grad_norm": 0.5148133635520935, "learning_rate": 8.487736030851663e-06, "loss": 0.031070145964622497, "memory(GiB)": 122.96, "step": 53250, "token_acc": 0.9906524227394125, "train_speed(iter/s)": 0.2311 }, { "epoch": 4.059379525878497, "grad_norm": 0.7038024067878723, "learning_rate": 8.481063243920501e-06, "loss": 0.04555312395095825, "memory(GiB)": 122.96, "step": 53255, "token_acc": 0.9835575485799701, "train_speed(iter/s)": 0.231103 }, { "epoch": 4.059760652488757, "grad_norm": 1.9048726558685303, "learning_rate": 8.474392837907702e-06, "loss": 0.02903330624103546, "memory(GiB)": 122.96, "step": 53260, "token_acc": 0.9904717275851297, "train_speed(iter/s)": 0.231108 }, { "epoch": 4.060141779099017, "grad_norm": 0.6985657215118408, "learning_rate": 8.467724813195759e-06, "loss": 0.03546475172042847, "memory(GiB)": 122.96, "step": 53265, "token_acc": 0.98701504354711, "train_speed(iter/s)": 0.23111 }, { "epoch": 4.060522905709276, "grad_norm": 1.0427576303482056, "learning_rate": 8.461059170167068e-06, "loss": 0.033675378561019896, "memory(GiB)": 122.96, "step": 53270, "token_acc": 0.9856874706710465, "train_speed(iter/s)": 0.231116 }, { "epoch": 4.060904032319536, "grad_norm": 0.665774405002594, "learning_rate": 8.454395909203878e-06, "loss": 0.032632702589035036, "memory(GiB)": 122.96, "step": 53275, "token_acc": 0.9883374689826303, "train_speed(iter/s)": 0.231119 }, { "epoch": 4.061285158929796, "grad_norm": 1.0452651977539062, "learning_rate": 8.447735030688276e-06, "loss": 0.03675309419631958, "memory(GiB)": 122.96, "step": 53280, "token_acc": 0.9826580724370779, "train_speed(iter/s)": 0.231122 }, { "epoch": 4.061666285540056, "grad_norm": 0.8064104318618774, "learning_rate": 8.441076535002241e-06, "loss": 0.023718342185020447, "memory(GiB)": 122.96, "step": 53285, "token_acc": 0.9897844948222781, "train_speed(iter/s)": 0.231126 }, { "epoch": 4.062047412150316, "grad_norm": 1.2988775968551636, "learning_rate": 8.434420422527629e-06, "loss": 0.02847830057144165, "memory(GiB)": 122.96, "step": 53290, "token_acc": 0.9829692706405035, "train_speed(iter/s)": 0.231133 }, { "epoch": 4.062428538760576, "grad_norm": 0.4753057360649109, "learning_rate": 8.4277666936461e-06, "loss": 0.049191564321517944, "memory(GiB)": 122.96, "step": 53295, "token_acc": 0.9882489241972857, "train_speed(iter/s)": 0.231136 }, { "epoch": 4.062809665370836, "grad_norm": 0.5510156750679016, "learning_rate": 8.421115348739234e-06, "loss": 0.02386294901371002, "memory(GiB)": 122.96, "step": 53300, "token_acc": 0.9875717017208413, "train_speed(iter/s)": 0.23114 }, { "epoch": 4.063190791981096, "grad_norm": 2.717634677886963, "learning_rate": 8.414466388188463e-06, "loss": 0.019723328948020934, "memory(GiB)": 122.96, "step": 53305, "token_acc": 0.9934691745036572, "train_speed(iter/s)": 0.231145 }, { "epoch": 4.063571918591356, "grad_norm": 0.6676408052444458, "learning_rate": 8.407819812375056e-06, "loss": 0.018288759887218474, "memory(GiB)": 122.96, "step": 53310, "token_acc": 0.9907114991640349, "train_speed(iter/s)": 0.231149 }, { "epoch": 4.063953045201616, "grad_norm": 1.963147521018982, "learning_rate": 8.401175621680169e-06, "loss": 0.05622974634170532, "memory(GiB)": 122.96, "step": 53315, "token_acc": 0.9844789356984479, "train_speed(iter/s)": 0.231154 }, { "epoch": 4.0643341718118755, "grad_norm": 0.5998033881187439, "learning_rate": 8.394533816484829e-06, "loss": 0.018751341104507446, "memory(GiB)": 122.96, "step": 53320, "token_acc": 0.9933895921237693, "train_speed(iter/s)": 0.231157 }, { "epoch": 4.0647152984221355, "grad_norm": 0.8937817215919495, "learning_rate": 8.387894397169893e-06, "loss": 0.026675844192504884, "memory(GiB)": 122.96, "step": 53325, "token_acc": 0.988036047234307, "train_speed(iter/s)": 0.231161 }, { "epoch": 4.0650964250323955, "grad_norm": 1.4597535133361816, "learning_rate": 8.381257364116108e-06, "loss": 0.029454955458641054, "memory(GiB)": 122.96, "step": 53330, "token_acc": 0.9879072892173328, "train_speed(iter/s)": 0.231168 }, { "epoch": 4.065477551642656, "grad_norm": 0.5223109126091003, "learning_rate": 8.374622717704089e-06, "loss": 0.022602570056915284, "memory(GiB)": 122.96, "step": 53335, "token_acc": 0.99, "train_speed(iter/s)": 0.231175 }, { "epoch": 4.065858678252916, "grad_norm": 0.717375636100769, "learning_rate": 8.367990458314284e-06, "loss": 0.03715379238128662, "memory(GiB)": 122.96, "step": 53340, "token_acc": 0.9838964319545311, "train_speed(iter/s)": 0.231182 }, { "epoch": 4.066239804863176, "grad_norm": 0.6229118704795837, "learning_rate": 8.361360586327038e-06, "loss": 0.04933002591133118, "memory(GiB)": 122.96, "step": 53345, "token_acc": 0.985711410746629, "train_speed(iter/s)": 0.231187 }, { "epoch": 4.066620931473436, "grad_norm": 1.1808123588562012, "learning_rate": 8.354733102122526e-06, "loss": 0.03992566764354706, "memory(GiB)": 122.96, "step": 53350, "token_acc": 0.982779827798278, "train_speed(iter/s)": 0.231195 }, { "epoch": 4.067002058083696, "grad_norm": 3.0427260398864746, "learning_rate": 8.348108006080813e-06, "loss": 0.02659507393836975, "memory(GiB)": 122.96, "step": 53355, "token_acc": 0.992021834977955, "train_speed(iter/s)": 0.2312 }, { "epoch": 4.067383184693956, "grad_norm": 0.9793557524681091, "learning_rate": 8.341485298581825e-06, "loss": 0.027014097571372984, "memory(GiB)": 122.96, "step": 53360, "token_acc": 0.9877400295420975, "train_speed(iter/s)": 0.231202 }, { "epoch": 4.067764311304216, "grad_norm": 0.8872125744819641, "learning_rate": 8.33486498000533e-06, "loss": 0.03523969948291779, "memory(GiB)": 122.96, "step": 53365, "token_acc": 0.9872296952153925, "train_speed(iter/s)": 0.231206 }, { "epoch": 4.068145437914475, "grad_norm": 1.3278154134750366, "learning_rate": 8.328247050730975e-06, "loss": 0.036752229928970336, "memory(GiB)": 122.96, "step": 53370, "token_acc": 0.9873129472999349, "train_speed(iter/s)": 0.231213 }, { "epoch": 4.068526564524735, "grad_norm": 1.6728018522262573, "learning_rate": 8.321631511138273e-06, "loss": 0.03112640678882599, "memory(GiB)": 122.96, "step": 53375, "token_acc": 0.9854912474373128, "train_speed(iter/s)": 0.231216 }, { "epoch": 4.068907691134995, "grad_norm": 0.9841048717498779, "learning_rate": 8.315018361606592e-06, "loss": 0.01882137656211853, "memory(GiB)": 122.96, "step": 53380, "token_acc": 0.9873392282958199, "train_speed(iter/s)": 0.23122 }, { "epoch": 4.069288817745255, "grad_norm": 0.26417994499206543, "learning_rate": 8.308407602515178e-06, "loss": 0.02473374754190445, "memory(GiB)": 122.96, "step": 53385, "token_acc": 0.9876850207223209, "train_speed(iter/s)": 0.231223 }, { "epoch": 4.069669944355515, "grad_norm": 0.7888450622558594, "learning_rate": 8.301799234243102e-06, "loss": 0.031085497140884398, "memory(GiB)": 122.96, "step": 53390, "token_acc": 0.989125142022399, "train_speed(iter/s)": 0.231225 }, { "epoch": 4.070051070965775, "grad_norm": 1.1668246984481812, "learning_rate": 8.295193257169337e-06, "loss": 0.03586641550064087, "memory(GiB)": 122.96, "step": 53395, "token_acc": 0.9869653767820774, "train_speed(iter/s)": 0.231228 }, { "epoch": 4.070432197576035, "grad_norm": 1.795162320137024, "learning_rate": 8.288589671672714e-06, "loss": 0.03670423328876495, "memory(GiB)": 122.96, "step": 53400, "token_acc": 0.9897619047619047, "train_speed(iter/s)": 0.231233 }, { "epoch": 4.070432197576035, "eval_loss": 0.053026169538497925, "eval_runtime": 219.5226, "eval_samples_per_second": 2.414, "eval_steps_per_second": 2.414, "eval_token_acc": 0.9788341063791338, "step": 53400 }, { "epoch": 4.070813324186295, "grad_norm": 0.8011142611503601, "learning_rate": 8.281988478131903e-06, "loss": 0.02527101933956146, "memory(GiB)": 122.96, "step": 53405, "token_acc": 0.9795536672124668, "train_speed(iter/s)": 0.231015 }, { "epoch": 4.071194450796555, "grad_norm": 0.6893752217292786, "learning_rate": 8.275389676925455e-06, "loss": 0.054383504390716556, "memory(GiB)": 122.96, "step": 53410, "token_acc": 0.9824312836497592, "train_speed(iter/s)": 0.23102 }, { "epoch": 4.071575577406815, "grad_norm": 1.7618376016616821, "learning_rate": 8.268793268431795e-06, "loss": 0.020154780149459837, "memory(GiB)": 122.96, "step": 53415, "token_acc": 0.9882842025699169, "train_speed(iter/s)": 0.231026 }, { "epoch": 4.071956704017074, "grad_norm": 3.4751136302948, "learning_rate": 8.262199253029174e-06, "loss": 0.04073074758052826, "memory(GiB)": 122.96, "step": 53420, "token_acc": 0.9843470483005367, "train_speed(iter/s)": 0.231031 }, { "epoch": 4.072337830627334, "grad_norm": 0.5998041033744812, "learning_rate": 8.255607631095735e-06, "loss": 0.04213403463363648, "memory(GiB)": 122.96, "step": 53425, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.231037 }, { "epoch": 4.072718957237594, "grad_norm": 0.7269887328147888, "learning_rate": 8.249018403009495e-06, "loss": 0.0281173437833786, "memory(GiB)": 122.96, "step": 53430, "token_acc": 0.9862674470959027, "train_speed(iter/s)": 0.231037 }, { "epoch": 4.073100083847854, "grad_norm": 0.6967656016349792, "learning_rate": 8.24243156914829e-06, "loss": 0.039331698417663576, "memory(GiB)": 122.96, "step": 53435, "token_acc": 0.9856181860357226, "train_speed(iter/s)": 0.231043 }, { "epoch": 4.073481210458114, "grad_norm": 0.882792592048645, "learning_rate": 8.235847129889851e-06, "loss": 0.04511374831199646, "memory(GiB)": 122.96, "step": 53440, "token_acc": 0.9866262482168331, "train_speed(iter/s)": 0.231047 }, { "epoch": 4.073862337068374, "grad_norm": 0.8683498501777649, "learning_rate": 8.229265085611787e-06, "loss": 0.03662592470645905, "memory(GiB)": 122.96, "step": 53445, "token_acc": 0.9836501124054772, "train_speed(iter/s)": 0.231051 }, { "epoch": 4.074243463678634, "grad_norm": 0.8746306300163269, "learning_rate": 8.222685436691518e-06, "loss": 0.032275530695915225, "memory(GiB)": 122.96, "step": 53450, "token_acc": 0.984635761589404, "train_speed(iter/s)": 0.231057 }, { "epoch": 4.074624590288894, "grad_norm": 1.040813684463501, "learning_rate": 8.216108183506365e-06, "loss": 0.04014216065406799, "memory(GiB)": 122.96, "step": 53455, "token_acc": 0.9790165324289953, "train_speed(iter/s)": 0.231062 }, { "epoch": 4.075005716899154, "grad_norm": 1.6044697761535645, "learning_rate": 8.209533326433527e-06, "loss": 0.030856555700302123, "memory(GiB)": 122.96, "step": 53460, "token_acc": 0.9863238512035011, "train_speed(iter/s)": 0.231067 }, { "epoch": 4.075386843509413, "grad_norm": 1.2756956815719604, "learning_rate": 8.20296086585e-06, "loss": 0.02630636990070343, "memory(GiB)": 122.96, "step": 53465, "token_acc": 0.9905540417801998, "train_speed(iter/s)": 0.231069 }, { "epoch": 4.075767970119673, "grad_norm": 0.7860857248306274, "learning_rate": 8.196390802132714e-06, "loss": 0.030326515436172485, "memory(GiB)": 122.96, "step": 53470, "token_acc": 0.9841991341991342, "train_speed(iter/s)": 0.231074 }, { "epoch": 4.076149096729933, "grad_norm": 1.081770658493042, "learning_rate": 8.189823135658425e-06, "loss": 0.04276342391967773, "memory(GiB)": 122.96, "step": 53475, "token_acc": 0.9858140549978176, "train_speed(iter/s)": 0.231079 }, { "epoch": 4.076530223340193, "grad_norm": 0.7403837442398071, "learning_rate": 8.18325786680375e-06, "loss": 0.021331661939620973, "memory(GiB)": 122.96, "step": 53480, "token_acc": 0.9912241224122412, "train_speed(iter/s)": 0.231085 }, { "epoch": 4.0769113499504535, "grad_norm": 2.343557834625244, "learning_rate": 8.176694995945183e-06, "loss": 0.038110336661338805, "memory(GiB)": 122.96, "step": 53485, "token_acc": 0.9896237029628704, "train_speed(iter/s)": 0.231087 }, { "epoch": 4.0772924765607135, "grad_norm": 0.7029040455818176, "learning_rate": 8.170134523459088e-06, "loss": 0.013844710588455201, "memory(GiB)": 122.96, "step": 53490, "token_acc": 0.9944289693593314, "train_speed(iter/s)": 0.231093 }, { "epoch": 4.0776736031709735, "grad_norm": 0.7062538266181946, "learning_rate": 8.163576449721649e-06, "loss": 0.027145785093307496, "memory(GiB)": 122.96, "step": 53495, "token_acc": 0.9883460331689825, "train_speed(iter/s)": 0.231095 }, { "epoch": 4.0780547297812335, "grad_norm": 1.0219961404800415, "learning_rate": 8.157020775108959e-06, "loss": 0.036024004220962524, "memory(GiB)": 122.96, "step": 53500, "token_acc": 0.988713607797871, "train_speed(iter/s)": 0.231098 }, { "epoch": 4.0784358563914935, "grad_norm": 0.681365430355072, "learning_rate": 8.15046749999695e-06, "loss": 0.03481810688972473, "memory(GiB)": 122.96, "step": 53505, "token_acc": 0.9867382380802021, "train_speed(iter/s)": 0.231099 }, { "epoch": 4.0788169830017535, "grad_norm": 1.1691502332687378, "learning_rate": 8.143916624761421e-06, "loss": 0.03670762181282043, "memory(GiB)": 122.96, "step": 53510, "token_acc": 0.9853900709219858, "train_speed(iter/s)": 0.231102 }, { "epoch": 4.079198109612013, "grad_norm": 0.7338285446166992, "learning_rate": 8.137368149778051e-06, "loss": 0.03153424859046936, "memory(GiB)": 122.96, "step": 53515, "token_acc": 0.9891254049051366, "train_speed(iter/s)": 0.231105 }, { "epoch": 4.079579236222273, "grad_norm": 0.8982493877410889, "learning_rate": 8.130822075422345e-06, "loss": 0.026994779706001282, "memory(GiB)": 122.96, "step": 53520, "token_acc": 0.988280369619112, "train_speed(iter/s)": 0.231111 }, { "epoch": 4.079960362832533, "grad_norm": 0.738635241985321, "learning_rate": 8.12427840206969e-06, "loss": 0.03664886653423309, "memory(GiB)": 122.96, "step": 53525, "token_acc": 0.988009592326139, "train_speed(iter/s)": 0.231116 }, { "epoch": 4.080341489442793, "grad_norm": 0.9056589007377625, "learning_rate": 8.11773713009536e-06, "loss": 0.04072945415973663, "memory(GiB)": 122.96, "step": 53530, "token_acc": 0.9833752174753528, "train_speed(iter/s)": 0.231121 }, { "epoch": 4.080722616053053, "grad_norm": 0.6400207877159119, "learning_rate": 8.11119825987443e-06, "loss": 0.03644640445709228, "memory(GiB)": 122.96, "step": 53535, "token_acc": 0.9864948773672773, "train_speed(iter/s)": 0.231124 }, { "epoch": 4.081103742663313, "grad_norm": 0.5714262127876282, "learning_rate": 8.104661791781892e-06, "loss": 0.03788108229637146, "memory(GiB)": 122.96, "step": 53540, "token_acc": 0.9837526205450734, "train_speed(iter/s)": 0.231129 }, { "epoch": 4.081484869273573, "grad_norm": 2.682532548904419, "learning_rate": 8.0981277261926e-06, "loss": 0.039311110973358154, "memory(GiB)": 122.96, "step": 53545, "token_acc": 0.9867713004484305, "train_speed(iter/s)": 0.231134 }, { "epoch": 4.081865995883833, "grad_norm": 1.2106550931930542, "learning_rate": 8.091596063481216e-06, "loss": 0.03971518576145172, "memory(GiB)": 122.96, "step": 53550, "token_acc": 0.9803563270899954, "train_speed(iter/s)": 0.231141 }, { "epoch": 4.082247122494093, "grad_norm": 0.41928523778915405, "learning_rate": 8.085066804022334e-06, "loss": 0.03414474725723267, "memory(GiB)": 122.96, "step": 53555, "token_acc": 0.9856658848058379, "train_speed(iter/s)": 0.231148 }, { "epoch": 4.082628249104353, "grad_norm": 0.8104530572891235, "learning_rate": 8.07853994819035e-06, "loss": 0.022182604670524596, "memory(GiB)": 122.96, "step": 53560, "token_acc": 0.9910821884791516, "train_speed(iter/s)": 0.231154 }, { "epoch": 4.083009375714612, "grad_norm": 0.5085285902023315, "learning_rate": 8.072015496359558e-06, "loss": 0.047208189964294434, "memory(GiB)": 122.96, "step": 53565, "token_acc": 0.9860254829428688, "train_speed(iter/s)": 0.231155 }, { "epoch": 4.083390502324872, "grad_norm": 1.3026195764541626, "learning_rate": 8.065493448904121e-06, "loss": 0.025645425915718077, "memory(GiB)": 122.96, "step": 53570, "token_acc": 0.9909407665505227, "train_speed(iter/s)": 0.231161 }, { "epoch": 4.083771628935132, "grad_norm": 2.5795958042144775, "learning_rate": 8.058973806198027e-06, "loss": 0.02001785933971405, "memory(GiB)": 122.96, "step": 53575, "token_acc": 0.9914122137404581, "train_speed(iter/s)": 0.231168 }, { "epoch": 4.084152755545392, "grad_norm": 0.7809562683105469, "learning_rate": 8.052456568615151e-06, "loss": 0.06772407293319702, "memory(GiB)": 122.96, "step": 53580, "token_acc": 0.9761285748050106, "train_speed(iter/s)": 0.231174 }, { "epoch": 4.084533882155652, "grad_norm": 0.7935867309570312, "learning_rate": 8.045941736529245e-06, "loss": 0.02488071918487549, "memory(GiB)": 122.96, "step": 53585, "token_acc": 0.9893731711073463, "train_speed(iter/s)": 0.231177 }, { "epoch": 4.084915008765912, "grad_norm": 0.675284206867218, "learning_rate": 8.039429310313873e-06, "loss": 0.028624522686004638, "memory(GiB)": 122.96, "step": 53590, "token_acc": 0.9886768036234228, "train_speed(iter/s)": 0.231181 }, { "epoch": 4.085296135376172, "grad_norm": 1.3259645700454712, "learning_rate": 8.03291929034251e-06, "loss": 0.015855300426483154, "memory(GiB)": 122.96, "step": 53595, "token_acc": 0.9930579072130037, "train_speed(iter/s)": 0.231185 }, { "epoch": 4.085677261986432, "grad_norm": 0.9246994853019714, "learning_rate": 8.026411676988493e-06, "loss": 0.02930956780910492, "memory(GiB)": 122.96, "step": 53600, "token_acc": 0.9882024216081962, "train_speed(iter/s)": 0.231185 }, { "epoch": 4.085677261986432, "eval_loss": 0.05279500037431717, "eval_runtime": 218.6534, "eval_samples_per_second": 2.424, "eval_steps_per_second": 2.424, "eval_token_acc": 0.9790072887175472, "step": 53600 }, { "epoch": 4.086058388596692, "grad_norm": 0.694756805896759, "learning_rate": 8.019906470624966e-06, "loss": 0.02873707711696625, "memory(GiB)": 122.96, "step": 53605, "token_acc": 0.9795518445940189, "train_speed(iter/s)": 0.230971 }, { "epoch": 4.086439515206952, "grad_norm": 1.1546965837478638, "learning_rate": 8.013403671624997e-06, "loss": 0.028007540106773376, "memory(GiB)": 122.96, "step": 53610, "token_acc": 0.9897056167599784, "train_speed(iter/s)": 0.230975 }, { "epoch": 4.086820641817211, "grad_norm": 0.9607802033424377, "learning_rate": 8.006903280361495e-06, "loss": 0.02544459104537964, "memory(GiB)": 122.96, "step": 53615, "token_acc": 0.9903123192596877, "train_speed(iter/s)": 0.230977 }, { "epoch": 4.087201768427471, "grad_norm": 1.1476333141326904, "learning_rate": 8.000405297207203e-06, "loss": 0.016901905834674835, "memory(GiB)": 122.96, "step": 53620, "token_acc": 0.9903753609239654, "train_speed(iter/s)": 0.230983 }, { "epoch": 4.087582895037731, "grad_norm": 1.364572286605835, "learning_rate": 7.99390972253477e-06, "loss": 0.05038233995437622, "memory(GiB)": 122.96, "step": 53625, "token_acc": 0.9853039412157648, "train_speed(iter/s)": 0.23099 }, { "epoch": 4.087964021647991, "grad_norm": 0.8113613128662109, "learning_rate": 7.987416556716692e-06, "loss": 0.04407536685466766, "memory(GiB)": 122.96, "step": 53630, "token_acc": 0.9785977859778597, "train_speed(iter/s)": 0.230996 }, { "epoch": 4.088345148258251, "grad_norm": 0.12593957781791687, "learning_rate": 7.9809258001253e-06, "loss": 0.03821252584457398, "memory(GiB)": 122.96, "step": 53635, "token_acc": 0.9838220424671386, "train_speed(iter/s)": 0.231001 }, { "epoch": 4.088726274868511, "grad_norm": 0.7749133706092834, "learning_rate": 7.974437453132822e-06, "loss": 0.03481042981147766, "memory(GiB)": 122.96, "step": 53640, "token_acc": 0.979050279329609, "train_speed(iter/s)": 0.231008 }, { "epoch": 4.089107401478771, "grad_norm": 1.2731785774230957, "learning_rate": 7.967951516111332e-06, "loss": 0.03387185335159302, "memory(GiB)": 122.96, "step": 53645, "token_acc": 0.9784817692767483, "train_speed(iter/s)": 0.231015 }, { "epoch": 4.089488528089031, "grad_norm": 0.22433635592460632, "learning_rate": 7.961467989432775e-06, "loss": 0.036557677388191226, "memory(GiB)": 122.96, "step": 53650, "token_acc": 0.9780168381665107, "train_speed(iter/s)": 0.231023 }, { "epoch": 4.089869654699291, "grad_norm": 0.7858422994613647, "learning_rate": 7.954986873468957e-06, "loss": 0.03944132328033447, "memory(GiB)": 122.96, "step": 53655, "token_acc": 0.9828466408004901, "train_speed(iter/s)": 0.231027 }, { "epoch": 4.090250781309551, "grad_norm": 0.6636209487915039, "learning_rate": 7.948508168591512e-06, "loss": 0.051181286573410034, "memory(GiB)": 122.96, "step": 53660, "token_acc": 0.9790314528207689, "train_speed(iter/s)": 0.23103 }, { "epoch": 4.090631907919811, "grad_norm": 0.7979147434234619, "learning_rate": 7.942031875171984e-06, "loss": 0.03244549036026001, "memory(GiB)": 122.96, "step": 53665, "token_acc": 0.9906331959535406, "train_speed(iter/s)": 0.231036 }, { "epoch": 4.091013034530071, "grad_norm": 1.002500295639038, "learning_rate": 7.935557993581766e-06, "loss": 0.03304027318954468, "memory(GiB)": 122.96, "step": 53670, "token_acc": 0.9861469828598263, "train_speed(iter/s)": 0.231042 }, { "epoch": 4.091394161140331, "grad_norm": 1.3841630220413208, "learning_rate": 7.929086524192086e-06, "loss": 0.03809280395507812, "memory(GiB)": 122.96, "step": 53675, "token_acc": 0.9895370128171593, "train_speed(iter/s)": 0.231049 }, { "epoch": 4.091775287750591, "grad_norm": 1.7976726293563843, "learning_rate": 7.922617467374055e-06, "loss": 0.036685997247695924, "memory(GiB)": 122.96, "step": 53680, "token_acc": 0.9858712715855573, "train_speed(iter/s)": 0.231054 }, { "epoch": 4.092156414360851, "grad_norm": 0.43824440240859985, "learning_rate": 7.916150823498664e-06, "loss": 0.018627819418907166, "memory(GiB)": 122.96, "step": 53685, "token_acc": 0.9936672423719056, "train_speed(iter/s)": 0.23106 }, { "epoch": 4.092537540971111, "grad_norm": 0.527538537979126, "learning_rate": 7.909686592936722e-06, "loss": 0.026407596468925477, "memory(GiB)": 122.96, "step": 53690, "token_acc": 0.9893559444344218, "train_speed(iter/s)": 0.231065 }, { "epoch": 4.092918667581371, "grad_norm": 0.6890683770179749, "learning_rate": 7.903224776058926e-06, "loss": 0.03354440033435822, "memory(GiB)": 122.96, "step": 53695, "token_acc": 0.9902272727272727, "train_speed(iter/s)": 0.23107 }, { "epoch": 4.093299794191631, "grad_norm": 1.2874864339828491, "learning_rate": 7.896765373235854e-06, "loss": 0.036010253429412845, "memory(GiB)": 122.96, "step": 53700, "token_acc": 0.9846342962507683, "train_speed(iter/s)": 0.231075 }, { "epoch": 4.093680920801891, "grad_norm": 7.508488178253174, "learning_rate": 7.890308384837886e-06, "loss": 0.031852534413337706, "memory(GiB)": 122.96, "step": 53705, "token_acc": 0.9921341337197268, "train_speed(iter/s)": 0.23108 }, { "epoch": 4.094062047412151, "grad_norm": 0.7551389932632446, "learning_rate": 7.88385381123532e-06, "loss": 0.03546325266361237, "memory(GiB)": 122.96, "step": 53710, "token_acc": 0.9867848025229013, "train_speed(iter/s)": 0.231082 }, { "epoch": 4.09444317402241, "grad_norm": 1.030689001083374, "learning_rate": 7.87740165279831e-06, "loss": 0.053396481275558474, "memory(GiB)": 122.96, "step": 53715, "token_acc": 0.9789336801040313, "train_speed(iter/s)": 0.231088 }, { "epoch": 4.09482430063267, "grad_norm": 0.4042191207408905, "learning_rate": 7.870951909896835e-06, "loss": 0.027251020073890686, "memory(GiB)": 122.96, "step": 53720, "token_acc": 0.9860637968411273, "train_speed(iter/s)": 0.231094 }, { "epoch": 4.09520542724293, "grad_norm": 0.8790942430496216, "learning_rate": 7.864504582900768e-06, "loss": 0.03986426293849945, "memory(GiB)": 122.96, "step": 53725, "token_acc": 0.9886334933063905, "train_speed(iter/s)": 0.2311 }, { "epoch": 4.09558655385319, "grad_norm": 1.1553378105163574, "learning_rate": 7.858059672179824e-06, "loss": 0.030221787095069886, "memory(GiB)": 122.96, "step": 53730, "token_acc": 0.9846534653465346, "train_speed(iter/s)": 0.231106 }, { "epoch": 4.09596768046345, "grad_norm": 0.5655634999275208, "learning_rate": 7.851617178103593e-06, "loss": 0.018299080431461334, "memory(GiB)": 122.96, "step": 53735, "token_acc": 0.991754336081888, "train_speed(iter/s)": 0.231113 }, { "epoch": 4.09634880707371, "grad_norm": 2.504856824874878, "learning_rate": 7.84517710104154e-06, "loss": 0.04715578556060791, "memory(GiB)": 122.96, "step": 53740, "token_acc": 0.9852231604342582, "train_speed(iter/s)": 0.231119 }, { "epoch": 4.09672993368397, "grad_norm": 1.7584342956542969, "learning_rate": 7.838739441362941e-06, "loss": 0.026138490438461302, "memory(GiB)": 122.96, "step": 53745, "token_acc": 0.9893428063943162, "train_speed(iter/s)": 0.231126 }, { "epoch": 4.09711106029423, "grad_norm": 1.1171385049819946, "learning_rate": 7.832304199436984e-06, "loss": 0.025195032358169556, "memory(GiB)": 122.96, "step": 53750, "token_acc": 0.9902516436182272, "train_speed(iter/s)": 0.231131 }, { "epoch": 4.09749218690449, "grad_norm": 1.6106984615325928, "learning_rate": 7.825871375632715e-06, "loss": 0.05708370804786682, "memory(GiB)": 122.96, "step": 53755, "token_acc": 0.9770830573585906, "train_speed(iter/s)": 0.231133 }, { "epoch": 4.09787331351475, "grad_norm": 1.3594685792922974, "learning_rate": 7.819440970318997e-06, "loss": 0.043789854645729064, "memory(GiB)": 122.96, "step": 53760, "token_acc": 0.9802110817941952, "train_speed(iter/s)": 0.23114 }, { "epoch": 4.098254440125009, "grad_norm": 0.8795149922370911, "learning_rate": 7.813012983864598e-06, "loss": 0.024878399074077608, "memory(GiB)": 122.96, "step": 53765, "token_acc": 0.9892294946147473, "train_speed(iter/s)": 0.231145 }, { "epoch": 4.098635566735269, "grad_norm": 0.6909263730049133, "learning_rate": 7.80658741663814e-06, "loss": 0.0688150703907013, "memory(GiB)": 122.96, "step": 53770, "token_acc": 0.9755425979915776, "train_speed(iter/s)": 0.231149 }, { "epoch": 4.099016693345529, "grad_norm": 0.6710997223854065, "learning_rate": 7.800164269008076e-06, "loss": 0.04026476740837097, "memory(GiB)": 122.96, "step": 53775, "token_acc": 0.9861405197305101, "train_speed(iter/s)": 0.231154 }, { "epoch": 4.099397819955789, "grad_norm": 1.270875096321106, "learning_rate": 7.793743541342779e-06, "loss": 0.03635070323944092, "memory(GiB)": 122.96, "step": 53780, "token_acc": 0.9841129744042365, "train_speed(iter/s)": 0.23116 }, { "epoch": 4.099778946566049, "grad_norm": 0.5816462635993958, "learning_rate": 7.787325234010418e-06, "loss": 0.028926479816436767, "memory(GiB)": 122.96, "step": 53785, "token_acc": 0.9853892715508245, "train_speed(iter/s)": 0.231164 }, { "epoch": 4.100160073176309, "grad_norm": 1.7469606399536133, "learning_rate": 7.780909347379062e-06, "loss": 0.03998096585273743, "memory(GiB)": 122.96, "step": 53790, "token_acc": 0.983073798239675, "train_speed(iter/s)": 0.231169 }, { "epoch": 4.100541199786569, "grad_norm": 1.1025497913360596, "learning_rate": 7.774495881816651e-06, "loss": 0.03090856075286865, "memory(GiB)": 122.96, "step": 53795, "token_acc": 0.9904015141273489, "train_speed(iter/s)": 0.231172 }, { "epoch": 4.100922326396829, "grad_norm": 1.6296333074569702, "learning_rate": 7.768084837690937e-06, "loss": 0.04579300284385681, "memory(GiB)": 122.96, "step": 53800, "token_acc": 0.9818616703209089, "train_speed(iter/s)": 0.231176 }, { "epoch": 4.100922326396829, "eval_loss": 0.05166026949882507, "eval_runtime": 218.5188, "eval_samples_per_second": 2.425, "eval_steps_per_second": 2.425, "eval_token_acc": 0.9789846997168845, "step": 53800 }, { "epoch": 4.101303453007089, "grad_norm": 1.020552396774292, "learning_rate": 7.761676215369574e-06, "loss": 0.04452953338623047, "memory(GiB)": 122.96, "step": 53805, "token_acc": 0.9790921132553221, "train_speed(iter/s)": 0.230965 }, { "epoch": 4.1016845796173484, "grad_norm": 1.2697105407714844, "learning_rate": 7.755270015220084e-06, "loss": 0.047821244597434996, "memory(GiB)": 122.96, "step": 53810, "token_acc": 0.9867411025819958, "train_speed(iter/s)": 0.230972 }, { "epoch": 4.1020657062276085, "grad_norm": 0.7875304818153381, "learning_rate": 7.74886623760981e-06, "loss": 0.030826157331466673, "memory(GiB)": 122.96, "step": 53815, "token_acc": 0.9866425992779784, "train_speed(iter/s)": 0.230975 }, { "epoch": 4.1024468328378685, "grad_norm": 0.8725568056106567, "learning_rate": 7.742464882905986e-06, "loss": 0.025532180070877077, "memory(GiB)": 122.96, "step": 53820, "token_acc": 0.9876095497129042, "train_speed(iter/s)": 0.230979 }, { "epoch": 4.1028279594481285, "grad_norm": 1.5663493871688843, "learning_rate": 7.736065951475718e-06, "loss": 0.03532339334487915, "memory(GiB)": 122.96, "step": 53825, "token_acc": 0.9861145549218944, "train_speed(iter/s)": 0.230984 }, { "epoch": 4.1032090860583885, "grad_norm": 1.0225768089294434, "learning_rate": 7.729669443685922e-06, "loss": 0.0271576851606369, "memory(GiB)": 122.96, "step": 53830, "token_acc": 0.9888327979195349, "train_speed(iter/s)": 0.230988 }, { "epoch": 4.1035902126686485, "grad_norm": 2.8433539867401123, "learning_rate": 7.723275359903426e-06, "loss": 0.029180806875228883, "memory(GiB)": 122.96, "step": 53835, "token_acc": 0.9884551116512228, "train_speed(iter/s)": 0.230992 }, { "epoch": 4.103971339278909, "grad_norm": 0.8787396550178528, "learning_rate": 7.716883700494915e-06, "loss": 0.06217254996299744, "memory(GiB)": 122.96, "step": 53840, "token_acc": 0.9817637840148169, "train_speed(iter/s)": 0.230995 }, { "epoch": 4.104352465889169, "grad_norm": 3.0368027687072754, "learning_rate": 7.71049446582689e-06, "loss": 0.04986308217048645, "memory(GiB)": 122.96, "step": 53845, "token_acc": 0.9843357184297041, "train_speed(iter/s)": 0.231 }, { "epoch": 4.104733592499429, "grad_norm": 1.3620330095291138, "learning_rate": 7.704107656265763e-06, "loss": 0.03122726082801819, "memory(GiB)": 122.96, "step": 53850, "token_acc": 0.9921457744266415, "train_speed(iter/s)": 0.231006 }, { "epoch": 4.105114719109689, "grad_norm": 0.9558029770851135, "learning_rate": 7.697723272177799e-06, "loss": 0.04136313199996948, "memory(GiB)": 122.96, "step": 53855, "token_acc": 0.9860292512551845, "train_speed(iter/s)": 0.231012 }, { "epoch": 4.105495845719948, "grad_norm": 1.2579799890518188, "learning_rate": 7.69134131392908e-06, "loss": 0.035866305232048035, "memory(GiB)": 122.96, "step": 53860, "token_acc": 0.9869125520523498, "train_speed(iter/s)": 0.231015 }, { "epoch": 4.105876972330208, "grad_norm": 0.9869985580444336, "learning_rate": 7.684961781885602e-06, "loss": 0.026746928691864014, "memory(GiB)": 122.96, "step": 53865, "token_acc": 0.9867509172441908, "train_speed(iter/s)": 0.23102 }, { "epoch": 4.106258098940468, "grad_norm": 0.5660719275474548, "learning_rate": 7.67858467641322e-06, "loss": 0.029116681218147276, "memory(GiB)": 122.96, "step": 53870, "token_acc": 0.9880917377241988, "train_speed(iter/s)": 0.231023 }, { "epoch": 4.106639225550728, "grad_norm": 1.6719486713409424, "learning_rate": 7.672209997877588e-06, "loss": 0.03430209457874298, "memory(GiB)": 122.96, "step": 53875, "token_acc": 0.9852783725910065, "train_speed(iter/s)": 0.231028 }, { "epoch": 4.107020352160988, "grad_norm": 0.4053094983100891, "learning_rate": 7.665837746644295e-06, "loss": 0.02699875831604004, "memory(GiB)": 122.96, "step": 53880, "token_acc": 0.9935691318327974, "train_speed(iter/s)": 0.231035 }, { "epoch": 4.107401478771248, "grad_norm": 0.4183003008365631, "learning_rate": 7.659467923078767e-06, "loss": 0.01684492826461792, "memory(GiB)": 122.96, "step": 53885, "token_acc": 0.9933802234174597, "train_speed(iter/s)": 0.23104 }, { "epoch": 4.107782605381508, "grad_norm": 0.8496479988098145, "learning_rate": 7.653100527546253e-06, "loss": 0.046040239930152896, "memory(GiB)": 122.96, "step": 53890, "token_acc": 0.9830028328611898, "train_speed(iter/s)": 0.231043 }, { "epoch": 4.108163731991768, "grad_norm": 0.6237499117851257, "learning_rate": 7.646735560411923e-06, "loss": 0.023174448311328887, "memory(GiB)": 122.96, "step": 53895, "token_acc": 0.9893882646691635, "train_speed(iter/s)": 0.231048 }, { "epoch": 4.108544858602028, "grad_norm": 0.8521115183830261, "learning_rate": 7.640373022040753e-06, "loss": 0.016917130351066588, "memory(GiB)": 122.96, "step": 53900, "token_acc": 0.9915094339622641, "train_speed(iter/s)": 0.231056 }, { "epoch": 4.108925985212288, "grad_norm": 0.571546196937561, "learning_rate": 7.634012912797617e-06, "loss": 0.0219974085688591, "memory(GiB)": 122.96, "step": 53905, "token_acc": 0.9889088729016786, "train_speed(iter/s)": 0.231058 }, { "epoch": 4.109307111822547, "grad_norm": 0.8993967771530151, "learning_rate": 7.627655233047237e-06, "loss": 0.027530187368392946, "memory(GiB)": 122.96, "step": 53910, "token_acc": 0.9872155458961902, "train_speed(iter/s)": 0.231063 }, { "epoch": 4.109688238432807, "grad_norm": 0.7479774951934814, "learning_rate": 7.621299983154201e-06, "loss": 0.04079717993736267, "memory(GiB)": 122.96, "step": 53915, "token_acc": 0.985969387755102, "train_speed(iter/s)": 0.231065 }, { "epoch": 4.110069365043067, "grad_norm": 0.8657191395759583, "learning_rate": 7.614947163482949e-06, "loss": 0.02245510518550873, "memory(GiB)": 122.96, "step": 53920, "token_acc": 0.9910504003768252, "train_speed(iter/s)": 0.23107 }, { "epoch": 4.110450491653327, "grad_norm": 0.888566255569458, "learning_rate": 7.608596774397797e-06, "loss": 0.031218892335891722, "memory(GiB)": 122.96, "step": 53925, "token_acc": 0.9862454655380894, "train_speed(iter/s)": 0.231072 }, { "epoch": 4.110831618263587, "grad_norm": 1.1260439157485962, "learning_rate": 7.602248816262891e-06, "loss": 0.017646652460098267, "memory(GiB)": 122.96, "step": 53930, "token_acc": 0.9916083916083916, "train_speed(iter/s)": 0.231075 }, { "epoch": 4.111212744873847, "grad_norm": 0.44401630759239197, "learning_rate": 7.595903289442263e-06, "loss": 0.02998282015323639, "memory(GiB)": 122.96, "step": 53935, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.231083 }, { "epoch": 4.111593871484107, "grad_norm": 1.7398289442062378, "learning_rate": 7.589560194299816e-06, "loss": 0.04052813947200775, "memory(GiB)": 122.96, "step": 53940, "token_acc": 0.9829326923076923, "train_speed(iter/s)": 0.231088 }, { "epoch": 4.111974998094367, "grad_norm": 0.6649186611175537, "learning_rate": 7.583219531199271e-06, "loss": 0.036061665415763854, "memory(GiB)": 122.96, "step": 53945, "token_acc": 0.9877430262045647, "train_speed(iter/s)": 0.231091 }, { "epoch": 4.112356124704627, "grad_norm": 2.760192632675171, "learning_rate": 7.576881300504257e-06, "loss": 0.032847973704338077, "memory(GiB)": 122.96, "step": 53950, "token_acc": 0.9886547811993517, "train_speed(iter/s)": 0.231097 }, { "epoch": 4.112737251314887, "grad_norm": 2.153844118118286, "learning_rate": 7.570545502578224e-06, "loss": 0.03666276931762695, "memory(GiB)": 122.96, "step": 53955, "token_acc": 0.9839968774395004, "train_speed(iter/s)": 0.231104 }, { "epoch": 4.113118377925146, "grad_norm": 1.1987278461456299, "learning_rate": 7.56421213778451e-06, "loss": 0.03232462704181671, "memory(GiB)": 122.96, "step": 53960, "token_acc": 0.9875551987153753, "train_speed(iter/s)": 0.231107 }, { "epoch": 4.113499504535406, "grad_norm": 0.6834537982940674, "learning_rate": 7.557881206486317e-06, "loss": 0.02597138285636902, "memory(GiB)": 122.96, "step": 53965, "token_acc": 0.9885100537030098, "train_speed(iter/s)": 0.231109 }, { "epoch": 4.113880631145666, "grad_norm": 1.0592851638793945, "learning_rate": 7.5515527090466666e-06, "loss": 0.026524096727371216, "memory(GiB)": 122.96, "step": 53970, "token_acc": 0.9906962785114045, "train_speed(iter/s)": 0.231115 }, { "epoch": 4.114261757755926, "grad_norm": 0.88997882604599, "learning_rate": 7.545226645828485e-06, "loss": 0.032077175378799436, "memory(GiB)": 122.96, "step": 53975, "token_acc": 0.9844182321246542, "train_speed(iter/s)": 0.231119 }, { "epoch": 4.114642884366186, "grad_norm": 0.4282940626144409, "learning_rate": 7.538903017194548e-06, "loss": 0.043998444080352785, "memory(GiB)": 122.96, "step": 53980, "token_acc": 0.988351776354106, "train_speed(iter/s)": 0.231122 }, { "epoch": 4.115024010976446, "grad_norm": 2.0474283695220947, "learning_rate": 7.532581823507473e-06, "loss": 0.055157136917114255, "memory(GiB)": 122.96, "step": 53985, "token_acc": 0.9817813765182186, "train_speed(iter/s)": 0.231127 }, { "epoch": 4.1154051375867065, "grad_norm": 1.3407340049743652, "learning_rate": 7.526263065129757e-06, "loss": 0.04826726317405701, "memory(GiB)": 122.96, "step": 53990, "token_acc": 0.9766847652507186, "train_speed(iter/s)": 0.231134 }, { "epoch": 4.1157862641969665, "grad_norm": 1.2001365423202515, "learning_rate": 7.519946742423761e-06, "loss": 0.03424661159515381, "memory(GiB)": 122.96, "step": 53995, "token_acc": 0.9889994761655317, "train_speed(iter/s)": 0.23114 }, { "epoch": 4.1161673908072265, "grad_norm": 1.344584345817566, "learning_rate": 7.513632855751679e-06, "loss": 0.03057123124599457, "memory(GiB)": 122.96, "step": 54000, "token_acc": 0.9875996457041629, "train_speed(iter/s)": 0.231146 }, { "epoch": 4.1161673908072265, "eval_loss": 0.05280463770031929, "eval_runtime": 195.5238, "eval_samples_per_second": 2.711, "eval_steps_per_second": 2.711, "eval_token_acc": 0.9787286910427083, "step": 54000 }, { "epoch": 4.1165485174174865, "grad_norm": 1.1232130527496338, "learning_rate": 7.507321405475593e-06, "loss": 0.02602834105491638, "memory(GiB)": 122.96, "step": 54005, "token_acc": 0.9791785572372936, "train_speed(iter/s)": 0.230956 }, { "epoch": 4.116929644027746, "grad_norm": 1.1146057844161987, "learning_rate": 7.501012391957446e-06, "loss": 0.03814407587051392, "memory(GiB)": 122.96, "step": 54010, "token_acc": 0.9869550858652576, "train_speed(iter/s)": 0.230961 }, { "epoch": 4.117310770638006, "grad_norm": 0.8726955652236938, "learning_rate": 7.494705815559005e-06, "loss": 0.048566070199012754, "memory(GiB)": 122.96, "step": 54015, "token_acc": 0.9827439646378783, "train_speed(iter/s)": 0.230963 }, { "epoch": 4.117691897248266, "grad_norm": 1.1162950992584229, "learning_rate": 7.488401676641937e-06, "loss": 0.03790769577026367, "memory(GiB)": 122.96, "step": 54020, "token_acc": 0.98385934572705, "train_speed(iter/s)": 0.230965 }, { "epoch": 4.118073023858526, "grad_norm": 0.887414813041687, "learning_rate": 7.482099975567763e-06, "loss": 0.024767863750457763, "memory(GiB)": 122.96, "step": 54025, "token_acc": 0.991093326448948, "train_speed(iter/s)": 0.230968 }, { "epoch": 4.118454150468786, "grad_norm": 1.1256111860275269, "learning_rate": 7.475800712697845e-06, "loss": 0.03337015509605408, "memory(GiB)": 122.96, "step": 54030, "token_acc": 0.9882667286245354, "train_speed(iter/s)": 0.23097 }, { "epoch": 4.118835277079046, "grad_norm": 0.9180206656455994, "learning_rate": 7.4695038883934145e-06, "loss": 0.019051040709018707, "memory(GiB)": 122.96, "step": 54035, "token_acc": 0.9912795871151451, "train_speed(iter/s)": 0.230975 }, { "epoch": 4.119216403689306, "grad_norm": 1.0820484161376953, "learning_rate": 7.463209503015567e-06, "loss": 0.049337157607078554, "memory(GiB)": 122.96, "step": 54040, "token_acc": 0.9833729216152018, "train_speed(iter/s)": 0.230981 }, { "epoch": 4.119597530299566, "grad_norm": 0.7786101698875427, "learning_rate": 7.4569175569252635e-06, "loss": 0.021539323031902313, "memory(GiB)": 122.96, "step": 54045, "token_acc": 0.9906485671191554, "train_speed(iter/s)": 0.230987 }, { "epoch": 4.119978656909826, "grad_norm": 2.0482399463653564, "learning_rate": 7.450628050483327e-06, "loss": 0.02544601559638977, "memory(GiB)": 122.96, "step": 54050, "token_acc": 0.9887024991441288, "train_speed(iter/s)": 0.230994 }, { "epoch": 4.120359783520086, "grad_norm": 0.7718355059623718, "learning_rate": 7.444340984050407e-06, "loss": 0.031000304222106933, "memory(GiB)": 122.96, "step": 54055, "token_acc": 0.9897552646556631, "train_speed(iter/s)": 0.231001 }, { "epoch": 4.120740910130345, "grad_norm": 0.9473495483398438, "learning_rate": 7.438056357987044e-06, "loss": 0.024207308888435364, "memory(GiB)": 122.96, "step": 54060, "token_acc": 0.9922547332185886, "train_speed(iter/s)": 0.231007 }, { "epoch": 4.121122036740605, "grad_norm": 0.958797037601471, "learning_rate": 7.431774172653655e-06, "loss": 0.042241919040679934, "memory(GiB)": 122.96, "step": 54065, "token_acc": 0.9837784696051123, "train_speed(iter/s)": 0.231013 }, { "epoch": 4.121503163350865, "grad_norm": 0.6725132465362549, "learning_rate": 7.425494428410462e-06, "loss": 0.043928790092468264, "memory(GiB)": 122.96, "step": 54070, "token_acc": 0.9849296718017415, "train_speed(iter/s)": 0.231013 }, { "epoch": 4.121884289961125, "grad_norm": 1.009594202041626, "learning_rate": 7.419217125617595e-06, "loss": 0.022415055334568022, "memory(GiB)": 122.96, "step": 54075, "token_acc": 0.9887420460107684, "train_speed(iter/s)": 0.231021 }, { "epoch": 4.122265416571385, "grad_norm": 1.5367140769958496, "learning_rate": 7.4129422646350365e-06, "loss": 0.04991730749607086, "memory(GiB)": 122.96, "step": 54080, "token_acc": 0.974937343358396, "train_speed(iter/s)": 0.231027 }, { "epoch": 4.122646543181645, "grad_norm": 0.8611154556274414, "learning_rate": 7.406669845822606e-06, "loss": 0.0196261391043663, "memory(GiB)": 122.96, "step": 54085, "token_acc": 0.991674595623216, "train_speed(iter/s)": 0.231032 }, { "epoch": 4.123027669791905, "grad_norm": 0.6574738621711731, "learning_rate": 7.4003998695399926e-06, "loss": 0.028351446986198424, "memory(GiB)": 122.96, "step": 54090, "token_acc": 0.9905350589144292, "train_speed(iter/s)": 0.231038 }, { "epoch": 4.123408796402165, "grad_norm": 1.796790361404419, "learning_rate": 7.39413233614678e-06, "loss": 0.044778579473495485, "memory(GiB)": 122.96, "step": 54095, "token_acc": 0.9848731294729993, "train_speed(iter/s)": 0.231043 }, { "epoch": 4.123789923012425, "grad_norm": 0.4545847773551941, "learning_rate": 7.387867246002345e-06, "loss": 0.05924106240272522, "memory(GiB)": 122.96, "step": 54100, "token_acc": 0.9868727503705272, "train_speed(iter/s)": 0.231049 }, { "epoch": 4.124171049622685, "grad_norm": 2.6126229763031006, "learning_rate": 7.381604599465986e-06, "loss": 0.0516531229019165, "memory(GiB)": 122.96, "step": 54105, "token_acc": 0.9864066193853428, "train_speed(iter/s)": 0.231055 }, { "epoch": 4.124552176232944, "grad_norm": 1.7979004383087158, "learning_rate": 7.375344396896833e-06, "loss": 0.0358661025762558, "memory(GiB)": 122.96, "step": 54110, "token_acc": 0.9854935351624093, "train_speed(iter/s)": 0.231063 }, { "epoch": 4.124933302843204, "grad_norm": 0.5016605854034424, "learning_rate": 7.369086638653872e-06, "loss": 0.016060033440589906, "memory(GiB)": 122.96, "step": 54115, "token_acc": 0.9943490054249547, "train_speed(iter/s)": 0.231068 }, { "epoch": 4.125314429453464, "grad_norm": 0.6997796893119812, "learning_rate": 7.362831325095971e-06, "loss": 0.03062770962715149, "memory(GiB)": 122.96, "step": 54120, "token_acc": 0.9860043063672717, "train_speed(iter/s)": 0.231072 }, { "epoch": 4.125695556063724, "grad_norm": 1.515255331993103, "learning_rate": 7.356578456581814e-06, "loss": 0.04019281566143036, "memory(GiB)": 122.96, "step": 54125, "token_acc": 0.9855960772295433, "train_speed(iter/s)": 0.231078 }, { "epoch": 4.126076682673984, "grad_norm": 0.49438560009002686, "learning_rate": 7.35032803347e-06, "loss": 0.023660190403461456, "memory(GiB)": 122.96, "step": 54130, "token_acc": 0.9901048589573179, "train_speed(iter/s)": 0.231082 }, { "epoch": 4.126457809284244, "grad_norm": 0.9157177805900574, "learning_rate": 7.344080056118963e-06, "loss": 0.032107359170913695, "memory(GiB)": 122.96, "step": 54135, "token_acc": 0.987489574645538, "train_speed(iter/s)": 0.231088 }, { "epoch": 4.126838935894504, "grad_norm": 0.9151263236999512, "learning_rate": 7.337834524886977e-06, "loss": 0.04277883172035217, "memory(GiB)": 122.96, "step": 54140, "token_acc": 0.9780693533270853, "train_speed(iter/s)": 0.231095 }, { "epoch": 4.127220062504764, "grad_norm": 0.6187189221382141, "learning_rate": 7.331591440132207e-06, "loss": 0.029391607642173766, "memory(GiB)": 122.96, "step": 54145, "token_acc": 0.9881470730527334, "train_speed(iter/s)": 0.231098 }, { "epoch": 4.127601189115024, "grad_norm": 1.037352204322815, "learning_rate": 7.325350802212672e-06, "loss": 0.04067699313163757, "memory(GiB)": 122.96, "step": 54150, "token_acc": 0.9811373092926491, "train_speed(iter/s)": 0.231106 }, { "epoch": 4.1279823157252835, "grad_norm": 1.283240556716919, "learning_rate": 7.319112611486217e-06, "loss": 0.046956110000610354, "memory(GiB)": 122.96, "step": 54155, "token_acc": 0.9811616954474097, "train_speed(iter/s)": 0.23111 }, { "epoch": 4.1283634423355435, "grad_norm": 1.8908188343048096, "learning_rate": 7.312876868310597e-06, "loss": 0.046372953057289126, "memory(GiB)": 122.96, "step": 54160, "token_acc": 0.9832912988650694, "train_speed(iter/s)": 0.231117 }, { "epoch": 4.1287445689458036, "grad_norm": 1.1729605197906494, "learning_rate": 7.306643573043409e-06, "loss": 0.02255253493785858, "memory(GiB)": 122.96, "step": 54165, "token_acc": 0.9904327027614699, "train_speed(iter/s)": 0.231122 }, { "epoch": 4.129125695556064, "grad_norm": 1.2502931356430054, "learning_rate": 7.3004127260420715e-06, "loss": 0.049557891488075254, "memory(GiB)": 122.96, "step": 54170, "token_acc": 0.980883735506111, "train_speed(iter/s)": 0.231127 }, { "epoch": 4.129506822166324, "grad_norm": 1.143985390663147, "learning_rate": 7.294184327663922e-06, "loss": 0.025493612885475157, "memory(GiB)": 122.96, "step": 54175, "token_acc": 0.9853202846975089, "train_speed(iter/s)": 0.231132 }, { "epoch": 4.129887948776584, "grad_norm": 1.2726123332977295, "learning_rate": 7.287958378266119e-06, "loss": 0.03591393232345581, "memory(GiB)": 122.96, "step": 54180, "token_acc": 0.9833822091886608, "train_speed(iter/s)": 0.231139 }, { "epoch": 4.130269075386844, "grad_norm": 0.586693286895752, "learning_rate": 7.281734878205692e-06, "loss": 0.024769186973571777, "memory(GiB)": 122.96, "step": 54185, "token_acc": 0.9910637566592198, "train_speed(iter/s)": 0.231145 }, { "epoch": 4.130650201997104, "grad_norm": 1.287660002708435, "learning_rate": 7.275513827839547e-06, "loss": 0.027529072761535645, "memory(GiB)": 122.96, "step": 54190, "token_acc": 0.9871852517985612, "train_speed(iter/s)": 0.231151 }, { "epoch": 4.131031328607364, "grad_norm": 1.3025212287902832, "learning_rate": 7.269295227524403e-06, "loss": 0.02406333088874817, "memory(GiB)": 122.96, "step": 54195, "token_acc": 0.9913544668587896, "train_speed(iter/s)": 0.231157 }, { "epoch": 4.131412455217624, "grad_norm": 1.9282519817352295, "learning_rate": 7.263079077616886e-06, "loss": 0.03414790332317352, "memory(GiB)": 122.96, "step": 54200, "token_acc": 0.9884481255448997, "train_speed(iter/s)": 0.231162 }, { "epoch": 4.131412455217624, "eval_loss": 0.05176291614770889, "eval_runtime": 175.9374, "eval_samples_per_second": 3.012, "eval_steps_per_second": 3.012, "eval_token_acc": 0.9791503523884103, "step": 54200 }, { "epoch": 4.131793581827883, "grad_norm": 0.47848907113075256, "learning_rate": 7.2568653784734706e-06, "loss": 0.031965500116348265, "memory(GiB)": 122.96, "step": 54205, "token_acc": 0.9794315846947426, "train_speed(iter/s)": 0.230992 }, { "epoch": 4.132174708438143, "grad_norm": 1.4010511636734009, "learning_rate": 7.250654130450468e-06, "loss": 0.021611602604389192, "memory(GiB)": 122.96, "step": 54210, "token_acc": 0.9939485627836612, "train_speed(iter/s)": 0.230999 }, { "epoch": 4.132555835048403, "grad_norm": 0.5484176278114319, "learning_rate": 7.244445333904065e-06, "loss": 0.025626558065414428, "memory(GiB)": 122.96, "step": 54215, "token_acc": 0.9917563930013459, "train_speed(iter/s)": 0.231004 }, { "epoch": 4.132936961658663, "grad_norm": 0.08235243707895279, "learning_rate": 7.238238989190321e-06, "loss": 0.034170085191726686, "memory(GiB)": 122.96, "step": 54220, "token_acc": 0.9839158676152181, "train_speed(iter/s)": 0.231011 }, { "epoch": 4.133318088268923, "grad_norm": 1.7296148538589478, "learning_rate": 7.232035096665124e-06, "loss": 0.04423507750034332, "memory(GiB)": 122.96, "step": 54225, "token_acc": 0.9879920960632315, "train_speed(iter/s)": 0.231015 }, { "epoch": 4.133699214879183, "grad_norm": 1.0189650058746338, "learning_rate": 7.225833656684245e-06, "loss": 0.024519374966621398, "memory(GiB)": 122.96, "step": 54230, "token_acc": 0.9919177075679647, "train_speed(iter/s)": 0.231021 }, { "epoch": 4.134080341489443, "grad_norm": 1.2092605829238892, "learning_rate": 7.21963466960332e-06, "loss": 0.03509757220745087, "memory(GiB)": 122.96, "step": 54235, "token_acc": 0.9862784471218207, "train_speed(iter/s)": 0.231028 }, { "epoch": 4.134461468099703, "grad_norm": 0.9455986618995667, "learning_rate": 7.213438135777817e-06, "loss": 0.030011487007141114, "memory(GiB)": 122.96, "step": 54240, "token_acc": 0.9860681114551083, "train_speed(iter/s)": 0.231033 }, { "epoch": 4.134842594709963, "grad_norm": 1.5073325634002686, "learning_rate": 7.207244055563072e-06, "loss": 0.015232709050178529, "memory(GiB)": 122.96, "step": 54245, "token_acc": 0.9937810945273632, "train_speed(iter/s)": 0.231041 }, { "epoch": 4.135223721320223, "grad_norm": 1.2452137470245361, "learning_rate": 7.20105242931432e-06, "loss": 0.03927369117736816, "memory(GiB)": 122.96, "step": 54250, "token_acc": 0.9873046875, "train_speed(iter/s)": 0.231048 }, { "epoch": 4.135604847930482, "grad_norm": 1.4517422914505005, "learning_rate": 7.19486325738658e-06, "loss": 0.040998023748397824, "memory(GiB)": 122.96, "step": 54255, "token_acc": 0.9868593955321945, "train_speed(iter/s)": 0.231054 }, { "epoch": 4.135985974540742, "grad_norm": 0.7554478645324707, "learning_rate": 7.188676540134798e-06, "loss": 0.027133870124816894, "memory(GiB)": 122.96, "step": 54260, "token_acc": 0.9888261975882288, "train_speed(iter/s)": 0.231054 }, { "epoch": 4.136367101151002, "grad_norm": 1.12729811668396, "learning_rate": 7.182492277913755e-06, "loss": 0.03869348764419556, "memory(GiB)": 122.96, "step": 54265, "token_acc": 0.9841535686678691, "train_speed(iter/s)": 0.231056 }, { "epoch": 4.136748227761262, "grad_norm": 1.4737827777862549, "learning_rate": 7.176310471078073e-06, "loss": 0.0330334484577179, "memory(GiB)": 122.96, "step": 54270, "token_acc": 0.9871662181742911, "train_speed(iter/s)": 0.231062 }, { "epoch": 4.137129354371522, "grad_norm": 1.3278594017028809, "learning_rate": 7.170131119982259e-06, "loss": 0.029538795351982117, "memory(GiB)": 122.96, "step": 54275, "token_acc": 0.9859215941087286, "train_speed(iter/s)": 0.231069 }, { "epoch": 4.137510480981782, "grad_norm": 1.4173715114593506, "learning_rate": 7.163954224980679e-06, "loss": 0.03545851111412048, "memory(GiB)": 122.96, "step": 54280, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.231072 }, { "epoch": 4.137891607592042, "grad_norm": 0.0008639300358481705, "learning_rate": 7.157779786427532e-06, "loss": 0.02106224447488785, "memory(GiB)": 122.96, "step": 54285, "token_acc": 0.9904878797177048, "train_speed(iter/s)": 0.231076 }, { "epoch": 4.138272734202302, "grad_norm": 0.7476704716682434, "learning_rate": 7.151607804676913e-06, "loss": 0.03654558658599853, "memory(GiB)": 122.96, "step": 54290, "token_acc": 0.9849424269264836, "train_speed(iter/s)": 0.231082 }, { "epoch": 4.138653860812562, "grad_norm": 0.8115509748458862, "learning_rate": 7.145438280082739e-06, "loss": 0.021607619524002076, "memory(GiB)": 122.96, "step": 54295, "token_acc": 0.9888505980133793, "train_speed(iter/s)": 0.231088 }, { "epoch": 4.139034987422822, "grad_norm": 1.6072876453399658, "learning_rate": 7.139271212998805e-06, "loss": 0.031310233473777774, "memory(GiB)": 122.96, "step": 54300, "token_acc": 0.9843406593406593, "train_speed(iter/s)": 0.231094 }, { "epoch": 4.139416114033081, "grad_norm": 0.79038405418396, "learning_rate": 7.133106603778777e-06, "loss": 0.029432064294815062, "memory(GiB)": 122.96, "step": 54305, "token_acc": 0.985831729928284, "train_speed(iter/s)": 0.231098 }, { "epoch": 4.139797240643341, "grad_norm": 0.9858720898628235, "learning_rate": 7.126944452776141e-06, "loss": 0.0330751359462738, "memory(GiB)": 122.96, "step": 54310, "token_acc": 0.9854333576110706, "train_speed(iter/s)": 0.231102 }, { "epoch": 4.140178367253601, "grad_norm": 0.8208244442939758, "learning_rate": 7.1207847603442955e-06, "loss": 0.025164368748664855, "memory(GiB)": 122.96, "step": 54315, "token_acc": 0.9873486354599674, "train_speed(iter/s)": 0.231106 }, { "epoch": 4.1405594938638615, "grad_norm": 2.3016769886016846, "learning_rate": 7.11462752683647e-06, "loss": 0.024252060055732726, "memory(GiB)": 122.96, "step": 54320, "token_acc": 0.9920032807053516, "train_speed(iter/s)": 0.231112 }, { "epoch": 4.1409406204741215, "grad_norm": 0.8271681666374207, "learning_rate": 7.108472752605738e-06, "loss": 0.05321881175041199, "memory(GiB)": 122.96, "step": 54325, "token_acc": 0.9767736486486487, "train_speed(iter/s)": 0.231116 }, { "epoch": 4.1413217470843815, "grad_norm": 1.162315845489502, "learning_rate": 7.1023204380050445e-06, "loss": 0.02521146535873413, "memory(GiB)": 122.96, "step": 54330, "token_acc": 0.9847773279352227, "train_speed(iter/s)": 0.231122 }, { "epoch": 4.1417028736946415, "grad_norm": 1.1290749311447144, "learning_rate": 7.096170583387224e-06, "loss": 0.05417289733886719, "memory(GiB)": 122.96, "step": 54335, "token_acc": 0.9793575321813754, "train_speed(iter/s)": 0.231127 }, { "epoch": 4.1420840003049015, "grad_norm": 1.4262025356292725, "learning_rate": 7.090023189104905e-06, "loss": 0.02608681619167328, "memory(GiB)": 122.96, "step": 54340, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.231133 }, { "epoch": 4.1424651269151616, "grad_norm": 0.7504939436912537, "learning_rate": 7.083878255510639e-06, "loss": 0.03740245997905731, "memory(GiB)": 122.96, "step": 54345, "token_acc": 0.9861040929762507, "train_speed(iter/s)": 0.23114 }, { "epoch": 4.142846253525422, "grad_norm": 1.5690531730651855, "learning_rate": 7.077735782956796e-06, "loss": 0.019555152952671052, "memory(GiB)": 122.96, "step": 54350, "token_acc": 0.9946969696969697, "train_speed(iter/s)": 0.231148 }, { "epoch": 4.143227380135681, "grad_norm": 1.5397980213165283, "learning_rate": 7.071595771795614e-06, "loss": 0.027711158990859984, "memory(GiB)": 122.96, "step": 54355, "token_acc": 0.9876328165824769, "train_speed(iter/s)": 0.231152 }, { "epoch": 4.143608506745941, "grad_norm": 1.072394609451294, "learning_rate": 7.065458222379217e-06, "loss": 0.02970227599143982, "memory(GiB)": 122.96, "step": 54360, "token_acc": 0.9889454726120979, "train_speed(iter/s)": 0.231155 }, { "epoch": 4.143989633356201, "grad_norm": 0.9242489337921143, "learning_rate": 7.059323135059542e-06, "loss": 0.044575875997543334, "memory(GiB)": 122.96, "step": 54365, "token_acc": 0.9818773234200744, "train_speed(iter/s)": 0.231162 }, { "epoch": 4.144370759966461, "grad_norm": 1.4162697792053223, "learning_rate": 7.0531905101884125e-06, "loss": 0.058882874250411985, "memory(GiB)": 122.96, "step": 54370, "token_acc": 0.9786453119786453, "train_speed(iter/s)": 0.231166 }, { "epoch": 4.144751886576721, "grad_norm": 0.9566333889961243, "learning_rate": 7.047060348117524e-06, "loss": 0.0292243093252182, "memory(GiB)": 122.96, "step": 54375, "token_acc": 0.9910008181074448, "train_speed(iter/s)": 0.231171 }, { "epoch": 4.145133013186981, "grad_norm": 0.7626445889472961, "learning_rate": 7.040932649198384e-06, "loss": 0.032679778337478635, "memory(GiB)": 122.96, "step": 54380, "token_acc": 0.9908096280087527, "train_speed(iter/s)": 0.231176 }, { "epoch": 4.145514139797241, "grad_norm": 1.1416856050491333, "learning_rate": 7.034807413782407e-06, "loss": 0.013089582324028015, "memory(GiB)": 122.96, "step": 54385, "token_acc": 0.9936020473448497, "train_speed(iter/s)": 0.231185 }, { "epoch": 4.145895266407501, "grad_norm": 0.8329086303710938, "learning_rate": 7.02868464222085e-06, "loss": 0.03626088500022888, "memory(GiB)": 122.96, "step": 54390, "token_acc": 0.9837084313066846, "train_speed(iter/s)": 0.231189 }, { "epoch": 4.146276393017761, "grad_norm": 1.4032717943191528, "learning_rate": 7.022564334864806e-06, "loss": 0.031080877780914305, "memory(GiB)": 122.96, "step": 54395, "token_acc": 0.9886130296807915, "train_speed(iter/s)": 0.231193 }, { "epoch": 4.14665751962802, "grad_norm": 1.0799059867858887, "learning_rate": 7.01644649206526e-06, "loss": 0.037841585278511045, "memory(GiB)": 122.96, "step": 54400, "token_acc": 0.9827691369319914, "train_speed(iter/s)": 0.231198 }, { "epoch": 4.14665751962802, "eval_loss": 0.05173661559820175, "eval_runtime": 175.3335, "eval_samples_per_second": 3.023, "eval_steps_per_second": 3.023, "eval_token_acc": 0.9790675260526475, "step": 54400 }, { "epoch": 4.14703864623828, "grad_norm": 0.9243777990341187, "learning_rate": 7.010331114173047e-06, "loss": 0.016247971355915068, "memory(GiB)": 122.96, "step": 54405, "token_acc": 0.9793069336168202, "train_speed(iter/s)": 0.231035 }, { "epoch": 4.14741977284854, "grad_norm": 0.6836816072463989, "learning_rate": 7.004218201538837e-06, "loss": 0.03287810683250427, "memory(GiB)": 122.96, "step": 54410, "token_acc": 0.984984984984985, "train_speed(iter/s)": 0.231042 }, { "epoch": 4.1478008994588, "grad_norm": 1.482722282409668, "learning_rate": 6.998107754513189e-06, "loss": 0.03759737312793732, "memory(GiB)": 122.96, "step": 54415, "token_acc": 0.9842300556586271, "train_speed(iter/s)": 0.231049 }, { "epoch": 4.14818202606906, "grad_norm": 0.6378282904624939, "learning_rate": 6.991999773446523e-06, "loss": 0.03176992833614349, "memory(GiB)": 122.96, "step": 54420, "token_acc": 0.9894212818917237, "train_speed(iter/s)": 0.231056 }, { "epoch": 4.14856315267932, "grad_norm": 1.6748992204666138, "learning_rate": 6.9858942586890745e-06, "loss": 0.03512975573539734, "memory(GiB)": 122.96, "step": 54425, "token_acc": 0.9876365134968061, "train_speed(iter/s)": 0.231061 }, { "epoch": 4.14894427928958, "grad_norm": 1.337565302848816, "learning_rate": 6.9797912105909806e-06, "loss": 0.03512915372848511, "memory(GiB)": 122.96, "step": 54430, "token_acc": 0.9826200304149467, "train_speed(iter/s)": 0.231067 }, { "epoch": 4.14932540589984, "grad_norm": 1.323195219039917, "learning_rate": 6.973690629502233e-06, "loss": 0.02770017981529236, "memory(GiB)": 122.96, "step": 54435, "token_acc": 0.9881411206640972, "train_speed(iter/s)": 0.231073 }, { "epoch": 4.1497065325101, "grad_norm": 0.6644495129585266, "learning_rate": 6.9675925157726415e-06, "loss": 0.020978583395481108, "memory(GiB)": 122.96, "step": 54440, "token_acc": 0.9932885906040269, "train_speed(iter/s)": 0.23108 }, { "epoch": 4.15008765912036, "grad_norm": 0.9146669507026672, "learning_rate": 6.961496869751943e-06, "loss": 0.05959935784339905, "memory(GiB)": 122.96, "step": 54445, "token_acc": 0.9796475552246215, "train_speed(iter/s)": 0.231087 }, { "epoch": 4.15046878573062, "grad_norm": 0.9155683517456055, "learning_rate": 6.95540369178967e-06, "loss": 0.03274225890636444, "memory(GiB)": 122.96, "step": 54450, "token_acc": 0.9856602270464051, "train_speed(iter/s)": 0.231092 }, { "epoch": 4.150849912340879, "grad_norm": 1.105959177017212, "learning_rate": 6.94931298223524e-06, "loss": 0.028896591067314147, "memory(GiB)": 122.96, "step": 54455, "token_acc": 0.9909701037217816, "train_speed(iter/s)": 0.231095 }, { "epoch": 4.151231038951139, "grad_norm": 0.6945962309837341, "learning_rate": 6.943224741437943e-06, "loss": 0.02106776535511017, "memory(GiB)": 122.96, "step": 54460, "token_acc": 0.9887921783712889, "train_speed(iter/s)": 0.231097 }, { "epoch": 4.151612165561399, "grad_norm": 1.1662766933441162, "learning_rate": 6.937138969746887e-06, "loss": 0.030979758501052855, "memory(GiB)": 122.96, "step": 54465, "token_acc": 0.9877338239803741, "train_speed(iter/s)": 0.231101 }, { "epoch": 4.151993292171659, "grad_norm": 2.59840989112854, "learning_rate": 6.9310556675110775e-06, "loss": 0.035133495926856995, "memory(GiB)": 122.96, "step": 54470, "token_acc": 0.9900714161295942, "train_speed(iter/s)": 0.231105 }, { "epoch": 4.152374418781919, "grad_norm": 0.9418238401412964, "learning_rate": 6.924974835079368e-06, "loss": 0.035858321189880374, "memory(GiB)": 122.96, "step": 54475, "token_acc": 0.9847741935483871, "train_speed(iter/s)": 0.23111 }, { "epoch": 4.152755545392179, "grad_norm": 3.405599594116211, "learning_rate": 6.918896472800451e-06, "loss": 0.04519959092140198, "memory(GiB)": 122.96, "step": 54480, "token_acc": 0.9881386861313869, "train_speed(iter/s)": 0.231116 }, { "epoch": 4.153136672002439, "grad_norm": 0.9627317786216736, "learning_rate": 6.9128205810229e-06, "loss": 0.032913050055503844, "memory(GiB)": 122.96, "step": 54485, "token_acc": 0.988956587966489, "train_speed(iter/s)": 0.231124 }, { "epoch": 4.153517798612699, "grad_norm": 0.8504566550254822, "learning_rate": 6.906747160095151e-06, "loss": 0.02143979221582413, "memory(GiB)": 122.96, "step": 54490, "token_acc": 0.9918062636562273, "train_speed(iter/s)": 0.231127 }, { "epoch": 4.1538989252229594, "grad_norm": 1.230493426322937, "learning_rate": 6.900676210365464e-06, "loss": 0.027364462614059448, "memory(GiB)": 122.96, "step": 54495, "token_acc": 0.9872262773722628, "train_speed(iter/s)": 0.231136 }, { "epoch": 4.154280051833219, "grad_norm": 0.7406442761421204, "learning_rate": 6.894607732181996e-06, "loss": 0.027772486209869385, "memory(GiB)": 122.96, "step": 54500, "token_acc": 0.9895908980876301, "train_speed(iter/s)": 0.231142 }, { "epoch": 4.154661178443479, "grad_norm": 0.4464856684207916, "learning_rate": 6.888541725892745e-06, "loss": 0.03033764362335205, "memory(GiB)": 122.96, "step": 54505, "token_acc": 0.9910087103118853, "train_speed(iter/s)": 0.231146 }, { "epoch": 4.155042305053739, "grad_norm": 1.106855869293213, "learning_rate": 6.882478191845559e-06, "loss": 0.03171315789222717, "memory(GiB)": 122.96, "step": 54510, "token_acc": 0.9839525553811268, "train_speed(iter/s)": 0.231151 }, { "epoch": 4.155423431663999, "grad_norm": 0.9480795860290527, "learning_rate": 6.876417130388163e-06, "loss": 0.0354744017124176, "memory(GiB)": 122.96, "step": 54515, "token_acc": 0.9899637580150543, "train_speed(iter/s)": 0.231157 }, { "epoch": 4.155804558274259, "grad_norm": 1.1103415489196777, "learning_rate": 6.870358541868121e-06, "loss": 0.03855787217617035, "memory(GiB)": 122.96, "step": 54520, "token_acc": 0.9847274018824365, "train_speed(iter/s)": 0.23116 }, { "epoch": 4.156185684884519, "grad_norm": 2.3169593811035156, "learning_rate": 6.864302426632868e-06, "loss": 0.036573287844657895, "memory(GiB)": 122.96, "step": 54525, "token_acc": 0.9908355795148248, "train_speed(iter/s)": 0.231167 }, { "epoch": 4.156566811494779, "grad_norm": 1.2677106857299805, "learning_rate": 6.858248785029708e-06, "loss": 0.03983940482139588, "memory(GiB)": 122.96, "step": 54530, "token_acc": 0.9829835507657402, "train_speed(iter/s)": 0.231171 }, { "epoch": 4.156947938105039, "grad_norm": 1.3369802236557007, "learning_rate": 6.85219761740577e-06, "loss": 0.028627288341522217, "memory(GiB)": 122.96, "step": 54535, "token_acc": 0.9822569198012775, "train_speed(iter/s)": 0.231178 }, { "epoch": 4.157329064715299, "grad_norm": 2.0361244678497314, "learning_rate": 6.84614892410807e-06, "loss": 0.029270190000534057, "memory(GiB)": 122.96, "step": 54540, "token_acc": 0.9867790207758245, "train_speed(iter/s)": 0.231183 }, { "epoch": 4.157710191325559, "grad_norm": 0.16237960755825043, "learning_rate": 6.840102705483475e-06, "loss": 0.036707690358161925, "memory(GiB)": 122.96, "step": 54545, "token_acc": 0.9826440677966102, "train_speed(iter/s)": 0.231186 }, { "epoch": 4.158091317935818, "grad_norm": 0.8695746064186096, "learning_rate": 6.834058961878698e-06, "loss": 0.03798233270645142, "memory(GiB)": 122.96, "step": 54550, "token_acc": 0.9834278238116005, "train_speed(iter/s)": 0.231188 }, { "epoch": 4.158472444546078, "grad_norm": 0.6707727909088135, "learning_rate": 6.828017693640321e-06, "loss": 0.04469782710075378, "memory(GiB)": 122.96, "step": 54555, "token_acc": 0.9850470430107527, "train_speed(iter/s)": 0.231193 }, { "epoch": 4.158853571156338, "grad_norm": 1.0692318677902222, "learning_rate": 6.821978901114801e-06, "loss": 0.04296365976333618, "memory(GiB)": 122.96, "step": 54560, "token_acc": 0.9816525871172123, "train_speed(iter/s)": 0.231198 }, { "epoch": 4.159234697766598, "grad_norm": 0.7580469846725464, "learning_rate": 6.815942584648405e-06, "loss": 0.04258651733398437, "memory(GiB)": 122.96, "step": 54565, "token_acc": 0.9891678975873953, "train_speed(iter/s)": 0.231203 }, { "epoch": 4.159615824376858, "grad_norm": 2.616116762161255, "learning_rate": 6.8099087445873066e-06, "loss": 0.06607654690742493, "memory(GiB)": 122.96, "step": 54570, "token_acc": 0.9729023832843617, "train_speed(iter/s)": 0.231209 }, { "epoch": 4.159996950987118, "grad_norm": 1.5488783121109009, "learning_rate": 6.803877381277518e-06, "loss": 0.02296465039253235, "memory(GiB)": 122.96, "step": 54575, "token_acc": 0.9887540692512578, "train_speed(iter/s)": 0.231215 }, { "epoch": 4.160378077597378, "grad_norm": 0.585402250289917, "learning_rate": 6.797848495064901e-06, "loss": 0.01964803785085678, "memory(GiB)": 122.96, "step": 54580, "token_acc": 0.9887073203135379, "train_speed(iter/s)": 0.231217 }, { "epoch": 4.160759204207638, "grad_norm": 0.5063278675079346, "learning_rate": 6.791822086295208e-06, "loss": 0.02271225452423096, "memory(GiB)": 122.96, "step": 54585, "token_acc": 0.9908060067422617, "train_speed(iter/s)": 0.231223 }, { "epoch": 4.161140330817898, "grad_norm": 2.842499256134033, "learning_rate": 6.785798155313994e-06, "loss": 0.045292556285858154, "memory(GiB)": 122.96, "step": 54590, "token_acc": 0.9814222932481023, "train_speed(iter/s)": 0.231229 }, { "epoch": 4.161521457428158, "grad_norm": 1.359418272972107, "learning_rate": 6.779776702466717e-06, "loss": 0.03026381731033325, "memory(GiB)": 122.96, "step": 54595, "token_acc": 0.9881376037959668, "train_speed(iter/s)": 0.231232 }, { "epoch": 4.161902584038417, "grad_norm": 0.7124122381210327, "learning_rate": 6.77375772809869e-06, "loss": 0.059599530696868894, "memory(GiB)": 122.96, "step": 54600, "token_acc": 0.9865962632006499, "train_speed(iter/s)": 0.231234 }, { "epoch": 4.161902584038417, "eval_loss": 0.0517476387321949, "eval_runtime": 193.2245, "eval_samples_per_second": 2.743, "eval_steps_per_second": 2.743, "eval_token_acc": 0.9789319920486718, "step": 54600 }, { "epoch": 4.162283710648677, "grad_norm": 2.0789377689361572, "learning_rate": 6.767741232555052e-06, "loss": 0.027543526887893677, "memory(GiB)": 122.96, "step": 54605, "token_acc": 0.9793137211919272, "train_speed(iter/s)": 0.231052 }, { "epoch": 4.162664837258937, "grad_norm": 1.2224832773208618, "learning_rate": 6.76172721618083e-06, "loss": 0.026502731442451476, "memory(GiB)": 122.96, "step": 54610, "token_acc": 0.9875776397515528, "train_speed(iter/s)": 0.231059 }, { "epoch": 4.163045963869197, "grad_norm": 1.1668221950531006, "learning_rate": 6.755715679320917e-06, "loss": 0.03591226041316986, "memory(GiB)": 122.96, "step": 54615, "token_acc": 0.9860174781523097, "train_speed(iter/s)": 0.231063 }, { "epoch": 4.163427090479457, "grad_norm": 1.0242559909820557, "learning_rate": 6.749706622320018e-06, "loss": 0.03442449569702148, "memory(GiB)": 122.96, "step": 54620, "token_acc": 0.9855678446182897, "train_speed(iter/s)": 0.231066 }, { "epoch": 4.163808217089717, "grad_norm": 2.1899735927581787, "learning_rate": 6.743700045522744e-06, "loss": 0.046162641048431395, "memory(GiB)": 122.96, "step": 54625, "token_acc": 0.9858375039931849, "train_speed(iter/s)": 0.231066 }, { "epoch": 4.164189343699977, "grad_norm": 5.031675815582275, "learning_rate": 6.737695949273543e-06, "loss": 0.053904712200164795, "memory(GiB)": 122.96, "step": 54630, "token_acc": 0.9836927223719677, "train_speed(iter/s)": 0.231069 }, { "epoch": 4.164570470310237, "grad_norm": 0.9666539430618286, "learning_rate": 6.731694333916711e-06, "loss": 0.03001103699207306, "memory(GiB)": 122.96, "step": 54635, "token_acc": 0.9888027562446167, "train_speed(iter/s)": 0.231075 }, { "epoch": 4.164951596920497, "grad_norm": 0.9933744668960571, "learning_rate": 6.725695199796417e-06, "loss": 0.035966315865516664, "memory(GiB)": 122.96, "step": 54640, "token_acc": 0.9860041987403779, "train_speed(iter/s)": 0.231079 }, { "epoch": 4.165332723530757, "grad_norm": 0.8784541487693787, "learning_rate": 6.719698547256698e-06, "loss": 0.03507736027240753, "memory(GiB)": 122.96, "step": 54645, "token_acc": 0.9862238158143046, "train_speed(iter/s)": 0.231084 }, { "epoch": 4.1657138501410165, "grad_norm": 0.5878980755805969, "learning_rate": 6.713704376641411e-06, "loss": 0.033890119194984435, "memory(GiB)": 122.96, "step": 54650, "token_acc": 0.9874522640480087, "train_speed(iter/s)": 0.23109 }, { "epoch": 4.1660949767512765, "grad_norm": 1.054348111152649, "learning_rate": 6.707712688294305e-06, "loss": 0.029639309644699095, "memory(GiB)": 122.96, "step": 54655, "token_acc": 0.9911330049261083, "train_speed(iter/s)": 0.231098 }, { "epoch": 4.1664761033615365, "grad_norm": 0.6264936327934265, "learning_rate": 6.701723482558986e-06, "loss": 0.020953820645809175, "memory(GiB)": 122.96, "step": 54660, "token_acc": 0.9897172236503856, "train_speed(iter/s)": 0.231104 }, { "epoch": 4.1668572299717965, "grad_norm": 1.2187527418136597, "learning_rate": 6.695736759778887e-06, "loss": 0.033331677317619324, "memory(GiB)": 122.96, "step": 54665, "token_acc": 0.9867345855884538, "train_speed(iter/s)": 0.231106 }, { "epoch": 4.1672383565820565, "grad_norm": 0.8396475911140442, "learning_rate": 6.689752520297327e-06, "loss": 0.031467437744140625, "memory(GiB)": 122.96, "step": 54670, "token_acc": 0.9875283446712018, "train_speed(iter/s)": 0.231111 }, { "epoch": 4.167619483192317, "grad_norm": 2.053088903427124, "learning_rate": 6.683770764457486e-06, "loss": 0.0423812985420227, "memory(GiB)": 122.96, "step": 54675, "token_acc": 0.9834219580857053, "train_speed(iter/s)": 0.231117 }, { "epoch": 4.168000609802577, "grad_norm": 1.0360718965530396, "learning_rate": 6.67779149260237e-06, "loss": 0.020427481830120088, "memory(GiB)": 122.96, "step": 54680, "token_acc": 0.9897033586663397, "train_speed(iter/s)": 0.231123 }, { "epoch": 4.168381736412837, "grad_norm": 1.8894308805465698, "learning_rate": 6.671814705074886e-06, "loss": 0.04196591079235077, "memory(GiB)": 122.96, "step": 54685, "token_acc": 0.9903951102379394, "train_speed(iter/s)": 0.231129 }, { "epoch": 4.168762863023097, "grad_norm": 1.1052526235580444, "learning_rate": 6.665840402217749e-06, "loss": 0.03107260465621948, "memory(GiB)": 122.96, "step": 54690, "token_acc": 0.988279554333671, "train_speed(iter/s)": 0.231132 }, { "epoch": 4.169143989633357, "grad_norm": 1.0564314126968384, "learning_rate": 6.6598685843735686e-06, "loss": 0.017465430498123168, "memory(GiB)": 122.96, "step": 54695, "token_acc": 0.9932633247473747, "train_speed(iter/s)": 0.231136 }, { "epoch": 4.169525116243616, "grad_norm": 0.47773563861846924, "learning_rate": 6.6538992518848156e-06, "loss": 0.034402850270271304, "memory(GiB)": 122.96, "step": 54700, "token_acc": 0.984597904305012, "train_speed(iter/s)": 0.231139 }, { "epoch": 4.169906242853876, "grad_norm": 0.5369814038276672, "learning_rate": 6.647932405093777e-06, "loss": 0.02724473476409912, "memory(GiB)": 122.96, "step": 54705, "token_acc": 0.9905542869005605, "train_speed(iter/s)": 0.231139 }, { "epoch": 4.170287369464136, "grad_norm": 0.8509847521781921, "learning_rate": 6.641968044342622e-06, "loss": 0.024917681515216828, "memory(GiB)": 122.96, "step": 54710, "token_acc": 0.98755432635322, "train_speed(iter/s)": 0.231145 }, { "epoch": 4.170668496074396, "grad_norm": 0.5657125115394592, "learning_rate": 6.636006169973419e-06, "loss": 0.03420963287353516, "memory(GiB)": 122.96, "step": 54715, "token_acc": 0.9878618113912232, "train_speed(iter/s)": 0.231151 }, { "epoch": 4.171049622684656, "grad_norm": 1.2188407182693481, "learning_rate": 6.63004678232802e-06, "loss": 0.026455551385879517, "memory(GiB)": 122.96, "step": 54720, "token_acc": 0.9839291078401923, "train_speed(iter/s)": 0.231155 }, { "epoch": 4.171430749294916, "grad_norm": 1.5461846590042114, "learning_rate": 6.624089881748186e-06, "loss": 0.033232647180557254, "memory(GiB)": 122.96, "step": 54725, "token_acc": 0.9853240929474113, "train_speed(iter/s)": 0.23116 }, { "epoch": 4.171811875905176, "grad_norm": 1.133164405822754, "learning_rate": 6.6181354685754984e-06, "loss": 0.04893189072608948, "memory(GiB)": 122.96, "step": 54730, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.231165 }, { "epoch": 4.172193002515436, "grad_norm": 3.197770595550537, "learning_rate": 6.612183543151423e-06, "loss": 0.03853365182876587, "memory(GiB)": 122.96, "step": 54735, "token_acc": 0.9832944832944833, "train_speed(iter/s)": 0.231173 }, { "epoch": 4.172574129125696, "grad_norm": 1.0084818601608276, "learning_rate": 6.6062341058172884e-06, "loss": 0.04893515110015869, "memory(GiB)": 122.96, "step": 54740, "token_acc": 0.984768812330009, "train_speed(iter/s)": 0.231178 }, { "epoch": 4.172955255735955, "grad_norm": 0.5432860851287842, "learning_rate": 6.600287156914248e-06, "loss": 0.025204026699066163, "memory(GiB)": 122.96, "step": 54745, "token_acc": 0.9892497453887066, "train_speed(iter/s)": 0.231182 }, { "epoch": 4.173336382346215, "grad_norm": 0.9724192023277283, "learning_rate": 6.59434269678334e-06, "loss": 0.02753123939037323, "memory(GiB)": 122.96, "step": 54750, "token_acc": 0.9915655214976342, "train_speed(iter/s)": 0.231188 }, { "epoch": 4.173717508956475, "grad_norm": 1.2906368970870972, "learning_rate": 6.588400725765459e-06, "loss": 0.03821151852607727, "memory(GiB)": 122.96, "step": 54755, "token_acc": 0.9848706686188384, "train_speed(iter/s)": 0.231195 }, { "epoch": 4.174098635566735, "grad_norm": 3.309770345687866, "learning_rate": 6.582461244201333e-06, "loss": 0.03382004797458649, "memory(GiB)": 122.96, "step": 54760, "token_acc": 0.9832285115303984, "train_speed(iter/s)": 0.2312 }, { "epoch": 4.174479762176995, "grad_norm": 1.2762328386306763, "learning_rate": 6.576524252431571e-06, "loss": 0.02366660684347153, "memory(GiB)": 122.96, "step": 54765, "token_acc": 0.9903560830860534, "train_speed(iter/s)": 0.231209 }, { "epoch": 4.174860888787255, "grad_norm": 0.8107351064682007, "learning_rate": 6.570589750796646e-06, "loss": 0.03071948289871216, "memory(GiB)": 122.96, "step": 54770, "token_acc": 0.9913984461709212, "train_speed(iter/s)": 0.231216 }, { "epoch": 4.175242015397515, "grad_norm": 1.5567125082015991, "learning_rate": 6.564657739636854e-06, "loss": 0.039465463161468504, "memory(GiB)": 122.96, "step": 54775, "token_acc": 0.9835245660488379, "train_speed(iter/s)": 0.231222 }, { "epoch": 4.175623142007775, "grad_norm": 1.2282888889312744, "learning_rate": 6.558728219292371e-06, "loss": 0.033269578218460084, "memory(GiB)": 122.96, "step": 54780, "token_acc": 0.9867354458364038, "train_speed(iter/s)": 0.231228 }, { "epoch": 4.176004268618035, "grad_norm": 0.6444400548934937, "learning_rate": 6.552801190103242e-06, "loss": 0.021567100286483766, "memory(GiB)": 122.96, "step": 54785, "token_acc": 0.9905825373335128, "train_speed(iter/s)": 0.231232 }, { "epoch": 4.176385395228295, "grad_norm": 0.056426819413900375, "learning_rate": 6.546876652409339e-06, "loss": 0.02855568528175354, "memory(GiB)": 122.96, "step": 54790, "token_acc": 0.9848704663212435, "train_speed(iter/s)": 0.231236 }, { "epoch": 4.176766521838554, "grad_norm": 0.8473888635635376, "learning_rate": 6.540954606550409e-06, "loss": 0.027732658386230468, "memory(GiB)": 122.96, "step": 54795, "token_acc": 0.9864352683024137, "train_speed(iter/s)": 0.231242 }, { "epoch": 4.177147648448814, "grad_norm": 0.7704639434814453, "learning_rate": 6.535035052866073e-06, "loss": 0.035870373249053955, "memory(GiB)": 122.96, "step": 54800, "token_acc": 0.9854957805907173, "train_speed(iter/s)": 0.231245 }, { "epoch": 4.177147648448814, "eval_loss": 0.051189910620450974, "eval_runtime": 190.3811, "eval_samples_per_second": 2.784, "eval_steps_per_second": 2.784, "eval_token_acc": 0.9790599963857599, "step": 54800 }, { "epoch": 4.177528775059074, "grad_norm": 2.586639881134033, "learning_rate": 6.52911799169576e-06, "loss": 0.029172462224960328, "memory(GiB)": 122.96, "step": 54805, "token_acc": 0.9794094594891554, "train_speed(iter/s)": 0.231065 }, { "epoch": 4.177909901669334, "grad_norm": 0.9541264176368713, "learning_rate": 6.523203423378804e-06, "loss": 0.03639890253543854, "memory(GiB)": 122.96, "step": 54810, "token_acc": 0.9824677912073316, "train_speed(iter/s)": 0.231068 }, { "epoch": 4.178291028279594, "grad_norm": 1.9563854932785034, "learning_rate": 6.517291348254384e-06, "loss": 0.041938316822052, "memory(GiB)": 122.96, "step": 54815, "token_acc": 0.9836552748885586, "train_speed(iter/s)": 0.231074 }, { "epoch": 4.178672154889854, "grad_norm": 1.45810866355896, "learning_rate": 6.511381766661512e-06, "loss": 0.035060420632362366, "memory(GiB)": 122.96, "step": 54820, "token_acc": 0.989520295202952, "train_speed(iter/s)": 0.231076 }, { "epoch": 4.1790532815001145, "grad_norm": 0.7520869374275208, "learning_rate": 6.505474678939077e-06, "loss": 0.0458139032125473, "memory(GiB)": 122.96, "step": 54825, "token_acc": 0.9840418020053665, "train_speed(iter/s)": 0.231078 }, { "epoch": 4.1794344081103745, "grad_norm": 0.9215095043182373, "learning_rate": 6.499570085425849e-06, "loss": 0.025092512369155884, "memory(GiB)": 122.96, "step": 54830, "token_acc": 0.9900363380611886, "train_speed(iter/s)": 0.23108 }, { "epoch": 4.1798155347206345, "grad_norm": 0.23009540140628815, "learning_rate": 6.4936679864603945e-06, "loss": 0.03135313391685486, "memory(GiB)": 122.96, "step": 54835, "token_acc": 0.983352144469526, "train_speed(iter/s)": 0.231087 }, { "epoch": 4.1801966613308945, "grad_norm": 2.343182325363159, "learning_rate": 6.4877683823811875e-06, "loss": 0.036047089099884036, "memory(GiB)": 122.96, "step": 54840, "token_acc": 0.9864901746724891, "train_speed(iter/s)": 0.23109 }, { "epoch": 4.180577787941154, "grad_norm": 0.7468942999839783, "learning_rate": 6.481871273526541e-06, "loss": 0.037367862462997434, "memory(GiB)": 122.96, "step": 54845, "token_acc": 0.9885003520300399, "train_speed(iter/s)": 0.231095 }, { "epoch": 4.180958914551414, "grad_norm": 1.840085744857788, "learning_rate": 6.475976660234628e-06, "loss": 0.03636122941970825, "memory(GiB)": 122.96, "step": 54850, "token_acc": 0.9857870173949936, "train_speed(iter/s)": 0.231099 }, { "epoch": 4.181340041161674, "grad_norm": 2.1490564346313477, "learning_rate": 6.470084542843491e-06, "loss": 0.048007601499557497, "memory(GiB)": 122.96, "step": 54855, "token_acc": 0.9838004628439188, "train_speed(iter/s)": 0.231104 }, { "epoch": 4.181721167771934, "grad_norm": 1.1590602397918701, "learning_rate": 6.464194921690991e-06, "loss": 0.031872743368148805, "memory(GiB)": 122.96, "step": 54860, "token_acc": 0.9833178869323448, "train_speed(iter/s)": 0.231112 }, { "epoch": 4.182102294382194, "grad_norm": 0.9347841739654541, "learning_rate": 6.458307797114882e-06, "loss": 0.023981976509094238, "memory(GiB)": 122.96, "step": 54865, "token_acc": 0.9904052376114686, "train_speed(iter/s)": 0.231113 }, { "epoch": 4.182483420992454, "grad_norm": 0.7410144805908203, "learning_rate": 6.45242316945277e-06, "loss": 0.018899552524089813, "memory(GiB)": 122.96, "step": 54870, "token_acc": 0.9916502946954814, "train_speed(iter/s)": 0.231121 }, { "epoch": 4.182864547602714, "grad_norm": 0.5737892985343933, "learning_rate": 6.446541039042098e-06, "loss": 0.023883605003356935, "memory(GiB)": 122.96, "step": 54875, "token_acc": 0.9899127061105722, "train_speed(iter/s)": 0.231126 }, { "epoch": 4.183245674212974, "grad_norm": 0.907258152961731, "learning_rate": 6.440661406220178e-06, "loss": 0.03671302199363709, "memory(GiB)": 122.96, "step": 54880, "token_acc": 0.9890096470875565, "train_speed(iter/s)": 0.231127 }, { "epoch": 4.183626800823234, "grad_norm": 1.6771100759506226, "learning_rate": 6.434784271324201e-06, "loss": 0.034452444314956664, "memory(GiB)": 122.96, "step": 54885, "token_acc": 0.9891511229539398, "train_speed(iter/s)": 0.231131 }, { "epoch": 4.184007927433494, "grad_norm": 0.9216017127037048, "learning_rate": 6.428909634691172e-06, "loss": 0.01819142997264862, "memory(GiB)": 122.96, "step": 54890, "token_acc": 0.9928587882976273, "train_speed(iter/s)": 0.231136 }, { "epoch": 4.184389054043753, "grad_norm": 0.02054188773036003, "learning_rate": 6.423037496657985e-06, "loss": 0.023944091796875, "memory(GiB)": 122.96, "step": 54895, "token_acc": 0.9921487603305785, "train_speed(iter/s)": 0.231144 }, { "epoch": 4.184770180654013, "grad_norm": 3.067049264907837, "learning_rate": 6.4171678575613685e-06, "loss": 0.03430234789848328, "memory(GiB)": 122.96, "step": 54900, "token_acc": 0.9884148064424979, "train_speed(iter/s)": 0.231151 }, { "epoch": 4.185151307264273, "grad_norm": 0.8045777082443237, "learning_rate": 6.411300717737922e-06, "loss": 0.024421411752700805, "memory(GiB)": 122.96, "step": 54905, "token_acc": 0.9880862329803328, "train_speed(iter/s)": 0.231157 }, { "epoch": 4.185532433874533, "grad_norm": 0.6312374472618103, "learning_rate": 6.405436077524119e-06, "loss": 0.022075673937797545, "memory(GiB)": 122.96, "step": 54910, "token_acc": 0.9901690238013108, "train_speed(iter/s)": 0.23116 }, { "epoch": 4.185913560484793, "grad_norm": 1.7158222198486328, "learning_rate": 6.399573937256242e-06, "loss": 0.03671598136425018, "memory(GiB)": 122.96, "step": 54915, "token_acc": 0.9809286898839138, "train_speed(iter/s)": 0.231168 }, { "epoch": 4.186294687095053, "grad_norm": 0.5945406556129456, "learning_rate": 6.39371429727047e-06, "loss": 0.027919429540634155, "memory(GiB)": 122.96, "step": 54920, "token_acc": 0.9882866329811667, "train_speed(iter/s)": 0.231167 }, { "epoch": 4.186675813705313, "grad_norm": 1.8890838623046875, "learning_rate": 6.387857157902833e-06, "loss": 0.0436959832906723, "memory(GiB)": 122.96, "step": 54925, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.231175 }, { "epoch": 4.187056940315573, "grad_norm": 0.11291039735078812, "learning_rate": 6.382002519489194e-06, "loss": 0.026330041885375976, "memory(GiB)": 122.96, "step": 54930, "token_acc": 0.9910485933503836, "train_speed(iter/s)": 0.231182 }, { "epoch": 4.187438066925833, "grad_norm": 0.7063408493995667, "learning_rate": 6.376150382365292e-06, "loss": 0.045445045828819274, "memory(GiB)": 122.96, "step": 54935, "token_acc": 0.9884845693228926, "train_speed(iter/s)": 0.231189 }, { "epoch": 4.187819193536093, "grad_norm": 0.556076169013977, "learning_rate": 6.370300746866747e-06, "loss": 0.05207285284996033, "memory(GiB)": 122.96, "step": 54940, "token_acc": 0.9844033369604642, "train_speed(iter/s)": 0.231192 }, { "epoch": 4.188200320146352, "grad_norm": 0.8586413860321045, "learning_rate": 6.364453613328969e-06, "loss": 0.035344472527503966, "memory(GiB)": 122.96, "step": 54945, "token_acc": 0.9855210819411296, "train_speed(iter/s)": 0.231195 }, { "epoch": 4.188581446756612, "grad_norm": 0.8107700943946838, "learning_rate": 6.358608982087289e-06, "loss": 0.03208433985710144, "memory(GiB)": 122.96, "step": 54950, "token_acc": 0.9886874064215605, "train_speed(iter/s)": 0.231198 }, { "epoch": 4.188962573366872, "grad_norm": 0.9293810725212097, "learning_rate": 6.3527668534768655e-06, "loss": 0.025044241547584535, "memory(GiB)": 122.96, "step": 54955, "token_acc": 0.9884102339820686, "train_speed(iter/s)": 0.231204 }, { "epoch": 4.189343699977132, "grad_norm": 0.7914187908172607, "learning_rate": 6.34692722783271e-06, "loss": 0.039822322130203244, "memory(GiB)": 122.96, "step": 54960, "token_acc": 0.9862637362637363, "train_speed(iter/s)": 0.231212 }, { "epoch": 4.189724826587392, "grad_norm": 0.7814209461212158, "learning_rate": 6.3410901054897055e-06, "loss": 0.01835901141166687, "memory(GiB)": 122.96, "step": 54965, "token_acc": 0.9898389095415118, "train_speed(iter/s)": 0.231214 }, { "epoch": 4.190105953197652, "grad_norm": 1.356967806816101, "learning_rate": 6.335255486782587e-06, "loss": 0.028802937269210814, "memory(GiB)": 122.96, "step": 54970, "token_acc": 0.9886881647813608, "train_speed(iter/s)": 0.23122 }, { "epoch": 4.190487079807912, "grad_norm": 0.04713306948542595, "learning_rate": 6.329423372045917e-06, "loss": 0.04461093544960022, "memory(GiB)": 122.96, "step": 54975, "token_acc": 0.984071821604402, "train_speed(iter/s)": 0.231227 }, { "epoch": 4.190868206418172, "grad_norm": 0.9351802468299866, "learning_rate": 6.323593761614182e-06, "loss": 0.024878785014152527, "memory(GiB)": 122.96, "step": 54980, "token_acc": 0.9877646411877922, "train_speed(iter/s)": 0.23123 }, { "epoch": 4.191249333028432, "grad_norm": 2.1892201900482178, "learning_rate": 6.317766655821656e-06, "loss": 0.021574102342128754, "memory(GiB)": 122.96, "step": 54985, "token_acc": 0.9841666666666666, "train_speed(iter/s)": 0.231237 }, { "epoch": 4.1916304596386915, "grad_norm": 0.8994430303573608, "learning_rate": 6.311942055002496e-06, "loss": 0.03560173213481903, "memory(GiB)": 122.96, "step": 54990, "token_acc": 0.9841775029553514, "train_speed(iter/s)": 0.231238 }, { "epoch": 4.1920115862489515, "grad_norm": 1.5805320739746094, "learning_rate": 6.306119959490731e-06, "loss": 0.0353665828704834, "memory(GiB)": 122.96, "step": 54995, "token_acc": 0.9858839731451197, "train_speed(iter/s)": 0.231242 }, { "epoch": 4.192392712859212, "grad_norm": 1.027337670326233, "learning_rate": 6.300300369620216e-06, "loss": 0.0247516930103302, "memory(GiB)": 122.96, "step": 55000, "token_acc": 0.9891921102404756, "train_speed(iter/s)": 0.231249 }, { "epoch": 4.192392712859212, "eval_loss": 0.05095803737640381, "eval_runtime": 187.9616, "eval_samples_per_second": 2.82, "eval_steps_per_second": 2.82, "eval_token_acc": 0.9794364797301367, "step": 55000 }, { "epoch": 4.192773839469472, "grad_norm": 0.13975584506988525, "learning_rate": 6.294483285724678e-06, "loss": 0.02491881251335144, "memory(GiB)": 122.96, "step": 55005, "token_acc": 0.9795640427815403, "train_speed(iter/s)": 0.231073 }, { "epoch": 4.193154966079732, "grad_norm": 1.201179027557373, "learning_rate": 6.288668708137724e-06, "loss": 0.03555134236812592, "memory(GiB)": 122.96, "step": 55010, "token_acc": 0.9843164794007491, "train_speed(iter/s)": 0.231079 }, { "epoch": 4.193536092689992, "grad_norm": 1.5541774034500122, "learning_rate": 6.282856637192757e-06, "loss": 0.030637761950492857, "memory(GiB)": 122.96, "step": 55015, "token_acc": 0.988155668358714, "train_speed(iter/s)": 0.231084 }, { "epoch": 4.193917219300252, "grad_norm": 1.1531987190246582, "learning_rate": 6.277047073223091e-06, "loss": 0.03712287843227387, "memory(GiB)": 122.96, "step": 55020, "token_acc": 0.9820822331195775, "train_speed(iter/s)": 0.231089 }, { "epoch": 4.194298345910512, "grad_norm": 1.0331188440322876, "learning_rate": 6.271240016561886e-06, "loss": 0.026881766319274903, "memory(GiB)": 122.96, "step": 55025, "token_acc": 0.9877474081055608, "train_speed(iter/s)": 0.231093 }, { "epoch": 4.194679472520772, "grad_norm": 1.0743449926376343, "learning_rate": 6.26543546754213e-06, "loss": 0.029098203778266905, "memory(GiB)": 122.96, "step": 55030, "token_acc": 0.9860034991252187, "train_speed(iter/s)": 0.2311 }, { "epoch": 4.195060599131032, "grad_norm": 1.3197851181030273, "learning_rate": 6.259633426496697e-06, "loss": 0.035451951622962954, "memory(GiB)": 122.96, "step": 55035, "token_acc": 0.9904692082111437, "train_speed(iter/s)": 0.231105 }, { "epoch": 4.195441725741292, "grad_norm": 0.7696093320846558, "learning_rate": 6.253833893758321e-06, "loss": 0.0414112389087677, "memory(GiB)": 122.96, "step": 55040, "token_acc": 0.9862542955326461, "train_speed(iter/s)": 0.23111 }, { "epoch": 4.195822852351551, "grad_norm": 4.2469563484191895, "learning_rate": 6.2480368696595455e-06, "loss": 0.06911444067955017, "memory(GiB)": 122.96, "step": 55045, "token_acc": 0.9714605484051483, "train_speed(iter/s)": 0.231116 }, { "epoch": 4.196203978961811, "grad_norm": 0.7596127986907959, "learning_rate": 6.242242354532829e-06, "loss": 0.019406017661094666, "memory(GiB)": 122.96, "step": 55050, "token_acc": 0.990684575389948, "train_speed(iter/s)": 0.231121 }, { "epoch": 4.196585105572071, "grad_norm": 0.6448656916618347, "learning_rate": 6.23645034871046e-06, "loss": 0.022197966277599335, "memory(GiB)": 122.96, "step": 55055, "token_acc": 0.9920832039739211, "train_speed(iter/s)": 0.231123 }, { "epoch": 4.196966232182331, "grad_norm": 2.6731629371643066, "learning_rate": 6.230660852524567e-06, "loss": 0.04089439511299133, "memory(GiB)": 122.96, "step": 55060, "token_acc": 0.986031746031746, "train_speed(iter/s)": 0.23113 }, { "epoch": 4.197347358792591, "grad_norm": 2.1330859661102295, "learning_rate": 6.224873866307157e-06, "loss": 0.02771533727645874, "memory(GiB)": 122.96, "step": 55065, "token_acc": 0.9906943450250537, "train_speed(iter/s)": 0.231136 }, { "epoch": 4.197728485402851, "grad_norm": 2.1169981956481934, "learning_rate": 6.219089390390098e-06, "loss": 0.03573443591594696, "memory(GiB)": 122.96, "step": 55070, "token_acc": 0.9916036943744753, "train_speed(iter/s)": 0.231142 }, { "epoch": 4.198109612013111, "grad_norm": 1.6034080982208252, "learning_rate": 6.213307425105092e-06, "loss": 0.01675720661878586, "memory(GiB)": 122.96, "step": 55075, "token_acc": 0.9943052391799544, "train_speed(iter/s)": 0.231147 }, { "epoch": 4.198490738623371, "grad_norm": 1.5714973211288452, "learning_rate": 6.207527970783711e-06, "loss": 0.028781816363334656, "memory(GiB)": 122.96, "step": 55080, "token_acc": 0.9884493213976321, "train_speed(iter/s)": 0.231153 }, { "epoch": 4.198871865233631, "grad_norm": 1.1748396158218384, "learning_rate": 6.2017510277573745e-06, "loss": 0.03417149782180786, "memory(GiB)": 122.96, "step": 55085, "token_acc": 0.9875187969924812, "train_speed(iter/s)": 0.231157 }, { "epoch": 4.19925299184389, "grad_norm": 1.5857776403427124, "learning_rate": 6.1959765963573664e-06, "loss": 0.04178975224494934, "memory(GiB)": 122.96, "step": 55090, "token_acc": 0.9854891168376282, "train_speed(iter/s)": 0.231163 }, { "epoch": 4.19963411845415, "grad_norm": 3.191985845565796, "learning_rate": 6.190204676914835e-06, "loss": 0.03885223269462586, "memory(GiB)": 122.96, "step": 55095, "token_acc": 0.9887837364178058, "train_speed(iter/s)": 0.231171 }, { "epoch": 4.20001524506441, "grad_norm": 0.7045013308525085, "learning_rate": 6.184435269760752e-06, "loss": 0.030042463541030885, "memory(GiB)": 122.96, "step": 55100, "token_acc": 0.9862135687507558, "train_speed(iter/s)": 0.231172 }, { "epoch": 4.20039637167467, "grad_norm": 3.2799274921417236, "learning_rate": 6.1786683752259824e-06, "loss": 0.042986491322517396, "memory(GiB)": 122.96, "step": 55105, "token_acc": 0.9820661783278606, "train_speed(iter/s)": 0.231179 }, { "epoch": 4.20077749828493, "grad_norm": 1.387196660041809, "learning_rate": 6.172903993641221e-06, "loss": 0.04853796064853668, "memory(GiB)": 122.96, "step": 55110, "token_acc": 0.9838752312979117, "train_speed(iter/s)": 0.231185 }, { "epoch": 4.20115862489519, "grad_norm": 0.9536697864532471, "learning_rate": 6.167142125337034e-06, "loss": 0.03077118396759033, "memory(GiB)": 122.96, "step": 55115, "token_acc": 0.9857142857142858, "train_speed(iter/s)": 0.231189 }, { "epoch": 4.20153975150545, "grad_norm": 0.9758539795875549, "learning_rate": 6.161382770643842e-06, "loss": 0.04011464416980744, "memory(GiB)": 122.96, "step": 55120, "token_acc": 0.9850837760523089, "train_speed(iter/s)": 0.231194 }, { "epoch": 4.20192087811571, "grad_norm": 1.7224823236465454, "learning_rate": 6.155625929891906e-06, "loss": 0.022836048901081086, "memory(GiB)": 122.96, "step": 55125, "token_acc": 0.9882196781151485, "train_speed(iter/s)": 0.231199 }, { "epoch": 4.20230200472597, "grad_norm": 0.8979042172431946, "learning_rate": 6.149871603411361e-06, "loss": 0.03307593166828156, "memory(GiB)": 122.96, "step": 55130, "token_acc": 0.9856733524355301, "train_speed(iter/s)": 0.231202 }, { "epoch": 4.20268313133623, "grad_norm": 1.689898133277893, "learning_rate": 6.144119791532205e-06, "loss": 0.03842396438121796, "memory(GiB)": 122.96, "step": 55135, "token_acc": 0.986848676544032, "train_speed(iter/s)": 0.231208 }, { "epoch": 4.203064257946489, "grad_norm": 1.1496270895004272, "learning_rate": 6.138370494584245e-06, "loss": 0.030379849672317504, "memory(GiB)": 122.96, "step": 55140, "token_acc": 0.9894527667380006, "train_speed(iter/s)": 0.231213 }, { "epoch": 4.203445384556749, "grad_norm": 1.3549721240997314, "learning_rate": 6.132623712897195e-06, "loss": 0.034192973375320436, "memory(GiB)": 122.96, "step": 55145, "token_acc": 0.9864284098620222, "train_speed(iter/s)": 0.23122 }, { "epoch": 4.2038265111670094, "grad_norm": 1.1207005977630615, "learning_rate": 6.1268794468006205e-06, "loss": 0.036942070722579955, "memory(GiB)": 122.96, "step": 55150, "token_acc": 0.9855827477586625, "train_speed(iter/s)": 0.231222 }, { "epoch": 4.2042076377772695, "grad_norm": 1.0955843925476074, "learning_rate": 6.121137696623897e-06, "loss": 0.03591034412384033, "memory(GiB)": 122.96, "step": 55155, "token_acc": 0.9874285714285714, "train_speed(iter/s)": 0.231229 }, { "epoch": 4.2045887643875295, "grad_norm": 0.6937577128410339, "learning_rate": 6.115398462696309e-06, "loss": 0.02268853783607483, "memory(GiB)": 122.96, "step": 55160, "token_acc": 0.9904699309891555, "train_speed(iter/s)": 0.23123 }, { "epoch": 4.2049698909977895, "grad_norm": 1.0608017444610596, "learning_rate": 6.109661745346978e-06, "loss": 0.025661033391952515, "memory(GiB)": 122.96, "step": 55165, "token_acc": 0.9890909090909091, "train_speed(iter/s)": 0.231235 }, { "epoch": 4.2053510176080495, "grad_norm": 1.2367222309112549, "learning_rate": 6.103927544904858e-06, "loss": 0.03372465968132019, "memory(GiB)": 122.96, "step": 55170, "token_acc": 0.986171410194565, "train_speed(iter/s)": 0.23124 }, { "epoch": 4.2057321442183095, "grad_norm": 1.0857067108154297, "learning_rate": 6.098195861698797e-06, "loss": 0.028401729464530946, "memory(GiB)": 122.96, "step": 55175, "token_acc": 0.9843564356435643, "train_speed(iter/s)": 0.231245 }, { "epoch": 4.20611327082857, "grad_norm": 1.0472482442855835, "learning_rate": 6.0924666960574805e-06, "loss": 0.049645066261291504, "memory(GiB)": 122.96, "step": 55180, "token_acc": 0.9839799749687109, "train_speed(iter/s)": 0.231252 }, { "epoch": 4.20649439743883, "grad_norm": 0.5535577535629272, "learning_rate": 6.0867400483094306e-06, "loss": 0.026069051027297972, "memory(GiB)": 122.96, "step": 55185, "token_acc": 0.9889724310776943, "train_speed(iter/s)": 0.231259 }, { "epoch": 4.206875524049089, "grad_norm": 0.5371598601341248, "learning_rate": 6.081015918783056e-06, "loss": 0.032494640350341795, "memory(GiB)": 122.96, "step": 55190, "token_acc": 0.9859433340654513, "train_speed(iter/s)": 0.231264 }, { "epoch": 4.207256650659349, "grad_norm": 1.0634384155273438, "learning_rate": 6.0752943078066226e-06, "loss": 0.028981700539588928, "memory(GiB)": 122.96, "step": 55195, "token_acc": 0.9896514161220044, "train_speed(iter/s)": 0.231268 }, { "epoch": 4.207637777269609, "grad_norm": 1.4177008867263794, "learning_rate": 6.069575215708212e-06, "loss": 0.04638499617576599, "memory(GiB)": 122.96, "step": 55200, "token_acc": 0.9774879023774459, "train_speed(iter/s)": 0.231273 }, { "epoch": 4.207637777269609, "eval_loss": 0.05128999426960945, "eval_runtime": 188.2748, "eval_samples_per_second": 2.815, "eval_steps_per_second": 2.815, "eval_token_acc": 0.9795569544003373, "step": 55200 }, { "epoch": 4.208018903879869, "grad_norm": 0.5684129595756531, "learning_rate": 6.0638586428158064e-06, "loss": 0.028097471594810484, "memory(GiB)": 122.96, "step": 55205, "token_acc": 0.9800147444179239, "train_speed(iter/s)": 0.231096 }, { "epoch": 4.208400030490129, "grad_norm": 0.7357848286628723, "learning_rate": 6.058144589457226e-06, "loss": 0.016154921054840087, "memory(GiB)": 122.96, "step": 55210, "token_acc": 0.9938618925831202, "train_speed(iter/s)": 0.231104 }, { "epoch": 4.208781157100389, "grad_norm": 1.7192988395690918, "learning_rate": 6.052433055960127e-06, "loss": 0.05158147811889648, "memory(GiB)": 122.96, "step": 55215, "token_acc": 0.9875940614195647, "train_speed(iter/s)": 0.231108 }, { "epoch": 4.209162283710649, "grad_norm": 0.7445062398910522, "learning_rate": 6.0467240426520526e-06, "loss": 0.016115473210811616, "memory(GiB)": 122.96, "step": 55220, "token_acc": 0.9913983237759153, "train_speed(iter/s)": 0.231115 }, { "epoch": 4.209543410320909, "grad_norm": 2.424006938934326, "learning_rate": 6.041017549860395e-06, "loss": 0.02891588807106018, "memory(GiB)": 122.96, "step": 55225, "token_acc": 0.9884203626829802, "train_speed(iter/s)": 0.231121 }, { "epoch": 4.209924536931169, "grad_norm": 0.7885124087333679, "learning_rate": 6.035313577912377e-06, "loss": 0.042253345251083374, "memory(GiB)": 122.96, "step": 55230, "token_acc": 0.9836393383021269, "train_speed(iter/s)": 0.231126 }, { "epoch": 4.210305663541429, "grad_norm": 1.1044069528579712, "learning_rate": 6.029612127135104e-06, "loss": 0.03578483760356903, "memory(GiB)": 122.96, "step": 55235, "token_acc": 0.990482664853841, "train_speed(iter/s)": 0.23113 }, { "epoch": 4.210686790151688, "grad_norm": 0.7533672451972961, "learning_rate": 6.023913197855535e-06, "loss": 0.019311246275901795, "memory(GiB)": 122.96, "step": 55240, "token_acc": 0.99258811072455, "train_speed(iter/s)": 0.231134 }, { "epoch": 4.211067916761948, "grad_norm": 1.5038245916366577, "learning_rate": 6.018216790400455e-06, "loss": 0.03279012143611908, "memory(GiB)": 122.96, "step": 55245, "token_acc": 0.9869035269709544, "train_speed(iter/s)": 0.231136 }, { "epoch": 4.211449043372208, "grad_norm": 0.497211217880249, "learning_rate": 6.012522905096557e-06, "loss": 0.025078490376472473, "memory(GiB)": 122.96, "step": 55250, "token_acc": 0.9903273809523809, "train_speed(iter/s)": 0.231145 }, { "epoch": 4.211830169982468, "grad_norm": 0.9427999258041382, "learning_rate": 6.006831542270336e-06, "loss": 0.026688313484191893, "memory(GiB)": 122.96, "step": 55255, "token_acc": 0.9856517509727627, "train_speed(iter/s)": 0.231151 }, { "epoch": 4.212211296592728, "grad_norm": 0.8263587355613708, "learning_rate": 6.001142702248175e-06, "loss": 0.04322470426559448, "memory(GiB)": 122.96, "step": 55260, "token_acc": 0.9858934169278997, "train_speed(iter/s)": 0.231157 }, { "epoch": 4.212592423202988, "grad_norm": 0.749843955039978, "learning_rate": 5.995456385356307e-06, "loss": 0.03127419352531433, "memory(GiB)": 122.96, "step": 55265, "token_acc": 0.9857813362782993, "train_speed(iter/s)": 0.23116 }, { "epoch": 4.212973549813248, "grad_norm": 0.8890780806541443, "learning_rate": 5.9897725919208e-06, "loss": 0.02349926233291626, "memory(GiB)": 122.96, "step": 55270, "token_acc": 0.9886018237082067, "train_speed(iter/s)": 0.231165 }, { "epoch": 4.213354676423508, "grad_norm": 0.42937788367271423, "learning_rate": 5.9840913222676045e-06, "loss": 0.04700961410999298, "memory(GiB)": 122.96, "step": 55275, "token_acc": 0.9714190761050183, "train_speed(iter/s)": 0.231172 }, { "epoch": 4.213735803033768, "grad_norm": 1.2323603630065918, "learning_rate": 5.97841257672252e-06, "loss": 0.05711854100227356, "memory(GiB)": 122.96, "step": 55280, "token_acc": 0.9797015961138098, "train_speed(iter/s)": 0.231175 }, { "epoch": 4.214116929644028, "grad_norm": 0.15172363817691803, "learning_rate": 5.97273635561118e-06, "loss": 0.012025836855173111, "memory(GiB)": 122.96, "step": 55285, "token_acc": 0.9976762199845082, "train_speed(iter/s)": 0.231184 }, { "epoch": 4.214498056254287, "grad_norm": 1.4292503595352173, "learning_rate": 5.967062659259109e-06, "loss": 0.024781396985054015, "memory(GiB)": 122.96, "step": 55290, "token_acc": 0.991, "train_speed(iter/s)": 0.231192 }, { "epoch": 4.214879182864547, "grad_norm": 0.6128196716308594, "learning_rate": 5.961391487991646e-06, "loss": 0.025586360692977907, "memory(GiB)": 122.96, "step": 55295, "token_acc": 0.9883875248838753, "train_speed(iter/s)": 0.231196 }, { "epoch": 4.215260309474807, "grad_norm": 0.7270740270614624, "learning_rate": 5.95572284213402e-06, "loss": 0.03917216360569, "memory(GiB)": 122.96, "step": 55300, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.231202 }, { "epoch": 4.215641436085067, "grad_norm": 1.8320974111557007, "learning_rate": 5.950056722011305e-06, "loss": 0.03055128753185272, "memory(GiB)": 122.96, "step": 55305, "token_acc": 0.9916107382550335, "train_speed(iter/s)": 0.231208 }, { "epoch": 4.216022562695327, "grad_norm": 0.6406719088554382, "learning_rate": 5.944393127948411e-06, "loss": 0.03373225331306458, "memory(GiB)": 122.96, "step": 55310, "token_acc": 0.9875618374558304, "train_speed(iter/s)": 0.231212 }, { "epoch": 4.216403689305587, "grad_norm": 1.390854835510254, "learning_rate": 5.93873206027013e-06, "loss": 0.02096572667360306, "memory(GiB)": 122.96, "step": 55315, "token_acc": 0.9937214611872146, "train_speed(iter/s)": 0.231218 }, { "epoch": 4.216784815915847, "grad_norm": 0.853208065032959, "learning_rate": 5.933073519301103e-06, "loss": 0.028709876537322997, "memory(GiB)": 122.96, "step": 55320, "token_acc": 0.9904100529100529, "train_speed(iter/s)": 0.231219 }, { "epoch": 4.217165942526107, "grad_norm": 2.4721550941467285, "learning_rate": 5.927417505365807e-06, "loss": 0.02685352861881256, "memory(GiB)": 122.96, "step": 55325, "token_acc": 0.9867160278745645, "train_speed(iter/s)": 0.231225 }, { "epoch": 4.2175470691363675, "grad_norm": 0.938816487789154, "learning_rate": 5.921764018788595e-06, "loss": 0.027258116006851196, "memory(GiB)": 122.96, "step": 55330, "token_acc": 0.9881011403073872, "train_speed(iter/s)": 0.231227 }, { "epoch": 4.217928195746627, "grad_norm": 1.3245903253555298, "learning_rate": 5.916113059893674e-06, "loss": 0.033673858642578124, "memory(GiB)": 122.96, "step": 55335, "token_acc": 0.9846057832327855, "train_speed(iter/s)": 0.231231 }, { "epoch": 4.218309322356887, "grad_norm": 0.898905336856842, "learning_rate": 5.910464629005091e-06, "loss": 0.02315828800201416, "memory(GiB)": 122.96, "step": 55340, "token_acc": 0.9878987898789879, "train_speed(iter/s)": 0.231234 }, { "epoch": 4.218690448967147, "grad_norm": 0.8714662194252014, "learning_rate": 5.904818726446759e-06, "loss": 0.01832747757434845, "memory(GiB)": 122.96, "step": 55345, "token_acc": 0.9908306364617044, "train_speed(iter/s)": 0.231241 }, { "epoch": 4.219071575577407, "grad_norm": 1.1509640216827393, "learning_rate": 5.899175352542457e-06, "loss": 0.05176819562911987, "memory(GiB)": 122.96, "step": 55350, "token_acc": 0.9824959481361426, "train_speed(iter/s)": 0.231247 }, { "epoch": 4.219452702187667, "grad_norm": 1.6465777158737183, "learning_rate": 5.893534507615783e-06, "loss": 0.042900896072387694, "memory(GiB)": 122.96, "step": 55355, "token_acc": 0.9816940738442445, "train_speed(iter/s)": 0.231253 }, { "epoch": 4.219833828797927, "grad_norm": 0.7855560779571533, "learning_rate": 5.8878961919902275e-06, "loss": 0.02973032891750336, "memory(GiB)": 122.96, "step": 55360, "token_acc": 0.9887751083103584, "train_speed(iter/s)": 0.231259 }, { "epoch": 4.220214955408187, "grad_norm": 1.3315565586090088, "learning_rate": 5.882260405989132e-06, "loss": 0.04169154167175293, "memory(GiB)": 122.96, "step": 55365, "token_acc": 0.9815059445178336, "train_speed(iter/s)": 0.231266 }, { "epoch": 4.220596082018447, "grad_norm": 1.2262063026428223, "learning_rate": 5.87662714993566e-06, "loss": 0.023150771856307983, "memory(GiB)": 122.96, "step": 55370, "token_acc": 0.9901389511429852, "train_speed(iter/s)": 0.231272 }, { "epoch": 4.220977208628707, "grad_norm": 0.9751487970352173, "learning_rate": 5.870996424152864e-06, "loss": 0.05374323129653931, "memory(GiB)": 122.96, "step": 55375, "token_acc": 0.9746865959498554, "train_speed(iter/s)": 0.231278 }, { "epoch": 4.221358335238967, "grad_norm": 0.8549553155899048, "learning_rate": 5.865368228963636e-06, "loss": 0.04377599060535431, "memory(GiB)": 122.96, "step": 55380, "token_acc": 0.9834311356575768, "train_speed(iter/s)": 0.231283 }, { "epoch": 4.221739461849226, "grad_norm": 1.7336773872375488, "learning_rate": 5.859742564690735e-06, "loss": 0.061805224418640135, "memory(GiB)": 122.96, "step": 55385, "token_acc": 0.9770303527481542, "train_speed(iter/s)": 0.231287 }, { "epoch": 4.222120588459486, "grad_norm": 0.7327417135238647, "learning_rate": 5.8541194316567684e-06, "loss": 0.03314964771270752, "memory(GiB)": 122.96, "step": 55390, "token_acc": 0.9857603949117144, "train_speed(iter/s)": 0.231293 }, { "epoch": 4.222501715069746, "grad_norm": 1.0110148191452026, "learning_rate": 5.848498830184179e-06, "loss": 0.03203316032886505, "memory(GiB)": 122.96, "step": 55395, "token_acc": 0.9813053460150869, "train_speed(iter/s)": 0.231299 }, { "epoch": 4.222882841680006, "grad_norm": 0.5330761671066284, "learning_rate": 5.842880760595298e-06, "loss": 0.03214051127433777, "memory(GiB)": 122.96, "step": 55400, "token_acc": 0.9889949830069591, "train_speed(iter/s)": 0.231303 }, { "epoch": 4.222882841680006, "eval_loss": 0.05143863335251808, "eval_runtime": 190.6969, "eval_samples_per_second": 2.779, "eval_steps_per_second": 2.779, "eval_token_acc": 0.9796849587374254, "step": 55400 }, { "epoch": 4.223263968290266, "grad_norm": 1.4159185886383057, "learning_rate": 5.837265223212302e-06, "loss": 0.05170719623565674, "memory(GiB)": 122.96, "step": 55405, "token_acc": 0.9796420937084753, "train_speed(iter/s)": 0.231124 }, { "epoch": 4.223645094900526, "grad_norm": 0.7959277033805847, "learning_rate": 5.831652218357192e-06, "loss": 0.014931032061576843, "memory(GiB)": 122.96, "step": 55410, "token_acc": 0.9941646191646192, "train_speed(iter/s)": 0.23113 }, { "epoch": 4.224026221510786, "grad_norm": 0.5972065925598145, "learning_rate": 5.826041746351863e-06, "loss": 0.034714192152023315, "memory(GiB)": 122.96, "step": 55415, "token_acc": 0.9873740956163994, "train_speed(iter/s)": 0.231134 }, { "epoch": 4.224407348121046, "grad_norm": 3.350816249847412, "learning_rate": 5.820433807518055e-06, "loss": 0.04720616042613983, "memory(GiB)": 122.96, "step": 55420, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.231142 }, { "epoch": 4.224788474731306, "grad_norm": 2.4044203758239746, "learning_rate": 5.814828402177342e-06, "loss": 0.03379968106746674, "memory(GiB)": 122.96, "step": 55425, "token_acc": 0.9885540761504321, "train_speed(iter/s)": 0.231147 }, { "epoch": 4.225169601341566, "grad_norm": 1.9135187864303589, "learning_rate": 5.809225530651175e-06, "loss": 0.041115564107894895, "memory(GiB)": 122.96, "step": 55430, "token_acc": 0.9797837329572168, "train_speed(iter/s)": 0.23115 }, { "epoch": 4.225550727951825, "grad_norm": 0.9383395314216614, "learning_rate": 5.803625193260864e-06, "loss": 0.0339178740978241, "memory(GiB)": 122.96, "step": 55435, "token_acc": 0.9907054337464252, "train_speed(iter/s)": 0.231156 }, { "epoch": 4.225931854562085, "grad_norm": 1.1289457082748413, "learning_rate": 5.798027390327543e-06, "loss": 0.037212294340133664, "memory(GiB)": 122.96, "step": 55440, "token_acc": 0.9819196428571428, "train_speed(iter/s)": 0.231162 }, { "epoch": 4.226312981172345, "grad_norm": 1.9063247442245483, "learning_rate": 5.792432122172231e-06, "loss": 0.03086470365524292, "memory(GiB)": 122.96, "step": 55445, "token_acc": 0.9887580299785867, "train_speed(iter/s)": 0.231168 }, { "epoch": 4.226694107782605, "grad_norm": 0.5388110876083374, "learning_rate": 5.786839389115794e-06, "loss": 0.04246129095554352, "memory(GiB)": 122.96, "step": 55450, "token_acc": 0.9835234474017744, "train_speed(iter/s)": 0.231173 }, { "epoch": 4.227075234392865, "grad_norm": 0.1315968781709671, "learning_rate": 5.781249191478938e-06, "loss": 0.031179898977279664, "memory(GiB)": 122.96, "step": 55455, "token_acc": 0.9897585954645208, "train_speed(iter/s)": 0.23118 }, { "epoch": 4.227456361003125, "grad_norm": 1.5521327257156372, "learning_rate": 5.775661529582249e-06, "loss": 0.049688971042633055, "memory(GiB)": 122.96, "step": 55460, "token_acc": 0.982310372554275, "train_speed(iter/s)": 0.231186 }, { "epoch": 4.227837487613385, "grad_norm": 0.6377988457679749, "learning_rate": 5.770076403746133e-06, "loss": 0.034239640831947325, "memory(GiB)": 122.96, "step": 55465, "token_acc": 0.9853691731881592, "train_speed(iter/s)": 0.231192 }, { "epoch": 4.228218614223645, "grad_norm": 1.3450336456298828, "learning_rate": 5.764493814290883e-06, "loss": 0.03402649760246277, "memory(GiB)": 122.96, "step": 55470, "token_acc": 0.9851757729775519, "train_speed(iter/s)": 0.231196 }, { "epoch": 4.228599740833905, "grad_norm": 0.7203817963600159, "learning_rate": 5.75891376153665e-06, "loss": 0.02172076404094696, "memory(GiB)": 122.96, "step": 55475, "token_acc": 0.9931124959002952, "train_speed(iter/s)": 0.231202 }, { "epoch": 4.228980867444165, "grad_norm": 0.10536673665046692, "learning_rate": 5.753336245803398e-06, "loss": 0.02292395383119583, "memory(GiB)": 122.96, "step": 55480, "token_acc": 0.988398415393322, "train_speed(iter/s)": 0.231209 }, { "epoch": 4.2293619940544245, "grad_norm": 1.3946874141693115, "learning_rate": 5.747761267410978e-06, "loss": 0.03022457957267761, "memory(GiB)": 122.96, "step": 55485, "token_acc": 0.9877594465141033, "train_speed(iter/s)": 0.231217 }, { "epoch": 4.2297431206646845, "grad_norm": 0.3025609850883484, "learning_rate": 5.742188826679107e-06, "loss": 0.03815813958644867, "memory(GiB)": 122.96, "step": 55490, "token_acc": 0.9863657733897508, "train_speed(iter/s)": 0.231221 }, { "epoch": 4.2301242472749445, "grad_norm": 0.31161943078041077, "learning_rate": 5.736618923927317e-06, "loss": 0.02344596982002258, "memory(GiB)": 122.96, "step": 55495, "token_acc": 0.9883870967741936, "train_speed(iter/s)": 0.231224 }, { "epoch": 4.2305053738852045, "grad_norm": 0.16098657250404358, "learning_rate": 5.7310515594750205e-06, "loss": 0.018935438990592957, "memory(GiB)": 122.96, "step": 55500, "token_acc": 0.9910019455252919, "train_speed(iter/s)": 0.23123 }, { "epoch": 4.2308865004954646, "grad_norm": 1.453188180923462, "learning_rate": 5.725486733641494e-06, "loss": 0.02893900275230408, "memory(GiB)": 122.96, "step": 55505, "token_acc": 0.988589766446782, "train_speed(iter/s)": 0.231234 }, { "epoch": 4.231267627105725, "grad_norm": 0.9316104650497437, "learning_rate": 5.719924446745828e-06, "loss": 0.025292667746543884, "memory(GiB)": 122.96, "step": 55510, "token_acc": 0.989634748272458, "train_speed(iter/s)": 0.231239 }, { "epoch": 4.231648753715985, "grad_norm": 2.1183507442474365, "learning_rate": 5.7143646991070275e-06, "loss": 0.03176144957542419, "memory(GiB)": 122.96, "step": 55515, "token_acc": 0.9916652775462577, "train_speed(iter/s)": 0.231244 }, { "epoch": 4.232029880326245, "grad_norm": 1.5971215963363647, "learning_rate": 5.708807491043894e-06, "loss": 0.050456440448760985, "memory(GiB)": 122.96, "step": 55520, "token_acc": 0.9871934604904632, "train_speed(iter/s)": 0.23125 }, { "epoch": 4.232411006936505, "grad_norm": 0.4628068804740906, "learning_rate": 5.703252822875116e-06, "loss": 0.07084723114967346, "memory(GiB)": 122.96, "step": 55525, "token_acc": 0.9782405968293441, "train_speed(iter/s)": 0.231253 }, { "epoch": 4.232792133546765, "grad_norm": 0.42939600348472595, "learning_rate": 5.697700694919234e-06, "loss": 0.032654482126235965, "memory(GiB)": 122.96, "step": 55530, "token_acc": 0.9898334794040315, "train_speed(iter/s)": 0.231257 }, { "epoch": 4.233173260157024, "grad_norm": 0.1929548680782318, "learning_rate": 5.69215110749462e-06, "loss": 0.024700118601322173, "memory(GiB)": 122.96, "step": 55535, "token_acc": 0.9896907216494846, "train_speed(iter/s)": 0.231265 }, { "epoch": 4.233554386767284, "grad_norm": 0.7563052177429199, "learning_rate": 5.686604060919526e-06, "loss": 0.020641586184501647, "memory(GiB)": 122.96, "step": 55540, "token_acc": 0.988249694002448, "train_speed(iter/s)": 0.23127 }, { "epoch": 4.233935513377544, "grad_norm": 0.9931066036224365, "learning_rate": 5.681059555512058e-06, "loss": 0.03583637475967407, "memory(GiB)": 122.96, "step": 55545, "token_acc": 0.9830102374210412, "train_speed(iter/s)": 0.231276 }, { "epoch": 4.234316639987804, "grad_norm": 2.455418586730957, "learning_rate": 5.675517591590152e-06, "loss": 0.050568246841430665, "memory(GiB)": 122.96, "step": 55550, "token_acc": 0.979702300405954, "train_speed(iter/s)": 0.231282 }, { "epoch": 4.234697766598064, "grad_norm": 0.5062316060066223, "learning_rate": 5.6699781694716185e-06, "loss": 0.024629752337932586, "memory(GiB)": 122.96, "step": 55555, "token_acc": 0.9918908956874309, "train_speed(iter/s)": 0.231285 }, { "epoch": 4.235078893208324, "grad_norm": 0.9456076622009277, "learning_rate": 5.6644412894741314e-06, "loss": 0.03336159586906433, "memory(GiB)": 122.96, "step": 55560, "token_acc": 0.9817454363590897, "train_speed(iter/s)": 0.231291 }, { "epoch": 4.235460019818584, "grad_norm": 0.27335476875305176, "learning_rate": 5.658906951915188e-06, "loss": 0.05252414345741272, "memory(GiB)": 122.96, "step": 55565, "token_acc": 0.9820816864295125, "train_speed(iter/s)": 0.231296 }, { "epoch": 4.235841146428844, "grad_norm": 0.8343319296836853, "learning_rate": 5.653375157112156e-06, "loss": 0.027564799785614012, "memory(GiB)": 122.96, "step": 55570, "token_acc": 0.9915480427046264, "train_speed(iter/s)": 0.231302 }, { "epoch": 4.236222273039104, "grad_norm": 1.1917659044265747, "learning_rate": 5.6478459053822816e-06, "loss": 0.04632106423377991, "memory(GiB)": 122.96, "step": 55575, "token_acc": 0.982182084452038, "train_speed(iter/s)": 0.231307 }, { "epoch": 4.236603399649364, "grad_norm": 0.9330425262451172, "learning_rate": 5.642319197042611e-06, "loss": 0.020627635717391967, "memory(GiB)": 122.96, "step": 55580, "token_acc": 0.9877908646940534, "train_speed(iter/s)": 0.231312 }, { "epoch": 4.236984526259623, "grad_norm": 1.00416898727417, "learning_rate": 5.6367950324100975e-06, "loss": 0.0322589099407196, "memory(GiB)": 122.96, "step": 55585, "token_acc": 0.9878682842287695, "train_speed(iter/s)": 0.231317 }, { "epoch": 4.237365652869883, "grad_norm": 1.6256523132324219, "learning_rate": 5.6312734118015185e-06, "loss": 0.021205219626426696, "memory(GiB)": 122.96, "step": 55590, "token_acc": 0.9887857695282289, "train_speed(iter/s)": 0.231321 }, { "epoch": 4.237746779480143, "grad_norm": 1.8889739513397217, "learning_rate": 5.625754335533512e-06, "loss": 0.062058103084564206, "memory(GiB)": 122.96, "step": 55595, "token_acc": 0.9818627450980392, "train_speed(iter/s)": 0.231329 }, { "epoch": 4.238127906090403, "grad_norm": 0.8898444771766663, "learning_rate": 5.620237803922568e-06, "loss": 0.025379469990730284, "memory(GiB)": 122.96, "step": 55600, "token_acc": 0.991161441789808, "train_speed(iter/s)": 0.231333 }, { "epoch": 4.238127906090403, "eval_loss": 0.051023125648498535, "eval_runtime": 186.7651, "eval_samples_per_second": 2.838, "eval_steps_per_second": 2.838, "eval_token_acc": 0.9797150774049757, "step": 55600 }, { "epoch": 4.238509032700663, "grad_norm": 1.6823430061340332, "learning_rate": 5.61472381728505e-06, "loss": 0.03135631680488586, "memory(GiB)": 122.96, "step": 55605, "token_acc": 0.9798871791842642, "train_speed(iter/s)": 0.23116 }, { "epoch": 4.238890159310923, "grad_norm": 0.6572668552398682, "learning_rate": 5.609212375937134e-06, "loss": 0.0784042239189148, "memory(GiB)": 122.96, "step": 55610, "token_acc": 0.9674906874365052, "train_speed(iter/s)": 0.231167 }, { "epoch": 4.239271285921183, "grad_norm": 0.9683722257614136, "learning_rate": 5.603703480194894e-06, "loss": 0.029264092445373535, "memory(GiB)": 122.96, "step": 55615, "token_acc": 0.985897435897436, "train_speed(iter/s)": 0.231171 }, { "epoch": 4.239652412531443, "grad_norm": 0.35510164499282837, "learning_rate": 5.598197130374244e-06, "loss": 0.04463421702384949, "memory(GiB)": 122.96, "step": 55620, "token_acc": 0.9825842696629213, "train_speed(iter/s)": 0.231178 }, { "epoch": 4.240033539141703, "grad_norm": 0.8370898962020874, "learning_rate": 5.592693326790932e-06, "loss": 0.031641560792922976, "memory(GiB)": 122.96, "step": 55625, "token_acc": 0.9881221719457014, "train_speed(iter/s)": 0.231183 }, { "epoch": 4.240414665751963, "grad_norm": 0.9573915600776672, "learning_rate": 5.5871920697605775e-06, "loss": 0.020558997988700867, "memory(GiB)": 122.96, "step": 55630, "token_acc": 0.9952254641909815, "train_speed(iter/s)": 0.231191 }, { "epoch": 4.240795792362222, "grad_norm": 0.9567024111747742, "learning_rate": 5.581693359598672e-06, "loss": 0.04411880671977997, "memory(GiB)": 122.96, "step": 55635, "token_acc": 0.9800214822771214, "train_speed(iter/s)": 0.231195 }, { "epoch": 4.241176918972482, "grad_norm": 0.9903087019920349, "learning_rate": 5.576197196620514e-06, "loss": 0.024760468304157256, "memory(GiB)": 122.96, "step": 55640, "token_acc": 0.9895417379729987, "train_speed(iter/s)": 0.2312 }, { "epoch": 4.241558045582742, "grad_norm": 3.717097520828247, "learning_rate": 5.570703581141295e-06, "loss": 0.031729042530059814, "memory(GiB)": 122.96, "step": 55645, "token_acc": 0.9842922028766087, "train_speed(iter/s)": 0.231206 }, { "epoch": 4.241939172193002, "grad_norm": 0.4493067264556885, "learning_rate": 5.565212513476048e-06, "loss": 0.023640228807926177, "memory(GiB)": 122.96, "step": 55650, "token_acc": 0.9935537533790809, "train_speed(iter/s)": 0.231212 }, { "epoch": 4.2423202988032624, "grad_norm": 1.25039803981781, "learning_rate": 5.559723993939664e-06, "loss": 0.031138277053833006, "memory(GiB)": 122.96, "step": 55655, "token_acc": 0.988427143608627, "train_speed(iter/s)": 0.231218 }, { "epoch": 4.2427014254135225, "grad_norm": 0.9705250263214111, "learning_rate": 5.554238022846886e-06, "loss": 0.04189582467079163, "memory(GiB)": 122.96, "step": 55660, "token_acc": 0.9845063458051755, "train_speed(iter/s)": 0.231222 }, { "epoch": 4.2430825520237825, "grad_norm": 1.6370806694030762, "learning_rate": 5.548754600512301e-06, "loss": 0.04755587577819824, "memory(GiB)": 122.96, "step": 55665, "token_acc": 0.9806896551724138, "train_speed(iter/s)": 0.231228 }, { "epoch": 4.2434636786340425, "grad_norm": 0.9621847867965698, "learning_rate": 5.543273727250353e-06, "loss": 0.03403976559638977, "memory(GiB)": 122.96, "step": 55670, "token_acc": 0.9843487041400202, "train_speed(iter/s)": 0.231232 }, { "epoch": 4.2438448052443025, "grad_norm": 1.2499693632125854, "learning_rate": 5.537795403375368e-06, "loss": 0.02431725710630417, "memory(GiB)": 122.96, "step": 55675, "token_acc": 0.9909584086799277, "train_speed(iter/s)": 0.231239 }, { "epoch": 4.244225931854562, "grad_norm": 1.286186933517456, "learning_rate": 5.532319629201471e-06, "loss": 0.030416026711463928, "memory(GiB)": 122.96, "step": 55680, "token_acc": 0.988401808531551, "train_speed(iter/s)": 0.231244 }, { "epoch": 4.244607058464822, "grad_norm": 0.8849012851715088, "learning_rate": 5.526846405042701e-06, "loss": 0.031170442700386047, "memory(GiB)": 122.96, "step": 55685, "token_acc": 0.9872367581365666, "train_speed(iter/s)": 0.231249 }, { "epoch": 4.244988185075082, "grad_norm": 0.7004885077476501, "learning_rate": 5.521375731212902e-06, "loss": 0.02484828680753708, "memory(GiB)": 122.96, "step": 55690, "token_acc": 0.9896947307019249, "train_speed(iter/s)": 0.231254 }, { "epoch": 4.245369311685342, "grad_norm": 1.2400579452514648, "learning_rate": 5.515907608025794e-06, "loss": 0.031246325373649596, "memory(GiB)": 122.96, "step": 55695, "token_acc": 0.9868829823955816, "train_speed(iter/s)": 0.231261 }, { "epoch": 4.245750438295602, "grad_norm": 0.5791195034980774, "learning_rate": 5.510442035794966e-06, "loss": 0.043377363681793214, "memory(GiB)": 122.96, "step": 55700, "token_acc": 0.9848823226249785, "train_speed(iter/s)": 0.231265 }, { "epoch": 4.246131564905862, "grad_norm": 1.2839840650558472, "learning_rate": 5.504979014833822e-06, "loss": 0.059115856885910034, "memory(GiB)": 122.96, "step": 55705, "token_acc": 0.9694706073400454, "train_speed(iter/s)": 0.231271 }, { "epoch": 4.246512691516122, "grad_norm": 1.0479323863983154, "learning_rate": 5.4995185454556456e-06, "loss": 0.040796682238578796, "memory(GiB)": 122.96, "step": 55710, "token_acc": 0.9812313379781032, "train_speed(iter/s)": 0.231274 }, { "epoch": 4.246893818126382, "grad_norm": 0.6933945417404175, "learning_rate": 5.494060627973585e-06, "loss": 0.03109516203403473, "memory(GiB)": 122.96, "step": 55715, "token_acc": 0.9867744305657604, "train_speed(iter/s)": 0.23128 }, { "epoch": 4.247274944736642, "grad_norm": 1.0038052797317505, "learning_rate": 5.488605262700602e-06, "loss": 0.04874084591865539, "memory(GiB)": 122.96, "step": 55720, "token_acc": 0.9775987918449535, "train_speed(iter/s)": 0.231286 }, { "epoch": 4.247656071346902, "grad_norm": 2.268214702606201, "learning_rate": 5.483152449949552e-06, "loss": 0.03826970756053925, "memory(GiB)": 122.96, "step": 55725, "token_acc": 0.9842843326885881, "train_speed(iter/s)": 0.231291 }, { "epoch": 4.248037197957161, "grad_norm": 1.2620925903320312, "learning_rate": 5.477702190033135e-06, "loss": 0.052023154497146604, "memory(GiB)": 122.96, "step": 55730, "token_acc": 0.9833380803189974, "train_speed(iter/s)": 0.231294 }, { "epoch": 4.248418324567421, "grad_norm": 0.6020711064338684, "learning_rate": 5.472254483263883e-06, "loss": 0.040706795454025266, "memory(GiB)": 122.96, "step": 55735, "token_acc": 0.9811240721102863, "train_speed(iter/s)": 0.231298 }, { "epoch": 4.248799451177681, "grad_norm": 0.8475333452224731, "learning_rate": 5.466809329954198e-06, "loss": 0.035697543621063234, "memory(GiB)": 122.96, "step": 55740, "token_acc": 0.9864399483426604, "train_speed(iter/s)": 0.231303 }, { "epoch": 4.249180577787941, "grad_norm": 1.0141565799713135, "learning_rate": 5.461366730416345e-06, "loss": 0.03310554027557373, "memory(GiB)": 122.96, "step": 55745, "token_acc": 0.9853848652159792, "train_speed(iter/s)": 0.23131 }, { "epoch": 4.249561704398201, "grad_norm": 0.9373081922531128, "learning_rate": 5.45592668496242e-06, "loss": 0.023779386281967164, "memory(GiB)": 122.96, "step": 55750, "token_acc": 0.9915555555555555, "train_speed(iter/s)": 0.231318 }, { "epoch": 4.249942831008461, "grad_norm": 0.9089663624763489, "learning_rate": 5.4504891939043904e-06, "loss": 0.022957149147987365, "memory(GiB)": 122.96, "step": 55755, "token_acc": 0.9862700228832952, "train_speed(iter/s)": 0.231324 }, { "epoch": 4.250323957618721, "grad_norm": 0.7605732679367065, "learning_rate": 5.4450542575540774e-06, "loss": 0.037996339797973636, "memory(GiB)": 122.96, "step": 55760, "token_acc": 0.989563765393446, "train_speed(iter/s)": 0.231327 }, { "epoch": 4.250705084228981, "grad_norm": 1.0327438116073608, "learning_rate": 5.439621876223139e-06, "loss": 0.03594544529914856, "memory(GiB)": 122.96, "step": 55765, "token_acc": 0.984437350359138, "train_speed(iter/s)": 0.231332 }, { "epoch": 4.251086210839241, "grad_norm": 0.9550861716270447, "learning_rate": 5.434192050223092e-06, "loss": 0.03485516607761383, "memory(GiB)": 122.96, "step": 55770, "token_acc": 0.9847623966942148, "train_speed(iter/s)": 0.231338 }, { "epoch": 4.251467337449501, "grad_norm": 0.9575262665748596, "learning_rate": 5.428764779865336e-06, "loss": 0.04105111360549927, "memory(GiB)": 122.96, "step": 55775, "token_acc": 0.9839867476532302, "train_speed(iter/s)": 0.231344 }, { "epoch": 4.25184846405976, "grad_norm": 1.4627716541290283, "learning_rate": 5.423340065461063e-06, "loss": 0.03346918225288391, "memory(GiB)": 122.96, "step": 55780, "token_acc": 0.9884769539078156, "train_speed(iter/s)": 0.231349 }, { "epoch": 4.25222959067002, "grad_norm": 1.3987767696380615, "learning_rate": 5.417917907321396e-06, "loss": 0.034051910042762756, "memory(GiB)": 122.96, "step": 55785, "token_acc": 0.9858417377812257, "train_speed(iter/s)": 0.231353 }, { "epoch": 4.25261071728028, "grad_norm": 2.2202696800231934, "learning_rate": 5.412498305757241e-06, "loss": 0.04685159325599671, "memory(GiB)": 122.96, "step": 55790, "token_acc": 0.9805912212600776, "train_speed(iter/s)": 0.231355 }, { "epoch": 4.25299184389054, "grad_norm": 2.2798538208007812, "learning_rate": 5.407081261079394e-06, "loss": 0.03173722624778748, "memory(GiB)": 122.96, "step": 55795, "token_acc": 0.9837973528069375, "train_speed(iter/s)": 0.231361 }, { "epoch": 4.2533729705008, "grad_norm": 1.3044331073760986, "learning_rate": 5.401666773598513e-06, "loss": 0.0327546238899231, "memory(GiB)": 122.96, "step": 55800, "token_acc": 0.9857414448669202, "train_speed(iter/s)": 0.231367 }, { "epoch": 4.2533729705008, "eval_loss": 0.05061187222599983, "eval_runtime": 186.1415, "eval_samples_per_second": 2.847, "eval_steps_per_second": 2.847, "eval_token_acc": 0.9797226070718631, "step": 55800 }, { "epoch": 4.25375409711106, "grad_norm": 0.10742887854576111, "learning_rate": 5.396254843625071e-06, "loss": 0.03288579285144806, "memory(GiB)": 122.96, "step": 55805, "token_acc": 0.9799062587789984, "train_speed(iter/s)": 0.231195 }, { "epoch": 4.25413522372132, "grad_norm": 0.8881111145019531, "learning_rate": 5.39084547146943e-06, "loss": 0.03049021363258362, "memory(GiB)": 122.96, "step": 55810, "token_acc": 0.9873943945286309, "train_speed(iter/s)": 0.231198 }, { "epoch": 4.25451635033158, "grad_norm": 1.100844144821167, "learning_rate": 5.385438657441794e-06, "loss": 0.027132943272590637, "memory(GiB)": 122.96, "step": 55815, "token_acc": 0.9867817147053424, "train_speed(iter/s)": 0.231202 }, { "epoch": 4.25489747694184, "grad_norm": 1.0397592782974243, "learning_rate": 5.380034401852207e-06, "loss": 0.016745986044406892, "memory(GiB)": 122.96, "step": 55820, "token_acc": 0.992717643164515, "train_speed(iter/s)": 0.231209 }, { "epoch": 4.2552786035521, "grad_norm": 0.923952043056488, "learning_rate": 5.374632705010585e-06, "loss": 0.027237117290496826, "memory(GiB)": 122.96, "step": 55825, "token_acc": 0.98842476094615, "train_speed(iter/s)": 0.231214 }, { "epoch": 4.2556597301623595, "grad_norm": 1.0628695487976074, "learning_rate": 5.3692335672267e-06, "loss": 0.027714183926582335, "memory(GiB)": 122.96, "step": 55830, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.231219 }, { "epoch": 4.25604085677262, "grad_norm": 0.8408164381980896, "learning_rate": 5.363836988810145e-06, "loss": 0.03212621808052063, "memory(GiB)": 122.96, "step": 55835, "token_acc": 0.9869011976047904, "train_speed(iter/s)": 0.231224 }, { "epoch": 4.25642198338288, "grad_norm": 0.7279913425445557, "learning_rate": 5.3584429700704046e-06, "loss": 0.02750459611415863, "memory(GiB)": 122.96, "step": 55840, "token_acc": 0.9896670493685419, "train_speed(iter/s)": 0.23123 }, { "epoch": 4.25680310999314, "grad_norm": 0.979219377040863, "learning_rate": 5.3530515113168085e-06, "loss": 0.042927712202072144, "memory(GiB)": 122.96, "step": 55845, "token_acc": 0.9851592664092664, "train_speed(iter/s)": 0.231231 }, { "epoch": 4.2571842366034, "grad_norm": 0.6738657355308533, "learning_rate": 5.347662612858512e-06, "loss": 0.02330757975578308, "memory(GiB)": 122.96, "step": 55850, "token_acc": 0.9895052473763118, "train_speed(iter/s)": 0.231233 }, { "epoch": 4.25756536321366, "grad_norm": 0.6510804295539856, "learning_rate": 5.34227627500456e-06, "loss": 0.04608000218868256, "memory(GiB)": 122.96, "step": 55855, "token_acc": 0.9884083816317432, "train_speed(iter/s)": 0.23124 }, { "epoch": 4.25794648982392, "grad_norm": 2.241509199142456, "learning_rate": 5.3368924980638165e-06, "loss": 0.03260180950164795, "memory(GiB)": 122.96, "step": 55860, "token_acc": 0.9863539445628998, "train_speed(iter/s)": 0.231246 }, { "epoch": 4.25832761643418, "grad_norm": 0.7324682474136353, "learning_rate": 5.331511282345025e-06, "loss": 0.027350258827209473, "memory(GiB)": 122.96, "step": 55865, "token_acc": 0.9913587604290822, "train_speed(iter/s)": 0.231253 }, { "epoch": 4.25870874304444, "grad_norm": 0.8822376132011414, "learning_rate": 5.326132628156788e-06, "loss": 0.03706401884555817, "memory(GiB)": 122.96, "step": 55870, "token_acc": 0.9880788053708119, "train_speed(iter/s)": 0.231254 }, { "epoch": 4.2590898696547, "grad_norm": 0.7976823449134827, "learning_rate": 5.320756535807519e-06, "loss": 0.027686327695846558, "memory(GiB)": 122.96, "step": 55875, "token_acc": 0.9904171364148816, "train_speed(iter/s)": 0.23126 }, { "epoch": 4.259470996264959, "grad_norm": 0.9616652727127075, "learning_rate": 5.315383005605529e-06, "loss": 0.022088515758514404, "memory(GiB)": 122.96, "step": 55880, "token_acc": 0.9890792694407833, "train_speed(iter/s)": 0.231265 }, { "epoch": 4.259852122875219, "grad_norm": 3.0265095233917236, "learning_rate": 5.310012037858969e-06, "loss": 0.04564012289047241, "memory(GiB)": 122.96, "step": 55885, "token_acc": 0.98528, "train_speed(iter/s)": 0.231271 }, { "epoch": 4.260233249485479, "grad_norm": 0.5774654150009155, "learning_rate": 5.304643632875822e-06, "loss": 0.03886063694953919, "memory(GiB)": 122.96, "step": 55890, "token_acc": 0.983341045812124, "train_speed(iter/s)": 0.231276 }, { "epoch": 4.260614376095739, "grad_norm": 0.5466727614402771, "learning_rate": 5.299277790963953e-06, "loss": 0.01950731873512268, "memory(GiB)": 122.96, "step": 55895, "token_acc": 0.9930091657604474, "train_speed(iter/s)": 0.231281 }, { "epoch": 4.260995502705999, "grad_norm": 1.1077451705932617, "learning_rate": 5.293914512431075e-06, "loss": 0.015646776556968688, "memory(GiB)": 122.96, "step": 55900, "token_acc": 0.9910098264687435, "train_speed(iter/s)": 0.231286 }, { "epoch": 4.261376629316259, "grad_norm": 0.2987518012523651, "learning_rate": 5.288553797584728e-06, "loss": 0.02945254147052765, "memory(GiB)": 122.96, "step": 55905, "token_acc": 0.9900779588944011, "train_speed(iter/s)": 0.231292 }, { "epoch": 4.261757755926519, "grad_norm": 2.2152915000915527, "learning_rate": 5.2831956467323305e-06, "loss": 0.038922616839408876, "memory(GiB)": 122.96, "step": 55910, "token_acc": 0.9833107803337844, "train_speed(iter/s)": 0.231299 }, { "epoch": 4.262138882536779, "grad_norm": 1.480915904045105, "learning_rate": 5.277840060181155e-06, "loss": 0.024721261858940125, "memory(GiB)": 122.96, "step": 55915, "token_acc": 0.9938434476693052, "train_speed(iter/s)": 0.231307 }, { "epoch": 4.262520009147039, "grad_norm": 0.8018785715103149, "learning_rate": 5.272487038238317e-06, "loss": 0.027003493905067445, "memory(GiB)": 122.96, "step": 55920, "token_acc": 0.9855623100303952, "train_speed(iter/s)": 0.231312 }, { "epoch": 4.262901135757298, "grad_norm": 0.8384920358657837, "learning_rate": 5.267136581210796e-06, "loss": 0.04031466841697693, "memory(GiB)": 122.96, "step": 55925, "token_acc": 0.9853095487932844, "train_speed(iter/s)": 0.231317 }, { "epoch": 4.263282262367558, "grad_norm": 2.0321948528289795, "learning_rate": 5.261788689405394e-06, "loss": 0.035682350397109985, "memory(GiB)": 122.96, "step": 55930, "token_acc": 0.9914666666666667, "train_speed(iter/s)": 0.231323 }, { "epoch": 4.263663388977818, "grad_norm": 0.5227847695350647, "learning_rate": 5.256443363128805e-06, "loss": 0.04486962854862213, "memory(GiB)": 122.96, "step": 55935, "token_acc": 0.9882171141587792, "train_speed(iter/s)": 0.231326 }, { "epoch": 4.264044515588078, "grad_norm": 2.0290746688842773, "learning_rate": 5.2511006026875585e-06, "loss": 0.043025851249694824, "memory(GiB)": 122.96, "step": 55940, "token_acc": 0.9867619247741122, "train_speed(iter/s)": 0.23133 }, { "epoch": 4.264425642198338, "grad_norm": 1.4065628051757812, "learning_rate": 5.245760408388023e-06, "loss": 0.034803324937820436, "memory(GiB)": 122.96, "step": 55945, "token_acc": 0.9866049280635025, "train_speed(iter/s)": 0.231334 }, { "epoch": 4.264806768808598, "grad_norm": 0.663159966468811, "learning_rate": 5.240422780536441e-06, "loss": 0.025256389379501344, "memory(GiB)": 122.96, "step": 55950, "token_acc": 0.9913935176707562, "train_speed(iter/s)": 0.231339 }, { "epoch": 4.265187895418858, "grad_norm": 1.8307753801345825, "learning_rate": 5.235087719438919e-06, "loss": 0.05578843355178833, "memory(GiB)": 122.96, "step": 55955, "token_acc": 0.9779296875, "train_speed(iter/s)": 0.231344 }, { "epoch": 4.265569022029118, "grad_norm": 1.1265560388565063, "learning_rate": 5.229755225401367e-06, "loss": 0.03542043566703797, "memory(GiB)": 122.96, "step": 55960, "token_acc": 0.9826415094339622, "train_speed(iter/s)": 0.231351 }, { "epoch": 4.265950148639378, "grad_norm": 0.4838615655899048, "learning_rate": 5.2244252987295965e-06, "loss": 0.02178962379693985, "memory(GiB)": 122.96, "step": 55965, "token_acc": 0.9921985815602837, "train_speed(iter/s)": 0.231355 }, { "epoch": 4.266331275249638, "grad_norm": 2.8898351192474365, "learning_rate": 5.219097939729256e-06, "loss": 0.035409939289093015, "memory(GiB)": 122.96, "step": 55970, "token_acc": 0.9895300706111517, "train_speed(iter/s)": 0.231362 }, { "epoch": 4.266712401859898, "grad_norm": 0.9155134558677673, "learning_rate": 5.213773148705836e-06, "loss": 0.03228876888751984, "memory(GiB)": 122.96, "step": 55975, "token_acc": 0.9907825772636906, "train_speed(iter/s)": 0.231367 }, { "epoch": 4.267093528470157, "grad_norm": 0.9192335605621338, "learning_rate": 5.208450925964687e-06, "loss": 0.027496135234832762, "memory(GiB)": 122.96, "step": 55980, "token_acc": 0.9881261595547309, "train_speed(iter/s)": 0.231371 }, { "epoch": 4.2674746550804175, "grad_norm": 2.3140456676483154, "learning_rate": 5.203131271811035e-06, "loss": 0.04288991689682007, "memory(GiB)": 122.96, "step": 55985, "token_acc": 0.9870100273473108, "train_speed(iter/s)": 0.231377 }, { "epoch": 4.2678557816906775, "grad_norm": 1.1888647079467773, "learning_rate": 5.19781418654991e-06, "loss": 0.025165802240371703, "memory(GiB)": 122.96, "step": 55990, "token_acc": 0.9907001228285665, "train_speed(iter/s)": 0.231381 }, { "epoch": 4.2682369083009375, "grad_norm": 0.706360399723053, "learning_rate": 5.192499670486228e-06, "loss": 0.03362097442150116, "memory(GiB)": 122.96, "step": 55995, "token_acc": 0.9871134020618557, "train_speed(iter/s)": 0.231386 }, { "epoch": 4.2686180349111975, "grad_norm": 1.821107268333435, "learning_rate": 5.187187723924774e-06, "loss": 0.030249857902526857, "memory(GiB)": 122.96, "step": 56000, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.231392 }, { "epoch": 4.2686180349111975, "eval_loss": 0.050435010343790054, "eval_runtime": 186.1172, "eval_samples_per_second": 2.848, "eval_steps_per_second": 2.848, "eval_token_acc": 0.9797301367387506, "step": 56000 }, { "epoch": 4.2689991615214575, "grad_norm": 0.7955253720283508, "learning_rate": 5.181878347170132e-06, "loss": 0.029318276047706603, "memory(GiB)": 122.96, "step": 56005, "token_acc": 0.9800358166189111, "train_speed(iter/s)": 0.231218 }, { "epoch": 4.2693802881317175, "grad_norm": 0.5767116546630859, "learning_rate": 5.176571540526792e-06, "loss": 0.039894679188728334, "memory(GiB)": 122.96, "step": 56010, "token_acc": 0.9817785700138641, "train_speed(iter/s)": 0.231224 }, { "epoch": 4.269761414741978, "grad_norm": 0.8838576674461365, "learning_rate": 5.171267304299071e-06, "loss": 0.02735731899738312, "memory(GiB)": 122.96, "step": 56015, "token_acc": 0.9882422104644327, "train_speed(iter/s)": 0.231232 }, { "epoch": 4.270142541352238, "grad_norm": 0.8391122221946716, "learning_rate": 5.165965638791137e-06, "loss": 0.03018256425857544, "memory(GiB)": 122.96, "step": 56020, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.231238 }, { "epoch": 4.270523667962497, "grad_norm": 1.3880473375320435, "learning_rate": 5.160666544307024e-06, "loss": 0.03469514846801758, "memory(GiB)": 122.96, "step": 56025, "token_acc": 0.98900595510765, "train_speed(iter/s)": 0.231243 }, { "epoch": 4.270904794572757, "grad_norm": 0.8367213010787964, "learning_rate": 5.155370021150596e-06, "loss": 0.0213482066988945, "memory(GiB)": 122.96, "step": 56030, "token_acc": 0.990025223572575, "train_speed(iter/s)": 0.231248 }, { "epoch": 4.271285921183017, "grad_norm": 1.4721362590789795, "learning_rate": 5.150076069625587e-06, "loss": 0.02963692247867584, "memory(GiB)": 122.96, "step": 56035, "token_acc": 0.9889842632331902, "train_speed(iter/s)": 0.231249 }, { "epoch": 4.271667047793277, "grad_norm": 1.4627041816711426, "learning_rate": 5.144784690035604e-06, "loss": 0.034291785955429074, "memory(GiB)": 122.96, "step": 56040, "token_acc": 0.9893981405969663, "train_speed(iter/s)": 0.231253 }, { "epoch": 4.272048174403537, "grad_norm": 0.9179987907409668, "learning_rate": 5.139495882684042e-06, "loss": 0.02333555817604065, "memory(GiB)": 122.96, "step": 56045, "token_acc": 0.9921609076843734, "train_speed(iter/s)": 0.231254 }, { "epoch": 4.272429301013797, "grad_norm": 1.3345279693603516, "learning_rate": 5.134209647874222e-06, "loss": 0.029423293471336365, "memory(GiB)": 122.96, "step": 56050, "token_acc": 0.9926881720430107, "train_speed(iter/s)": 0.231261 }, { "epoch": 4.272810427624057, "grad_norm": 1.6184276342391968, "learning_rate": 5.128925985909289e-06, "loss": 0.030446305871009827, "memory(GiB)": 122.96, "step": 56055, "token_acc": 0.9899592944369063, "train_speed(iter/s)": 0.231265 }, { "epoch": 4.273191554234317, "grad_norm": 1.2767040729522705, "learning_rate": 5.1236448970922115e-06, "loss": 0.04707072675228119, "memory(GiB)": 122.96, "step": 56060, "token_acc": 0.9856837606837607, "train_speed(iter/s)": 0.23127 }, { "epoch": 4.273572680844577, "grad_norm": 2.884556293487549, "learning_rate": 5.118366381725848e-06, "loss": 0.03010045289993286, "memory(GiB)": 122.96, "step": 56065, "token_acc": 0.9867947178871549, "train_speed(iter/s)": 0.231277 }, { "epoch": 4.273953807454837, "grad_norm": 2.132399082183838, "learning_rate": 5.1130904401129055e-06, "loss": 0.05185847878456116, "memory(GiB)": 122.96, "step": 56070, "token_acc": 0.982716513244411, "train_speed(iter/s)": 0.231282 }, { "epoch": 4.274334934065096, "grad_norm": 0.8832416534423828, "learning_rate": 5.107817072555915e-06, "loss": 0.03215884566307068, "memory(GiB)": 122.96, "step": 56075, "token_acc": 0.9864308214199177, "train_speed(iter/s)": 0.231288 }, { "epoch": 4.274716060675356, "grad_norm": 0.526527464389801, "learning_rate": 5.102546279357301e-06, "loss": 0.023228850960731507, "memory(GiB)": 122.96, "step": 56080, "token_acc": 0.9906984906984907, "train_speed(iter/s)": 0.231292 }, { "epoch": 4.275097187285616, "grad_norm": 3.236386775970459, "learning_rate": 5.097278060819299e-06, "loss": 0.03415350317955017, "memory(GiB)": 122.96, "step": 56085, "token_acc": 0.9870603848706039, "train_speed(iter/s)": 0.231299 }, { "epoch": 4.275478313895876, "grad_norm": 2.2770841121673584, "learning_rate": 5.0920124172440295e-06, "loss": 0.028768056631088258, "memory(GiB)": 122.96, "step": 56090, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.231306 }, { "epoch": 4.275859440506136, "grad_norm": 1.090156078338623, "learning_rate": 5.086749348933456e-06, "loss": 0.030387488007545472, "memory(GiB)": 122.96, "step": 56095, "token_acc": 0.9821570182394924, "train_speed(iter/s)": 0.231312 }, { "epoch": 4.276240567116396, "grad_norm": 4.913140296936035, "learning_rate": 5.08148885618937e-06, "loss": 0.04598428606987, "memory(GiB)": 122.96, "step": 56100, "token_acc": 0.9845581395348837, "train_speed(iter/s)": 0.231316 }, { "epoch": 4.276621693726656, "grad_norm": 1.187787413597107, "learning_rate": 5.076230939313459e-06, "loss": 0.033177369832992555, "memory(GiB)": 122.96, "step": 56105, "token_acc": 0.9844167408726625, "train_speed(iter/s)": 0.231321 }, { "epoch": 4.277002820336916, "grad_norm": 1.08171546459198, "learning_rate": 5.070975598607236e-06, "loss": 0.03173680603504181, "memory(GiB)": 122.96, "step": 56110, "token_acc": 0.9890338438268104, "train_speed(iter/s)": 0.231327 }, { "epoch": 4.277383946947176, "grad_norm": 0.7866901159286499, "learning_rate": 5.065722834372055e-06, "loss": 0.028880318999290465, "memory(GiB)": 122.96, "step": 56115, "token_acc": 0.987602840076637, "train_speed(iter/s)": 0.231328 }, { "epoch": 4.277765073557436, "grad_norm": 1.2954652309417725, "learning_rate": 5.060472646909154e-06, "loss": 0.04040428102016449, "memory(GiB)": 122.96, "step": 56120, "token_acc": 0.9813639968279143, "train_speed(iter/s)": 0.231333 }, { "epoch": 4.278146200167695, "grad_norm": 0.7754842042922974, "learning_rate": 5.055225036519612e-06, "loss": 0.028511819243431092, "memory(GiB)": 122.96, "step": 56125, "token_acc": 0.9905325443786982, "train_speed(iter/s)": 0.231341 }, { "epoch": 4.278527326777955, "grad_norm": 1.3386951684951782, "learning_rate": 5.049980003504329e-06, "loss": 0.03959351181983948, "memory(GiB)": 122.96, "step": 56130, "token_acc": 0.983982683982684, "train_speed(iter/s)": 0.231347 }, { "epoch": 4.278908453388215, "grad_norm": 1.1254962682724, "learning_rate": 5.044737548164102e-06, "loss": 0.038133054971694946, "memory(GiB)": 122.96, "step": 56135, "token_acc": 0.9865223155103845, "train_speed(iter/s)": 0.231353 }, { "epoch": 4.279289579998475, "grad_norm": 0.7885187864303589, "learning_rate": 5.03949767079957e-06, "loss": 0.026303672790527345, "memory(GiB)": 122.96, "step": 56140, "token_acc": 0.9891072697134738, "train_speed(iter/s)": 0.23136 }, { "epoch": 4.279670706608735, "grad_norm": 1.0124826431274414, "learning_rate": 5.0342603717111965e-06, "loss": 0.03664742112159729, "memory(GiB)": 122.96, "step": 56145, "token_acc": 0.9864885152379522, "train_speed(iter/s)": 0.231365 }, { "epoch": 4.280051833218995, "grad_norm": 2.797420024871826, "learning_rate": 5.029025651199321e-06, "loss": 0.05330157876014709, "memory(GiB)": 122.96, "step": 56150, "token_acc": 0.9823985680190931, "train_speed(iter/s)": 0.231372 }, { "epoch": 4.280432959829255, "grad_norm": 1.333232045173645, "learning_rate": 5.023793509564145e-06, "loss": 0.03743847012519837, "memory(GiB)": 122.96, "step": 56155, "token_acc": 0.9856020942408377, "train_speed(iter/s)": 0.231379 }, { "epoch": 4.280814086439515, "grad_norm": 0.6326130032539368, "learning_rate": 5.018563947105686e-06, "loss": 0.01651999056339264, "memory(GiB)": 122.96, "step": 56160, "token_acc": 0.9918573943661971, "train_speed(iter/s)": 0.231381 }, { "epoch": 4.2811952130497755, "grad_norm": 2.9814226627349854, "learning_rate": 5.013336964123844e-06, "loss": 0.027526196837425233, "memory(GiB)": 122.96, "step": 56165, "token_acc": 0.9864195265186273, "train_speed(iter/s)": 0.231387 }, { "epoch": 4.2815763396600355, "grad_norm": 1.218881607055664, "learning_rate": 5.008112560918371e-06, "loss": 0.0338642954826355, "memory(GiB)": 122.96, "step": 56170, "token_acc": 0.9838957055214724, "train_speed(iter/s)": 0.231395 }, { "epoch": 4.281957466270295, "grad_norm": 1.0354249477386475, "learning_rate": 5.002890737788851e-06, "loss": 0.028844505548477173, "memory(GiB)": 122.96, "step": 56175, "token_acc": 0.9888156580786899, "train_speed(iter/s)": 0.2314 }, { "epoch": 4.282338592880555, "grad_norm": 1.47127366065979, "learning_rate": 4.997671495034728e-06, "loss": 0.062215662002563475, "memory(GiB)": 122.96, "step": 56180, "token_acc": 0.9790304396843292, "train_speed(iter/s)": 0.231407 }, { "epoch": 4.282719719490815, "grad_norm": 3.424586296081543, "learning_rate": 4.992454832955318e-06, "loss": 0.023804795742034913, "memory(GiB)": 122.96, "step": 56185, "token_acc": 0.9903578315834584, "train_speed(iter/s)": 0.231414 }, { "epoch": 4.283100846101075, "grad_norm": 1.097609281539917, "learning_rate": 4.987240751849753e-06, "loss": 0.03785799145698547, "memory(GiB)": 122.96, "step": 56190, "token_acc": 0.9852858744394619, "train_speed(iter/s)": 0.231417 }, { "epoch": 4.283481972711335, "grad_norm": 2.096236228942871, "learning_rate": 4.982029252017062e-06, "loss": 0.0590688943862915, "memory(GiB)": 122.96, "step": 56195, "token_acc": 0.9768465365480291, "train_speed(iter/s)": 0.231423 }, { "epoch": 4.283863099321595, "grad_norm": 0.8003596067428589, "learning_rate": 4.976820333756071e-06, "loss": 0.023588380217552184, "memory(GiB)": 122.96, "step": 56200, "token_acc": 0.9905596763317599, "train_speed(iter/s)": 0.231429 }, { "epoch": 4.283863099321595, "eval_loss": 0.05034811794757843, "eval_runtime": 162.042, "eval_samples_per_second": 3.271, "eval_steps_per_second": 3.271, "eval_token_acc": 0.9797301367387506, "step": 56200 }, { "epoch": 4.284244225931855, "grad_norm": 2.373654842376709, "learning_rate": 4.971613997365504e-06, "loss": 0.028111782670021058, "memory(GiB)": 122.96, "step": 56205, "token_acc": 0.979880349610432, "train_speed(iter/s)": 0.231283 }, { "epoch": 4.284625352542115, "grad_norm": 0.951248288154602, "learning_rate": 4.9664102431439266e-06, "loss": 0.024873843789100646, "memory(GiB)": 122.96, "step": 56210, "token_acc": 0.9884479224892864, "train_speed(iter/s)": 0.231288 }, { "epoch": 4.285006479152375, "grad_norm": 1.2183493375778198, "learning_rate": 4.961209071389727e-06, "loss": 0.03191918134689331, "memory(GiB)": 122.96, "step": 56215, "token_acc": 0.9832402234636871, "train_speed(iter/s)": 0.231295 }, { "epoch": 4.285387605762635, "grad_norm": 0.6407907009124756, "learning_rate": 4.9560104824011855e-06, "loss": 0.01554737687110901, "memory(GiB)": 122.96, "step": 56220, "token_acc": 0.9926058704907014, "train_speed(iter/s)": 0.2313 }, { "epoch": 4.285768732372894, "grad_norm": 0.7040459513664246, "learning_rate": 4.950814476476423e-06, "loss": 0.026825445890426635, "memory(GiB)": 122.96, "step": 56225, "token_acc": 0.9886113152094048, "train_speed(iter/s)": 0.231305 }, { "epoch": 4.286149858983154, "grad_norm": 0.8625585436820984, "learning_rate": 4.945621053913385e-06, "loss": 0.030958375334739684, "memory(GiB)": 122.96, "step": 56230, "token_acc": 0.9907730673316708, "train_speed(iter/s)": 0.231311 }, { "epoch": 4.286530985593414, "grad_norm": 1.4686434268951416, "learning_rate": 4.9404302150099e-06, "loss": 0.04194161295890808, "memory(GiB)": 122.96, "step": 56235, "token_acc": 0.9852348993288591, "train_speed(iter/s)": 0.231317 }, { "epoch": 4.286912112203674, "grad_norm": 0.7705807685852051, "learning_rate": 4.935241960063652e-06, "loss": 0.01748828887939453, "memory(GiB)": 122.96, "step": 56240, "token_acc": 0.992377420683972, "train_speed(iter/s)": 0.231324 }, { "epoch": 4.287293238813934, "grad_norm": 0.8479730486869812, "learning_rate": 4.930056289372143e-06, "loss": 0.033521583676338194, "memory(GiB)": 122.96, "step": 56245, "token_acc": 0.9873382104670794, "train_speed(iter/s)": 0.231331 }, { "epoch": 4.287674365424194, "grad_norm": 1.7344906330108643, "learning_rate": 4.924873203232766e-06, "loss": 0.02181389629840851, "memory(GiB)": 122.96, "step": 56250, "token_acc": 0.9881563363600474, "train_speed(iter/s)": 0.231338 }, { "epoch": 4.288055492034454, "grad_norm": 0.8991094827651978, "learning_rate": 4.919692701942724e-06, "loss": 0.03527817726135254, "memory(GiB)": 122.96, "step": 56255, "token_acc": 0.9851587450685704, "train_speed(iter/s)": 0.231344 }, { "epoch": 4.288436618644714, "grad_norm": 0.5780420303344727, "learning_rate": 4.914514785799107e-06, "loss": 0.024499839544296263, "memory(GiB)": 122.96, "step": 56260, "token_acc": 0.9921829762594094, "train_speed(iter/s)": 0.231347 }, { "epoch": 4.288817745254974, "grad_norm": 0.516294538974762, "learning_rate": 4.909339455098855e-06, "loss": 0.01602325141429901, "memory(GiB)": 122.96, "step": 56265, "token_acc": 0.9924199355694523, "train_speed(iter/s)": 0.231352 }, { "epoch": 4.289198871865233, "grad_norm": 1.0164282321929932, "learning_rate": 4.90416671013873e-06, "loss": 0.04259299635887146, "memory(GiB)": 122.96, "step": 56270, "token_acc": 0.9876044969731911, "train_speed(iter/s)": 0.23136 }, { "epoch": 4.289579998475493, "grad_norm": 1.803491234779358, "learning_rate": 4.898996551215379e-06, "loss": 0.033911556005477905, "memory(GiB)": 122.96, "step": 56275, "token_acc": 0.9829749103942652, "train_speed(iter/s)": 0.231363 }, { "epoch": 4.289961125085753, "grad_norm": 0.8840876817703247, "learning_rate": 4.893828978625287e-06, "loss": 0.025949466228485107, "memory(GiB)": 122.96, "step": 56280, "token_acc": 0.9891334633602675, "train_speed(iter/s)": 0.23137 }, { "epoch": 4.290342251696013, "grad_norm": 1.0328891277313232, "learning_rate": 4.888663992664771e-06, "loss": 0.022670072317123414, "memory(GiB)": 122.96, "step": 56285, "token_acc": 0.9907970906931869, "train_speed(iter/s)": 0.231374 }, { "epoch": 4.290723378306273, "grad_norm": 1.0483834743499756, "learning_rate": 4.883501593630035e-06, "loss": 0.02787022292613983, "memory(GiB)": 122.96, "step": 56290, "token_acc": 0.9889898860581232, "train_speed(iter/s)": 0.231378 }, { "epoch": 4.291104504916533, "grad_norm": 1.2503665685653687, "learning_rate": 4.87834178181713e-06, "loss": 0.01933097392320633, "memory(GiB)": 122.96, "step": 56295, "token_acc": 0.9939255884586181, "train_speed(iter/s)": 0.231386 }, { "epoch": 4.291485631526793, "grad_norm": 1.8338510990142822, "learning_rate": 4.8731845575219205e-06, "loss": 0.041626283526420595, "memory(GiB)": 122.96, "step": 56300, "token_acc": 0.9814945613787258, "train_speed(iter/s)": 0.231391 }, { "epoch": 4.291866758137053, "grad_norm": 1.3895814418792725, "learning_rate": 4.868029921040168e-06, "loss": 0.028496125340461732, "memory(GiB)": 122.96, "step": 56305, "token_acc": 0.9932523616734144, "train_speed(iter/s)": 0.231399 }, { "epoch": 4.292247884747313, "grad_norm": 0.9830102324485779, "learning_rate": 4.862877872667465e-06, "loss": 0.021399299800395965, "memory(GiB)": 122.96, "step": 56310, "token_acc": 0.991800878477306, "train_speed(iter/s)": 0.231405 }, { "epoch": 4.292629011357573, "grad_norm": 1.5869462490081787, "learning_rate": 4.857728412699236e-06, "loss": 0.038104474544525146, "memory(GiB)": 122.96, "step": 56315, "token_acc": 0.983739837398374, "train_speed(iter/s)": 0.231412 }, { "epoch": 4.293010137967833, "grad_norm": 2.220918893814087, "learning_rate": 4.852581541430818e-06, "loss": 0.022610053420066833, "memory(GiB)": 122.96, "step": 56320, "token_acc": 0.9926870429401837, "train_speed(iter/s)": 0.231418 }, { "epoch": 4.2933912645780925, "grad_norm": 1.1566609144210815, "learning_rate": 4.847437259157328e-06, "loss": 0.03175105154514313, "memory(GiB)": 122.96, "step": 56325, "token_acc": 0.9881324172392255, "train_speed(iter/s)": 0.231425 }, { "epoch": 4.2937723911883525, "grad_norm": 1.0312066078186035, "learning_rate": 4.842295566173782e-06, "loss": 0.04431872069835663, "memory(GiB)": 122.96, "step": 56330, "token_acc": 0.9849669272399278, "train_speed(iter/s)": 0.23143 }, { "epoch": 4.2941535177986125, "grad_norm": 0.6221259832382202, "learning_rate": 4.837156462775033e-06, "loss": 0.025273922085762023, "memory(GiB)": 122.96, "step": 56335, "token_acc": 0.9890532544378698, "train_speed(iter/s)": 0.231434 }, { "epoch": 4.294534644408873, "grad_norm": 0.02295490726828575, "learning_rate": 4.8320199492557674e-06, "loss": 0.04114666879177094, "memory(GiB)": 122.96, "step": 56340, "token_acc": 0.9869186046511628, "train_speed(iter/s)": 0.231439 }, { "epoch": 4.294915771019133, "grad_norm": 0.7442665100097656, "learning_rate": 4.8268860259105595e-06, "loss": 0.03767063319683075, "memory(GiB)": 122.96, "step": 56345, "token_acc": 0.9824986537425956, "train_speed(iter/s)": 0.231441 }, { "epoch": 4.295296897629393, "grad_norm": 1.0683692693710327, "learning_rate": 4.821754693033814e-06, "loss": 0.022991889715194704, "memory(GiB)": 122.96, "step": 56350, "token_acc": 0.9898150072749948, "train_speed(iter/s)": 0.231446 }, { "epoch": 4.295678024239653, "grad_norm": 1.0162808895111084, "learning_rate": 4.816625950919779e-06, "loss": 0.041716134548187254, "memory(GiB)": 122.96, "step": 56355, "token_acc": 0.9801849405548216, "train_speed(iter/s)": 0.231452 }, { "epoch": 4.296059150849913, "grad_norm": 1.8937252759933472, "learning_rate": 4.811499799862562e-06, "loss": 0.02901039719581604, "memory(GiB)": 122.96, "step": 56360, "token_acc": 0.9873657612128869, "train_speed(iter/s)": 0.231456 }, { "epoch": 4.296440277460173, "grad_norm": 1.30540132522583, "learning_rate": 4.806376240156146e-06, "loss": 0.02855513393878937, "memory(GiB)": 122.96, "step": 56365, "token_acc": 0.9898111332007953, "train_speed(iter/s)": 0.231463 }, { "epoch": 4.296821404070432, "grad_norm": 1.186202883720398, "learning_rate": 4.8012552720943184e-06, "loss": 0.05185282230377197, "memory(GiB)": 122.96, "step": 56370, "token_acc": 0.9815416420555227, "train_speed(iter/s)": 0.231468 }, { "epoch": 4.297202530680692, "grad_norm": 0.8980030417442322, "learning_rate": 4.796136895970754e-06, "loss": 0.032538068294525144, "memory(GiB)": 122.96, "step": 56375, "token_acc": 0.9856346121345276, "train_speed(iter/s)": 0.231473 }, { "epoch": 4.297583657290952, "grad_norm": 0.536116898059845, "learning_rate": 4.791021112078975e-06, "loss": 0.0286103755235672, "memory(GiB)": 122.96, "step": 56380, "token_acc": 0.9900656946002243, "train_speed(iter/s)": 0.231478 }, { "epoch": 4.297964783901212, "grad_norm": 1.0454872846603394, "learning_rate": 4.7859079207123294e-06, "loss": 0.0465530127286911, "memory(GiB)": 122.96, "step": 56385, "token_acc": 0.9841947210368263, "train_speed(iter/s)": 0.231482 }, { "epoch": 4.298345910511472, "grad_norm": 0.9710280895233154, "learning_rate": 4.780797322164049e-06, "loss": 0.02104971706867218, "memory(GiB)": 122.96, "step": 56390, "token_acc": 0.9915325994919559, "train_speed(iter/s)": 0.231489 }, { "epoch": 4.298727037121732, "grad_norm": 0.23185372352600098, "learning_rate": 4.775689316727205e-06, "loss": 0.012664739787578583, "memory(GiB)": 122.96, "step": 56395, "token_acc": 0.9934754240974336, "train_speed(iter/s)": 0.231498 }, { "epoch": 4.299108163731992, "grad_norm": 1.638534426689148, "learning_rate": 4.770583904694709e-06, "loss": 0.028618121147155763, "memory(GiB)": 122.96, "step": 56400, "token_acc": 0.9903813122638269, "train_speed(iter/s)": 0.231505 }, { "epoch": 4.299108163731992, "eval_loss": 0.05009985715150833, "eval_runtime": 157.5861, "eval_samples_per_second": 3.363, "eval_steps_per_second": 3.363, "eval_token_acc": 0.9798732004096139, "step": 56400 }, { "epoch": 4.299489290342252, "grad_norm": 1.3606458902359009, "learning_rate": 4.765481086359331e-06, "loss": 0.029828649759292603, "memory(GiB)": 122.96, "step": 56405, "token_acc": 0.9801406883106492, "train_speed(iter/s)": 0.231362 }, { "epoch": 4.299870416952512, "grad_norm": 0.8846539258956909, "learning_rate": 4.760380862013708e-06, "loss": 0.02670426368713379, "memory(GiB)": 122.96, "step": 56410, "token_acc": 0.9867408741201505, "train_speed(iter/s)": 0.231367 }, { "epoch": 4.300251543562772, "grad_norm": 1.2048697471618652, "learning_rate": 4.755283231950297e-06, "loss": 0.03298570215702057, "memory(GiB)": 122.96, "step": 56415, "token_acc": 0.9896592244418332, "train_speed(iter/s)": 0.231374 }, { "epoch": 4.300632670173031, "grad_norm": 0.44583770632743835, "learning_rate": 4.750188196461441e-06, "loss": 0.027601492404937745, "memory(GiB)": 122.96, "step": 56420, "token_acc": 0.9817450495049505, "train_speed(iter/s)": 0.231382 }, { "epoch": 4.301013796783291, "grad_norm": 1.3444002866744995, "learning_rate": 4.745095755839296e-06, "loss": 0.02856728732585907, "memory(GiB)": 122.96, "step": 56425, "token_acc": 0.9865685372585097, "train_speed(iter/s)": 0.231387 }, { "epoch": 4.301394923393551, "grad_norm": 0.7602562308311462, "learning_rate": 4.7400059103759e-06, "loss": 0.025354331731796263, "memory(GiB)": 122.96, "step": 56430, "token_acc": 0.9893758300132802, "train_speed(iter/s)": 0.231394 }, { "epoch": 4.301776050003811, "grad_norm": 0.9740633368492126, "learning_rate": 4.7349186603631464e-06, "loss": 0.029451555013656615, "memory(GiB)": 122.96, "step": 56435, "token_acc": 0.9875577675079986, "train_speed(iter/s)": 0.231402 }, { "epoch": 4.302157176614071, "grad_norm": 1.1155682802200317, "learning_rate": 4.729834006092742e-06, "loss": 0.024903278052806854, "memory(GiB)": 122.96, "step": 56440, "token_acc": 0.988491316174932, "train_speed(iter/s)": 0.231407 }, { "epoch": 4.302538303224331, "grad_norm": 1.5005555152893066, "learning_rate": 4.724751947856265e-06, "loss": 0.02688722014427185, "memory(GiB)": 122.96, "step": 56445, "token_acc": 0.9904775440591245, "train_speed(iter/s)": 0.231408 }, { "epoch": 4.302919429834591, "grad_norm": 0.9731636047363281, "learning_rate": 4.719672485945181e-06, "loss": 0.0447279155254364, "memory(GiB)": 122.96, "step": 56450, "token_acc": 0.9801801801801802, "train_speed(iter/s)": 0.231416 }, { "epoch": 4.303300556444851, "grad_norm": 0.7500284910202026, "learning_rate": 4.714595620650747e-06, "loss": 0.03467515707015991, "memory(GiB)": 122.96, "step": 56455, "token_acc": 0.9882424797679035, "train_speed(iter/s)": 0.231421 }, { "epoch": 4.303681683055111, "grad_norm": 0.6046462655067444, "learning_rate": 4.709521352264112e-06, "loss": 0.03794448971748352, "memory(GiB)": 122.96, "step": 56460, "token_acc": 0.9813963668198731, "train_speed(iter/s)": 0.231428 }, { "epoch": 4.304062809665371, "grad_norm": 0.9128597974777222, "learning_rate": 4.704449681076245e-06, "loss": 0.03026048243045807, "memory(GiB)": 122.96, "step": 56465, "token_acc": 0.9906143344709898, "train_speed(iter/s)": 0.231434 }, { "epoch": 4.30444393627563, "grad_norm": 0.6909599900245667, "learning_rate": 4.699380607377996e-06, "loss": 0.021975766122341155, "memory(GiB)": 122.96, "step": 56470, "token_acc": 0.9922598803799695, "train_speed(iter/s)": 0.231437 }, { "epoch": 4.30482506288589, "grad_norm": 1.0626152753829956, "learning_rate": 4.694314131460048e-06, "loss": 0.02790490388870239, "memory(GiB)": 122.96, "step": 56475, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.231443 }, { "epoch": 4.30520618949615, "grad_norm": 0.7717025279998779, "learning_rate": 4.689250253612937e-06, "loss": 0.03643999397754669, "memory(GiB)": 122.96, "step": 56480, "token_acc": 0.9852317790236566, "train_speed(iter/s)": 0.231446 }, { "epoch": 4.30558731610641, "grad_norm": 0.44502148032188416, "learning_rate": 4.6841889741270575e-06, "loss": 0.027820149064064027, "memory(GiB)": 122.96, "step": 56485, "token_acc": 0.9867201549315258, "train_speed(iter/s)": 0.231451 }, { "epoch": 4.3059684427166705, "grad_norm": 0.9880298376083374, "learning_rate": 4.679130293292655e-06, "loss": 0.048440805077552794, "memory(GiB)": 122.96, "step": 56490, "token_acc": 0.9868467204489653, "train_speed(iter/s)": 0.231456 }, { "epoch": 4.3063495693269305, "grad_norm": 1.061919093132019, "learning_rate": 4.674074211399809e-06, "loss": 0.02590230107307434, "memory(GiB)": 122.96, "step": 56495, "token_acc": 0.9876499647141849, "train_speed(iter/s)": 0.231464 }, { "epoch": 4.3067306959371905, "grad_norm": 1.260136604309082, "learning_rate": 4.669020728738472e-06, "loss": 0.029977703094482423, "memory(GiB)": 122.96, "step": 56500, "token_acc": 0.987886724504829, "train_speed(iter/s)": 0.231469 }, { "epoch": 4.3071118225474505, "grad_norm": 1.5586352348327637, "learning_rate": 4.663969845598437e-06, "loss": 0.030046704411506652, "memory(GiB)": 122.96, "step": 56505, "token_acc": 0.9878903760356915, "train_speed(iter/s)": 0.231475 }, { "epoch": 4.3074929491577105, "grad_norm": 1.5927186012268066, "learning_rate": 4.658921562269342e-06, "loss": 0.0444591611623764, "memory(GiB)": 122.96, "step": 56510, "token_acc": 0.9801111436092425, "train_speed(iter/s)": 0.231478 }, { "epoch": 4.30787407576797, "grad_norm": 0.886110246181488, "learning_rate": 4.653875879040686e-06, "loss": 0.030528730154037474, "memory(GiB)": 122.96, "step": 56515, "token_acc": 0.9876985014538134, "train_speed(iter/s)": 0.231484 }, { "epoch": 4.30825520237823, "grad_norm": 1.7657686471939087, "learning_rate": 4.6488327962018245e-06, "loss": 0.03859785795211792, "memory(GiB)": 122.96, "step": 56520, "token_acc": 0.9849744245524297, "train_speed(iter/s)": 0.231492 }, { "epoch": 4.30863632898849, "grad_norm": 1.267701506614685, "learning_rate": 4.643792314041939e-06, "loss": 0.027880534529685974, "memory(GiB)": 122.96, "step": 56525, "token_acc": 0.9889780759554331, "train_speed(iter/s)": 0.231493 }, { "epoch": 4.30901745559875, "grad_norm": 0.6529945731163025, "learning_rate": 4.63875443285009e-06, "loss": 0.01989876925945282, "memory(GiB)": 122.96, "step": 56530, "token_acc": 0.9910699241786015, "train_speed(iter/s)": 0.231498 }, { "epoch": 4.30939858220901, "grad_norm": 1.4095760583877563, "learning_rate": 4.633719152915173e-06, "loss": 0.03992711901664734, "memory(GiB)": 122.96, "step": 56535, "token_acc": 0.985852683376903, "train_speed(iter/s)": 0.231503 }, { "epoch": 4.30977970881927, "grad_norm": 1.404532551765442, "learning_rate": 4.628686474525934e-06, "loss": 0.027997109293937682, "memory(GiB)": 122.96, "step": 56540, "token_acc": 0.9904420549581839, "train_speed(iter/s)": 0.23151 }, { "epoch": 4.31016083542953, "grad_norm": 0.6581978797912598, "learning_rate": 4.623656397970977e-06, "loss": 0.031526225805282596, "memory(GiB)": 122.96, "step": 56545, "token_acc": 0.9917929292929293, "train_speed(iter/s)": 0.231515 }, { "epoch": 4.31054196203979, "grad_norm": 0.9870343208312988, "learning_rate": 4.618628923538759e-06, "loss": 0.03007081151008606, "memory(GiB)": 122.96, "step": 56550, "token_acc": 0.9855805717176828, "train_speed(iter/s)": 0.231522 }, { "epoch": 4.31092308865005, "grad_norm": 0.45915722846984863, "learning_rate": 4.6136040515175724e-06, "loss": 0.03478447198867798, "memory(GiB)": 122.96, "step": 56555, "token_acc": 0.9866298811544991, "train_speed(iter/s)": 0.231529 }, { "epoch": 4.31130421526031, "grad_norm": 1.6996009349822998, "learning_rate": 4.60858178219557e-06, "loss": 0.032695254683494566, "memory(GiB)": 122.96, "step": 56560, "token_acc": 0.986712777575537, "train_speed(iter/s)": 0.231534 }, { "epoch": 4.31168534187057, "grad_norm": 2.0134456157684326, "learning_rate": 4.603562115860771e-06, "loss": 0.03146355450153351, "memory(GiB)": 122.96, "step": 56565, "token_acc": 0.9896981745888307, "train_speed(iter/s)": 0.231538 }, { "epoch": 4.312066468480829, "grad_norm": 0.9508880376815796, "learning_rate": 4.5985450528010124e-06, "loss": 0.029457780718803405, "memory(GiB)": 122.96, "step": 56570, "token_acc": 0.9871215544509715, "train_speed(iter/s)": 0.231544 }, { "epoch": 4.312447595091089, "grad_norm": 0.8441643714904785, "learning_rate": 4.593530593304007e-06, "loss": 0.04024452865123749, "memory(GiB)": 122.96, "step": 56575, "token_acc": 0.9890534449452673, "train_speed(iter/s)": 0.231553 }, { "epoch": 4.312828721701349, "grad_norm": 0.7354947328567505, "learning_rate": 4.58851873765731e-06, "loss": 0.027288484573364257, "memory(GiB)": 122.96, "step": 56580, "token_acc": 0.9886492622020431, "train_speed(iter/s)": 0.231557 }, { "epoch": 4.313209848311609, "grad_norm": 0.7304196953773499, "learning_rate": 4.583509486148324e-06, "loss": 0.029281842708587646, "memory(GiB)": 122.96, "step": 56585, "token_acc": 0.9891720586293411, "train_speed(iter/s)": 0.23156 }, { "epoch": 4.313590974921869, "grad_norm": 0.9571777582168579, "learning_rate": 4.578502839064325e-06, "loss": 0.04082332849502564, "memory(GiB)": 122.96, "step": 56590, "token_acc": 0.9853801169590644, "train_speed(iter/s)": 0.231564 }, { "epoch": 4.313972101532129, "grad_norm": 1.1139880418777466, "learning_rate": 4.573498796692393e-06, "loss": 0.03275468349456787, "memory(GiB)": 122.96, "step": 56595, "token_acc": 0.9845258375922771, "train_speed(iter/s)": 0.231567 }, { "epoch": 4.314353228142389, "grad_norm": 2.072366237640381, "learning_rate": 4.5684973593195066e-06, "loss": 0.04082944393157959, "memory(GiB)": 122.96, "step": 56600, "token_acc": 0.9874939874939875, "train_speed(iter/s)": 0.231573 }, { "epoch": 4.314353228142389, "eval_loss": 0.050422653555870056, "eval_runtime": 159.7732, "eval_samples_per_second": 3.317, "eval_steps_per_second": 3.317, "eval_token_acc": 0.9798957894102764, "step": 56600 }, { "epoch": 4.314734354752649, "grad_norm": 1.5450422763824463, "learning_rate": 4.563498527232474e-06, "loss": 0.03235548734664917, "memory(GiB)": 122.96, "step": 56605, "token_acc": 0.9799590489939015, "train_speed(iter/s)": 0.231429 }, { "epoch": 4.315115481362909, "grad_norm": 0.8790038824081421, "learning_rate": 4.5585023007179425e-06, "loss": 0.029085662961006165, "memory(GiB)": 122.96, "step": 56610, "token_acc": 0.9883879781420765, "train_speed(iter/s)": 0.231437 }, { "epoch": 4.315496607973168, "grad_norm": 2.1348702907562256, "learning_rate": 4.553508680062424e-06, "loss": 0.037862366437911986, "memory(GiB)": 122.96, "step": 56615, "token_acc": 0.9844626672421234, "train_speed(iter/s)": 0.231442 }, { "epoch": 4.315877734583428, "grad_norm": 0.885884165763855, "learning_rate": 4.548517665552299e-06, "loss": 0.03170434534549713, "memory(GiB)": 122.96, "step": 56620, "token_acc": 0.9897990726429675, "train_speed(iter/s)": 0.231449 }, { "epoch": 4.316258861193688, "grad_norm": 0.7388194799423218, "learning_rate": 4.543529257473755e-06, "loss": 0.024867814779281617, "memory(GiB)": 122.96, "step": 56625, "token_acc": 0.9915458937198067, "train_speed(iter/s)": 0.231451 }, { "epoch": 4.316639987803948, "grad_norm": 1.0172159671783447, "learning_rate": 4.5385434561128645e-06, "loss": 0.022394607961177825, "memory(GiB)": 122.96, "step": 56630, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.231457 }, { "epoch": 4.317021114414208, "grad_norm": 0.9668006896972656, "learning_rate": 4.533560261755554e-06, "loss": 0.029089680314064024, "memory(GiB)": 122.96, "step": 56635, "token_acc": 0.9838333033950063, "train_speed(iter/s)": 0.231464 }, { "epoch": 4.317402241024468, "grad_norm": 0.16417518258094788, "learning_rate": 4.528579674687555e-06, "loss": 0.03749719262123108, "memory(GiB)": 122.96, "step": 56640, "token_acc": 0.9836100468284377, "train_speed(iter/s)": 0.23147 }, { "epoch": 4.317783367634728, "grad_norm": 1.5132347345352173, "learning_rate": 4.523601695194513e-06, "loss": 0.042767000198364255, "memory(GiB)": 122.96, "step": 56645, "token_acc": 0.9847915242652084, "train_speed(iter/s)": 0.231474 }, { "epoch": 4.318164494244988, "grad_norm": 1.5520461797714233, "learning_rate": 4.518626323561864e-06, "loss": 0.04277833998203277, "memory(GiB)": 122.96, "step": 56650, "token_acc": 0.9887400504756358, "train_speed(iter/s)": 0.231479 }, { "epoch": 4.318545620855248, "grad_norm": 0.68074631690979, "learning_rate": 4.513653560074943e-06, "loss": 0.03450406789779663, "memory(GiB)": 122.96, "step": 56655, "token_acc": 0.9869053339740509, "train_speed(iter/s)": 0.231483 }, { "epoch": 4.318926747465508, "grad_norm": 0.6428031921386719, "learning_rate": 4.5086834050189096e-06, "loss": 0.03410144746303558, "memory(GiB)": 122.96, "step": 56660, "token_acc": 0.9883095627776479, "train_speed(iter/s)": 0.231489 }, { "epoch": 4.319307874075768, "grad_norm": 1.3498306274414062, "learning_rate": 4.50371585867877e-06, "loss": 0.03771767318248749, "memory(GiB)": 122.96, "step": 56665, "token_acc": 0.9883749690823646, "train_speed(iter/s)": 0.231495 }, { "epoch": 4.319689000686028, "grad_norm": 1.3586755990982056, "learning_rate": 4.498750921339401e-06, "loss": 0.02689318358898163, "memory(GiB)": 122.96, "step": 56670, "token_acc": 0.9908015768725361, "train_speed(iter/s)": 0.231502 }, { "epoch": 4.320070127296288, "grad_norm": 0.4459330141544342, "learning_rate": 4.493788593285519e-06, "loss": 0.036771321296691896, "memory(GiB)": 122.96, "step": 56675, "token_acc": 0.9887887447790723, "train_speed(iter/s)": 0.231508 }, { "epoch": 4.320451253906548, "grad_norm": 1.7902473211288452, "learning_rate": 4.4888288748016816e-06, "loss": 0.026561135053634645, "memory(GiB)": 122.96, "step": 56680, "token_acc": 0.9882121807465619, "train_speed(iter/s)": 0.231515 }, { "epoch": 4.320832380516808, "grad_norm": 1.2405246496200562, "learning_rate": 4.483871766172309e-06, "loss": 0.031094864010810852, "memory(GiB)": 122.96, "step": 56685, "token_acc": 0.9933903576982893, "train_speed(iter/s)": 0.231523 }, { "epoch": 4.321213507127068, "grad_norm": 2.102415084838867, "learning_rate": 4.478917267681682e-06, "loss": 0.033961498737335206, "memory(GiB)": 122.96, "step": 56690, "token_acc": 0.9860788863109049, "train_speed(iter/s)": 0.23153 }, { "epoch": 4.321594633737328, "grad_norm": 1.4777919054031372, "learning_rate": 4.473965379613893e-06, "loss": 0.03399845063686371, "memory(GiB)": 122.96, "step": 56695, "token_acc": 0.9862700228832952, "train_speed(iter/s)": 0.231536 }, { "epoch": 4.321975760347588, "grad_norm": 0.5477937459945679, "learning_rate": 4.469016102252927e-06, "loss": 0.021122771501541137, "memory(GiB)": 122.96, "step": 56700, "token_acc": 0.9894996911673872, "train_speed(iter/s)": 0.23154 }, { "epoch": 4.322356886957848, "grad_norm": 0.5334409475326538, "learning_rate": 4.464069435882601e-06, "loss": 0.03244448900222778, "memory(GiB)": 122.96, "step": 56705, "token_acc": 0.9898544331715924, "train_speed(iter/s)": 0.231547 }, { "epoch": 4.322738013568108, "grad_norm": 1.236446499824524, "learning_rate": 4.459125380786577e-06, "loss": 0.026854994893074035, "memory(GiB)": 122.96, "step": 56710, "token_acc": 0.9895918606010993, "train_speed(iter/s)": 0.231548 }, { "epoch": 4.323119140178367, "grad_norm": 1.4728659391403198, "learning_rate": 4.454183937248374e-06, "loss": 0.036658179759979245, "memory(GiB)": 122.96, "step": 56715, "token_acc": 0.9823943661971831, "train_speed(iter/s)": 0.231555 }, { "epoch": 4.323500266788627, "grad_norm": 1.3209854364395142, "learning_rate": 4.449245105551364e-06, "loss": 0.023862193524837493, "memory(GiB)": 122.96, "step": 56720, "token_acc": 0.9863620866007501, "train_speed(iter/s)": 0.231562 }, { "epoch": 4.323881393398887, "grad_norm": 1.103216290473938, "learning_rate": 4.444308885978765e-06, "loss": 0.023098348081111907, "memory(GiB)": 122.96, "step": 56725, "token_acc": 0.9890543692804593, "train_speed(iter/s)": 0.231568 }, { "epoch": 4.324262520009147, "grad_norm": 1.09901762008667, "learning_rate": 4.439375278813657e-06, "loss": 0.03125913441181183, "memory(GiB)": 122.96, "step": 56730, "token_acc": 0.9911807937285644, "train_speed(iter/s)": 0.231576 }, { "epoch": 4.324643646619407, "grad_norm": 0.3981832265853882, "learning_rate": 4.43444428433894e-06, "loss": 0.034559914469718934, "memory(GiB)": 122.96, "step": 56735, "token_acc": 0.9811719906917706, "train_speed(iter/s)": 0.231583 }, { "epoch": 4.325024773229667, "grad_norm": 0.9162402749061584, "learning_rate": 4.4295159028373945e-06, "loss": 0.030220368504524232, "memory(GiB)": 122.96, "step": 56740, "token_acc": 0.9841842397336293, "train_speed(iter/s)": 0.231589 }, { "epoch": 4.325405899839927, "grad_norm": 0.8675771951675415, "learning_rate": 4.42459013459165e-06, "loss": 0.03574210107326507, "memory(GiB)": 122.96, "step": 56745, "token_acc": 0.9850008823010411, "train_speed(iter/s)": 0.231595 }, { "epoch": 4.325787026450187, "grad_norm": 1.582871675491333, "learning_rate": 4.4196669798841575e-06, "loss": 0.025450804829597475, "memory(GiB)": 122.96, "step": 56750, "token_acc": 0.9886808881149325, "train_speed(iter/s)": 0.231601 }, { "epoch": 4.326168153060447, "grad_norm": 1.7442567348480225, "learning_rate": 4.414746438997242e-06, "loss": 0.042752915620803834, "memory(GiB)": 122.96, "step": 56755, "token_acc": 0.9821664464993395, "train_speed(iter/s)": 0.231608 }, { "epoch": 4.326549279670707, "grad_norm": 1.5975000858306885, "learning_rate": 4.409828512213082e-06, "loss": 0.02404576390981674, "memory(GiB)": 122.96, "step": 56760, "token_acc": 0.9903325599381284, "train_speed(iter/s)": 0.231616 }, { "epoch": 4.326930406280966, "grad_norm": 1.3463395833969116, "learning_rate": 4.404913199813687e-06, "loss": 0.057933861017227174, "memory(GiB)": 122.96, "step": 56765, "token_acc": 0.9804660726525017, "train_speed(iter/s)": 0.231622 }, { "epoch": 4.327311532891226, "grad_norm": 0.6645733118057251, "learning_rate": 4.400000502080936e-06, "loss": 0.0376004308462143, "memory(GiB)": 122.96, "step": 56770, "token_acc": 0.9880416213697779, "train_speed(iter/s)": 0.231627 }, { "epoch": 4.327692659501486, "grad_norm": 2.9712092876434326, "learning_rate": 4.395090419296549e-06, "loss": 0.062117534875869754, "memory(GiB)": 122.96, "step": 56775, "token_acc": 0.9764075067024128, "train_speed(iter/s)": 0.231635 }, { "epoch": 4.328073786111746, "grad_norm": 1.65714430809021, "learning_rate": 4.3901829517420885e-06, "loss": 0.029206568002700807, "memory(GiB)": 122.96, "step": 56780, "token_acc": 0.9886510558827755, "train_speed(iter/s)": 0.23164 }, { "epoch": 4.328454912722006, "grad_norm": 1.311539649963379, "learning_rate": 4.3852780996989805e-06, "loss": 0.026852670311927795, "memory(GiB)": 122.96, "step": 56785, "token_acc": 0.986232790988736, "train_speed(iter/s)": 0.231645 }, { "epoch": 4.328836039332266, "grad_norm": 1.1189994812011719, "learning_rate": 4.380375863448505e-06, "loss": 0.03580468595027923, "memory(GiB)": 122.96, "step": 56790, "token_acc": 0.9857682899710919, "train_speed(iter/s)": 0.231651 }, { "epoch": 4.329217165942526, "grad_norm": 0.11035322397947311, "learning_rate": 4.375476243271765e-06, "loss": 0.02525656223297119, "memory(GiB)": 122.96, "step": 56795, "token_acc": 0.9911786786786787, "train_speed(iter/s)": 0.231656 }, { "epoch": 4.329598292552786, "grad_norm": 1.3669482469558716, "learning_rate": 4.3705792394497346e-06, "loss": 0.0252657413482666, "memory(GiB)": 122.96, "step": 56800, "token_acc": 0.9850980392156863, "train_speed(iter/s)": 0.231664 }, { "epoch": 4.329598292552786, "eval_loss": 0.05016002804040909, "eval_runtime": 157.8678, "eval_samples_per_second": 3.357, "eval_steps_per_second": 3.357, "eval_token_acc": 0.9797150774049757, "step": 56800 }, { "epoch": 4.329979419163046, "grad_norm": 0.8016194701194763, "learning_rate": 4.365684852263252e-06, "loss": 0.032298633456230165, "memory(GiB)": 122.96, "step": 56805, "token_acc": 0.9800191735203593, "train_speed(iter/s)": 0.231519 }, { "epoch": 4.330360545773306, "grad_norm": 1.410971999168396, "learning_rate": 4.3607930819929645e-06, "loss": 0.02681463062763214, "memory(GiB)": 122.96, "step": 56810, "token_acc": 0.9857111453066608, "train_speed(iter/s)": 0.231525 }, { "epoch": 4.330741672383565, "grad_norm": 1.290440559387207, "learning_rate": 4.3559039289194085e-06, "loss": 0.028167355060577392, "memory(GiB)": 122.96, "step": 56815, "token_acc": 0.9840619307832422, "train_speed(iter/s)": 0.231534 }, { "epoch": 4.3311227989938255, "grad_norm": 1.3759077787399292, "learning_rate": 4.351017393322937e-06, "loss": 0.02801782488822937, "memory(GiB)": 122.96, "step": 56820, "token_acc": 0.9869825566258786, "train_speed(iter/s)": 0.231541 }, { "epoch": 4.3315039256040855, "grad_norm": 1.0497753620147705, "learning_rate": 4.346133475483782e-06, "loss": 0.03208264112472534, "memory(GiB)": 122.96, "step": 56825, "token_acc": 0.9856242118537201, "train_speed(iter/s)": 0.231548 }, { "epoch": 4.3318850522143455, "grad_norm": 0.6383605003356934, "learning_rate": 4.341252175682026e-06, "loss": 0.023679612576961516, "memory(GiB)": 122.96, "step": 56830, "token_acc": 0.9899425287356322, "train_speed(iter/s)": 0.231551 }, { "epoch": 4.3322661788246055, "grad_norm": 1.1624906063079834, "learning_rate": 4.33637349419756e-06, "loss": 0.04364684522151947, "memory(GiB)": 122.96, "step": 56835, "token_acc": 0.9823067935236187, "train_speed(iter/s)": 0.231556 }, { "epoch": 4.3326473054348655, "grad_norm": 1.170672059059143, "learning_rate": 4.331497431310172e-06, "loss": 0.03822368383407593, "memory(GiB)": 122.96, "step": 56840, "token_acc": 0.984759671746776, "train_speed(iter/s)": 0.231562 }, { "epoch": 4.3330284320451256, "grad_norm": 0.6033211350440979, "learning_rate": 4.326623987299477e-06, "loss": 0.023332826793193817, "memory(GiB)": 122.96, "step": 56845, "token_acc": 0.9873543268038681, "train_speed(iter/s)": 0.23157 }, { "epoch": 4.333409558655386, "grad_norm": 0.6579980254173279, "learning_rate": 4.321753162444952e-06, "loss": 0.026081454753875733, "memory(GiB)": 122.96, "step": 56850, "token_acc": 0.9901194852941176, "train_speed(iter/s)": 0.231577 }, { "epoch": 4.333790685265646, "grad_norm": 0.5874403715133667, "learning_rate": 4.316884957025913e-06, "loss": 0.025949698686599732, "memory(GiB)": 122.96, "step": 56855, "token_acc": 0.9903366583541147, "train_speed(iter/s)": 0.231582 }, { "epoch": 4.334171811875905, "grad_norm": 1.035077452659607, "learning_rate": 4.312019371321518e-06, "loss": 0.03357393443584442, "memory(GiB)": 122.96, "step": 56860, "token_acc": 0.9874199955859634, "train_speed(iter/s)": 0.231589 }, { "epoch": 4.334552938486165, "grad_norm": 0.9752175211906433, "learning_rate": 4.307156405610796e-06, "loss": 0.01783519983291626, "memory(GiB)": 122.96, "step": 56865, "token_acc": 0.9952012796587577, "train_speed(iter/s)": 0.231596 }, { "epoch": 4.334934065096425, "grad_norm": 0.7628051042556763, "learning_rate": 4.302296060172623e-06, "loss": 0.02231002151966095, "memory(GiB)": 122.96, "step": 56870, "token_acc": 0.9929411764705882, "train_speed(iter/s)": 0.231602 }, { "epoch": 4.335315191706685, "grad_norm": 0.9223611950874329, "learning_rate": 4.297438335285692e-06, "loss": 0.03773881494998932, "memory(GiB)": 122.96, "step": 56875, "token_acc": 0.9842940973420781, "train_speed(iter/s)": 0.231606 }, { "epoch": 4.335696318316945, "grad_norm": 0.9382603168487549, "learning_rate": 4.292583231228592e-06, "loss": 0.031791788339614865, "memory(GiB)": 122.96, "step": 56880, "token_acc": 0.9861517976031957, "train_speed(iter/s)": 0.231613 }, { "epoch": 4.336077444927205, "grad_norm": 0.5173014998435974, "learning_rate": 4.287730748279744e-06, "loss": 0.0247573122382164, "memory(GiB)": 122.96, "step": 56885, "token_acc": 0.9878364389233955, "train_speed(iter/s)": 0.231619 }, { "epoch": 4.336458571537465, "grad_norm": 0.17217504978179932, "learning_rate": 4.2828808867174e-06, "loss": 0.021451196074485777, "memory(GiB)": 122.96, "step": 56890, "token_acc": 0.991858098284385, "train_speed(iter/s)": 0.231625 }, { "epoch": 4.336839698147725, "grad_norm": 0.5322732925415039, "learning_rate": 4.2780336468196795e-06, "loss": 0.038912123441696166, "memory(GiB)": 122.96, "step": 56895, "token_acc": 0.9874702959602506, "train_speed(iter/s)": 0.231633 }, { "epoch": 4.337220824757985, "grad_norm": 1.4924726486206055, "learning_rate": 4.273189028864566e-06, "loss": 0.036540687084198, "memory(GiB)": 122.96, "step": 56900, "token_acc": 0.9869461298032727, "train_speed(iter/s)": 0.231638 }, { "epoch": 4.337601951368245, "grad_norm": 0.8098247647285461, "learning_rate": 4.268347033129849e-06, "loss": 0.030013051629066468, "memory(GiB)": 122.96, "step": 56905, "token_acc": 0.9900145243282498, "train_speed(iter/s)": 0.231642 }, { "epoch": 4.337983077978505, "grad_norm": 0.891287624835968, "learning_rate": 4.263507659893212e-06, "loss": 0.03570747971534729, "memory(GiB)": 122.96, "step": 56910, "token_acc": 0.9874963224477787, "train_speed(iter/s)": 0.231646 }, { "epoch": 4.338364204588764, "grad_norm": 0.13738885521888733, "learning_rate": 4.258670909432177e-06, "loss": 0.01868680864572525, "memory(GiB)": 122.96, "step": 56915, "token_acc": 0.9872080916257623, "train_speed(iter/s)": 0.231652 }, { "epoch": 4.338745331199024, "grad_norm": 2.4237728118896484, "learning_rate": 4.253836782024095e-06, "loss": 0.02780998945236206, "memory(GiB)": 122.96, "step": 56920, "token_acc": 0.9917686318131257, "train_speed(iter/s)": 0.231657 }, { "epoch": 4.339126457809284, "grad_norm": 2.1395299434661865, "learning_rate": 4.249005277946177e-06, "loss": 0.03003830313682556, "memory(GiB)": 122.96, "step": 56925, "token_acc": 0.990765679107349, "train_speed(iter/s)": 0.231665 }, { "epoch": 4.339507584419544, "grad_norm": 3.5259904861450195, "learning_rate": 4.244176397475513e-06, "loss": 0.031600204110145566, "memory(GiB)": 122.96, "step": 56930, "token_acc": 0.9919331604724864, "train_speed(iter/s)": 0.231672 }, { "epoch": 4.339888711029804, "grad_norm": 0.797493577003479, "learning_rate": 4.239350140888987e-06, "loss": 0.028476014733314514, "memory(GiB)": 122.96, "step": 56935, "token_acc": 0.9902680866691149, "train_speed(iter/s)": 0.231677 }, { "epoch": 4.340269837640064, "grad_norm": 0.845337450504303, "learning_rate": 4.234526508463371e-06, "loss": 0.026389342546463013, "memory(GiB)": 122.96, "step": 56940, "token_acc": 0.9892737430167597, "train_speed(iter/s)": 0.231682 }, { "epoch": 4.340650964250324, "grad_norm": 0.6613597869873047, "learning_rate": 4.229705500475295e-06, "loss": 0.022658462822437286, "memory(GiB)": 122.96, "step": 56945, "token_acc": 0.9916149068322981, "train_speed(iter/s)": 0.231686 }, { "epoch": 4.341032090860584, "grad_norm": 2.4841151237487793, "learning_rate": 4.224887117201198e-06, "loss": 0.033138760924339296, "memory(GiB)": 122.96, "step": 56950, "token_acc": 0.9876387487386479, "train_speed(iter/s)": 0.231693 }, { "epoch": 4.341413217470844, "grad_norm": 1.0868453979492188, "learning_rate": 4.2200713589174046e-06, "loss": 0.017836572229862215, "memory(GiB)": 122.96, "step": 56955, "token_acc": 0.9870987098709871, "train_speed(iter/s)": 0.2317 }, { "epoch": 4.341794344081103, "grad_norm": 0.6126849055290222, "learning_rate": 4.2152582259000814e-06, "loss": 0.034357988834381105, "memory(GiB)": 122.96, "step": 56960, "token_acc": 0.9823446327683616, "train_speed(iter/s)": 0.231706 }, { "epoch": 4.342175470691363, "grad_norm": 0.9399639368057251, "learning_rate": 4.210447718425226e-06, "loss": 0.03803393840789795, "memory(GiB)": 122.96, "step": 56965, "token_acc": 0.9838323353293413, "train_speed(iter/s)": 0.231711 }, { "epoch": 4.342556597301623, "grad_norm": 0.676488995552063, "learning_rate": 4.205639836768699e-06, "loss": 0.041442877054214476, "memory(GiB)": 122.96, "step": 56970, "token_acc": 0.9829625457617572, "train_speed(iter/s)": 0.231715 }, { "epoch": 4.342937723911883, "grad_norm": 1.228368878364563, "learning_rate": 4.200834581206231e-06, "loss": 0.026723092794418334, "memory(GiB)": 122.96, "step": 56975, "token_acc": 0.9879953954941622, "train_speed(iter/s)": 0.231719 }, { "epoch": 4.343318850522143, "grad_norm": 1.7128833532333374, "learning_rate": 4.196031952013341e-06, "loss": 0.03198819160461426, "memory(GiB)": 122.96, "step": 56980, "token_acc": 0.9892974753018661, "train_speed(iter/s)": 0.231725 }, { "epoch": 4.343699977132403, "grad_norm": 0.8593846559524536, "learning_rate": 4.191231949465485e-06, "loss": 0.03973855972290039, "memory(GiB)": 122.96, "step": 56985, "token_acc": 0.9831052542659233, "train_speed(iter/s)": 0.231731 }, { "epoch": 4.344081103742663, "grad_norm": 1.1370562314987183, "learning_rate": 4.186434573837883e-06, "loss": 0.029904705286026, "memory(GiB)": 122.96, "step": 56990, "token_acc": 0.9882690302398331, "train_speed(iter/s)": 0.231738 }, { "epoch": 4.3444622303529234, "grad_norm": 0.47150859236717224, "learning_rate": 4.181639825405664e-06, "loss": 0.02946779429912567, "memory(GiB)": 122.96, "step": 56995, "token_acc": 0.9910820451843044, "train_speed(iter/s)": 0.231739 }, { "epoch": 4.3448433569631835, "grad_norm": 0.9129472374916077, "learning_rate": 4.1768477044437815e-06, "loss": 0.03758901655673981, "memory(GiB)": 122.96, "step": 57000, "token_acc": 0.9865102639296187, "train_speed(iter/s)": 0.231743 }, { "epoch": 4.3448433569631835, "eval_loss": 0.05087516829371452, "eval_runtime": 160.7345, "eval_samples_per_second": 3.297, "eval_steps_per_second": 3.297, "eval_token_acc": 0.9796623697367628, "step": 57000 }, { "epoch": 4.3452244835734435, "grad_norm": 2.7259042263031006, "learning_rate": 4.1720582112270315e-06, "loss": 0.05784919261932373, "memory(GiB)": 122.96, "step": 57005, "token_acc": 0.979751963379499, "train_speed(iter/s)": 0.231598 }, { "epoch": 4.345605610183703, "grad_norm": 0.16044487059116364, "learning_rate": 4.167271346030077e-06, "loss": 0.01622035950422287, "memory(GiB)": 122.96, "step": 57010, "token_acc": 0.9943492183085327, "train_speed(iter/s)": 0.231601 }, { "epoch": 4.345986736793963, "grad_norm": 0.8297127485275269, "learning_rate": 4.162487109127428e-06, "loss": 0.029871591925621034, "memory(GiB)": 122.96, "step": 57015, "token_acc": 0.9841027208804647, "train_speed(iter/s)": 0.231609 }, { "epoch": 4.346367863404223, "grad_norm": 0.9313675761222839, "learning_rate": 4.157705500793424e-06, "loss": 0.031971073150634764, "memory(GiB)": 122.96, "step": 57020, "token_acc": 0.9851380042462845, "train_speed(iter/s)": 0.231614 }, { "epoch": 4.346748990014483, "grad_norm": 0.9252403974533081, "learning_rate": 4.15292652130228e-06, "loss": 0.03501693606376648, "memory(GiB)": 122.96, "step": 57025, "token_acc": 0.9825647710607789, "train_speed(iter/s)": 0.23162 }, { "epoch": 4.347130116624743, "grad_norm": 1.4747933149337769, "learning_rate": 4.148150170928039e-06, "loss": 0.020332756638526916, "memory(GiB)": 122.96, "step": 57030, "token_acc": 0.9904823707549211, "train_speed(iter/s)": 0.231625 }, { "epoch": 4.347511243235003, "grad_norm": 0.9133774042129517, "learning_rate": 4.143376449944608e-06, "loss": 0.036401450634002686, "memory(GiB)": 122.96, "step": 57035, "token_acc": 0.9850871665616467, "train_speed(iter/s)": 0.231626 }, { "epoch": 4.347892369845263, "grad_norm": 1.055947184562683, "learning_rate": 4.138605358625741e-06, "loss": 0.03411855399608612, "memory(GiB)": 122.96, "step": 57040, "token_acc": 0.9878478337442762, "train_speed(iter/s)": 0.231632 }, { "epoch": 4.348273496455523, "grad_norm": 1.7831945419311523, "learning_rate": 4.13383689724503e-06, "loss": 0.02385149598121643, "memory(GiB)": 122.96, "step": 57045, "token_acc": 0.9891764705882353, "train_speed(iter/s)": 0.231641 }, { "epoch": 4.348654623065783, "grad_norm": 2.0175955295562744, "learning_rate": 4.129071066075924e-06, "loss": 0.056249606609344485, "memory(GiB)": 122.96, "step": 57050, "token_acc": 0.978098142500693, "train_speed(iter/s)": 0.231648 }, { "epoch": 4.349035749676043, "grad_norm": 1.5738128423690796, "learning_rate": 4.1243078653917355e-06, "loss": 0.03970653116703034, "memory(GiB)": 122.96, "step": 57055, "token_acc": 0.988272921108742, "train_speed(iter/s)": 0.231654 }, { "epoch": 4.349416876286302, "grad_norm": 0.7045892477035522, "learning_rate": 4.119547295465592e-06, "loss": 0.0346729576587677, "memory(GiB)": 122.96, "step": 57060, "token_acc": 0.9906357811729917, "train_speed(iter/s)": 0.231658 }, { "epoch": 4.349798002896562, "grad_norm": 0.9246518015861511, "learning_rate": 4.114789356570503e-06, "loss": 0.03057208061218262, "memory(GiB)": 122.96, "step": 57065, "token_acc": 0.9887272727272727, "train_speed(iter/s)": 0.231664 }, { "epoch": 4.350179129506822, "grad_norm": 1.2600059509277344, "learning_rate": 4.110034048979317e-06, "loss": 0.028766649961471557, "memory(GiB)": 122.96, "step": 57070, "token_acc": 0.9893153937475268, "train_speed(iter/s)": 0.231667 }, { "epoch": 4.350560256117082, "grad_norm": 0.23422418534755707, "learning_rate": 4.105281372964715e-06, "loss": 0.041221892833709715, "memory(GiB)": 122.96, "step": 57075, "token_acc": 0.9849864632045287, "train_speed(iter/s)": 0.231671 }, { "epoch": 4.350941382727342, "grad_norm": 2.01975417137146, "learning_rate": 4.100531328799245e-06, "loss": 0.04339967370033264, "memory(GiB)": 122.96, "step": 57080, "token_acc": 0.981163666806195, "train_speed(iter/s)": 0.231676 }, { "epoch": 4.351322509337602, "grad_norm": 2.1346895694732666, "learning_rate": 4.095783916755319e-06, "loss": 0.030268388986587524, "memory(GiB)": 122.96, "step": 57085, "token_acc": 0.982957669048928, "train_speed(iter/s)": 0.231685 }, { "epoch": 4.351703635947862, "grad_norm": 1.1857895851135254, "learning_rate": 4.091039137105152e-06, "loss": 0.04486008882522583, "memory(GiB)": 122.96, "step": 57090, "token_acc": 0.9860975609756097, "train_speed(iter/s)": 0.231692 }, { "epoch": 4.352084762558122, "grad_norm": 1.5839109420776367, "learning_rate": 4.0862969901208416e-06, "loss": 0.017622722685337065, "memory(GiB)": 122.96, "step": 57095, "token_acc": 0.9926082365364308, "train_speed(iter/s)": 0.231698 }, { "epoch": 4.352465889168382, "grad_norm": 1.3125064373016357, "learning_rate": 4.0815574760743495e-06, "loss": 0.026409608125686646, "memory(GiB)": 122.96, "step": 57100, "token_acc": 0.9896338404949006, "train_speed(iter/s)": 0.231702 }, { "epoch": 4.352847015778641, "grad_norm": 0.0983426496386528, "learning_rate": 4.076820595237435e-06, "loss": 0.03371442556381225, "memory(GiB)": 122.96, "step": 57105, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.231709 }, { "epoch": 4.353228142388901, "grad_norm": 1.8781272172927856, "learning_rate": 4.072086347881754e-06, "loss": 0.02735028266906738, "memory(GiB)": 122.96, "step": 57110, "token_acc": 0.9900130264871906, "train_speed(iter/s)": 0.231716 }, { "epoch": 4.353609268999161, "grad_norm": 1.0089811086654663, "learning_rate": 4.067354734278789e-06, "loss": 0.019203273952007292, "memory(GiB)": 122.96, "step": 57115, "token_acc": 0.9914356898162566, "train_speed(iter/s)": 0.231721 }, { "epoch": 4.353990395609421, "grad_norm": 0.13944296538829803, "learning_rate": 4.062625754699873e-06, "loss": 0.02814592719078064, "memory(GiB)": 122.96, "step": 57120, "token_acc": 0.9856834143706105, "train_speed(iter/s)": 0.231728 }, { "epoch": 4.354371522219681, "grad_norm": 0.5130220651626587, "learning_rate": 4.0578994094162045e-06, "loss": 0.019051577150821685, "memory(GiB)": 122.96, "step": 57125, "token_acc": 0.9948994049305753, "train_speed(iter/s)": 0.231736 }, { "epoch": 4.354752648829941, "grad_norm": 0.6363046169281006, "learning_rate": 4.053175698698802e-06, "loss": 0.02947595715522766, "memory(GiB)": 122.96, "step": 57130, "token_acc": 0.9899533548618586, "train_speed(iter/s)": 0.231741 }, { "epoch": 4.355133775440201, "grad_norm": 1.192973256111145, "learning_rate": 4.048454622818548e-06, "loss": 0.038931792974472045, "memory(GiB)": 122.96, "step": 57135, "token_acc": 0.9811698717948718, "train_speed(iter/s)": 0.231749 }, { "epoch": 4.355514902050461, "grad_norm": 1.1527025699615479, "learning_rate": 4.043736182046193e-06, "loss": 0.032477089762687684, "memory(GiB)": 122.96, "step": 57140, "token_acc": 0.9882948790095667, "train_speed(iter/s)": 0.231751 }, { "epoch": 4.355896028660721, "grad_norm": 0.8629696369171143, "learning_rate": 4.0390203766522975e-06, "loss": 0.026650914549827577, "memory(GiB)": 122.96, "step": 57145, "token_acc": 0.9901787151827403, "train_speed(iter/s)": 0.231754 }, { "epoch": 4.356277155270981, "grad_norm": 0.050860777497291565, "learning_rate": 4.034307206907295e-06, "loss": 0.019665509462356567, "memory(GiB)": 122.96, "step": 57150, "token_acc": 0.9892255892255892, "train_speed(iter/s)": 0.231763 }, { "epoch": 4.356658281881241, "grad_norm": 0.37209731340408325, "learning_rate": 4.029596673081476e-06, "loss": 0.02328375428915024, "memory(GiB)": 122.96, "step": 57155, "token_acc": 0.9890185312285518, "train_speed(iter/s)": 0.23177 }, { "epoch": 4.3570394084915005, "grad_norm": 0.917543888092041, "learning_rate": 4.024888775444951e-06, "loss": 0.042275351285934445, "memory(GiB)": 122.96, "step": 57160, "token_acc": 0.9854642539305843, "train_speed(iter/s)": 0.231775 }, { "epoch": 4.3574205351017605, "grad_norm": 1.1013929843902588, "learning_rate": 4.020183514267706e-06, "loss": 0.02508768141269684, "memory(GiB)": 122.96, "step": 57165, "token_acc": 0.9865889212827988, "train_speed(iter/s)": 0.231783 }, { "epoch": 4.3578016617120205, "grad_norm": 1.4473906755447388, "learning_rate": 4.0154808898195725e-06, "loss": 0.039225584268569945, "memory(GiB)": 122.96, "step": 57170, "token_acc": 0.9793189889283476, "train_speed(iter/s)": 0.23179 }, { "epoch": 4.358182788322281, "grad_norm": 0.8847504258155823, "learning_rate": 4.0107809023702035e-06, "loss": 0.034545397758483885, "memory(GiB)": 122.96, "step": 57175, "token_acc": 0.9875920370875375, "train_speed(iter/s)": 0.231791 }, { "epoch": 4.358563914932541, "grad_norm": 0.4996013939380646, "learning_rate": 4.006083552189138e-06, "loss": 0.04050042331218719, "memory(GiB)": 122.96, "step": 57180, "token_acc": 0.990588803088803, "train_speed(iter/s)": 0.231798 }, { "epoch": 4.358945041542801, "grad_norm": 0.9691295027732849, "learning_rate": 4.00138883954575e-06, "loss": 0.025456950068473816, "memory(GiB)": 122.96, "step": 57185, "token_acc": 0.9870205629283944, "train_speed(iter/s)": 0.231802 }, { "epoch": 4.359326168153061, "grad_norm": 0.9959986805915833, "learning_rate": 3.996696764709246e-06, "loss": 0.02466481626033783, "memory(GiB)": 122.96, "step": 57190, "token_acc": 0.9898209898209899, "train_speed(iter/s)": 0.231807 }, { "epoch": 4.359707294763321, "grad_norm": 0.6335867643356323, "learning_rate": 3.992007327948705e-06, "loss": 0.033452349901199344, "memory(GiB)": 122.96, "step": 57195, "token_acc": 0.9853492733239568, "train_speed(iter/s)": 0.231809 }, { "epoch": 4.360088421373581, "grad_norm": 0.9419013261795044, "learning_rate": 3.987320529533034e-06, "loss": 0.02715916037559509, "memory(GiB)": 122.96, "step": 57200, "token_acc": 0.9893543956043956, "train_speed(iter/s)": 0.231817 }, { "epoch": 4.360088421373581, "eval_loss": 0.049890827387571335, "eval_runtime": 158.8278, "eval_samples_per_second": 3.337, "eval_steps_per_second": 3.337, "eval_token_acc": 0.980001204746702, "step": 57200 }, { "epoch": 4.36046954798384, "grad_norm": 1.2357265949249268, "learning_rate": 3.982636369731008e-06, "loss": 0.03638900816440582, "memory(GiB)": 122.96, "step": 57205, "token_acc": 0.9801233171585976, "train_speed(iter/s)": 0.231675 }, { "epoch": 4.3608506745941, "grad_norm": 5.209682941436768, "learning_rate": 3.977954848811244e-06, "loss": 0.0355132520198822, "memory(GiB)": 122.96, "step": 57210, "token_acc": 0.9915190350546551, "train_speed(iter/s)": 0.23168 }, { "epoch": 4.36123180120436, "grad_norm": 0.46227043867111206, "learning_rate": 3.973275967042195e-06, "loss": 0.022060906887054442, "memory(GiB)": 122.96, "step": 57215, "token_acc": 0.9886973180076628, "train_speed(iter/s)": 0.231684 }, { "epoch": 4.36161292781462, "grad_norm": 0.3112508952617645, "learning_rate": 3.968599724692179e-06, "loss": 0.02524166405200958, "memory(GiB)": 122.96, "step": 57220, "token_acc": 0.9908123791102514, "train_speed(iter/s)": 0.231692 }, { "epoch": 4.36199405442488, "grad_norm": 1.2877916097640991, "learning_rate": 3.963926122029366e-06, "loss": 0.025917065143585206, "memory(GiB)": 122.96, "step": 57225, "token_acc": 0.982880658436214, "train_speed(iter/s)": 0.231697 }, { "epoch": 4.36237518103514, "grad_norm": 2.360616683959961, "learning_rate": 3.9592551593217455e-06, "loss": 0.034145128726959226, "memory(GiB)": 122.96, "step": 57230, "token_acc": 0.9883192731992213, "train_speed(iter/s)": 0.231704 }, { "epoch": 4.3627563076454, "grad_norm": 0.660440981388092, "learning_rate": 3.954586836837187e-06, "loss": 0.03972390294075012, "memory(GiB)": 122.96, "step": 57235, "token_acc": 0.9876417233560091, "train_speed(iter/s)": 0.231706 }, { "epoch": 4.36313743425566, "grad_norm": 0.8386480212211609, "learning_rate": 3.949921154843411e-06, "loss": 0.03664419949054718, "memory(GiB)": 122.96, "step": 57240, "token_acc": 0.9859116399945288, "train_speed(iter/s)": 0.231709 }, { "epoch": 4.36351856086592, "grad_norm": 1.1973665952682495, "learning_rate": 3.945258113607941e-06, "loss": 0.03508492112159729, "memory(GiB)": 122.96, "step": 57245, "token_acc": 0.9897049784880148, "train_speed(iter/s)": 0.231713 }, { "epoch": 4.36389968747618, "grad_norm": 0.9854652285575867, "learning_rate": 3.940597713398203e-06, "loss": 0.03675018846988678, "memory(GiB)": 122.96, "step": 57250, "token_acc": 0.9825897714907508, "train_speed(iter/s)": 0.231719 }, { "epoch": 4.36428081408644, "grad_norm": 1.2451251745224, "learning_rate": 3.935939954481443e-06, "loss": 0.05017796754837036, "memory(GiB)": 122.96, "step": 57255, "token_acc": 0.9896777442094663, "train_speed(iter/s)": 0.231726 }, { "epoch": 4.364661940696699, "grad_norm": 0.9345805048942566, "learning_rate": 3.931284837124765e-06, "loss": 0.026048028469085695, "memory(GiB)": 122.96, "step": 57260, "token_acc": 0.9894453642384106, "train_speed(iter/s)": 0.231732 }, { "epoch": 4.365043067306959, "grad_norm": 0.7015694975852966, "learning_rate": 3.926632361595123e-06, "loss": 0.023478634655475616, "memory(GiB)": 122.96, "step": 57265, "token_acc": 0.9875377643504532, "train_speed(iter/s)": 0.23174 }, { "epoch": 4.365424193917219, "grad_norm": 0.768681526184082, "learning_rate": 3.9219825281593015e-06, "loss": 0.02878339886665344, "memory(GiB)": 122.96, "step": 57270, "token_acc": 0.9856884603796324, "train_speed(iter/s)": 0.231745 }, { "epoch": 4.365805320527479, "grad_norm": 0.9510180354118347, "learning_rate": 3.917335337083955e-06, "loss": 0.052419412136077884, "memory(GiB)": 122.96, "step": 57275, "token_acc": 0.9750297265160524, "train_speed(iter/s)": 0.231752 }, { "epoch": 4.366186447137739, "grad_norm": 0.4959603548049927, "learning_rate": 3.912690788635581e-06, "loss": 0.01679455041885376, "memory(GiB)": 122.96, "step": 57280, "token_acc": 0.994621246077992, "train_speed(iter/s)": 0.231759 }, { "epoch": 4.366567573747999, "grad_norm": 0.6373704075813293, "learning_rate": 3.908048883080517e-06, "loss": 0.03832828998565674, "memory(GiB)": 122.96, "step": 57285, "token_acc": 0.9880312652662433, "train_speed(iter/s)": 0.231764 }, { "epoch": 4.366948700358259, "grad_norm": 1.0226895809173584, "learning_rate": 3.903409620684956e-06, "loss": 0.020756259560585022, "memory(GiB)": 122.96, "step": 57290, "token_acc": 0.9913389513108615, "train_speed(iter/s)": 0.231771 }, { "epoch": 4.367329826968519, "grad_norm": 0.5051595568656921, "learning_rate": 3.8987730017149395e-06, "loss": 0.027772173285484314, "memory(GiB)": 122.96, "step": 57295, "token_acc": 0.9900106269925611, "train_speed(iter/s)": 0.231773 }, { "epoch": 4.367710953578779, "grad_norm": 1.178842306137085, "learning_rate": 3.894139026436355e-06, "loss": 0.01525954008102417, "memory(GiB)": 122.96, "step": 57300, "token_acc": 0.9950313242600993, "train_speed(iter/s)": 0.231779 }, { "epoch": 4.368092080189038, "grad_norm": 0.49669602513313293, "learning_rate": 3.889507695114936e-06, "loss": 0.03322218358516693, "memory(GiB)": 122.96, "step": 57305, "token_acc": 0.9883151149641914, "train_speed(iter/s)": 0.231781 }, { "epoch": 4.368473206799298, "grad_norm": 0.5717547535896301, "learning_rate": 3.884879008016284e-06, "loss": 0.027977031469345093, "memory(GiB)": 122.96, "step": 57310, "token_acc": 0.9887024991441288, "train_speed(iter/s)": 0.231785 }, { "epoch": 4.368854333409558, "grad_norm": 0.4798009395599365, "learning_rate": 3.880252965405812e-06, "loss": 0.03612786531448364, "memory(GiB)": 122.96, "step": 57315, "token_acc": 0.9850857568978374, "train_speed(iter/s)": 0.231789 }, { "epoch": 4.369235460019818, "grad_norm": 0.5822060108184814, "learning_rate": 3.875629567548805e-06, "loss": 0.019123725593090057, "memory(GiB)": 122.96, "step": 57320, "token_acc": 0.9942832014072119, "train_speed(iter/s)": 0.231796 }, { "epoch": 4.3696165866300785, "grad_norm": 1.5208616256713867, "learning_rate": 3.871008814710414e-06, "loss": 0.029501986503601075, "memory(GiB)": 122.96, "step": 57325, "token_acc": 0.990400374619527, "train_speed(iter/s)": 0.231803 }, { "epoch": 4.3699977132403385, "grad_norm": 2.280104398727417, "learning_rate": 3.8663907071555875e-06, "loss": 0.03417094051837921, "memory(GiB)": 122.96, "step": 57330, "token_acc": 0.987906976744186, "train_speed(iter/s)": 0.23181 }, { "epoch": 4.3703788398505985, "grad_norm": 0.6439019441604614, "learning_rate": 3.861775245149174e-06, "loss": 0.02415521889925003, "memory(GiB)": 122.96, "step": 57335, "token_acc": 0.9904185577407968, "train_speed(iter/s)": 0.231813 }, { "epoch": 4.3707599664608585, "grad_norm": 0.2199016809463501, "learning_rate": 3.857162428955846e-06, "loss": 0.02261464297771454, "memory(GiB)": 122.96, "step": 57340, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.23182 }, { "epoch": 4.3711410930711185, "grad_norm": 2.870471954345703, "learning_rate": 3.85255225884012e-06, "loss": 0.024649070203304292, "memory(GiB)": 122.96, "step": 57345, "token_acc": 0.990294886151549, "train_speed(iter/s)": 0.231828 }, { "epoch": 4.3715222196813786, "grad_norm": 1.4274208545684814, "learning_rate": 3.847944735066372e-06, "loss": 0.04171229004859924, "memory(GiB)": 122.96, "step": 57350, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.231835 }, { "epoch": 4.371903346291638, "grad_norm": 1.2670103311538696, "learning_rate": 3.843339857898826e-06, "loss": 0.04300893843173981, "memory(GiB)": 122.96, "step": 57355, "token_acc": 0.9795796691012073, "train_speed(iter/s)": 0.23184 }, { "epoch": 4.372284472901898, "grad_norm": 1.412739634513855, "learning_rate": 3.83873762760154e-06, "loss": 0.03834398984909058, "memory(GiB)": 122.96, "step": 57360, "token_acc": 0.986180210060807, "train_speed(iter/s)": 0.231844 }, { "epoch": 4.372665599512158, "grad_norm": 2.178524971008301, "learning_rate": 3.834138044438434e-06, "loss": 0.031152617931365967, "memory(GiB)": 122.96, "step": 57365, "token_acc": 0.9829992387718853, "train_speed(iter/s)": 0.23185 }, { "epoch": 4.373046726122418, "grad_norm": 0.1299949735403061, "learning_rate": 3.82954110867329e-06, "loss": 0.018521997332572936, "memory(GiB)": 122.96, "step": 57370, "token_acc": 0.9935768030831346, "train_speed(iter/s)": 0.231855 }, { "epoch": 4.373427852732678, "grad_norm": 0.4766598045825958, "learning_rate": 3.824946820569691e-06, "loss": 0.03446870744228363, "memory(GiB)": 122.96, "step": 57375, "token_acc": 0.9840310746655158, "train_speed(iter/s)": 0.231861 }, { "epoch": 4.373808979342938, "grad_norm": 2.336611747741699, "learning_rate": 3.820355180391116e-06, "loss": 0.03377532958984375, "memory(GiB)": 122.96, "step": 57380, "token_acc": 0.9853268119164073, "train_speed(iter/s)": 0.231868 }, { "epoch": 4.374190105953198, "grad_norm": 2.054511785507202, "learning_rate": 3.815766188400877e-06, "loss": 0.05652294158935547, "memory(GiB)": 122.96, "step": 57385, "token_acc": 0.9802253391584272, "train_speed(iter/s)": 0.231874 }, { "epoch": 4.374571232563458, "grad_norm": 0.7373908758163452, "learning_rate": 3.811179844862117e-06, "loss": 0.0315952479839325, "memory(GiB)": 122.96, "step": 57390, "token_acc": 0.9872489280072219, "train_speed(iter/s)": 0.231875 }, { "epoch": 4.374952359173718, "grad_norm": 1.0309258699417114, "learning_rate": 3.8065961500378665e-06, "loss": 0.04296904802322388, "memory(GiB)": 122.96, "step": 57395, "token_acc": 0.9864654981140448, "train_speed(iter/s)": 0.231881 }, { "epoch": 4.375333485783978, "grad_norm": 0.4995093047618866, "learning_rate": 3.802015104190948e-06, "loss": 0.013017110526561737, "memory(GiB)": 122.96, "step": 57400, "token_acc": 0.9937355753379492, "train_speed(iter/s)": 0.231885 }, { "epoch": 4.375333485783978, "eval_loss": 0.04981434717774391, "eval_runtime": 159.5597, "eval_samples_per_second": 3.322, "eval_steps_per_second": 3.322, "eval_token_acc": 0.980001204746702, "step": 57400 }, { "epoch": 4.375714612394237, "grad_norm": 1.2804778814315796, "learning_rate": 3.7974367075840844e-06, "loss": 0.024369478225708008, "memory(GiB)": 122.96, "step": 57405, "token_acc": 0.9806422097046292, "train_speed(iter/s)": 0.231741 }, { "epoch": 4.376095739004497, "grad_norm": 1.4963215589523315, "learning_rate": 3.792860960479827e-06, "loss": 0.03690122663974762, "memory(GiB)": 122.96, "step": 57410, "token_acc": 0.9847739517451394, "train_speed(iter/s)": 0.231748 }, { "epoch": 4.376476865614757, "grad_norm": 1.076244831085205, "learning_rate": 3.788287863140555e-06, "loss": 0.030574107170104982, "memory(GiB)": 122.96, "step": 57415, "token_acc": 0.9846735668789809, "train_speed(iter/s)": 0.231754 }, { "epoch": 4.376857992225017, "grad_norm": 0.6721583008766174, "learning_rate": 3.783717415828536e-06, "loss": 0.04351860284805298, "memory(GiB)": 122.96, "step": 57420, "token_acc": 0.9856333468231485, "train_speed(iter/s)": 0.231758 }, { "epoch": 4.377239118835277, "grad_norm": 0.8572652339935303, "learning_rate": 3.779149618805844e-06, "loss": 0.043691623210906985, "memory(GiB)": 122.96, "step": 57425, "token_acc": 0.9785621481160676, "train_speed(iter/s)": 0.231766 }, { "epoch": 4.377620245445537, "grad_norm": 1.43747079372406, "learning_rate": 3.7745844723344305e-06, "loss": 0.02270539551973343, "memory(GiB)": 122.96, "step": 57430, "token_acc": 0.9908860759493671, "train_speed(iter/s)": 0.231774 }, { "epoch": 4.378001372055797, "grad_norm": 0.8126947283744812, "learning_rate": 3.7700219766760915e-06, "loss": 0.03272598385810852, "memory(GiB)": 122.96, "step": 57435, "token_acc": 0.98828025477707, "train_speed(iter/s)": 0.231778 }, { "epoch": 4.378382498666057, "grad_norm": 0.8399785757064819, "learning_rate": 3.7654621320924567e-06, "loss": 0.020665791630744935, "memory(GiB)": 122.96, "step": 57440, "token_acc": 0.9914204003813155, "train_speed(iter/s)": 0.231785 }, { "epoch": 4.378763625276317, "grad_norm": 0.7253152132034302, "learning_rate": 3.7609049388450113e-06, "loss": 0.028352153301239014, "memory(GiB)": 122.96, "step": 57445, "token_acc": 0.9896710630859685, "train_speed(iter/s)": 0.231789 }, { "epoch": 4.379144751886576, "grad_norm": 1.5330610275268555, "learning_rate": 3.7563503971950964e-06, "loss": 0.025085079669952392, "memory(GiB)": 122.96, "step": 57450, "token_acc": 0.9854922279792746, "train_speed(iter/s)": 0.231796 }, { "epoch": 4.379525878496836, "grad_norm": 0.6604595184326172, "learning_rate": 3.7517985074038865e-06, "loss": 0.01731601059436798, "memory(GiB)": 122.96, "step": 57455, "token_acc": 0.9919370320598964, "train_speed(iter/s)": 0.2318 }, { "epoch": 4.379907005107096, "grad_norm": 1.270491123199463, "learning_rate": 3.7472492697324114e-06, "loss": 0.04406019449234009, "memory(GiB)": 122.96, "step": 57460, "token_acc": 0.981520591341077, "train_speed(iter/s)": 0.231808 }, { "epoch": 4.380288131717356, "grad_norm": 3.0127785205841064, "learning_rate": 3.7427026844415568e-06, "loss": 0.06003514528274536, "memory(GiB)": 122.96, "step": 57465, "token_acc": 0.976592082616179, "train_speed(iter/s)": 0.231816 }, { "epoch": 4.380669258327616, "grad_norm": 1.6862977743148804, "learning_rate": 3.7381587517920423e-06, "loss": 0.024795380234718323, "memory(GiB)": 122.96, "step": 57470, "token_acc": 0.9889834752128193, "train_speed(iter/s)": 0.231823 }, { "epoch": 4.381050384937876, "grad_norm": 1.898153305053711, "learning_rate": 3.733617472044437e-06, "loss": 0.02864529490470886, "memory(GiB)": 122.96, "step": 57475, "token_acc": 0.9875916525662719, "train_speed(iter/s)": 0.231827 }, { "epoch": 4.381431511548136, "grad_norm": 0.9171663522720337, "learning_rate": 3.7290788454591772e-06, "loss": 0.03939524292945862, "memory(GiB)": 122.96, "step": 57480, "token_acc": 0.9868231046931408, "train_speed(iter/s)": 0.231833 }, { "epoch": 4.381812638158396, "grad_norm": 1.3775911331176758, "learning_rate": 3.724542872296516e-06, "loss": 0.05716167688369751, "memory(GiB)": 122.96, "step": 57485, "token_acc": 0.9829222011385199, "train_speed(iter/s)": 0.231838 }, { "epoch": 4.382193764768656, "grad_norm": 0.7587323188781738, "learning_rate": 3.7200095528165723e-06, "loss": 0.015353333950042725, "memory(GiB)": 122.96, "step": 57490, "token_acc": 0.9921807124239791, "train_speed(iter/s)": 0.231847 }, { "epoch": 4.382574891378916, "grad_norm": 1.577989935874939, "learning_rate": 3.7154788872793278e-06, "loss": 0.03936365842819214, "memory(GiB)": 122.96, "step": 57495, "token_acc": 0.9859082756853702, "train_speed(iter/s)": 0.231853 }, { "epoch": 4.3829560179891764, "grad_norm": 1.7480531930923462, "learning_rate": 3.710950875944574e-06, "loss": 0.032803797721862794, "memory(GiB)": 122.96, "step": 57500, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.231859 }, { "epoch": 4.383337144599436, "grad_norm": 1.3342615365982056, "learning_rate": 3.7064255190719768e-06, "loss": 0.01928279399871826, "memory(GiB)": 122.96, "step": 57505, "token_acc": 0.9906821963394343, "train_speed(iter/s)": 0.231862 }, { "epoch": 4.383718271209696, "grad_norm": 1.385642409324646, "learning_rate": 3.70190281692106e-06, "loss": 0.02898799777030945, "memory(GiB)": 122.96, "step": 57510, "token_acc": 0.9891996676820826, "train_speed(iter/s)": 0.231868 }, { "epoch": 4.384099397819956, "grad_norm": 1.1468937397003174, "learning_rate": 3.6973827697511455e-06, "loss": 0.029617494344711302, "memory(GiB)": 122.96, "step": 57515, "token_acc": 0.9859212405631503, "train_speed(iter/s)": 0.231874 }, { "epoch": 4.384480524430216, "grad_norm": 2.4350454807281494, "learning_rate": 3.6928653778214804e-06, "loss": 0.04338846206665039, "memory(GiB)": 122.96, "step": 57520, "token_acc": 0.9845743934931988, "train_speed(iter/s)": 0.231879 }, { "epoch": 4.384861651040476, "grad_norm": 0.700507640838623, "learning_rate": 3.6883506413910862e-06, "loss": 0.03866499364376068, "memory(GiB)": 122.96, "step": 57525, "token_acc": 0.9868603781647259, "train_speed(iter/s)": 0.231881 }, { "epoch": 4.385242777650736, "grad_norm": 1.2252708673477173, "learning_rate": 3.683838560718866e-06, "loss": 0.02343205362558365, "memory(GiB)": 122.96, "step": 57530, "token_acc": 0.9872626772410478, "train_speed(iter/s)": 0.231887 }, { "epoch": 4.385623904260996, "grad_norm": 3.0756826400756836, "learning_rate": 3.6793291360635862e-06, "loss": 0.06326195001602172, "memory(GiB)": 122.96, "step": 57535, "token_acc": 0.9872982270441916, "train_speed(iter/s)": 0.231894 }, { "epoch": 4.386005030871256, "grad_norm": 0.7031931281089783, "learning_rate": 3.6748223676838113e-06, "loss": 0.025710776448249817, "memory(GiB)": 122.96, "step": 57540, "token_acc": 0.9925550923168552, "train_speed(iter/s)": 0.231901 }, { "epoch": 4.386386157481516, "grad_norm": 2.2520368099212646, "learning_rate": 3.670318255838001e-06, "loss": 0.015797241032123564, "memory(GiB)": 122.96, "step": 57545, "token_acc": 0.9935064935064936, "train_speed(iter/s)": 0.23191 }, { "epoch": 4.386767284091775, "grad_norm": 1.3514937162399292, "learning_rate": 3.6658168007844496e-06, "loss": 0.040784454345703124, "memory(GiB)": 122.96, "step": 57550, "token_acc": 0.9879317519766958, "train_speed(iter/s)": 0.231917 }, { "epoch": 4.387148410702035, "grad_norm": 2.483224868774414, "learning_rate": 3.661318002781283e-06, "loss": 0.03884675800800323, "memory(GiB)": 122.96, "step": 57555, "token_acc": 0.9827844311377245, "train_speed(iter/s)": 0.231926 }, { "epoch": 4.387529537312295, "grad_norm": 1.0399075746536255, "learning_rate": 3.6568218620864903e-06, "loss": 0.031295251846313474, "memory(GiB)": 122.96, "step": 57560, "token_acc": 0.9889484259879437, "train_speed(iter/s)": 0.231933 }, { "epoch": 4.387910663922555, "grad_norm": 1.9910885095596313, "learning_rate": 3.6523283789579087e-06, "loss": 0.037061494588851926, "memory(GiB)": 122.96, "step": 57565, "token_acc": 0.9838882921589689, "train_speed(iter/s)": 0.23194 }, { "epoch": 4.388291790532815, "grad_norm": 1.953363060951233, "learning_rate": 3.6478375536532104e-06, "loss": 0.03270009160041809, "memory(GiB)": 122.96, "step": 57570, "token_acc": 0.9891811229248274, "train_speed(iter/s)": 0.231946 }, { "epoch": 4.388672917143075, "grad_norm": 1.418900728225708, "learning_rate": 3.6433493864299282e-06, "loss": 0.039741164445877074, "memory(GiB)": 122.96, "step": 57575, "token_acc": 0.9857685009487666, "train_speed(iter/s)": 0.231953 }, { "epoch": 4.389054043753335, "grad_norm": 0.863917887210846, "learning_rate": 3.6388638775454453e-06, "loss": 0.0196002796292305, "memory(GiB)": 122.96, "step": 57580, "token_acc": 0.9913411938098747, "train_speed(iter/s)": 0.231959 }, { "epoch": 4.389435170363595, "grad_norm": 2.5540530681610107, "learning_rate": 3.634381027256972e-06, "loss": 0.02943817377090454, "memory(GiB)": 122.96, "step": 57585, "token_acc": 0.988950276243094, "train_speed(iter/s)": 0.231964 }, { "epoch": 4.389816296973855, "grad_norm": 0.20705890655517578, "learning_rate": 3.6299008358215867e-06, "loss": 0.04704307019710541, "memory(GiB)": 122.96, "step": 57590, "token_acc": 0.982500705616709, "train_speed(iter/s)": 0.231971 }, { "epoch": 4.390197423584115, "grad_norm": 0.85664963722229, "learning_rate": 3.6254233034962004e-06, "loss": 0.026491311192512513, "memory(GiB)": 122.96, "step": 57595, "token_acc": 0.9923510466988728, "train_speed(iter/s)": 0.231979 }, { "epoch": 4.390578550194375, "grad_norm": 0.6854936480522156, "learning_rate": 3.620948430537585e-06, "loss": 0.029217037558555602, "memory(GiB)": 122.96, "step": 57600, "token_acc": 0.9904635174096252, "train_speed(iter/s)": 0.231984 }, { "epoch": 4.390578550194375, "eval_loss": 0.049159277230501175, "eval_runtime": 160.4791, "eval_samples_per_second": 3.303, "eval_steps_per_second": 3.303, "eval_token_acc": 0.980016264080477, "step": 57600 }, { "epoch": 4.390959676804634, "grad_norm": 0.24682940542697906, "learning_rate": 3.616476217202358e-06, "loss": 0.028985971212387086, "memory(GiB)": 122.96, "step": 57605, "token_acc": 0.9803846790383743, "train_speed(iter/s)": 0.231837 }, { "epoch": 4.391340803414894, "grad_norm": 0.5941678285598755, "learning_rate": 3.612006663746964e-06, "loss": 0.020784293115139008, "memory(GiB)": 122.96, "step": 57610, "token_acc": 0.9944194698496357, "train_speed(iter/s)": 0.231842 }, { "epoch": 4.391721930025154, "grad_norm": 0.9373700022697449, "learning_rate": 3.60753977042772e-06, "loss": 0.048187026381492616, "memory(GiB)": 122.96, "step": 57615, "token_acc": 0.982003599280144, "train_speed(iter/s)": 0.231849 }, { "epoch": 4.392103056635414, "grad_norm": 1.3020738363265991, "learning_rate": 3.6030755375007997e-06, "loss": 0.033248302340507505, "memory(GiB)": 122.96, "step": 57620, "token_acc": 0.9851323828920571, "train_speed(iter/s)": 0.231855 }, { "epoch": 4.392484183245674, "grad_norm": 1.004951000213623, "learning_rate": 3.598613965222175e-06, "loss": 0.03455447256565094, "memory(GiB)": 122.96, "step": 57625, "token_acc": 0.9858490566037735, "train_speed(iter/s)": 0.231858 }, { "epoch": 4.392865309855934, "grad_norm": 1.184673547744751, "learning_rate": 3.5941550538477086e-06, "loss": 0.04665469229221344, "memory(GiB)": 122.96, "step": 57630, "token_acc": 0.9809837728194726, "train_speed(iter/s)": 0.231865 }, { "epoch": 4.393246436466194, "grad_norm": 1.2954802513122559, "learning_rate": 3.5896988036331124e-06, "loss": 0.03081212043762207, "memory(GiB)": 122.96, "step": 57635, "token_acc": 0.9882880637926739, "train_speed(iter/s)": 0.231871 }, { "epoch": 4.393627563076454, "grad_norm": 0.792701244354248, "learning_rate": 3.5852452148339098e-06, "loss": 0.01781158149242401, "memory(GiB)": 122.96, "step": 57640, "token_acc": 0.988517745302714, "train_speed(iter/s)": 0.231875 }, { "epoch": 4.394008689686714, "grad_norm": 2.6661739349365234, "learning_rate": 3.5807942877055024e-06, "loss": 0.03419716358184814, "memory(GiB)": 122.96, "step": 57645, "token_acc": 0.9826946847960445, "train_speed(iter/s)": 0.231882 }, { "epoch": 4.3943898162969734, "grad_norm": 1.5870498418807983, "learning_rate": 3.5763460225031298e-06, "loss": 0.02631135582923889, "memory(GiB)": 122.96, "step": 57650, "token_acc": 0.9908487349721873, "train_speed(iter/s)": 0.231887 }, { "epoch": 4.3947709429072335, "grad_norm": 0.837308943271637, "learning_rate": 3.5719004194818885e-06, "loss": 0.03358307778835297, "memory(GiB)": 122.96, "step": 57655, "token_acc": 0.9877586709414165, "train_speed(iter/s)": 0.231892 }, { "epoch": 4.3951520695174935, "grad_norm": 0.7220122814178467, "learning_rate": 3.5674574788967075e-06, "loss": 0.01582450270652771, "memory(GiB)": 122.96, "step": 57660, "token_acc": 0.9916186485070718, "train_speed(iter/s)": 0.231898 }, { "epoch": 4.3955331961277535, "grad_norm": 0.6346963047981262, "learning_rate": 3.5630172010023557e-06, "loss": 0.025852200388908387, "memory(GiB)": 122.96, "step": 57665, "token_acc": 0.9899936129444327, "train_speed(iter/s)": 0.231904 }, { "epoch": 4.3959143227380135, "grad_norm": 0.7726293802261353, "learning_rate": 3.5585795860534787e-06, "loss": 0.031912416219711304, "memory(GiB)": 122.96, "step": 57670, "token_acc": 0.9895261845386534, "train_speed(iter/s)": 0.231908 }, { "epoch": 4.3962954493482735, "grad_norm": 0.7318705916404724, "learning_rate": 3.554144634304557e-06, "loss": 0.036070674657821655, "memory(GiB)": 122.96, "step": 57675, "token_acc": 0.9850430578637256, "train_speed(iter/s)": 0.231912 }, { "epoch": 4.396676575958534, "grad_norm": 0.9472291469573975, "learning_rate": 3.5497123460098923e-06, "loss": 0.03258621096611023, "memory(GiB)": 122.96, "step": 57680, "token_acc": 0.9867942583732058, "train_speed(iter/s)": 0.231918 }, { "epoch": 4.397057702568794, "grad_norm": 1.0469608306884766, "learning_rate": 3.5452827214236707e-06, "loss": 0.025921228528022765, "memory(GiB)": 122.96, "step": 57685, "token_acc": 0.9892259240842035, "train_speed(iter/s)": 0.231922 }, { "epoch": 4.397438829179054, "grad_norm": 0.6196757555007935, "learning_rate": 3.5408557607999217e-06, "loss": 0.027020499110221863, "memory(GiB)": 122.96, "step": 57690, "token_acc": 0.9894278606965174, "train_speed(iter/s)": 0.231922 }, { "epoch": 4.397819955789314, "grad_norm": 0.87177973985672, "learning_rate": 3.5364314643924866e-06, "loss": 0.03696410655975342, "memory(GiB)": 122.96, "step": 57695, "token_acc": 0.9869141813755326, "train_speed(iter/s)": 0.231929 }, { "epoch": 4.398201082399573, "grad_norm": 1.3494101762771606, "learning_rate": 3.5320098324550853e-06, "loss": 0.032415884733200076, "memory(GiB)": 122.96, "step": 57700, "token_acc": 0.9863907531692767, "train_speed(iter/s)": 0.231935 }, { "epoch": 4.398582209009833, "grad_norm": 1.5426455736160278, "learning_rate": 3.527590865241298e-06, "loss": 0.027591854333877563, "memory(GiB)": 122.96, "step": 57705, "token_acc": 0.9897959183673469, "train_speed(iter/s)": 0.231942 }, { "epoch": 4.398963335620093, "grad_norm": 0.841070830821991, "learning_rate": 3.5231745630045055e-06, "loss": 0.02103964388370514, "memory(GiB)": 122.96, "step": 57710, "token_acc": 0.9906569343065693, "train_speed(iter/s)": 0.231949 }, { "epoch": 4.399344462230353, "grad_norm": 1.328594446182251, "learning_rate": 3.5187609259979714e-06, "loss": 0.03195193409919739, "memory(GiB)": 122.96, "step": 57715, "token_acc": 0.985670864090317, "train_speed(iter/s)": 0.231955 }, { "epoch": 4.399725588840613, "grad_norm": 0.9910503625869751, "learning_rate": 3.514349954474805e-06, "loss": 0.03803335428237915, "memory(GiB)": 122.96, "step": 57720, "token_acc": 0.9810402254675891, "train_speed(iter/s)": 0.231962 }, { "epoch": 4.400106715450873, "grad_norm": 0.5866404175758362, "learning_rate": 3.5099416486879423e-06, "loss": 0.018152137100696564, "memory(GiB)": 122.96, "step": 57725, "token_acc": 0.9905176832393644, "train_speed(iter/s)": 0.231966 }, { "epoch": 4.400487842061133, "grad_norm": 0.8235952258110046, "learning_rate": 3.5055360088901866e-06, "loss": 0.019095434248447417, "memory(GiB)": 122.96, "step": 57730, "token_acc": 0.990625, "train_speed(iter/s)": 0.231975 }, { "epoch": 4.400868968671393, "grad_norm": 1.6702661514282227, "learning_rate": 3.501133035334192e-06, "loss": 0.02442755103111267, "memory(GiB)": 122.96, "step": 57735, "token_acc": 0.9868441908384847, "train_speed(iter/s)": 0.231978 }, { "epoch": 4.401250095281653, "grad_norm": 1.7537249326705933, "learning_rate": 3.4967327282724228e-06, "loss": 0.03980360925197601, "memory(GiB)": 122.96, "step": 57740, "token_acc": 0.977551444606111, "train_speed(iter/s)": 0.231984 }, { "epoch": 4.401631221891913, "grad_norm": 0.9801976084709167, "learning_rate": 3.4923350879572327e-06, "loss": 0.038367894291877744, "memory(GiB)": 122.96, "step": 57745, "token_acc": 0.983634216298296, "train_speed(iter/s)": 0.231989 }, { "epoch": 4.402012348502172, "grad_norm": 0.8367419242858887, "learning_rate": 3.487940114640814e-06, "loss": 0.028457483649253844, "memory(GiB)": 122.96, "step": 57750, "token_acc": 0.9856687898089171, "train_speed(iter/s)": 0.231994 }, { "epoch": 4.402393475112432, "grad_norm": 0.7114782929420471, "learning_rate": 3.4835478085751773e-06, "loss": 0.02939937710762024, "memory(GiB)": 122.96, "step": 57755, "token_acc": 0.9897393233499723, "train_speed(iter/s)": 0.231997 }, { "epoch": 4.402774601722692, "grad_norm": 0.8467615842819214, "learning_rate": 3.4791581700122144e-06, "loss": 0.03001244366168976, "memory(GiB)": 122.96, "step": 57760, "token_acc": 0.9910005293806247, "train_speed(iter/s)": 0.232005 }, { "epoch": 4.403155728332952, "grad_norm": 1.0033708810806274, "learning_rate": 3.474771199203647e-06, "loss": 0.01617414951324463, "memory(GiB)": 122.96, "step": 57765, "token_acc": 0.9922622683771126, "train_speed(iter/s)": 0.23201 }, { "epoch": 4.403536854943212, "grad_norm": 1.638433814048767, "learning_rate": 3.4703868964010398e-06, "loss": 0.03302818834781647, "memory(GiB)": 122.96, "step": 57770, "token_acc": 0.9873366834170855, "train_speed(iter/s)": 0.232016 }, { "epoch": 4.403917981553472, "grad_norm": 0.69074946641922, "learning_rate": 3.466005261855837e-06, "loss": 0.020507115125656127, "memory(GiB)": 122.96, "step": 57775, "token_acc": 0.9890042761148442, "train_speed(iter/s)": 0.232019 }, { "epoch": 4.404299108163732, "grad_norm": 0.7818564772605896, "learning_rate": 3.4616262958192645e-06, "loss": 0.022255422174930574, "memory(GiB)": 122.96, "step": 57780, "token_acc": 0.9897313024131439, "train_speed(iter/s)": 0.232025 }, { "epoch": 4.404680234773992, "grad_norm": 1.3662620782852173, "learning_rate": 3.457249998542472e-06, "loss": 0.022172981500625612, "memory(GiB)": 122.96, "step": 57785, "token_acc": 0.9903755868544601, "train_speed(iter/s)": 0.23203 }, { "epoch": 4.405061361384252, "grad_norm": 0.5006567239761353, "learning_rate": 3.4528763702764144e-06, "loss": 0.05400158166885376, "memory(GiB)": 122.96, "step": 57790, "token_acc": 0.971250432975407, "train_speed(iter/s)": 0.232037 }, { "epoch": 4.405442487994511, "grad_norm": 1.9671649932861328, "learning_rate": 3.4485054112718852e-06, "loss": 0.04103606939315796, "memory(GiB)": 122.96, "step": 57795, "token_acc": 0.9920014219694276, "train_speed(iter/s)": 0.232041 }, { "epoch": 4.405823614604771, "grad_norm": 0.682978093624115, "learning_rate": 3.444137121779545e-06, "loss": 0.023986056447029114, "memory(GiB)": 122.96, "step": 57800, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.232044 }, { "epoch": 4.405823614604771, "eval_loss": 0.04924232140183449, "eval_runtime": 158.3438, "eval_samples_per_second": 3.347, "eval_steps_per_second": 3.347, "eval_token_acc": 0.98009909041624, "step": 57800 }, { "epoch": 4.406204741215031, "grad_norm": 0.49942949414253235, "learning_rate": 3.439771502049899e-06, "loss": 0.03486639559268952, "memory(GiB)": 122.96, "step": 57805, "token_acc": 0.9805182182737097, "train_speed(iter/s)": 0.231901 }, { "epoch": 4.406585867825291, "grad_norm": 1.861627221107483, "learning_rate": 3.4354085523332857e-06, "loss": 0.03683125376701355, "memory(GiB)": 122.96, "step": 57810, "token_acc": 0.9849490925188137, "train_speed(iter/s)": 0.231906 }, { "epoch": 4.406966994435551, "grad_norm": 0.5204206109046936, "learning_rate": 3.4310482728799165e-06, "loss": 0.03254488706588745, "memory(GiB)": 122.96, "step": 57815, "token_acc": 0.9850199203187251, "train_speed(iter/s)": 0.231911 }, { "epoch": 4.407348121045811, "grad_norm": 2.2437307834625244, "learning_rate": 3.4266906639398134e-06, "loss": 0.03758853077888489, "memory(GiB)": 122.96, "step": 57820, "token_acc": 0.9851414614288622, "train_speed(iter/s)": 0.231917 }, { "epoch": 4.407729247656071, "grad_norm": 1.6119801998138428, "learning_rate": 3.422335725762876e-06, "loss": 0.03666484951972961, "memory(GiB)": 122.96, "step": 57825, "token_acc": 0.9847328244274809, "train_speed(iter/s)": 0.231923 }, { "epoch": 4.4081103742663315, "grad_norm": 1.022993564605713, "learning_rate": 3.4179834585988447e-06, "loss": 0.025333493947982788, "memory(GiB)": 122.96, "step": 57830, "token_acc": 0.9896978021978022, "train_speed(iter/s)": 0.231929 }, { "epoch": 4.4084915008765915, "grad_norm": 0.6445615291595459, "learning_rate": 3.413633862697285e-06, "loss": 0.033560195565223695, "memory(GiB)": 122.96, "step": 57835, "token_acc": 0.986229344016024, "train_speed(iter/s)": 0.231935 }, { "epoch": 4.4088726274868515, "grad_norm": 0.22990448772907257, "learning_rate": 3.4092869383076377e-06, "loss": 0.018750400841236116, "memory(GiB)": 122.96, "step": 57840, "token_acc": 0.9904266389177939, "train_speed(iter/s)": 0.23194 }, { "epoch": 4.4092537540971115, "grad_norm": 0.9757771492004395, "learning_rate": 3.4049426856791857e-06, "loss": 0.05086647868156433, "memory(GiB)": 122.96, "step": 57845, "token_acc": 0.9815175097276264, "train_speed(iter/s)": 0.231947 }, { "epoch": 4.409634880707371, "grad_norm": 1.7180068492889404, "learning_rate": 3.4006011050610298e-06, "loss": 0.021427878737449647, "memory(GiB)": 122.96, "step": 57850, "token_acc": 0.9887679887679888, "train_speed(iter/s)": 0.231955 }, { "epoch": 4.410016007317631, "grad_norm": 3.6737020015716553, "learning_rate": 3.3962621967021603e-06, "loss": 0.029735422134399413, "memory(GiB)": 122.96, "step": 57855, "token_acc": 0.9900920028308563, "train_speed(iter/s)": 0.231963 }, { "epoch": 4.410397133927891, "grad_norm": 0.9897979497909546, "learning_rate": 3.3919259608513886e-06, "loss": 0.04148439466953278, "memory(GiB)": 122.96, "step": 57860, "token_acc": 0.9858008934269304, "train_speed(iter/s)": 0.231968 }, { "epoch": 4.410778260538151, "grad_norm": 0.5902113914489746, "learning_rate": 3.387592397757372e-06, "loss": 0.027893209457397462, "memory(GiB)": 122.96, "step": 57865, "token_acc": 0.9889574537187399, "train_speed(iter/s)": 0.231973 }, { "epoch": 4.411159387148411, "grad_norm": 0.8376398086547852, "learning_rate": 3.3832615076686225e-06, "loss": 0.0391187459230423, "memory(GiB)": 122.96, "step": 57870, "token_acc": 0.9874926714871995, "train_speed(iter/s)": 0.231979 }, { "epoch": 4.411540513758671, "grad_norm": 1.0960348844528198, "learning_rate": 3.3789332908335027e-06, "loss": 0.03503319025039673, "memory(GiB)": 122.96, "step": 57875, "token_acc": 0.989048473967684, "train_speed(iter/s)": 0.231983 }, { "epoch": 4.411921640368931, "grad_norm": 1.3230483531951904, "learning_rate": 3.3746077475002023e-06, "loss": 0.04643221795558929, "memory(GiB)": 122.96, "step": 57880, "token_acc": 0.9861402771944561, "train_speed(iter/s)": 0.231991 }, { "epoch": 4.412302766979191, "grad_norm": 1.053340196609497, "learning_rate": 3.3702848779167796e-06, "loss": 0.03921997249126434, "memory(GiB)": 122.96, "step": 57885, "token_acc": 0.9850671976107516, "train_speed(iter/s)": 0.231995 }, { "epoch": 4.412683893589451, "grad_norm": 0.1951836347579956, "learning_rate": 3.365964682331141e-06, "loss": 0.027813059091567994, "memory(GiB)": 122.96, "step": 57890, "token_acc": 0.9894268224819143, "train_speed(iter/s)": 0.232002 }, { "epoch": 4.41306502019971, "grad_norm": 3.0813069343566895, "learning_rate": 3.3616471609910104e-06, "loss": 0.03635947704315186, "memory(GiB)": 122.96, "step": 57895, "token_acc": 0.9877910920189916, "train_speed(iter/s)": 0.232006 }, { "epoch": 4.41344614680997, "grad_norm": 0.6885709166526794, "learning_rate": 3.357332314143985e-06, "loss": 0.029413706064224242, "memory(GiB)": 122.96, "step": 57900, "token_acc": 0.9873750657548659, "train_speed(iter/s)": 0.232012 }, { "epoch": 4.41382727342023, "grad_norm": 1.352856159210205, "learning_rate": 3.3530201420375163e-06, "loss": 0.014463508129119873, "memory(GiB)": 122.96, "step": 57905, "token_acc": 0.9959630911188004, "train_speed(iter/s)": 0.23202 }, { "epoch": 4.41420840003049, "grad_norm": 0.9718776941299438, "learning_rate": 3.348710644918862e-06, "loss": 0.02828814387321472, "memory(GiB)": 122.96, "step": 57910, "token_acc": 0.9884063866571748, "train_speed(iter/s)": 0.232021 }, { "epoch": 4.41458952664075, "grad_norm": 0.8573870062828064, "learning_rate": 3.344403823035164e-06, "loss": 0.02594425082206726, "memory(GiB)": 122.96, "step": 57915, "token_acc": 0.9907500502714659, "train_speed(iter/s)": 0.232027 }, { "epoch": 4.41497065325101, "grad_norm": 1.6297709941864014, "learning_rate": 3.3400996766334016e-06, "loss": 0.02965477406978607, "memory(GiB)": 122.96, "step": 57920, "token_acc": 0.9882693479585618, "train_speed(iter/s)": 0.232033 }, { "epoch": 4.41535177986127, "grad_norm": 1.0715593099594116, "learning_rate": 3.335798205960389e-06, "loss": 0.03421503305435181, "memory(GiB)": 122.96, "step": 57925, "token_acc": 0.9898880462260744, "train_speed(iter/s)": 0.232038 }, { "epoch": 4.41573290647153, "grad_norm": 0.25756850838661194, "learning_rate": 3.3314994112628117e-06, "loss": 0.027570644021034242, "memory(GiB)": 122.96, "step": 57930, "token_acc": 0.9882224048205971, "train_speed(iter/s)": 0.232046 }, { "epoch": 4.41611403308179, "grad_norm": 0.6623032689094543, "learning_rate": 3.327203292787168e-06, "loss": 0.047104498744010924, "memory(GiB)": 122.96, "step": 57935, "token_acc": 0.9824970828471412, "train_speed(iter/s)": 0.232053 }, { "epoch": 4.41649515969205, "grad_norm": 1.3237221240997314, "learning_rate": 3.3229098507798263e-06, "loss": 0.021547925472259522, "memory(GiB)": 122.96, "step": 57940, "token_acc": 0.9911764705882353, "train_speed(iter/s)": 0.232061 }, { "epoch": 4.416876286302309, "grad_norm": 1.3277007341384888, "learning_rate": 3.3186190854870016e-06, "loss": 0.04024717807769775, "memory(GiB)": 122.96, "step": 57945, "token_acc": 0.9808612440191388, "train_speed(iter/s)": 0.232066 }, { "epoch": 4.417257412912569, "grad_norm": 1.0037074089050293, "learning_rate": 3.314330997154741e-06, "loss": 0.021969597041606902, "memory(GiB)": 122.96, "step": 57950, "token_acc": 0.9889110036963321, "train_speed(iter/s)": 0.232073 }, { "epoch": 4.417638539522829, "grad_norm": 1.387196660041809, "learning_rate": 3.3100455860289427e-06, "loss": 0.019466283917427062, "memory(GiB)": 122.96, "step": 57955, "token_acc": 0.9916810097532989, "train_speed(iter/s)": 0.232081 }, { "epoch": 4.418019666133089, "grad_norm": 0.8267048001289368, "learning_rate": 3.305762852355376e-06, "loss": 0.018920820951461793, "memory(GiB)": 122.96, "step": 57960, "token_acc": 0.9939890710382514, "train_speed(iter/s)": 0.232089 }, { "epoch": 4.418400792743349, "grad_norm": 1.0818935632705688, "learning_rate": 3.3014827963796126e-06, "loss": 0.019760940968990327, "memory(GiB)": 122.96, "step": 57965, "token_acc": 0.9882329477463103, "train_speed(iter/s)": 0.232095 }, { "epoch": 4.418781919353609, "grad_norm": 0.9902758002281189, "learning_rate": 3.2972054183471047e-06, "loss": 0.036160925030708314, "memory(GiB)": 122.96, "step": 57970, "token_acc": 0.983739837398374, "train_speed(iter/s)": 0.232101 }, { "epoch": 4.419163045963869, "grad_norm": 0.6587932705879211, "learning_rate": 3.2929307185031453e-06, "loss": 0.03624656498432159, "memory(GiB)": 122.96, "step": 57975, "token_acc": 0.9865689865689866, "train_speed(iter/s)": 0.232106 }, { "epoch": 4.419544172574129, "grad_norm": 1.8629083633422852, "learning_rate": 3.2886586970928555e-06, "loss": 0.05313829183578491, "memory(GiB)": 122.96, "step": 57980, "token_acc": 0.9782764382907615, "train_speed(iter/s)": 0.232112 }, { "epoch": 4.419925299184389, "grad_norm": 0.8565999865531921, "learning_rate": 3.284389354361228e-06, "loss": 0.026492860913276673, "memory(GiB)": 122.96, "step": 57985, "token_acc": 0.9862558191088451, "train_speed(iter/s)": 0.232118 }, { "epoch": 4.420306425794649, "grad_norm": 1.276581883430481, "learning_rate": 3.280122690553078e-06, "loss": 0.04626967310905457, "memory(GiB)": 122.96, "step": 57990, "token_acc": 0.9789169078131459, "train_speed(iter/s)": 0.232126 }, { "epoch": 4.4206875524049085, "grad_norm": 1.7679924964904785, "learning_rate": 3.275858705913082e-06, "loss": 0.034546518325805665, "memory(GiB)": 122.96, "step": 57995, "token_acc": 0.9857308137292711, "train_speed(iter/s)": 0.232132 }, { "epoch": 4.4210686790151685, "grad_norm": 2.0108230113983154, "learning_rate": 3.271597400685772e-06, "loss": 0.0195330947637558, "memory(GiB)": 122.96, "step": 58000, "token_acc": 0.993676603432701, "train_speed(iter/s)": 0.232141 }, { "epoch": 4.4210686790151685, "eval_loss": 0.048980142921209335, "eval_runtime": 156.9514, "eval_samples_per_second": 3.377, "eval_steps_per_second": 3.377, "eval_token_acc": 0.9800087344135895, "step": 58000 }, { "epoch": 4.4214498056254286, "grad_norm": 0.21740758419036865, "learning_rate": 3.267338775115497e-06, "loss": 0.03175306618213654, "memory(GiB)": 122.96, "step": 58005, "token_acc": 0.9802798057066403, "train_speed(iter/s)": 0.232 }, { "epoch": 4.421830932235689, "grad_norm": 1.8037909269332886, "learning_rate": 3.2630828294464787e-06, "loss": 0.05408978462219238, "memory(GiB)": 122.96, "step": 58010, "token_acc": 0.9834887334887334, "train_speed(iter/s)": 0.232005 }, { "epoch": 4.422212058845949, "grad_norm": 3.169363498687744, "learning_rate": 3.258829563922783e-06, "loss": 0.034756502509117125, "memory(GiB)": 122.96, "step": 58015, "token_acc": 0.9901143632486916, "train_speed(iter/s)": 0.232011 }, { "epoch": 4.422593185456209, "grad_norm": 0.72584468126297, "learning_rate": 3.2545789787882975e-06, "loss": 0.03697426021099091, "memory(GiB)": 122.96, "step": 58020, "token_acc": 0.9833024118738405, "train_speed(iter/s)": 0.232018 }, { "epoch": 4.422974312066469, "grad_norm": 0.9954128861427307, "learning_rate": 3.2503310742867777e-06, "loss": 0.04321256577968598, "memory(GiB)": 122.96, "step": 58025, "token_acc": 0.9854474896919718, "train_speed(iter/s)": 0.232023 }, { "epoch": 4.423355438676729, "grad_norm": 1.0396685600280762, "learning_rate": 3.2460858506618396e-06, "loss": 0.03093872368335724, "memory(GiB)": 122.96, "step": 58030, "token_acc": 0.9860195199155896, "train_speed(iter/s)": 0.232029 }, { "epoch": 4.423736565286989, "grad_norm": 1.0589455366134644, "learning_rate": 3.2418433081569057e-06, "loss": 0.021922938525676727, "memory(GiB)": 122.96, "step": 58035, "token_acc": 0.9927536231884058, "train_speed(iter/s)": 0.232035 }, { "epoch": 4.424117691897248, "grad_norm": 0.7904384732246399, "learning_rate": 3.2376034470152695e-06, "loss": 0.0366991937160492, "memory(GiB)": 122.96, "step": 58040, "token_acc": 0.9862490450725745, "train_speed(iter/s)": 0.23204 }, { "epoch": 4.424498818507508, "grad_norm": 0.8042694330215454, "learning_rate": 3.2333662674800813e-06, "loss": 0.026309704780578612, "memory(GiB)": 122.96, "step": 58045, "token_acc": 0.9896729776247849, "train_speed(iter/s)": 0.232043 }, { "epoch": 4.424879945117768, "grad_norm": 0.17196081578731537, "learning_rate": 3.2291317697942968e-06, "loss": 0.02759680151939392, "memory(GiB)": 122.96, "step": 58050, "token_acc": 0.9893558197401161, "train_speed(iter/s)": 0.232048 }, { "epoch": 4.425261071728028, "grad_norm": 0.8437506556510925, "learning_rate": 3.2248999542007776e-06, "loss": 0.02846686840057373, "memory(GiB)": 122.96, "step": 58055, "token_acc": 0.990406140070355, "train_speed(iter/s)": 0.232055 }, { "epoch": 4.425642198338288, "grad_norm": 1.0075428485870361, "learning_rate": 3.220670820942179e-06, "loss": 0.03485849797725678, "memory(GiB)": 122.96, "step": 58060, "token_acc": 0.9827370527895922, "train_speed(iter/s)": 0.232062 }, { "epoch": 4.426023324948548, "grad_norm": 0.48605599999427795, "learning_rate": 3.216444370261024e-06, "loss": 0.018397243320941926, "memory(GiB)": 122.96, "step": 58065, "token_acc": 0.9929793769197016, "train_speed(iter/s)": 0.232069 }, { "epoch": 4.426404451558808, "grad_norm": 1.6452332735061646, "learning_rate": 3.21222060239969e-06, "loss": 0.046260124444961546, "memory(GiB)": 122.96, "step": 58070, "token_acc": 0.9843614248479583, "train_speed(iter/s)": 0.232076 }, { "epoch": 4.426785578169068, "grad_norm": 1.3656615018844604, "learning_rate": 3.2079995176003743e-06, "loss": 0.032697921991348265, "memory(GiB)": 122.96, "step": 58075, "token_acc": 0.9885159010600707, "train_speed(iter/s)": 0.232083 }, { "epoch": 4.427166704779328, "grad_norm": 0.8086431622505188, "learning_rate": 3.203781116105148e-06, "loss": 0.034242740273475646, "memory(GiB)": 122.96, "step": 58080, "token_acc": 0.9858233369683751, "train_speed(iter/s)": 0.232089 }, { "epoch": 4.427547831389588, "grad_norm": 0.21897390484809875, "learning_rate": 3.1995653981559183e-06, "loss": 0.018206483125686644, "memory(GiB)": 122.96, "step": 58085, "token_acc": 0.9916885389326334, "train_speed(iter/s)": 0.232097 }, { "epoch": 4.427928957999848, "grad_norm": 0.5835753083229065, "learning_rate": 3.1953523639944248e-06, "loss": 0.019384878873825073, "memory(GiB)": 122.96, "step": 58090, "token_acc": 0.9922708870077291, "train_speed(iter/s)": 0.232102 }, { "epoch": 4.428310084610107, "grad_norm": 0.965251624584198, "learning_rate": 3.1911420138622695e-06, "loss": 0.03341841399669647, "memory(GiB)": 122.96, "step": 58095, "token_acc": 0.9840656687590535, "train_speed(iter/s)": 0.232107 }, { "epoch": 4.428691211220367, "grad_norm": 0.6083664298057556, "learning_rate": 3.186934348000914e-06, "loss": 0.021438102424144744, "memory(GiB)": 122.96, "step": 58100, "token_acc": 0.9906584300135604, "train_speed(iter/s)": 0.232113 }, { "epoch": 4.429072337830627, "grad_norm": 1.9421762228012085, "learning_rate": 3.1827293666516267e-06, "loss": 0.023300044238567352, "memory(GiB)": 122.96, "step": 58105, "token_acc": 0.989983305509182, "train_speed(iter/s)": 0.232118 }, { "epoch": 4.429453464440887, "grad_norm": 0.7763165235519409, "learning_rate": 3.178527070055548e-06, "loss": 0.024998563528060912, "memory(GiB)": 122.96, "step": 58110, "token_acc": 0.9884383908774153, "train_speed(iter/s)": 0.232121 }, { "epoch": 4.429834591051147, "grad_norm": 0.8232695460319519, "learning_rate": 3.1743274584536744e-06, "loss": 0.02843540012836456, "memory(GiB)": 122.96, "step": 58115, "token_acc": 0.9885294603069813, "train_speed(iter/s)": 0.232121 }, { "epoch": 4.430215717661407, "grad_norm": 0.10406278818845749, "learning_rate": 3.1701305320868126e-06, "loss": 0.020504234731197356, "memory(GiB)": 122.96, "step": 58120, "token_acc": 0.9916749256689792, "train_speed(iter/s)": 0.232127 }, { "epoch": 4.430596844271667, "grad_norm": 0.9515517354011536, "learning_rate": 3.165936291195648e-06, "loss": 0.024630707502365113, "memory(GiB)": 122.96, "step": 58125, "token_acc": 0.9861720807726075, "train_speed(iter/s)": 0.232132 }, { "epoch": 4.430977970881927, "grad_norm": 1.018839716911316, "learning_rate": 3.161744736020711e-06, "loss": 0.023665951192379, "memory(GiB)": 122.96, "step": 58130, "token_acc": 0.9909509988048489, "train_speed(iter/s)": 0.232135 }, { "epoch": 4.431359097492187, "grad_norm": 0.9472254514694214, "learning_rate": 3.1575558668023476e-06, "loss": 0.03015182018280029, "memory(GiB)": 122.96, "step": 58135, "token_acc": 0.9888253638253638, "train_speed(iter/s)": 0.232142 }, { "epoch": 4.431740224102446, "grad_norm": 0.6966326832771301, "learning_rate": 3.1533696837807767e-06, "loss": 0.02285062223672867, "memory(GiB)": 122.96, "step": 58140, "token_acc": 0.9912536443148688, "train_speed(iter/s)": 0.232146 }, { "epoch": 4.432121350712706, "grad_norm": 0.26994091272354126, "learning_rate": 3.1491861871960736e-06, "loss": 0.03412860631942749, "memory(GiB)": 122.96, "step": 58145, "token_acc": 0.9888782967905942, "train_speed(iter/s)": 0.232154 }, { "epoch": 4.432502477322966, "grad_norm": 0.11044905334711075, "learning_rate": 3.145005377288118e-06, "loss": 0.0290594607591629, "memory(GiB)": 122.96, "step": 58150, "token_acc": 0.988666414809218, "train_speed(iter/s)": 0.232158 }, { "epoch": 4.4328836039332264, "grad_norm": 0.9089610576629639, "learning_rate": 3.140827254296674e-06, "loss": 0.02144276350736618, "memory(GiB)": 122.96, "step": 58155, "token_acc": 0.9880510918829831, "train_speed(iter/s)": 0.232166 }, { "epoch": 4.4332647305434865, "grad_norm": 0.5571759939193726, "learning_rate": 3.1366518184613334e-06, "loss": 0.029293784499168397, "memory(GiB)": 122.96, "step": 58160, "token_acc": 0.9895564289724874, "train_speed(iter/s)": 0.23217 }, { "epoch": 4.4336458571537465, "grad_norm": 0.6266617178916931, "learning_rate": 3.132479070021532e-06, "loss": 0.02829861044883728, "memory(GiB)": 122.96, "step": 58165, "token_acc": 0.9898887765419616, "train_speed(iter/s)": 0.232174 }, { "epoch": 4.4340269837640065, "grad_norm": 1.1608961820602417, "learning_rate": 3.128309009216579e-06, "loss": 0.029640501737594603, "memory(GiB)": 122.96, "step": 58170, "token_acc": 0.9870101276970498, "train_speed(iter/s)": 0.232181 }, { "epoch": 4.4344081103742665, "grad_norm": 0.12149921804666519, "learning_rate": 3.124141636285577e-06, "loss": 0.02867620885372162, "memory(GiB)": 122.96, "step": 58175, "token_acc": 0.9854691392194852, "train_speed(iter/s)": 0.232184 }, { "epoch": 4.4347892369845265, "grad_norm": 0.8352394700050354, "learning_rate": 3.1199769514675293e-06, "loss": 0.01649356186389923, "memory(GiB)": 122.96, "step": 58180, "token_acc": 0.9903368304803976, "train_speed(iter/s)": 0.23219 }, { "epoch": 4.435170363594787, "grad_norm": 1.0059140920639038, "learning_rate": 3.1158149550012507e-06, "loss": 0.031388971209526065, "memory(GiB)": 122.96, "step": 58185, "token_acc": 0.9885173247381144, "train_speed(iter/s)": 0.232196 }, { "epoch": 4.435551490205047, "grad_norm": 0.048272863030433655, "learning_rate": 3.1116556471254112e-06, "loss": 0.05026545524597168, "memory(GiB)": 122.96, "step": 58190, "token_acc": 0.980747015787447, "train_speed(iter/s)": 0.2322 }, { "epoch": 4.435932616815306, "grad_norm": 0.6339544653892517, "learning_rate": 3.107499028078542e-06, "loss": 0.01649473011493683, "memory(GiB)": 122.96, "step": 58195, "token_acc": 0.9927911275415896, "train_speed(iter/s)": 0.232205 }, { "epoch": 4.436313743425566, "grad_norm": 0.5887312889099121, "learning_rate": 3.103345098098992e-06, "loss": 0.031157466769218444, "memory(GiB)": 122.96, "step": 58200, "token_acc": 0.9861833308572278, "train_speed(iter/s)": 0.232209 }, { "epoch": 4.436313743425566, "eval_loss": 0.04928889870643616, "eval_runtime": 158.5738, "eval_samples_per_second": 3.342, "eval_steps_per_second": 3.342, "eval_token_acc": 0.9799108487440515, "step": 58200 }, { "epoch": 4.436694870035826, "grad_norm": 1.4222075939178467, "learning_rate": 3.09919385742497e-06, "loss": 0.030695736408233643, "memory(GiB)": 122.96, "step": 58205, "token_acc": 0.9801018760459176, "train_speed(iter/s)": 0.232069 }, { "epoch": 4.437075996646086, "grad_norm": 0.45601022243499756, "learning_rate": 3.0950453062945416e-06, "loss": 0.025674355030059815, "memory(GiB)": 122.96, "step": 58210, "token_acc": 0.991344732953346, "train_speed(iter/s)": 0.232075 }, { "epoch": 4.437457123256346, "grad_norm": 0.869012176990509, "learning_rate": 3.0908994449455886e-06, "loss": 0.03439017832279205, "memory(GiB)": 122.96, "step": 58215, "token_acc": 0.9871479871479871, "train_speed(iter/s)": 0.23208 }, { "epoch": 4.437838249866606, "grad_norm": 1.7268930673599243, "learning_rate": 3.0867562736158763e-06, "loss": 0.03278044164180756, "memory(GiB)": 122.96, "step": 58220, "token_acc": 0.9856341976173791, "train_speed(iter/s)": 0.232088 }, { "epoch": 4.438219376476866, "grad_norm": 0.5477996468544006, "learning_rate": 3.0826157925429867e-06, "loss": 0.016891853511333467, "memory(GiB)": 122.96, "step": 58225, "token_acc": 0.9930286599535244, "train_speed(iter/s)": 0.232093 }, { "epoch": 4.438600503087126, "grad_norm": 0.6966286301612854, "learning_rate": 3.078478001964352e-06, "loss": 0.042356681823730466, "memory(GiB)": 122.96, "step": 58230, "token_acc": 0.9808157943751165, "train_speed(iter/s)": 0.232098 }, { "epoch": 4.438981629697386, "grad_norm": 1.6819835901260376, "learning_rate": 3.0743429021172666e-06, "loss": 0.03549520373344421, "memory(GiB)": 122.96, "step": 58235, "token_acc": 0.9890784982935154, "train_speed(iter/s)": 0.232106 }, { "epoch": 4.439362756307645, "grad_norm": 1.0605305433273315, "learning_rate": 3.0702104932388566e-06, "loss": 0.035391539335250854, "memory(GiB)": 122.96, "step": 58240, "token_acc": 0.986198243412798, "train_speed(iter/s)": 0.232111 }, { "epoch": 4.439743882917905, "grad_norm": 1.3564244508743286, "learning_rate": 3.066080775566088e-06, "loss": 0.03244886696338654, "memory(GiB)": 122.96, "step": 58245, "token_acc": 0.9858377781864999, "train_speed(iter/s)": 0.232115 }, { "epoch": 4.440125009528165, "grad_norm": 1.2472724914550781, "learning_rate": 3.0619537493357888e-06, "loss": 0.0343135803937912, "memory(GiB)": 122.96, "step": 58250, "token_acc": 0.9842276171331374, "train_speed(iter/s)": 0.232121 }, { "epoch": 4.440506136138425, "grad_norm": 0.6026646494865417, "learning_rate": 3.05782941478463e-06, "loss": 0.022601570188999175, "memory(GiB)": 122.96, "step": 58255, "token_acc": 0.9951523545706371, "train_speed(iter/s)": 0.232128 }, { "epoch": 4.440887262748685, "grad_norm": 1.1260504722595215, "learning_rate": 3.0537077721491057e-06, "loss": 0.03453468382358551, "memory(GiB)": 122.96, "step": 58260, "token_acc": 0.9864351600651112, "train_speed(iter/s)": 0.232136 }, { "epoch": 4.441268389358945, "grad_norm": 1.2145941257476807, "learning_rate": 3.0495888216655832e-06, "loss": 0.040251871943473815, "memory(GiB)": 122.96, "step": 58265, "token_acc": 0.9838909541511772, "train_speed(iter/s)": 0.232142 }, { "epoch": 4.441649515969205, "grad_norm": 0.8725104331970215, "learning_rate": 3.0454725635702785e-06, "loss": 0.03908684849739075, "memory(GiB)": 122.96, "step": 58270, "token_acc": 0.9813643926788685, "train_speed(iter/s)": 0.232147 }, { "epoch": 4.442030642579465, "grad_norm": 0.6511116027832031, "learning_rate": 3.041358998099214e-06, "loss": 0.028995126485824585, "memory(GiB)": 122.96, "step": 58275, "token_acc": 0.9849074975657254, "train_speed(iter/s)": 0.232153 }, { "epoch": 4.442411769189725, "grad_norm": 0.9484047293663025, "learning_rate": 3.0372481254882957e-06, "loss": 0.026861637830734253, "memory(GiB)": 122.96, "step": 58280, "token_acc": 0.9874902267396404, "train_speed(iter/s)": 0.232156 }, { "epoch": 4.442792895799985, "grad_norm": 2.0905377864837646, "learning_rate": 3.0331399459732737e-06, "loss": 0.02032557427883148, "memory(GiB)": 122.96, "step": 58285, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.232164 }, { "epoch": 4.443174022410244, "grad_norm": 2.0775153636932373, "learning_rate": 3.0290344597897145e-06, "loss": 0.029581665992736816, "memory(GiB)": 122.96, "step": 58290, "token_acc": 0.9880663430420712, "train_speed(iter/s)": 0.232169 }, { "epoch": 4.443555149020504, "grad_norm": 0.8091530203819275, "learning_rate": 3.024931667173059e-06, "loss": 0.023634007573127745, "memory(GiB)": 122.96, "step": 58295, "token_acc": 0.9901360544217687, "train_speed(iter/s)": 0.232175 }, { "epoch": 4.443936275630764, "grad_norm": 1.0783971548080444, "learning_rate": 3.0208315683585843e-06, "loss": 0.049206975102424624, "memory(GiB)": 122.96, "step": 58300, "token_acc": 0.9811386253182134, "train_speed(iter/s)": 0.23218 }, { "epoch": 4.444317402241024, "grad_norm": 1.245119333267212, "learning_rate": 3.0167341635814028e-06, "loss": 0.028819045424461363, "memory(GiB)": 122.96, "step": 58305, "token_acc": 0.9875503478579275, "train_speed(iter/s)": 0.232183 }, { "epoch": 4.444698528851284, "grad_norm": 1.0270189046859741, "learning_rate": 3.0126394530764935e-06, "loss": 0.0307545006275177, "memory(GiB)": 122.96, "step": 58310, "token_acc": 0.9880636604774535, "train_speed(iter/s)": 0.23219 }, { "epoch": 4.445079655461544, "grad_norm": 0.8214865922927856, "learning_rate": 3.008547437078657e-06, "loss": 0.03009040355682373, "memory(GiB)": 122.96, "step": 58315, "token_acc": 0.986893663018511, "train_speed(iter/s)": 0.232195 }, { "epoch": 4.445460782071804, "grad_norm": 0.9736113548278809, "learning_rate": 3.0044581158225617e-06, "loss": 0.041349050402641294, "memory(GiB)": 122.96, "step": 58320, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.232197 }, { "epoch": 4.445841908682064, "grad_norm": 1.0476984977722168, "learning_rate": 3.000371489542714e-06, "loss": 0.03528337776660919, "memory(GiB)": 122.96, "step": 58325, "token_acc": 0.9806539509536785, "train_speed(iter/s)": 0.232204 }, { "epoch": 4.446223035292324, "grad_norm": 0.20259790122509003, "learning_rate": 2.9962875584734486e-06, "loss": 0.02186237871646881, "memory(GiB)": 122.96, "step": 58330, "token_acc": 0.9916599839615076, "train_speed(iter/s)": 0.232209 }, { "epoch": 4.4466041619025845, "grad_norm": 1.1591694355010986, "learning_rate": 2.9922063228489673e-06, "loss": 0.02191692292690277, "memory(GiB)": 122.96, "step": 58335, "token_acc": 0.9925020827547903, "train_speed(iter/s)": 0.232217 }, { "epoch": 4.446985288512844, "grad_norm": 1.0349624156951904, "learning_rate": 2.9881277829033217e-06, "loss": 0.028276726603507996, "memory(GiB)": 122.96, "step": 58340, "token_acc": 0.9870916334661355, "train_speed(iter/s)": 0.232222 }, { "epoch": 4.447366415123104, "grad_norm": 2.281031370162964, "learning_rate": 2.9840519388703745e-06, "loss": 0.022006803750991823, "memory(GiB)": 122.96, "step": 58345, "token_acc": 0.9934305070827345, "train_speed(iter/s)": 0.232229 }, { "epoch": 4.447747541733364, "grad_norm": 0.27579465508461, "learning_rate": 2.9799787909838726e-06, "loss": 0.015000584721565246, "memory(GiB)": 122.96, "step": 58350, "token_acc": 0.9892647609536886, "train_speed(iter/s)": 0.232234 }, { "epoch": 4.448128668343624, "grad_norm": 1.099178433418274, "learning_rate": 2.97590833947739e-06, "loss": 0.035849454998970035, "memory(GiB)": 122.96, "step": 58355, "token_acc": 0.9841240194247292, "train_speed(iter/s)": 0.23224 }, { "epoch": 4.448509794953884, "grad_norm": 0.47101083397865295, "learning_rate": 2.97184058458434e-06, "loss": 0.019146141409873963, "memory(GiB)": 122.96, "step": 58360, "token_acc": 0.9933083511777302, "train_speed(iter/s)": 0.232242 }, { "epoch": 4.448890921564144, "grad_norm": 1.4291999340057373, "learning_rate": 2.967775526537997e-06, "loss": 0.029675406217575074, "memory(GiB)": 122.96, "step": 58365, "token_acc": 0.9852717115286947, "train_speed(iter/s)": 0.232249 }, { "epoch": 4.449272048174404, "grad_norm": 0.9476715326309204, "learning_rate": 2.9637131655714744e-06, "loss": 0.03325777053833008, "memory(GiB)": 122.96, "step": 58370, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.232255 }, { "epoch": 4.449653174784664, "grad_norm": 1.0786081552505493, "learning_rate": 2.9596535019177195e-06, "loss": 0.03083682358264923, "memory(GiB)": 122.96, "step": 58375, "token_acc": 0.9875599520383693, "train_speed(iter/s)": 0.23226 }, { "epoch": 4.450034301394924, "grad_norm": 0.7458133101463318, "learning_rate": 2.9555965358095517e-06, "loss": 0.026840582489967346, "memory(GiB)": 122.96, "step": 58380, "token_acc": 0.9862976894142934, "train_speed(iter/s)": 0.232267 }, { "epoch": 4.450415428005183, "grad_norm": 1.139404058456421, "learning_rate": 2.9515422674796067e-06, "loss": 0.017826542258262634, "memory(GiB)": 122.96, "step": 58385, "token_acc": 0.9906700593723494, "train_speed(iter/s)": 0.232275 }, { "epoch": 4.450796554615443, "grad_norm": 2.2127692699432373, "learning_rate": 2.9474906971603766e-06, "loss": 0.028389915823936462, "memory(GiB)": 122.96, "step": 58390, "token_acc": 0.9885270170244264, "train_speed(iter/s)": 0.23228 }, { "epoch": 4.451177681225703, "grad_norm": 0.28387919068336487, "learning_rate": 2.9434418250842146e-06, "loss": 0.02027731537818909, "memory(GiB)": 122.96, "step": 58395, "token_acc": 0.9904559915164369, "train_speed(iter/s)": 0.232288 }, { "epoch": 4.451558807835963, "grad_norm": 0.7682221531867981, "learning_rate": 2.939395651483284e-06, "loss": 0.024406158924102785, "memory(GiB)": 122.96, "step": 58400, "token_acc": 0.98756146948221, "train_speed(iter/s)": 0.232293 }, { "epoch": 4.451558807835963, "eval_loss": 0.04881139099597931, "eval_runtime": 156.9073, "eval_samples_per_second": 3.378, "eval_steps_per_second": 3.378, "eval_token_acc": 0.9803249804228661, "step": 58400 }, { "epoch": 4.451939934446223, "grad_norm": 2.2710886001586914, "learning_rate": 2.9353521765896286e-06, "loss": 0.040872231125831604, "memory(GiB)": 122.96, "step": 58405, "token_acc": 0.9805622397371796, "train_speed(iter/s)": 0.232153 }, { "epoch": 4.452321061056483, "grad_norm": 1.2955021858215332, "learning_rate": 2.9313114006351227e-06, "loss": 0.02831556797027588, "memory(GiB)": 122.96, "step": 58410, "token_acc": 0.9896259038038353, "train_speed(iter/s)": 0.232157 }, { "epoch": 4.452702187666743, "grad_norm": 2.2006685733795166, "learning_rate": 2.927273323851476e-06, "loss": 0.029230961203575136, "memory(GiB)": 122.96, "step": 58415, "token_acc": 0.9892395982783357, "train_speed(iter/s)": 0.232163 }, { "epoch": 4.453083314277003, "grad_norm": 1.728107213973999, "learning_rate": 2.9232379464702587e-06, "loss": 0.025886327028274536, "memory(GiB)": 122.96, "step": 58420, "token_acc": 0.9906786590351594, "train_speed(iter/s)": 0.232168 }, { "epoch": 4.453464440887263, "grad_norm": 1.8402591943740845, "learning_rate": 2.9192052687228964e-06, "loss": 0.0384773850440979, "memory(GiB)": 122.96, "step": 58425, "token_acc": 0.9832456653029418, "train_speed(iter/s)": 0.232173 }, { "epoch": 4.453845567497523, "grad_norm": 1.266174077987671, "learning_rate": 2.915175290840616e-06, "loss": 0.01289404034614563, "memory(GiB)": 122.96, "step": 58430, "token_acc": 0.9954824136818329, "train_speed(iter/s)": 0.23218 }, { "epoch": 4.454226694107783, "grad_norm": 0.8660030961036682, "learning_rate": 2.911148013054538e-06, "loss": 0.039893466234207156, "memory(GiB)": 122.96, "step": 58435, "token_acc": 0.988263142321728, "train_speed(iter/s)": 0.232183 }, { "epoch": 4.454607820718042, "grad_norm": 1.322394847869873, "learning_rate": 2.9071234355956044e-06, "loss": 0.021267712116241455, "memory(GiB)": 122.96, "step": 58440, "token_acc": 0.993421052631579, "train_speed(iter/s)": 0.232189 }, { "epoch": 4.454988947328302, "grad_norm": 1.293222427368164, "learning_rate": 2.9031015586945985e-06, "loss": 0.02319294661283493, "memory(GiB)": 122.96, "step": 58445, "token_acc": 0.9905020352781547, "train_speed(iter/s)": 0.232194 }, { "epoch": 4.455370073938562, "grad_norm": 0.7662639617919922, "learning_rate": 2.8990823825821634e-06, "loss": 0.04244548380374909, "memory(GiB)": 122.96, "step": 58450, "token_acc": 0.9891000558971492, "train_speed(iter/s)": 0.2322 }, { "epoch": 4.455751200548822, "grad_norm": 1.2907568216323853, "learning_rate": 2.895065907488781e-06, "loss": 0.031272169947624204, "memory(GiB)": 122.96, "step": 58455, "token_acc": 0.9886218506908162, "train_speed(iter/s)": 0.232203 }, { "epoch": 4.456132327159082, "grad_norm": 0.5334659218788147, "learning_rate": 2.891052133644767e-06, "loss": 0.025861257314682008, "memory(GiB)": 122.96, "step": 58460, "token_acc": 0.9925982159802619, "train_speed(iter/s)": 0.232208 }, { "epoch": 4.456513453769342, "grad_norm": 0.8818627595901489, "learning_rate": 2.8870410612803156e-06, "loss": 0.02632417678833008, "memory(GiB)": 122.96, "step": 58465, "token_acc": 0.9882869692532943, "train_speed(iter/s)": 0.232212 }, { "epoch": 4.456894580379602, "grad_norm": 0.5386205315589905, "learning_rate": 2.8830326906254203e-06, "loss": 0.02091339826583862, "memory(GiB)": 122.96, "step": 58470, "token_acc": 0.9918578830495929, "train_speed(iter/s)": 0.232216 }, { "epoch": 4.457275706989862, "grad_norm": 0.6241896152496338, "learning_rate": 2.8790270219099467e-06, "loss": 0.027786344289779663, "memory(GiB)": 122.96, "step": 58475, "token_acc": 0.9850303838743145, "train_speed(iter/s)": 0.23222 }, { "epoch": 4.457656833600122, "grad_norm": 1.3326218128204346, "learning_rate": 2.875024055363618e-06, "loss": 0.02934412956237793, "memory(GiB)": 122.96, "step": 58480, "token_acc": 0.9872905834777586, "train_speed(iter/s)": 0.232227 }, { "epoch": 4.4580379602103815, "grad_norm": 0.9099288582801819, "learning_rate": 2.8710237912159656e-06, "loss": 0.04524487853050232, "memory(GiB)": 122.96, "step": 58485, "token_acc": 0.983729662077597, "train_speed(iter/s)": 0.232231 }, { "epoch": 4.4584190868206415, "grad_norm": 0.0020342697389423847, "learning_rate": 2.8670262296963903e-06, "loss": 0.031789141893386844, "memory(GiB)": 122.96, "step": 58490, "token_acc": 0.9927797833935018, "train_speed(iter/s)": 0.232238 }, { "epoch": 4.4588002134309015, "grad_norm": 1.0770201683044434, "learning_rate": 2.8630313710341417e-06, "loss": 0.046379125118255614, "memory(GiB)": 122.96, "step": 58495, "token_acc": 0.9796080508474576, "train_speed(iter/s)": 0.232244 }, { "epoch": 4.4591813400411615, "grad_norm": 2.4092719554901123, "learning_rate": 2.859039215458298e-06, "loss": 0.036414426565170285, "memory(GiB)": 122.96, "step": 58500, "token_acc": 0.989844278943805, "train_speed(iter/s)": 0.232251 }, { "epoch": 4.4595624666514215, "grad_norm": 0.8487651944160461, "learning_rate": 2.855049763197798e-06, "loss": 0.041926464438438414, "memory(GiB)": 122.96, "step": 58505, "token_acc": 0.9831492229919491, "train_speed(iter/s)": 0.232256 }, { "epoch": 4.4599435932616815, "grad_norm": 1.7606645822525024, "learning_rate": 2.85106301448142e-06, "loss": 0.03978169858455658, "memory(GiB)": 122.96, "step": 58510, "token_acc": 0.9858356940509915, "train_speed(iter/s)": 0.232264 }, { "epoch": 4.460324719871942, "grad_norm": 2.105976104736328, "learning_rate": 2.84707896953777e-06, "loss": 0.02783275246620178, "memory(GiB)": 122.96, "step": 58515, "token_acc": 0.9849756959787892, "train_speed(iter/s)": 0.232272 }, { "epoch": 4.460705846482202, "grad_norm": 0.3989911675453186, "learning_rate": 2.843097628595326e-06, "loss": 0.019062311947345735, "memory(GiB)": 122.96, "step": 58520, "token_acc": 0.9934985778138968, "train_speed(iter/s)": 0.232279 }, { "epoch": 4.461086973092462, "grad_norm": 0.9929360747337341, "learning_rate": 2.839118991882406e-06, "loss": 0.03961658477783203, "memory(GiB)": 122.96, "step": 58525, "token_acc": 0.9826742763962495, "train_speed(iter/s)": 0.232285 }, { "epoch": 4.461468099702722, "grad_norm": 0.9976407289505005, "learning_rate": 2.835143059627149e-06, "loss": 0.030867105722427367, "memory(GiB)": 122.96, "step": 58530, "token_acc": 0.9893119572478289, "train_speed(iter/s)": 0.232291 }, { "epoch": 4.461849226312982, "grad_norm": 1.0114582777023315, "learning_rate": 2.8311698320575677e-06, "loss": 0.03822359144687652, "memory(GiB)": 122.96, "step": 58535, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.232296 }, { "epoch": 4.462230352923241, "grad_norm": 2.659564256668091, "learning_rate": 2.827199309401518e-06, "loss": 0.03957253098487854, "memory(GiB)": 122.96, "step": 58540, "token_acc": 0.9811975701475267, "train_speed(iter/s)": 0.232303 }, { "epoch": 4.462611479533501, "grad_norm": 1.2665292024612427, "learning_rate": 2.8232314918866685e-06, "loss": 0.026403939723968504, "memory(GiB)": 122.96, "step": 58545, "token_acc": 0.9906333630686887, "train_speed(iter/s)": 0.232311 }, { "epoch": 4.462992606143761, "grad_norm": 0.6554067730903625, "learning_rate": 2.8192663797405706e-06, "loss": 0.025102069973945616, "memory(GiB)": 122.96, "step": 58550, "token_acc": 0.9907995334974731, "train_speed(iter/s)": 0.232314 }, { "epoch": 4.463373732754021, "grad_norm": 1.140676736831665, "learning_rate": 2.815303973190597e-06, "loss": 0.018084868788719177, "memory(GiB)": 122.96, "step": 58555, "token_acc": 0.9898770788141721, "train_speed(iter/s)": 0.232321 }, { "epoch": 4.463754859364281, "grad_norm": 0.5999197363853455, "learning_rate": 2.8113442724639726e-06, "loss": 0.025027731060981752, "memory(GiB)": 122.96, "step": 58560, "token_acc": 0.9885099733431382, "train_speed(iter/s)": 0.232321 }, { "epoch": 4.464135985974541, "grad_norm": 1.4394930601119995, "learning_rate": 2.807387277787782e-06, "loss": 0.03120211660861969, "memory(GiB)": 122.96, "step": 58565, "token_acc": 0.9878264718902169, "train_speed(iter/s)": 0.232327 }, { "epoch": 4.464517112584801, "grad_norm": 1.0940921306610107, "learning_rate": 2.8034329893889264e-06, "loss": 0.026378309726715087, "memory(GiB)": 122.96, "step": 58570, "token_acc": 0.9850624104767751, "train_speed(iter/s)": 0.232331 }, { "epoch": 4.464898239195061, "grad_norm": 1.6029376983642578, "learning_rate": 2.7994814074941646e-06, "loss": 0.034075173735618594, "memory(GiB)": 122.96, "step": 58575, "token_acc": 0.9899029126213592, "train_speed(iter/s)": 0.232339 }, { "epoch": 4.465279365805321, "grad_norm": 1.227484107017517, "learning_rate": 2.7955325323301197e-06, "loss": 0.03780399262905121, "memory(GiB)": 122.96, "step": 58580, "token_acc": 0.9816564758198999, "train_speed(iter/s)": 0.232344 }, { "epoch": 4.46566049241558, "grad_norm": 4.939914703369141, "learning_rate": 2.791586364123211e-06, "loss": 0.033963510394096376, "memory(GiB)": 122.96, "step": 58585, "token_acc": 0.9852897911150338, "train_speed(iter/s)": 0.232351 }, { "epoch": 4.46604161902584, "grad_norm": 1.0272794961929321, "learning_rate": 2.787642903099763e-06, "loss": 0.02754482626914978, "memory(GiB)": 122.96, "step": 58590, "token_acc": 0.9876051350154936, "train_speed(iter/s)": 0.232359 }, { "epoch": 4.4664227456361, "grad_norm": 1.9486650228500366, "learning_rate": 2.7837021494859006e-06, "loss": 0.04836756587028503, "memory(GiB)": 122.96, "step": 58595, "token_acc": 0.9806142452624701, "train_speed(iter/s)": 0.232364 }, { "epoch": 4.46680387224636, "grad_norm": 0.38139691948890686, "learning_rate": 2.779764103507604e-06, "loss": 0.02991333305835724, "memory(GiB)": 122.96, "step": 58600, "token_acc": 0.9924599434495759, "train_speed(iter/s)": 0.232373 }, { "epoch": 4.46680387224636, "eval_loss": 0.04884311929345131, "eval_runtime": 158.7528, "eval_samples_per_second": 3.339, "eval_steps_per_second": 3.339, "eval_token_acc": 0.9802873320884284, "step": 58600 }, { "epoch": 4.46718499885662, "grad_norm": 1.3038325309753418, "learning_rate": 2.7758287653907145e-06, "loss": 0.02997695505619049, "memory(GiB)": 122.96, "step": 58605, "token_acc": 0.9804596343012436, "train_speed(iter/s)": 0.232235 }, { "epoch": 4.46756612546688, "grad_norm": 1.866551399230957, "learning_rate": 2.7718961353608963e-06, "loss": 0.06661640405654908, "memory(GiB)": 122.96, "step": 58610, "token_acc": 0.9777887462981244, "train_speed(iter/s)": 0.232241 }, { "epoch": 4.46794725207714, "grad_norm": 0.8185667991638184, "learning_rate": 2.7679662136436635e-06, "loss": 0.06892396211624145, "memory(GiB)": 122.96, "step": 58615, "token_acc": 0.979614124499454, "train_speed(iter/s)": 0.232244 }, { "epoch": 4.4683283786874, "grad_norm": 0.9550796747207642, "learning_rate": 2.764039000464397e-06, "loss": 0.04460042119026184, "memory(GiB)": 122.96, "step": 58620, "token_acc": 0.986594637855142, "train_speed(iter/s)": 0.232248 }, { "epoch": 4.46870950529766, "grad_norm": 1.0035653114318848, "learning_rate": 2.7601144960482827e-06, "loss": 0.022362005710601807, "memory(GiB)": 122.96, "step": 58625, "token_acc": 0.9904938557848365, "train_speed(iter/s)": 0.232253 }, { "epoch": 4.46909063190792, "grad_norm": 0.8505848050117493, "learning_rate": 2.7561927006203857e-06, "loss": 0.014931640028953553, "memory(GiB)": 122.96, "step": 58630, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.23226 }, { "epoch": 4.469471758518179, "grad_norm": 0.7875033617019653, "learning_rate": 2.752273614405604e-06, "loss": 0.02169519364833832, "memory(GiB)": 122.96, "step": 58635, "token_acc": 0.9929435483870968, "train_speed(iter/s)": 0.232267 }, { "epoch": 4.469852885128439, "grad_norm": 1.1741772890090942, "learning_rate": 2.7483572376286628e-06, "loss": 0.029314693808555604, "memory(GiB)": 122.96, "step": 58640, "token_acc": 0.9892195922193578, "train_speed(iter/s)": 0.232273 }, { "epoch": 4.470234011738699, "grad_norm": 1.5904589891433716, "learning_rate": 2.7444435705141657e-06, "loss": 0.0372830867767334, "memory(GiB)": 122.96, "step": 58645, "token_acc": 0.9864783047426842, "train_speed(iter/s)": 0.232279 }, { "epoch": 4.470615138348959, "grad_norm": 0.9146624803543091, "learning_rate": 2.7405326132865393e-06, "loss": 0.03993427753448486, "memory(GiB)": 122.96, "step": 58650, "token_acc": 0.9850196775422115, "train_speed(iter/s)": 0.232283 }, { "epoch": 4.470996264959219, "grad_norm": 0.6179817318916321, "learning_rate": 2.736624366170054e-06, "loss": 0.042012158036232, "memory(GiB)": 122.96, "step": 58655, "token_acc": 0.9839986525181068, "train_speed(iter/s)": 0.232288 }, { "epoch": 4.471377391569479, "grad_norm": 1.2325605154037476, "learning_rate": 2.7327188293888306e-06, "loss": 0.01824956387281418, "memory(GiB)": 122.96, "step": 58660, "token_acc": 0.9947871416159861, "train_speed(iter/s)": 0.232296 }, { "epoch": 4.4717585181797395, "grad_norm": 0.8534398674964905, "learning_rate": 2.7288160031668453e-06, "loss": 0.03861548900604248, "memory(GiB)": 122.96, "step": 58665, "token_acc": 0.9851831896551724, "train_speed(iter/s)": 0.232299 }, { "epoch": 4.4721396447899995, "grad_norm": 4.789297580718994, "learning_rate": 2.7249158877278856e-06, "loss": 0.04987538456916809, "memory(GiB)": 122.96, "step": 58670, "token_acc": 0.9788064696040156, "train_speed(iter/s)": 0.232307 }, { "epoch": 4.4725207714002595, "grad_norm": 0.5436508655548096, "learning_rate": 2.7210184832956176e-06, "loss": 0.022099606692790985, "memory(GiB)": 122.96, "step": 58675, "token_acc": 0.9914500683994528, "train_speed(iter/s)": 0.232312 }, { "epoch": 4.4729018980105195, "grad_norm": 0.713771402835846, "learning_rate": 2.717123790093551e-06, "loss": 0.037155759334564206, "memory(GiB)": 122.96, "step": 58680, "token_acc": 0.9832662571489091, "train_speed(iter/s)": 0.232317 }, { "epoch": 4.473283024620779, "grad_norm": 1.024732232093811, "learning_rate": 2.7132318083450015e-06, "loss": 0.03543824851512909, "memory(GiB)": 122.96, "step": 58685, "token_acc": 0.98732339849042, "train_speed(iter/s)": 0.232319 }, { "epoch": 4.473664151231039, "grad_norm": 0.1376591920852661, "learning_rate": 2.709342538273174e-06, "loss": 0.019700439274311067, "memory(GiB)": 122.96, "step": 58690, "token_acc": 0.9903934126258005, "train_speed(iter/s)": 0.232327 }, { "epoch": 4.474045277841299, "grad_norm": 0.5456061363220215, "learning_rate": 2.7054559801011016e-06, "loss": 0.044377601146698, "memory(GiB)": 122.96, "step": 58695, "token_acc": 0.9875647668393782, "train_speed(iter/s)": 0.23233 }, { "epoch": 4.474426404451559, "grad_norm": 0.767620325088501, "learning_rate": 2.70157213405165e-06, "loss": 0.017122538387775423, "memory(GiB)": 122.96, "step": 58700, "token_acc": 0.9915572232645403, "train_speed(iter/s)": 0.232337 }, { "epoch": 4.474807531061819, "grad_norm": 0.4434947669506073, "learning_rate": 2.697691000347541e-06, "loss": 0.04516446590423584, "memory(GiB)": 122.96, "step": 58705, "token_acc": 0.9831029185867896, "train_speed(iter/s)": 0.232341 }, { "epoch": 4.475188657672079, "grad_norm": 4.507698059082031, "learning_rate": 2.693812579211358e-06, "loss": 0.0510998010635376, "memory(GiB)": 122.96, "step": 58710, "token_acc": 0.9888935900148086, "train_speed(iter/s)": 0.232344 }, { "epoch": 4.475569784282339, "grad_norm": 1.197785496711731, "learning_rate": 2.689936870865473e-06, "loss": 0.02281830906867981, "memory(GiB)": 122.96, "step": 58715, "token_acc": 0.9913489266260814, "train_speed(iter/s)": 0.232352 }, { "epoch": 4.475950910892599, "grad_norm": 0.8552770614624023, "learning_rate": 2.6860638755321852e-06, "loss": 0.01971241980791092, "memory(GiB)": 122.96, "step": 58720, "token_acc": 0.9921653383763339, "train_speed(iter/s)": 0.232353 }, { "epoch": 4.476332037502859, "grad_norm": 1.0146162509918213, "learning_rate": 2.6821935934335575e-06, "loss": 0.028000441193580628, "memory(GiB)": 122.96, "step": 58725, "token_acc": 0.9851526557925554, "train_speed(iter/s)": 0.232359 }, { "epoch": 4.476713164113118, "grad_norm": 1.9020683765411377, "learning_rate": 2.67832602479155e-06, "loss": 0.03799598217010498, "memory(GiB)": 122.96, "step": 58730, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.232365 }, { "epoch": 4.477094290723378, "grad_norm": 0.8341664671897888, "learning_rate": 2.674461169827952e-06, "loss": 0.032101699709892274, "memory(GiB)": 122.96, "step": 58735, "token_acc": 0.9861426051902242, "train_speed(iter/s)": 0.232371 }, { "epoch": 4.477475417333638, "grad_norm": 0.9162484407424927, "learning_rate": 2.6705990287643813e-06, "loss": 0.024838604032993317, "memory(GiB)": 122.96, "step": 58740, "token_acc": 0.9894583198183587, "train_speed(iter/s)": 0.232375 }, { "epoch": 4.477856543943898, "grad_norm": 1.2500826120376587, "learning_rate": 2.6667396018223155e-06, "loss": 0.030315685272216796, "memory(GiB)": 122.96, "step": 58745, "token_acc": 0.9916272297051328, "train_speed(iter/s)": 0.232379 }, { "epoch": 4.478237670554158, "grad_norm": 0.7465413808822632, "learning_rate": 2.662882889223095e-06, "loss": 0.03066963851451874, "memory(GiB)": 122.96, "step": 58750, "token_acc": 0.9882687083230427, "train_speed(iter/s)": 0.23238 }, { "epoch": 4.478618797164418, "grad_norm": 0.5871665477752686, "learning_rate": 2.659028891187859e-06, "loss": 0.022656178474426268, "memory(GiB)": 122.96, "step": 58755, "token_acc": 0.9894691596819256, "train_speed(iter/s)": 0.232385 }, { "epoch": 4.478999923774678, "grad_norm": 0.8276354670524597, "learning_rate": 2.655177607937637e-06, "loss": 0.02131238728761673, "memory(GiB)": 122.96, "step": 58760, "token_acc": 0.9924550203134068, "train_speed(iter/s)": 0.232392 }, { "epoch": 4.479381050384938, "grad_norm": 3.8448102474212646, "learning_rate": 2.651329039693262e-06, "loss": 0.028858768939971923, "memory(GiB)": 122.96, "step": 58765, "token_acc": 0.9925824175824176, "train_speed(iter/s)": 0.232398 }, { "epoch": 4.479762176995198, "grad_norm": 0.8636740446090698, "learning_rate": 2.6474831866754426e-06, "loss": 0.029450887441635133, "memory(GiB)": 122.96, "step": 58770, "token_acc": 0.9882879779538408, "train_speed(iter/s)": 0.232404 }, { "epoch": 4.480143303605458, "grad_norm": 1.2246664762496948, "learning_rate": 2.6436400491047287e-06, "loss": 0.0387860119342804, "memory(GiB)": 122.96, "step": 58775, "token_acc": 0.9847585432376063, "train_speed(iter/s)": 0.232408 }, { "epoch": 4.480524430215718, "grad_norm": 0.6238369941711426, "learning_rate": 2.639799627201489e-06, "loss": 0.030539613962173463, "memory(GiB)": 122.96, "step": 58780, "token_acc": 0.9873029772329247, "train_speed(iter/s)": 0.232413 }, { "epoch": 4.480905556825977, "grad_norm": 0.7980039715766907, "learning_rate": 2.6359619211859645e-06, "loss": 0.016242820024490356, "memory(GiB)": 122.96, "step": 58785, "token_acc": 0.993421052631579, "train_speed(iter/s)": 0.232419 }, { "epoch": 4.481286683436237, "grad_norm": 0.3720232844352722, "learning_rate": 2.6321269312782335e-06, "loss": 0.023035402595996856, "memory(GiB)": 122.96, "step": 58790, "token_acc": 0.9887834339948232, "train_speed(iter/s)": 0.232427 }, { "epoch": 4.481667810046497, "grad_norm": 1.9309014081954956, "learning_rate": 2.6282946576981994e-06, "loss": 0.03602511882781982, "memory(GiB)": 122.96, "step": 58795, "token_acc": 0.985544848035582, "train_speed(iter/s)": 0.232429 }, { "epoch": 4.482048936656757, "grad_norm": 0.9791076183319092, "learning_rate": 2.6244651006656405e-06, "loss": 0.015380094945430755, "memory(GiB)": 122.96, "step": 58800, "token_acc": 0.9923076923076923, "train_speed(iter/s)": 0.232437 }, { "epoch": 4.482048936656757, "eval_loss": 0.04842289537191391, "eval_runtime": 158.3848, "eval_samples_per_second": 3.346, "eval_steps_per_second": 3.346, "eval_token_acc": 0.9804303957592916, "step": 58800 }, { "epoch": 4.482430063267017, "grad_norm": 0.9074917435646057, "learning_rate": 2.62063826040016e-06, "loss": 0.02341914176940918, "memory(GiB)": 122.96, "step": 58805, "token_acc": 0.9807664212248715, "train_speed(iter/s)": 0.232298 }, { "epoch": 4.482811189877277, "grad_norm": 1.1580435037612915, "learning_rate": 2.616814137121204e-06, "loss": 0.04305016398429871, "memory(GiB)": 122.96, "step": 58810, "token_acc": 0.9853095487932844, "train_speed(iter/s)": 0.232304 }, { "epoch": 4.483192316487537, "grad_norm": 1.9132534265518188, "learning_rate": 2.6129927310480753e-06, "loss": 0.02530284523963928, "memory(GiB)": 122.96, "step": 58815, "token_acc": 0.9942375886524822, "train_speed(iter/s)": 0.232311 }, { "epoch": 4.483573443097797, "grad_norm": 1.1347721815109253, "learning_rate": 2.6091740423999143e-06, "loss": 0.020037820935249327, "memory(GiB)": 122.96, "step": 58820, "token_acc": 0.9910350448247759, "train_speed(iter/s)": 0.232318 }, { "epoch": 4.483954569708057, "grad_norm": 0.7857437133789062, "learning_rate": 2.605358071395697e-06, "loss": 0.023429441452026366, "memory(GiB)": 122.96, "step": 58825, "token_acc": 0.9887323943661972, "train_speed(iter/s)": 0.232324 }, { "epoch": 4.4843356963183165, "grad_norm": 1.0449252128601074, "learning_rate": 2.6015448182542536e-06, "loss": 0.02144862413406372, "memory(GiB)": 122.96, "step": 58830, "token_acc": 0.9881731253145445, "train_speed(iter/s)": 0.232329 }, { "epoch": 4.4847168229285765, "grad_norm": 0.6336116194725037, "learning_rate": 2.5977342831942696e-06, "loss": 0.020291432738304138, "memory(GiB)": 122.96, "step": 58835, "token_acc": 0.9897610921501706, "train_speed(iter/s)": 0.232336 }, { "epoch": 4.485097949538837, "grad_norm": 0.8035302758216858, "learning_rate": 2.5939264664342432e-06, "loss": 0.037300470471382144, "memory(GiB)": 122.96, "step": 58840, "token_acc": 0.9875946512002578, "train_speed(iter/s)": 0.232339 }, { "epoch": 4.485479076149097, "grad_norm": 0.9188675284385681, "learning_rate": 2.5901213681925386e-06, "loss": 0.028478652238845825, "memory(GiB)": 122.96, "step": 58845, "token_acc": 0.987288769253776, "train_speed(iter/s)": 0.232345 }, { "epoch": 4.485860202759357, "grad_norm": 0.7433770298957825, "learning_rate": 2.5863189886873697e-06, "loss": 0.035176658630371095, "memory(GiB)": 122.96, "step": 58850, "token_acc": 0.9870531116669119, "train_speed(iter/s)": 0.23235 }, { "epoch": 4.486241329369617, "grad_norm": 0.5480220913887024, "learning_rate": 2.582519328136779e-06, "loss": 0.018670706450939177, "memory(GiB)": 122.96, "step": 58855, "token_acc": 0.9936013125512715, "train_speed(iter/s)": 0.232354 }, { "epoch": 4.486622455979877, "grad_norm": 1.257667899131775, "learning_rate": 2.578722386758664e-06, "loss": 0.02547484338283539, "memory(GiB)": 122.96, "step": 58860, "token_acc": 0.9907202828104287, "train_speed(iter/s)": 0.23236 }, { "epoch": 4.487003582590137, "grad_norm": 1.4495033025741577, "learning_rate": 2.574928164770757e-06, "loss": 0.0367518275976181, "memory(GiB)": 122.96, "step": 58865, "token_acc": 0.9897270325799824, "train_speed(iter/s)": 0.232367 }, { "epoch": 4.487384709200397, "grad_norm": 0.8923691511154175, "learning_rate": 2.5711366623906385e-06, "loss": 0.05246408581733704, "memory(GiB)": 122.96, "step": 58870, "token_acc": 0.9861648016276704, "train_speed(iter/s)": 0.232371 }, { "epoch": 4.487765835810657, "grad_norm": 0.545430064201355, "learning_rate": 2.567347879835741e-06, "loss": 0.02600800096988678, "memory(GiB)": 122.96, "step": 58875, "token_acc": 0.9873274068469832, "train_speed(iter/s)": 0.232377 }, { "epoch": 4.488146962420916, "grad_norm": 0.9786388874053955, "learning_rate": 2.5635618173233233e-06, "loss": 0.02902292013168335, "memory(GiB)": 122.96, "step": 58880, "token_acc": 0.9888678615161972, "train_speed(iter/s)": 0.23238 }, { "epoch": 4.488528089031176, "grad_norm": 1.0377041101455688, "learning_rate": 2.5597784750705066e-06, "loss": 0.03175482451915741, "memory(GiB)": 122.96, "step": 58885, "token_acc": 0.987067075843928, "train_speed(iter/s)": 0.232383 }, { "epoch": 4.488909215641436, "grad_norm": 0.47545745968818665, "learning_rate": 2.555997853294245e-06, "loss": 0.022861213982105257, "memory(GiB)": 122.96, "step": 58890, "token_acc": 0.9898312660632473, "train_speed(iter/s)": 0.232385 }, { "epoch": 4.489290342251696, "grad_norm": 0.672337532043457, "learning_rate": 2.5522199522113375e-06, "loss": 0.024768924713134764, "memory(GiB)": 122.96, "step": 58895, "token_acc": 0.9876702245123298, "train_speed(iter/s)": 0.232391 }, { "epoch": 4.489671468861956, "grad_norm": 1.0599043369293213, "learning_rate": 2.5484447720384276e-06, "loss": 0.02579427361488342, "memory(GiB)": 122.96, "step": 58900, "token_acc": 0.9875, "train_speed(iter/s)": 0.232394 }, { "epoch": 4.490052595472216, "grad_norm": 1.4782673120498657, "learning_rate": 2.544672312992019e-06, "loss": 0.03504183292388916, "memory(GiB)": 122.96, "step": 58905, "token_acc": 0.9876373626373627, "train_speed(iter/s)": 0.232402 }, { "epoch": 4.490433722082476, "grad_norm": 2.1021382808685303, "learning_rate": 2.540902575288423e-06, "loss": 0.03324805498123169, "memory(GiB)": 122.96, "step": 58910, "token_acc": 0.9877384196185286, "train_speed(iter/s)": 0.232406 }, { "epoch": 4.490814848692736, "grad_norm": 2.2099149227142334, "learning_rate": 2.537135559143827e-06, "loss": 0.034390589594841, "memory(GiB)": 122.96, "step": 58915, "token_acc": 0.9887788778877887, "train_speed(iter/s)": 0.232414 }, { "epoch": 4.491195975302996, "grad_norm": 0.09914572536945343, "learning_rate": 2.533371264774259e-06, "loss": 0.018933041393756865, "memory(GiB)": 122.96, "step": 58920, "token_acc": 0.9919632606199771, "train_speed(iter/s)": 0.232421 }, { "epoch": 4.491577101913256, "grad_norm": 3.6800155639648438, "learning_rate": 2.5296096923955736e-06, "loss": 0.06279722452163697, "memory(GiB)": 122.96, "step": 58925, "token_acc": 0.9791614361034396, "train_speed(iter/s)": 0.232427 }, { "epoch": 4.491958228523515, "grad_norm": 2.4938931465148926, "learning_rate": 2.5258508422234816e-06, "loss": 0.0488945484161377, "memory(GiB)": 122.96, "step": 58930, "token_acc": 0.9779323578795874, "train_speed(iter/s)": 0.232432 }, { "epoch": 4.492339355133775, "grad_norm": 0.9473447203636169, "learning_rate": 2.522094714473544e-06, "loss": 0.028436344861984254, "memory(GiB)": 122.96, "step": 58935, "token_acc": 0.9913432835820896, "train_speed(iter/s)": 0.232438 }, { "epoch": 4.492720481744035, "grad_norm": 1.2059382200241089, "learning_rate": 2.5183413093611383e-06, "loss": 0.03406867086887359, "memory(GiB)": 122.96, "step": 58940, "token_acc": 0.9896269698783163, "train_speed(iter/s)": 0.232442 }, { "epoch": 4.493101608354295, "grad_norm": 1.0838170051574707, "learning_rate": 2.5145906271015317e-06, "loss": 0.023083811998367308, "memory(GiB)": 122.96, "step": 58945, "token_acc": 0.9926103021082373, "train_speed(iter/s)": 0.232447 }, { "epoch": 4.493482734964555, "grad_norm": 0.9644151926040649, "learning_rate": 2.5108426679097795e-06, "loss": 0.04214789867401123, "memory(GiB)": 122.96, "step": 58950, "token_acc": 0.9830913069425105, "train_speed(iter/s)": 0.232453 }, { "epoch": 4.493863861574815, "grad_norm": 0.903826892375946, "learning_rate": 2.507097432000827e-06, "loss": 0.028593805432319642, "memory(GiB)": 122.96, "step": 58955, "token_acc": 0.9867924528301887, "train_speed(iter/s)": 0.232457 }, { "epoch": 4.494244988185075, "grad_norm": 1.0287188291549683, "learning_rate": 2.503354919589451e-06, "loss": 0.026117533445358276, "memory(GiB)": 122.96, "step": 58960, "token_acc": 0.9913934944943678, "train_speed(iter/s)": 0.232459 }, { "epoch": 4.494626114795335, "grad_norm": 0.6805478930473328, "learning_rate": 2.4996151308902484e-06, "loss": 0.03258500397205353, "memory(GiB)": 122.96, "step": 58965, "token_acc": 0.9909571745435932, "train_speed(iter/s)": 0.232463 }, { "epoch": 4.495007241405595, "grad_norm": 0.7507529854774475, "learning_rate": 2.4958780661176905e-06, "loss": 0.027879178524017334, "memory(GiB)": 122.96, "step": 58970, "token_acc": 0.992255125284738, "train_speed(iter/s)": 0.232468 }, { "epoch": 4.495388368015854, "grad_norm": 0.908513605594635, "learning_rate": 2.4921437254860846e-06, "loss": 0.023573027551174165, "memory(GiB)": 122.96, "step": 58975, "token_acc": 0.9894795127353266, "train_speed(iter/s)": 0.232474 }, { "epoch": 4.495769494626114, "grad_norm": 1.094321370124817, "learning_rate": 2.4884121092095703e-06, "loss": 0.012840729951858521, "memory(GiB)": 122.96, "step": 58980, "token_acc": 0.992, "train_speed(iter/s)": 0.23248 }, { "epoch": 4.496150621236374, "grad_norm": 0.8562510013580322, "learning_rate": 2.4846832175021373e-06, "loss": 0.024349665641784667, "memory(GiB)": 122.96, "step": 58985, "token_acc": 0.9901347783798805, "train_speed(iter/s)": 0.232482 }, { "epoch": 4.4965317478466345, "grad_norm": 3.1774649620056152, "learning_rate": 2.480957050577626e-06, "loss": 0.035820472240448, "memory(GiB)": 122.96, "step": 58990, "token_acc": 0.9913702928870293, "train_speed(iter/s)": 0.232487 }, { "epoch": 4.4969128744568945, "grad_norm": 1.224403738975525, "learning_rate": 2.477233608649715e-06, "loss": 0.02210884392261505, "memory(GiB)": 122.96, "step": 58995, "token_acc": 0.9900383141762452, "train_speed(iter/s)": 0.232493 }, { "epoch": 4.4972940010671545, "grad_norm": 1.1882511377334595, "learning_rate": 2.473512891931923e-06, "loss": 0.026896572113037108, "memory(GiB)": 122.96, "step": 59000, "token_acc": 0.9884878048780488, "train_speed(iter/s)": 0.232497 }, { "epoch": 4.4972940010671545, "eval_loss": 0.048255596309900284, "eval_runtime": 157.985, "eval_samples_per_second": 3.355, "eval_steps_per_second": 3.355, "eval_token_acc": 0.9804906330943919, "step": 59000 }, { "epoch": 4.4976751276774145, "grad_norm": 1.0309727191925049, "learning_rate": 2.4697949006376174e-06, "loss": 0.03410537838935852, "memory(GiB)": 122.96, "step": 59005, "token_acc": 0.9805398161770061, "train_speed(iter/s)": 0.232358 }, { "epoch": 4.4980562542876745, "grad_norm": 1.2259430885314941, "learning_rate": 2.466079634980001e-06, "loss": 0.01955498307943344, "memory(GiB)": 122.96, "step": 59010, "token_acc": 0.9904919388176933, "train_speed(iter/s)": 0.232365 }, { "epoch": 4.4984373808979345, "grad_norm": 1.2581634521484375, "learning_rate": 2.462367095172147e-06, "loss": 0.0333198070526123, "memory(GiB)": 122.96, "step": 59015, "token_acc": 0.9881707526144351, "train_speed(iter/s)": 0.232371 }, { "epoch": 4.498818507508195, "grad_norm": 1.0542749166488647, "learning_rate": 2.458657281426924e-06, "loss": 0.028895360231399537, "memory(GiB)": 122.96, "step": 59020, "token_acc": 0.987944294325504, "train_speed(iter/s)": 0.232377 }, { "epoch": 4.499199634118455, "grad_norm": 1.542134404182434, "learning_rate": 2.454950193957095e-06, "loss": 0.04427205324172974, "memory(GiB)": 122.96, "step": 59025, "token_acc": 0.9839001447178003, "train_speed(iter/s)": 0.232382 }, { "epoch": 4.499580760728714, "grad_norm": 7.581893444061279, "learning_rate": 2.4512458329752354e-06, "loss": 0.0412689208984375, "memory(GiB)": 122.96, "step": 59030, "token_acc": 0.9829376854599406, "train_speed(iter/s)": 0.232386 }, { "epoch": 4.499961887338974, "grad_norm": 2.919800281524658, "learning_rate": 2.447544198693774e-06, "loss": 0.042187425494194034, "memory(GiB)": 122.96, "step": 59035, "token_acc": 0.9869446199199832, "train_speed(iter/s)": 0.232392 }, { "epoch": 4.500343013949234, "grad_norm": 0.6602823138237, "learning_rate": 2.443845291324981e-06, "loss": 0.01897834837436676, "memory(GiB)": 122.96, "step": 59040, "token_acc": 0.9899126769045469, "train_speed(iter/s)": 0.232397 }, { "epoch": 4.500724140559494, "grad_norm": 1.0085598230361938, "learning_rate": 2.4401491110809748e-06, "loss": 0.025445157289505006, "memory(GiB)": 122.96, "step": 59045, "token_acc": 0.9870922728065585, "train_speed(iter/s)": 0.232403 }, { "epoch": 4.501105267169754, "grad_norm": 2.225587844848633, "learning_rate": 2.4364556581737084e-06, "loss": 0.02714349329471588, "memory(GiB)": 122.96, "step": 59050, "token_acc": 0.9880059970014993, "train_speed(iter/s)": 0.232406 }, { "epoch": 4.501486393780014, "grad_norm": 1.1109521389007568, "learning_rate": 2.43276493281499e-06, "loss": 0.03491811454296112, "memory(GiB)": 122.96, "step": 59055, "token_acc": 0.9872192579724531, "train_speed(iter/s)": 0.232408 }, { "epoch": 4.501867520390274, "grad_norm": 1.374549388885498, "learning_rate": 2.4290769352164733e-06, "loss": 0.03553078770637512, "memory(GiB)": 122.96, "step": 59060, "token_acc": 0.9877767527675276, "train_speed(iter/s)": 0.232414 }, { "epoch": 4.502248647000534, "grad_norm": 1.537404179573059, "learning_rate": 2.425391665589627e-06, "loss": 0.0377911776304245, "memory(GiB)": 122.96, "step": 59065, "token_acc": 0.9862581662536607, "train_speed(iter/s)": 0.23242 }, { "epoch": 4.502629773610794, "grad_norm": 0.5495750904083252, "learning_rate": 2.4217091241457935e-06, "loss": 0.03972398340702057, "memory(GiB)": 122.96, "step": 59070, "token_acc": 0.9884520208963431, "train_speed(iter/s)": 0.232423 }, { "epoch": 4.503010900221053, "grad_norm": 2.9759106636047363, "learning_rate": 2.418029311096165e-06, "loss": 0.03728658556938171, "memory(GiB)": 122.96, "step": 59075, "token_acc": 0.9881764438381082, "train_speed(iter/s)": 0.23243 }, { "epoch": 4.503392026831313, "grad_norm": 0.744644820690155, "learning_rate": 2.4143522266517337e-06, "loss": 0.024471300840377807, "memory(GiB)": 122.96, "step": 59080, "token_acc": 0.9884133806765091, "train_speed(iter/s)": 0.232435 }, { "epoch": 4.503773153441573, "grad_norm": 0.778300404548645, "learning_rate": 2.4106778710233814e-06, "loss": 0.025973179936408998, "memory(GiB)": 122.96, "step": 59085, "token_acc": 0.9912871287128713, "train_speed(iter/s)": 0.232441 }, { "epoch": 4.504154280051833, "grad_norm": 0.7813193798065186, "learning_rate": 2.407006244421822e-06, "loss": 0.026257318258285523, "memory(GiB)": 122.96, "step": 59090, "token_acc": 0.989549602590521, "train_speed(iter/s)": 0.232444 }, { "epoch": 4.504535406662093, "grad_norm": 0.7900437116622925, "learning_rate": 2.4033373470575826e-06, "loss": 0.027383172512054445, "memory(GiB)": 122.96, "step": 59095, "token_acc": 0.9868257519264231, "train_speed(iter/s)": 0.23245 }, { "epoch": 4.504916533272353, "grad_norm": 2.2154102325439453, "learning_rate": 2.3996711791410774e-06, "loss": 0.03221116065979004, "memory(GiB)": 122.96, "step": 59100, "token_acc": 0.9872540870047104, "train_speed(iter/s)": 0.232457 }, { "epoch": 4.505297659882613, "grad_norm": 2.1204917430877686, "learning_rate": 2.3960077408825386e-06, "loss": 0.04464305341243744, "memory(GiB)": 122.96, "step": 59105, "token_acc": 0.984070796460177, "train_speed(iter/s)": 0.232464 }, { "epoch": 4.505678786492873, "grad_norm": 0.2336188405752182, "learning_rate": 2.3923470324920426e-06, "loss": 0.039981862902641295, "memory(GiB)": 122.96, "step": 59110, "token_acc": 0.9807213930348259, "train_speed(iter/s)": 0.232471 }, { "epoch": 4.506059913103133, "grad_norm": 1.3670543432235718, "learning_rate": 2.388689054179527e-06, "loss": 0.03979058563709259, "memory(GiB)": 122.96, "step": 59115, "token_acc": 0.9844216816047802, "train_speed(iter/s)": 0.232476 }, { "epoch": 4.506441039713393, "grad_norm": 0.9342288374900818, "learning_rate": 2.385033806154735e-06, "loss": 0.03484660983085632, "memory(GiB)": 122.96, "step": 59120, "token_acc": 0.9858188472095151, "train_speed(iter/s)": 0.232481 }, { "epoch": 4.506822166323653, "grad_norm": 0.6748875975608826, "learning_rate": 2.381381288627299e-06, "loss": 0.021957483887672425, "memory(GiB)": 122.96, "step": 59125, "token_acc": 0.9907254740313273, "train_speed(iter/s)": 0.232487 }, { "epoch": 4.507203292933912, "grad_norm": 1.1308940649032593, "learning_rate": 2.377731501806674e-06, "loss": 0.03931613862514496, "memory(GiB)": 122.96, "step": 59130, "token_acc": 0.9841517857142857, "train_speed(iter/s)": 0.232493 }, { "epoch": 4.507584419544172, "grad_norm": 0.9447954297065735, "learning_rate": 2.3740844459021473e-06, "loss": 0.019270297884941102, "memory(GiB)": 122.96, "step": 59135, "token_acc": 0.992723263506064, "train_speed(iter/s)": 0.232499 }, { "epoch": 4.507965546154432, "grad_norm": 4.6478095054626465, "learning_rate": 2.370440121122869e-06, "loss": 0.04787872433662414, "memory(GiB)": 122.96, "step": 59140, "token_acc": 0.9854901960784314, "train_speed(iter/s)": 0.232505 }, { "epoch": 4.508346672764692, "grad_norm": 1.5334807634353638, "learning_rate": 2.366798527677827e-06, "loss": 0.035903871059417725, "memory(GiB)": 122.96, "step": 59145, "token_acc": 0.9836677554829678, "train_speed(iter/s)": 0.23251 }, { "epoch": 4.508727799374952, "grad_norm": 1.630660891532898, "learning_rate": 2.3631596657758383e-06, "loss": 0.0346203088760376, "memory(GiB)": 122.96, "step": 59150, "token_acc": 0.9895571456198028, "train_speed(iter/s)": 0.232514 }, { "epoch": 4.509108925985212, "grad_norm": 1.0682584047317505, "learning_rate": 2.359523535625585e-06, "loss": 0.04764093160629272, "memory(GiB)": 122.96, "step": 59155, "token_acc": 0.9803979803979804, "train_speed(iter/s)": 0.232521 }, { "epoch": 4.509490052595472, "grad_norm": 1.0743567943572998, "learning_rate": 2.3558901374355734e-06, "loss": 0.02417047768831253, "memory(GiB)": 122.96, "step": 59160, "token_acc": 0.9920472036942022, "train_speed(iter/s)": 0.232527 }, { "epoch": 4.509871179205732, "grad_norm": 0.9620904922485352, "learning_rate": 2.35225947141417e-06, "loss": 0.024049782752990724, "memory(GiB)": 122.96, "step": 59165, "token_acc": 0.9880976397014323, "train_speed(iter/s)": 0.232532 }, { "epoch": 4.5102523058159925, "grad_norm": 2.828744411468506, "learning_rate": 2.3486315377695745e-06, "loss": 0.041955432295799254, "memory(GiB)": 122.96, "step": 59170, "token_acc": 0.9821668264621285, "train_speed(iter/s)": 0.232538 }, { "epoch": 4.510633432426252, "grad_norm": 0.4569729268550873, "learning_rate": 2.3450063367098264e-06, "loss": 0.022255441546440123, "memory(GiB)": 122.96, "step": 59175, "token_acc": 0.9889033942558747, "train_speed(iter/s)": 0.232544 }, { "epoch": 4.511014559036512, "grad_norm": 0.7939974665641785, "learning_rate": 2.34138386844282e-06, "loss": 0.02972511649131775, "memory(GiB)": 122.96, "step": 59180, "token_acc": 0.9883028268168526, "train_speed(iter/s)": 0.232546 }, { "epoch": 4.511395685646772, "grad_norm": 1.0517349243164062, "learning_rate": 2.33776413317629e-06, "loss": 0.031627827882766725, "memory(GiB)": 122.96, "step": 59185, "token_acc": 0.9881091944397923, "train_speed(iter/s)": 0.23255 }, { "epoch": 4.511776812257032, "grad_norm": 0.936591625213623, "learning_rate": 2.334147131117803e-06, "loss": 0.026798653602600097, "memory(GiB)": 122.96, "step": 59190, "token_acc": 0.9905882352941177, "train_speed(iter/s)": 0.232558 }, { "epoch": 4.512157938867292, "grad_norm": 1.2887248992919922, "learning_rate": 2.3305328624747822e-06, "loss": 0.03167259097099304, "memory(GiB)": 122.96, "step": 59195, "token_acc": 0.9872939560439561, "train_speed(iter/s)": 0.232565 }, { "epoch": 4.512539065477552, "grad_norm": 1.2832838296890259, "learning_rate": 2.32692132745449e-06, "loss": 0.03672055900096893, "memory(GiB)": 122.96, "step": 59200, "token_acc": 0.9847908745247148, "train_speed(iter/s)": 0.232568 }, { "epoch": 4.512539065477552, "eval_loss": 0.04806428402662277, "eval_runtime": 199.7208, "eval_samples_per_second": 2.654, "eval_steps_per_second": 2.654, "eval_token_acc": 0.9804529847599542, "step": 59200 }, { "epoch": 4.512920192087812, "grad_norm": 1.4501534700393677, "learning_rate": 2.3233125262640267e-06, "loss": 0.02646632194519043, "memory(GiB)": 122.96, "step": 59205, "token_acc": 0.9806061586661862, "train_speed(iter/s)": 0.232392 }, { "epoch": 4.513301318698072, "grad_norm": 0.6811836957931519, "learning_rate": 2.3197064591103436e-06, "loss": 0.04569809138774872, "memory(GiB)": 122.96, "step": 59210, "token_acc": 0.9858070194740896, "train_speed(iter/s)": 0.232392 }, { "epoch": 4.513682445308332, "grad_norm": 0.8343686461448669, "learning_rate": 2.3161031262002363e-06, "loss": 0.032369408011436465, "memory(GiB)": 122.96, "step": 59215, "token_acc": 0.98721071863581, "train_speed(iter/s)": 0.232395 }, { "epoch": 4.514063571918591, "grad_norm": 1.013788104057312, "learning_rate": 2.312502527740329e-06, "loss": 0.0365339457988739, "memory(GiB)": 122.96, "step": 59220, "token_acc": 0.985616772306192, "train_speed(iter/s)": 0.232399 }, { "epoch": 4.514444698528852, "grad_norm": 0.6791442632675171, "learning_rate": 2.308904663937106e-06, "loss": 0.024265801906585692, "memory(GiB)": 122.96, "step": 59225, "token_acc": 0.9894141992995861, "train_speed(iter/s)": 0.232398 }, { "epoch": 4.514825825139111, "grad_norm": 0.9160165786743164, "learning_rate": 2.305309534996891e-06, "loss": 0.04631746411323547, "memory(GiB)": 122.96, "step": 59230, "token_acc": 0.9812291077397789, "train_speed(iter/s)": 0.232402 }, { "epoch": 4.515206951749371, "grad_norm": 4.522104740142822, "learning_rate": 2.3017171411258366e-06, "loss": 0.03270387947559357, "memory(GiB)": 122.96, "step": 59235, "token_acc": 0.9918319719953326, "train_speed(iter/s)": 0.232409 }, { "epoch": 4.515588078359631, "grad_norm": 0.673285961151123, "learning_rate": 2.29812748252996e-06, "loss": 0.023082280158996583, "memory(GiB)": 122.96, "step": 59240, "token_acc": 0.9913096508614118, "train_speed(iter/s)": 0.232412 }, { "epoch": 4.515969204969891, "grad_norm": 0.8149601817131042, "learning_rate": 2.294540559415115e-06, "loss": 0.03350680470466614, "memory(GiB)": 122.96, "step": 59245, "token_acc": 0.987275811924382, "train_speed(iter/s)": 0.232413 }, { "epoch": 4.516350331580151, "grad_norm": 1.5069636106491089, "learning_rate": 2.2909563719869687e-06, "loss": 0.021573127806186677, "memory(GiB)": 122.96, "step": 59250, "token_acc": 0.989736572015053, "train_speed(iter/s)": 0.23242 }, { "epoch": 4.516731458190411, "grad_norm": 0.7998501658439636, "learning_rate": 2.2873749204510963e-06, "loss": 0.023532551527023316, "memory(GiB)": 122.96, "step": 59255, "token_acc": 0.9918839578656536, "train_speed(iter/s)": 0.232423 }, { "epoch": 4.517112584800671, "grad_norm": 0.6253056526184082, "learning_rate": 2.2837962050128505e-06, "loss": 0.02539536952972412, "memory(GiB)": 122.96, "step": 59260, "token_acc": 0.9920844327176781, "train_speed(iter/s)": 0.232427 }, { "epoch": 4.517493711410931, "grad_norm": 1.0590099096298218, "learning_rate": 2.280220225877466e-06, "loss": 0.029252460598945616, "memory(GiB)": 122.96, "step": 59265, "token_acc": 0.9857429718875502, "train_speed(iter/s)": 0.232431 }, { "epoch": 4.517874838021191, "grad_norm": 0.9471097588539124, "learning_rate": 2.2766469832500083e-06, "loss": 0.018694031238555908, "memory(GiB)": 122.96, "step": 59270, "token_acc": 0.9921783339851389, "train_speed(iter/s)": 0.232437 }, { "epoch": 4.51825596463145, "grad_norm": 0.3153873682022095, "learning_rate": 2.273076477335373e-06, "loss": 0.030390965938568115, "memory(GiB)": 122.96, "step": 59275, "token_acc": 0.9813990044537595, "train_speed(iter/s)": 0.232443 }, { "epoch": 4.51863709124171, "grad_norm": 1.117079734802246, "learning_rate": 2.2695087083383247e-06, "loss": 0.03349434435367584, "memory(GiB)": 122.96, "step": 59280, "token_acc": 0.9872954349698536, "train_speed(iter/s)": 0.232446 }, { "epoch": 4.51901821785197, "grad_norm": 0.7060087323188782, "learning_rate": 2.2659436764634666e-06, "loss": 0.025044816732406616, "memory(GiB)": 122.96, "step": 59285, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.23245 }, { "epoch": 4.51939934446223, "grad_norm": 0.9655941724777222, "learning_rate": 2.262381381915213e-06, "loss": 0.03594998717308044, "memory(GiB)": 122.96, "step": 59290, "token_acc": 0.9836745270795543, "train_speed(iter/s)": 0.232456 }, { "epoch": 4.51978047107249, "grad_norm": 2.23012113571167, "learning_rate": 2.258821824897861e-06, "loss": 0.03434213995933533, "memory(GiB)": 122.96, "step": 59295, "token_acc": 0.9846368715083799, "train_speed(iter/s)": 0.232462 }, { "epoch": 4.52016159768275, "grad_norm": 1.2015433311462402, "learning_rate": 2.2552650056155366e-06, "loss": 0.014914745092391967, "memory(GiB)": 122.96, "step": 59300, "token_acc": 0.9938180747718576, "train_speed(iter/s)": 0.232467 }, { "epoch": 4.52054272429301, "grad_norm": 0.8435158729553223, "learning_rate": 2.2517109242721934e-06, "loss": 0.039533483982086184, "memory(GiB)": 122.96, "step": 59305, "token_acc": 0.975973487986744, "train_speed(iter/s)": 0.232472 }, { "epoch": 4.52092385090327, "grad_norm": 0.8554381728172302, "learning_rate": 2.248159581071646e-06, "loss": 0.029383367300033568, "memory(GiB)": 122.96, "step": 59310, "token_acc": 0.9873899119295436, "train_speed(iter/s)": 0.232476 }, { "epoch": 4.52130497751353, "grad_norm": 1.167482852935791, "learning_rate": 2.2446109762175647e-06, "loss": 0.03343854248523712, "memory(GiB)": 122.96, "step": 59315, "token_acc": 0.9871159563924677, "train_speed(iter/s)": 0.232479 }, { "epoch": 4.5216861041237895, "grad_norm": 0.9088460206985474, "learning_rate": 2.2410651099134204e-06, "loss": 0.02998122274875641, "memory(GiB)": 122.96, "step": 59320, "token_acc": 0.9863227146814404, "train_speed(iter/s)": 0.232483 }, { "epoch": 4.5220672307340495, "grad_norm": 1.1944597959518433, "learning_rate": 2.2375219823625725e-06, "loss": 0.03850245177745819, "memory(GiB)": 122.96, "step": 59325, "token_acc": 0.9840482098546615, "train_speed(iter/s)": 0.232486 }, { "epoch": 4.5224483573443095, "grad_norm": 1.5016146898269653, "learning_rate": 2.233981593768186e-06, "loss": 0.07224138975143432, "memory(GiB)": 122.96, "step": 59330, "token_acc": 0.9770433405897486, "train_speed(iter/s)": 0.232489 }, { "epoch": 4.5228294839545695, "grad_norm": 1.0423824787139893, "learning_rate": 2.230443944333299e-06, "loss": 0.030390077829360963, "memory(GiB)": 122.96, "step": 59335, "token_acc": 0.989126559714795, "train_speed(iter/s)": 0.232493 }, { "epoch": 4.5232106105648295, "grad_norm": 0.6940978765487671, "learning_rate": 2.2269090342607766e-06, "loss": 0.03304453492164612, "memory(GiB)": 122.96, "step": 59340, "token_acc": 0.9860041987403779, "train_speed(iter/s)": 0.232496 }, { "epoch": 4.5235917371750896, "grad_norm": 0.5522743463516235, "learning_rate": 2.2233768637533183e-06, "loss": 0.015040203928947449, "memory(GiB)": 122.96, "step": 59345, "token_acc": 0.9924675324675325, "train_speed(iter/s)": 0.232501 }, { "epoch": 4.52397286378535, "grad_norm": 1.5150868892669678, "learning_rate": 2.219847433013489e-06, "loss": 0.017863033711910246, "memory(GiB)": 122.96, "step": 59350, "token_acc": 0.9921787709497206, "train_speed(iter/s)": 0.232504 }, { "epoch": 4.52435399039561, "grad_norm": 0.940869927406311, "learning_rate": 2.2163207422436883e-06, "loss": 0.02834937870502472, "memory(GiB)": 122.96, "step": 59355, "token_acc": 0.9892097746747064, "train_speed(iter/s)": 0.232507 }, { "epoch": 4.52473511700587, "grad_norm": 1.3798832893371582, "learning_rate": 2.2127967916461435e-06, "loss": 0.030871984362602235, "memory(GiB)": 122.96, "step": 59360, "token_acc": 0.9847799259563965, "train_speed(iter/s)": 0.23251 }, { "epoch": 4.52511624361613, "grad_norm": 1.2688530683517456, "learning_rate": 2.2092755814229426e-06, "loss": 0.033095327019691465, "memory(GiB)": 122.96, "step": 59365, "token_acc": 0.9841530054644809, "train_speed(iter/s)": 0.232515 }, { "epoch": 4.52549737022639, "grad_norm": 0.4319153428077698, "learning_rate": 2.2057571117760136e-06, "loss": 0.026617607474327086, "memory(GiB)": 122.96, "step": 59370, "token_acc": 0.9852161785216178, "train_speed(iter/s)": 0.232519 }, { "epoch": 4.525878496836649, "grad_norm": 3.2344508171081543, "learning_rate": 2.202241382907122e-06, "loss": 0.03872561454772949, "memory(GiB)": 122.96, "step": 59375, "token_acc": 0.9892537313432835, "train_speed(iter/s)": 0.232525 }, { "epoch": 4.526259623446909, "grad_norm": 0.4669230282306671, "learning_rate": 2.1987283950178737e-06, "loss": 0.027013123035430908, "memory(GiB)": 122.96, "step": 59380, "token_acc": 0.988141163023289, "train_speed(iter/s)": 0.232527 }, { "epoch": 4.526640750057169, "grad_norm": 2.558762311935425, "learning_rate": 2.1952181483097302e-06, "loss": 0.06078287959098816, "memory(GiB)": 122.96, "step": 59385, "token_acc": 0.9753042233357194, "train_speed(iter/s)": 0.232533 }, { "epoch": 4.527021876667429, "grad_norm": 0.8609647750854492, "learning_rate": 2.1917106429839795e-06, "loss": 0.02150699943304062, "memory(GiB)": 122.96, "step": 59390, "token_acc": 0.991011984021305, "train_speed(iter/s)": 0.232539 }, { "epoch": 4.527403003277689, "grad_norm": 1.43874192237854, "learning_rate": 2.1882058792417783e-06, "loss": 0.03964447379112244, "memory(GiB)": 122.96, "step": 59395, "token_acc": 0.9851283656856606, "train_speed(iter/s)": 0.232541 }, { "epoch": 4.527784129887949, "grad_norm": 0.929912269115448, "learning_rate": 2.184703857284087e-06, "loss": 0.03581387996673584, "memory(GiB)": 122.96, "step": 59400, "token_acc": 0.9852811950790861, "train_speed(iter/s)": 0.232542 }, { "epoch": 4.527784129887949, "eval_loss": 0.04835975915193558, "eval_runtime": 211.3789, "eval_samples_per_second": 2.507, "eval_steps_per_second": 2.507, "eval_token_acc": 0.9805358110957171, "step": 59400 }, { "epoch": 4.528165256498209, "grad_norm": 0.884985089302063, "learning_rate": 2.1812045773117407e-06, "loss": 0.033932077884674075, "memory(GiB)": 122.96, "step": 59405, "token_acc": 0.9807518248175182, "train_speed(iter/s)": 0.232354 }, { "epoch": 4.528546383108469, "grad_norm": 0.26062050461769104, "learning_rate": 2.177708039525411e-06, "loss": 0.0184480682015419, "memory(GiB)": 122.96, "step": 59410, "token_acc": 0.9938217122683142, "train_speed(iter/s)": 0.232361 }, { "epoch": 4.528927509718729, "grad_norm": 0.9507620334625244, "learning_rate": 2.174214244125594e-06, "loss": 0.029236042499542238, "memory(GiB)": 122.96, "step": 59415, "token_acc": 0.9883662747060296, "train_speed(iter/s)": 0.232364 }, { "epoch": 4.529308636328988, "grad_norm": 1.0780410766601562, "learning_rate": 2.170723191312657e-06, "loss": 0.02966539263725281, "memory(GiB)": 122.96, "step": 59420, "token_acc": 0.984, "train_speed(iter/s)": 0.232368 }, { "epoch": 4.529689762939248, "grad_norm": 0.8624927997589111, "learning_rate": 2.1672348812867892e-06, "loss": 0.028631231188774107, "memory(GiB)": 122.96, "step": 59425, "token_acc": 0.9873995617238861, "train_speed(iter/s)": 0.232371 }, { "epoch": 4.530070889549508, "grad_norm": 1.3525956869125366, "learning_rate": 2.1637493142480312e-06, "loss": 0.03756705522537231, "memory(GiB)": 122.96, "step": 59430, "token_acc": 0.9854196301564723, "train_speed(iter/s)": 0.232377 }, { "epoch": 4.530452016159768, "grad_norm": 0.1477954238653183, "learning_rate": 2.1602664903962565e-06, "loss": 0.023720067739486695, "memory(GiB)": 122.96, "step": 59435, "token_acc": 0.9849773242630385, "train_speed(iter/s)": 0.232381 }, { "epoch": 4.530833142770028, "grad_norm": 2.100870132446289, "learning_rate": 2.156786409931205e-06, "loss": 0.02915475070476532, "memory(GiB)": 122.96, "step": 59440, "token_acc": 0.9887715546049993, "train_speed(iter/s)": 0.232384 }, { "epoch": 4.531214269380288, "grad_norm": 1.07608163356781, "learning_rate": 2.153309073052423e-06, "loss": 0.03442515432834625, "memory(GiB)": 122.96, "step": 59445, "token_acc": 0.9890677194047981, "train_speed(iter/s)": 0.232386 }, { "epoch": 4.531595395990548, "grad_norm": 0.8724117875099182, "learning_rate": 2.149834479959334e-06, "loss": 0.028535571694374085, "memory(GiB)": 122.96, "step": 59450, "token_acc": 0.9905660377358491, "train_speed(iter/s)": 0.23239 }, { "epoch": 4.531976522600808, "grad_norm": 1.9680774211883545, "learning_rate": 2.14636263085119e-06, "loss": 0.03844791054725647, "memory(GiB)": 122.96, "step": 59455, "token_acc": 0.9884004884004884, "train_speed(iter/s)": 0.232391 }, { "epoch": 4.532357649211068, "grad_norm": 1.440451979637146, "learning_rate": 2.1428935259270754e-06, "loss": 0.02326098531484604, "memory(GiB)": 122.96, "step": 59460, "token_acc": 0.9892057354599646, "train_speed(iter/s)": 0.232394 }, { "epoch": 4.532738775821328, "grad_norm": 0.6049251556396484, "learning_rate": 2.1394271653859323e-06, "loss": 0.03685930073261261, "memory(GiB)": 122.96, "step": 59465, "token_acc": 0.9877859204974462, "train_speed(iter/s)": 0.232399 }, { "epoch": 4.533119902431588, "grad_norm": 2.584005355834961, "learning_rate": 2.1359635494265507e-06, "loss": 0.028444743156433104, "memory(GiB)": 122.96, "step": 59470, "token_acc": 0.9894705174488568, "train_speed(iter/s)": 0.232398 }, { "epoch": 4.533501029041847, "grad_norm": 0.29105398058891296, "learning_rate": 2.132502678247533e-06, "loss": 0.03365178108215332, "memory(GiB)": 122.96, "step": 59475, "token_acc": 0.9837133550488599, "train_speed(iter/s)": 0.232403 }, { "epoch": 4.533882155652107, "grad_norm": 0.8819000720977783, "learning_rate": 2.1290445520473543e-06, "loss": 0.022716154158115388, "memory(GiB)": 122.96, "step": 59480, "token_acc": 0.9928707224334601, "train_speed(iter/s)": 0.23241 }, { "epoch": 4.534263282262367, "grad_norm": 1.008579969406128, "learning_rate": 2.1255891710243283e-06, "loss": 0.025982236862182616, "memory(GiB)": 122.96, "step": 59485, "token_acc": 0.988173455978975, "train_speed(iter/s)": 0.232417 }, { "epoch": 4.534644408872627, "grad_norm": 1.2348617315292358, "learning_rate": 2.1221365353765956e-06, "loss": 0.019303226470947267, "memory(GiB)": 122.96, "step": 59490, "token_acc": 0.991474318985236, "train_speed(iter/s)": 0.23242 }, { "epoch": 4.5350255354828874, "grad_norm": 0.2707468271255493, "learning_rate": 2.1186866453021594e-06, "loss": 0.026720145344734193, "memory(GiB)": 122.96, "step": 59495, "token_acc": 0.9881512130900884, "train_speed(iter/s)": 0.232425 }, { "epoch": 4.5354066620931475, "grad_norm": 1.0585404634475708, "learning_rate": 2.115239500998839e-06, "loss": 0.01873173415660858, "memory(GiB)": 122.96, "step": 59500, "token_acc": 0.99302649930265, "train_speed(iter/s)": 0.232428 }, { "epoch": 4.5357877887034075, "grad_norm": 1.264786958694458, "learning_rate": 2.1117951026643212e-06, "loss": 0.04239074289798737, "memory(GiB)": 122.96, "step": 59505, "token_acc": 0.9856598016781083, "train_speed(iter/s)": 0.23243 }, { "epoch": 4.5361689153136675, "grad_norm": 1.0012396574020386, "learning_rate": 2.1083534504961313e-06, "loss": 0.020904065668582918, "memory(GiB)": 122.96, "step": 59510, "token_acc": 0.9920154185022027, "train_speed(iter/s)": 0.232436 }, { "epoch": 4.5365500419239275, "grad_norm": 0.6450436115264893, "learning_rate": 2.1049145446916217e-06, "loss": 0.030609449744224547, "memory(GiB)": 122.96, "step": 59515, "token_acc": 0.9883016058704669, "train_speed(iter/s)": 0.232436 }, { "epoch": 4.536931168534187, "grad_norm": 0.052250176668167114, "learning_rate": 2.1014783854479915e-06, "loss": 0.014717184007167816, "memory(GiB)": 122.96, "step": 59520, "token_acc": 0.994657014822475, "train_speed(iter/s)": 0.23244 }, { "epoch": 4.537312295144447, "grad_norm": 0.462763249874115, "learning_rate": 2.098044972962315e-06, "loss": 0.024439053237438203, "memory(GiB)": 122.96, "step": 59525, "token_acc": 0.9916363636363636, "train_speed(iter/s)": 0.232443 }, { "epoch": 4.537693421754707, "grad_norm": 0.47965481877326965, "learning_rate": 2.094614307431464e-06, "loss": 0.025448337197303772, "memory(GiB)": 122.96, "step": 59530, "token_acc": 0.9920263953808084, "train_speed(iter/s)": 0.232448 }, { "epoch": 4.538074548364967, "grad_norm": 0.37837326526641846, "learning_rate": 2.0911863890521743e-06, "loss": 0.028212955594062804, "memory(GiB)": 122.96, "step": 59535, "token_acc": 0.9873380447585395, "train_speed(iter/s)": 0.232452 }, { "epoch": 4.538455674975227, "grad_norm": 1.0939854383468628, "learning_rate": 2.087761218021023e-06, "loss": 0.023519554734230043, "memory(GiB)": 122.96, "step": 59540, "token_acc": 0.9874270318814549, "train_speed(iter/s)": 0.232457 }, { "epoch": 4.538836801585487, "grad_norm": 0.9382004737854004, "learning_rate": 2.084338794534424e-06, "loss": 0.03578497171401977, "memory(GiB)": 122.96, "step": 59545, "token_acc": 0.9828781084386465, "train_speed(iter/s)": 0.232463 }, { "epoch": 4.539217928195747, "grad_norm": 0.8781613111495972, "learning_rate": 2.0809191187886435e-06, "loss": 0.02501700520515442, "memory(GiB)": 122.96, "step": 59550, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.23247 }, { "epoch": 4.539599054806007, "grad_norm": 1.4861927032470703, "learning_rate": 2.077502190979774e-06, "loss": 0.030407717823982237, "memory(GiB)": 122.96, "step": 59555, "token_acc": 0.9902462813947818, "train_speed(iter/s)": 0.232474 }, { "epoch": 4.539980181416267, "grad_norm": 3.324665069580078, "learning_rate": 2.0740880113037655e-06, "loss": 0.03359721004962921, "memory(GiB)": 122.96, "step": 59560, "token_acc": 0.988314606741573, "train_speed(iter/s)": 0.23248 }, { "epoch": 4.540361308026526, "grad_norm": 1.0918350219726562, "learning_rate": 2.0706765799564097e-06, "loss": 0.02911856770515442, "memory(GiB)": 122.96, "step": 59565, "token_acc": 0.9878923766816143, "train_speed(iter/s)": 0.232484 }, { "epoch": 4.540742434636787, "grad_norm": 0.8269166350364685, "learning_rate": 2.0672678971333293e-06, "loss": 0.04684201180934906, "memory(GiB)": 122.96, "step": 59570, "token_acc": 0.9786008230452675, "train_speed(iter/s)": 0.232487 }, { "epoch": 4.541123561247046, "grad_norm": 2.4232094287872314, "learning_rate": 2.0638619630300002e-06, "loss": 0.025732648372650147, "memory(GiB)": 122.96, "step": 59575, "token_acc": 0.990699276610403, "train_speed(iter/s)": 0.232492 }, { "epoch": 4.541504687857306, "grad_norm": 0.4151006042957306, "learning_rate": 2.060458777841745e-06, "loss": 0.035278862714767455, "memory(GiB)": 122.96, "step": 59580, "token_acc": 0.9796491228070175, "train_speed(iter/s)": 0.232498 }, { "epoch": 4.541885814467566, "grad_norm": 1.357782244682312, "learning_rate": 2.057058341763701e-06, "loss": 0.02455410659313202, "memory(GiB)": 122.96, "step": 59585, "token_acc": 0.9886931369971076, "train_speed(iter/s)": 0.232504 }, { "epoch": 4.542266941077826, "grad_norm": 1.349265217781067, "learning_rate": 2.0536606549908855e-06, "loss": 0.05196772217750549, "memory(GiB)": 122.96, "step": 59590, "token_acc": 0.9864540091763163, "train_speed(iter/s)": 0.232509 }, { "epoch": 4.542648067688086, "grad_norm": 0.20614959299564362, "learning_rate": 2.0502657177181363e-06, "loss": 0.028534284234046935, "memory(GiB)": 122.96, "step": 59595, "token_acc": 0.985732270247587, "train_speed(iter/s)": 0.232515 }, { "epoch": 4.543029194298346, "grad_norm": 1.257615327835083, "learning_rate": 2.0468735301401252e-06, "loss": 0.03679392337799072, "memory(GiB)": 122.96, "step": 59600, "token_acc": 0.9802685667306111, "train_speed(iter/s)": 0.232521 }, { "epoch": 4.543029194298346, "eval_loss": 0.04827757179737091, "eval_runtime": 211.7134, "eval_samples_per_second": 2.503, "eval_steps_per_second": 2.503, "eval_token_acc": 0.9804755737606168, "step": 59600 }, { "epoch": 4.543410320908606, "grad_norm": 0.3239976465702057, "learning_rate": 2.0434840924513922e-06, "loss": 0.02414417117834091, "memory(GiB)": 122.96, "step": 59605, "token_acc": 0.9808141342143121, "train_speed(iter/s)": 0.232334 }, { "epoch": 4.543791447518866, "grad_norm": 1.5926331281661987, "learning_rate": 2.040097404846303e-06, "loss": 0.02683834731578827, "memory(GiB)": 122.96, "step": 59610, "token_acc": 0.9936942296252231, "train_speed(iter/s)": 0.232338 }, { "epoch": 4.544172574129126, "grad_norm": 1.5649784803390503, "learning_rate": 2.0367134675190637e-06, "loss": 0.03284373879432678, "memory(GiB)": 122.96, "step": 59615, "token_acc": 0.9913594470046083, "train_speed(iter/s)": 0.232342 }, { "epoch": 4.544553700739385, "grad_norm": 0.5726308822631836, "learning_rate": 2.0333322806637243e-06, "loss": 0.021368817985057832, "memory(GiB)": 122.96, "step": 59620, "token_acc": 0.9902319902319903, "train_speed(iter/s)": 0.232343 }, { "epoch": 4.544934827349645, "grad_norm": 1.134871244430542, "learning_rate": 2.0299538444741973e-06, "loss": 0.033299988508224486, "memory(GiB)": 122.96, "step": 59625, "token_acc": 0.9886398526251151, "train_speed(iter/s)": 0.232348 }, { "epoch": 4.545315953959905, "grad_norm": 1.4380630254745483, "learning_rate": 2.0265781591442047e-06, "loss": 0.04057266116142273, "memory(GiB)": 122.96, "step": 59630, "token_acc": 0.9873632591474916, "train_speed(iter/s)": 0.232352 }, { "epoch": 4.545697080570165, "grad_norm": 1.261387825012207, "learning_rate": 2.023205224867325e-06, "loss": 0.020338305830955507, "memory(GiB)": 122.96, "step": 59635, "token_acc": 0.9925890509203921, "train_speed(iter/s)": 0.232357 }, { "epoch": 4.546078207180425, "grad_norm": 1.383431315422058, "learning_rate": 2.0198350418369982e-06, "loss": 0.03750979602336883, "memory(GiB)": 122.96, "step": 59640, "token_acc": 0.986130374479889, "train_speed(iter/s)": 0.232361 }, { "epoch": 4.546459333790685, "grad_norm": 0.5154494643211365, "learning_rate": 2.0164676102464643e-06, "loss": 0.033510491251945496, "memory(GiB)": 122.96, "step": 59645, "token_acc": 0.9865961199294533, "train_speed(iter/s)": 0.232364 }, { "epoch": 4.546840460400945, "grad_norm": 0.8755080103874207, "learning_rate": 2.013102930288846e-06, "loss": 0.02149340510368347, "memory(GiB)": 122.96, "step": 59650, "token_acc": 0.9901361215229828, "train_speed(iter/s)": 0.232368 }, { "epoch": 4.547221587011205, "grad_norm": 0.17881257832050323, "learning_rate": 2.0097410021570893e-06, "loss": 0.019809289276599883, "memory(GiB)": 122.96, "step": 59655, "token_acc": 0.9895882631329863, "train_speed(iter/s)": 0.232371 }, { "epoch": 4.547602713621465, "grad_norm": 0.7774648666381836, "learning_rate": 2.0063818260439846e-06, "loss": 0.03220491409301758, "memory(GiB)": 122.96, "step": 59660, "token_acc": 0.9896367740611532, "train_speed(iter/s)": 0.232371 }, { "epoch": 4.5479838402317245, "grad_norm": 0.7214168310165405, "learning_rate": 2.003025402142167e-06, "loss": 0.03375124931335449, "memory(GiB)": 122.96, "step": 59665, "token_acc": 0.9842620161633348, "train_speed(iter/s)": 0.232376 }, { "epoch": 4.5483649668419845, "grad_norm": 2.1024253368377686, "learning_rate": 1.9996717306441036e-06, "loss": 0.03130369782447815, "memory(GiB)": 122.96, "step": 59670, "token_acc": 0.9891245241979336, "train_speed(iter/s)": 0.232384 }, { "epoch": 4.548746093452245, "grad_norm": 0.8422136306762695, "learning_rate": 1.9963208117421195e-06, "loss": 0.03149842917919159, "memory(GiB)": 122.96, "step": 59675, "token_acc": 0.9903459372485921, "train_speed(iter/s)": 0.232389 }, { "epoch": 4.549127220062505, "grad_norm": 0.7072480320930481, "learning_rate": 1.992972645628377e-06, "loss": 0.025007554888725282, "memory(GiB)": 122.96, "step": 59680, "token_acc": 0.9902565768106528, "train_speed(iter/s)": 0.232394 }, { "epoch": 4.549508346672765, "grad_norm": 1.1228365898132324, "learning_rate": 1.9896272324948727e-06, "loss": 0.04127134084701538, "memory(GiB)": 122.96, "step": 59685, "token_acc": 0.9830927835051546, "train_speed(iter/s)": 0.232399 }, { "epoch": 4.549889473283025, "grad_norm": 1.2025084495544434, "learning_rate": 1.9862845725334477e-06, "loss": 0.02617310583591461, "memory(GiB)": 122.96, "step": 59690, "token_acc": 0.9889828841235491, "train_speed(iter/s)": 0.232403 }, { "epoch": 4.550270599893285, "grad_norm": 0.6244208216667175, "learning_rate": 1.982944665935793e-06, "loss": 0.015453299880027771, "memory(GiB)": 122.96, "step": 59695, "token_acc": 0.9937417030153612, "train_speed(iter/s)": 0.232407 }, { "epoch": 4.550651726503545, "grad_norm": 0.9140555262565613, "learning_rate": 1.979607512893433e-06, "loss": 0.03021601140499115, "memory(GiB)": 122.96, "step": 59700, "token_acc": 0.9847715736040609, "train_speed(iter/s)": 0.232412 }, { "epoch": 4.551032853113805, "grad_norm": 1.0098390579223633, "learning_rate": 1.976273113597743e-06, "loss": 0.02145916372537613, "memory(GiB)": 122.96, "step": 59705, "token_acc": 0.9924840285606915, "train_speed(iter/s)": 0.232418 }, { "epoch": 4.551413979724065, "grad_norm": 1.2782061100006104, "learning_rate": 1.972941468239936e-06, "loss": 0.021961221098899843, "memory(GiB)": 122.96, "step": 59710, "token_acc": 0.9918438683367318, "train_speed(iter/s)": 0.232424 }, { "epoch": 4.551795106334325, "grad_norm": 1.9232922792434692, "learning_rate": 1.9696125770110597e-06, "loss": 0.02751024067401886, "memory(GiB)": 122.96, "step": 59715, "token_acc": 0.9907801418439717, "train_speed(iter/s)": 0.232429 }, { "epoch": 4.552176232944584, "grad_norm": 0.7703972458839417, "learning_rate": 1.9662864401020163e-06, "loss": 0.020924940705299377, "memory(GiB)": 122.96, "step": 59720, "token_acc": 0.9929814710836609, "train_speed(iter/s)": 0.232434 }, { "epoch": 4.552557359554844, "grad_norm": 0.8063812851905823, "learning_rate": 1.9629630577035373e-06, "loss": 0.021520860493183136, "memory(GiB)": 122.96, "step": 59725, "token_acc": 0.9899222574143391, "train_speed(iter/s)": 0.23244 }, { "epoch": 4.552938486165104, "grad_norm": 3.1564042568206787, "learning_rate": 1.959642430006209e-06, "loss": 0.037638595700263976, "memory(GiB)": 122.96, "step": 59730, "token_acc": 0.9863387978142076, "train_speed(iter/s)": 0.232444 }, { "epoch": 4.553319612775364, "grad_norm": 0.9854797124862671, "learning_rate": 1.956324557200456e-06, "loss": 0.03558739721775055, "memory(GiB)": 122.96, "step": 59735, "token_acc": 0.9851111948737279, "train_speed(iter/s)": 0.232448 }, { "epoch": 4.553700739385624, "grad_norm": 1.093687653541565, "learning_rate": 1.9530094394765384e-06, "loss": 0.031200188398361205, "memory(GiB)": 122.96, "step": 59740, "token_acc": 0.9833836858006042, "train_speed(iter/s)": 0.232453 }, { "epoch": 4.554081865995884, "grad_norm": 0.8157335519790649, "learning_rate": 1.9496970770245593e-06, "loss": 0.026190707087516786, "memory(GiB)": 122.96, "step": 59745, "token_acc": 0.9890770618298427, "train_speed(iter/s)": 0.232454 }, { "epoch": 4.554462992606144, "grad_norm": 0.7674251198768616, "learning_rate": 1.9463874700344774e-06, "loss": 0.026014244556427, "memory(GiB)": 122.96, "step": 59750, "token_acc": 0.9877171959257041, "train_speed(iter/s)": 0.23246 }, { "epoch": 4.554844119216404, "grad_norm": 0.7561858296394348, "learning_rate": 1.9430806186960805e-06, "loss": 0.048482227325439456, "memory(GiB)": 122.96, "step": 59755, "token_acc": 0.9806247528667458, "train_speed(iter/s)": 0.232466 }, { "epoch": 4.555225245826664, "grad_norm": 0.4120377004146576, "learning_rate": 1.9397765231989885e-06, "loss": 0.01676030308008194, "memory(GiB)": 122.96, "step": 59760, "token_acc": 0.9931020490971799, "train_speed(iter/s)": 0.23247 }, { "epoch": 4.555606372436923, "grad_norm": 1.666172742843628, "learning_rate": 1.9364751837327e-06, "loss": 0.031013494729995726, "memory(GiB)": 122.96, "step": 59765, "token_acc": 0.9874182184197282, "train_speed(iter/s)": 0.232474 }, { "epoch": 4.555987499047183, "grad_norm": 1.091766119003296, "learning_rate": 1.933176600486508e-06, "loss": 0.032971763610839845, "memory(GiB)": 122.96, "step": 59770, "token_acc": 0.9874141876430206, "train_speed(iter/s)": 0.232478 }, { "epoch": 4.556368625657443, "grad_norm": 4.682402610778809, "learning_rate": 1.9298807736495785e-06, "loss": 0.062862229347229, "memory(GiB)": 122.96, "step": 59775, "token_acc": 0.982911494438175, "train_speed(iter/s)": 0.232482 }, { "epoch": 4.556749752267703, "grad_norm": 0.9916756749153137, "learning_rate": 1.926587703410926e-06, "loss": 0.0299111008644104, "memory(GiB)": 122.96, "step": 59780, "token_acc": 0.987460815047022, "train_speed(iter/s)": 0.232488 }, { "epoch": 4.557130878877963, "grad_norm": 0.3585330545902252, "learning_rate": 1.9232973899593665e-06, "loss": 0.028886407613754272, "memory(GiB)": 122.96, "step": 59785, "token_acc": 0.9858314690529456, "train_speed(iter/s)": 0.232493 }, { "epoch": 4.557512005488223, "grad_norm": 0.9467634558677673, "learning_rate": 1.920009833483605e-06, "loss": 0.03280780613422394, "memory(GiB)": 122.96, "step": 59790, "token_acc": 0.9878965706950302, "train_speed(iter/s)": 0.232498 }, { "epoch": 4.557893132098483, "grad_norm": 1.2908377647399902, "learning_rate": 1.916725034172162e-06, "loss": 0.021145665645599367, "memory(GiB)": 122.96, "step": 59795, "token_acc": 0.9891586501755993, "train_speed(iter/s)": 0.232501 }, { "epoch": 4.558274258708743, "grad_norm": 1.866194725036621, "learning_rate": 1.9134429922134043e-06, "loss": 0.020305800437927245, "memory(GiB)": 122.96, "step": 59800, "token_acc": 0.9911458333333333, "train_speed(iter/s)": 0.232509 }, { "epoch": 4.558274258708743, "eval_loss": 0.04819339141249657, "eval_runtime": 214.5971, "eval_samples_per_second": 2.47, "eval_steps_per_second": 2.47, "eval_token_acc": 0.980505692428167, "step": 59800 }, { "epoch": 4.558655385319003, "grad_norm": 1.4477639198303223, "learning_rate": 1.9101637077955423e-06, "loss": 0.03159805834293365, "memory(GiB)": 122.96, "step": 59805, "token_acc": 0.9808029821736797, "train_speed(iter/s)": 0.232319 }, { "epoch": 4.559036511929262, "grad_norm": 0.2909989655017853, "learning_rate": 1.9068871811066259e-06, "loss": 0.014122414588928222, "memory(GiB)": 122.96, "step": 59810, "token_acc": 0.9951645399597046, "train_speed(iter/s)": 0.232322 }, { "epoch": 4.559417638539523, "grad_norm": 0.8563489317893982, "learning_rate": 1.9036134123345484e-06, "loss": 0.04422850608825683, "memory(GiB)": 122.96, "step": 59815, "token_acc": 0.9842013888888889, "train_speed(iter/s)": 0.232326 }, { "epoch": 4.559798765149782, "grad_norm": 1.4926763772964478, "learning_rate": 1.9003424016670545e-06, "loss": 0.04460325837135315, "memory(GiB)": 122.96, "step": 59820, "token_acc": 0.9847009735744089, "train_speed(iter/s)": 0.232332 }, { "epoch": 4.5601798917600425, "grad_norm": 1.5516494512557983, "learning_rate": 1.897074149291711e-06, "loss": 0.031041663885116578, "memory(GiB)": 122.96, "step": 59825, "token_acc": 0.9880085653104925, "train_speed(iter/s)": 0.232336 }, { "epoch": 4.5605610183703025, "grad_norm": 0.8611523509025574, "learning_rate": 1.89380865539594e-06, "loss": 0.021941904723644257, "memory(GiB)": 122.96, "step": 59830, "token_acc": 0.9916923489213453, "train_speed(iter/s)": 0.232338 }, { "epoch": 4.5609421449805625, "grad_norm": 0.6052525043487549, "learning_rate": 1.8905459201670028e-06, "loss": 0.0304135799407959, "memory(GiB)": 122.96, "step": 59835, "token_acc": 0.9883903357389394, "train_speed(iter/s)": 0.23234 }, { "epoch": 4.5613232715908225, "grad_norm": 2.273098945617676, "learning_rate": 1.8872859437919998e-06, "loss": 0.016959524154663085, "memory(GiB)": 122.96, "step": 59840, "token_acc": 0.9937152879275066, "train_speed(iter/s)": 0.232343 }, { "epoch": 4.5617043982010825, "grad_norm": 0.649159848690033, "learning_rate": 1.884028726457876e-06, "loss": 0.0347236692905426, "memory(GiB)": 122.96, "step": 59845, "token_acc": 0.9904336734693877, "train_speed(iter/s)": 0.232348 }, { "epoch": 4.5620855248113426, "grad_norm": 2.4965548515319824, "learning_rate": 1.8807742683514262e-06, "loss": 0.03398744761943817, "memory(GiB)": 122.96, "step": 59850, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.232355 }, { "epoch": 4.562466651421603, "grad_norm": 1.1448906660079956, "learning_rate": 1.877522569659268e-06, "loss": 0.024915549159049987, "memory(GiB)": 122.96, "step": 59855, "token_acc": 0.9861078408288203, "train_speed(iter/s)": 0.232361 }, { "epoch": 4.562847778031863, "grad_norm": 0.12641915678977966, "learning_rate": 1.8742736305678742e-06, "loss": 0.010688535869121552, "memory(GiB)": 122.96, "step": 59860, "token_acc": 0.9953636627064619, "train_speed(iter/s)": 0.232367 }, { "epoch": 4.563228904642122, "grad_norm": 2.5580317974090576, "learning_rate": 1.8710274512635629e-06, "loss": 0.04527593851089477, "memory(GiB)": 122.96, "step": 59865, "token_acc": 0.979463243873979, "train_speed(iter/s)": 0.23237 }, { "epoch": 4.563610031252382, "grad_norm": 0.42424237728118896, "learning_rate": 1.8677840319324735e-06, "loss": 0.048147889971733096, "memory(GiB)": 122.96, "step": 59870, "token_acc": 0.9848733369783124, "train_speed(iter/s)": 0.232373 }, { "epoch": 4.563991157862642, "grad_norm": 0.7715582847595215, "learning_rate": 1.8645433727606077e-06, "loss": 0.03324068784713745, "memory(GiB)": 122.96, "step": 59875, "token_acc": 0.985363433391218, "train_speed(iter/s)": 0.232375 }, { "epoch": 4.564372284472902, "grad_norm": 1.0919480323791504, "learning_rate": 1.861305473933811e-06, "loss": 0.026423835754394533, "memory(GiB)": 122.96, "step": 59880, "token_acc": 0.9852748691099477, "train_speed(iter/s)": 0.232381 }, { "epoch": 4.564753411083162, "grad_norm": 1.1979573965072632, "learning_rate": 1.8580703356377516e-06, "loss": 0.020244407653808593, "memory(GiB)": 122.96, "step": 59885, "token_acc": 0.9926330798479087, "train_speed(iter/s)": 0.232384 }, { "epoch": 4.565134537693422, "grad_norm": 0.54510897397995, "learning_rate": 1.8548379580579534e-06, "loss": 0.022035560011863707, "memory(GiB)": 122.96, "step": 59890, "token_acc": 0.9903863930486226, "train_speed(iter/s)": 0.232388 }, { "epoch": 4.565515664303682, "grad_norm": 2.2099459171295166, "learning_rate": 1.8516083413797791e-06, "loss": 0.03979501128196716, "memory(GiB)": 122.96, "step": 59895, "token_acc": 0.983530254206946, "train_speed(iter/s)": 0.232391 }, { "epoch": 4.565896790913942, "grad_norm": 0.8551263809204102, "learning_rate": 1.8483814857884253e-06, "loss": 0.028426167368888856, "memory(GiB)": 122.96, "step": 59900, "token_acc": 0.9895382395382395, "train_speed(iter/s)": 0.232394 }, { "epoch": 4.566277917524202, "grad_norm": 2.0555331707000732, "learning_rate": 1.845157391468949e-06, "loss": 0.014395399391651154, "memory(GiB)": 122.96, "step": 59905, "token_acc": 0.9934490664919751, "train_speed(iter/s)": 0.232399 }, { "epoch": 4.566659044134461, "grad_norm": 0.00016283598961308599, "learning_rate": 1.8419360586062251e-06, "loss": 0.014266480505466462, "memory(GiB)": 122.96, "step": 59910, "token_acc": 0.9920687376074026, "train_speed(iter/s)": 0.232406 }, { "epoch": 4.567040170744721, "grad_norm": 0.6601319313049316, "learning_rate": 1.8387174873849833e-06, "loss": 0.03153957724571228, "memory(GiB)": 122.96, "step": 59915, "token_acc": 0.9908922246584584, "train_speed(iter/s)": 0.232409 }, { "epoch": 4.567421297354981, "grad_norm": 1.1454966068267822, "learning_rate": 1.8355016779898038e-06, "loss": 0.03184411525726318, "memory(GiB)": 122.96, "step": 59920, "token_acc": 0.9877977919814062, "train_speed(iter/s)": 0.232415 }, { "epoch": 4.567802423965241, "grad_norm": 0.5530071258544922, "learning_rate": 1.832288630605089e-06, "loss": 0.01895516961812973, "memory(GiB)": 122.96, "step": 59925, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.232419 }, { "epoch": 4.568183550575501, "grad_norm": 1.614607572555542, "learning_rate": 1.8290783454150973e-06, "loss": 0.029742008447647093, "memory(GiB)": 122.96, "step": 59930, "token_acc": 0.9918662449164031, "train_speed(iter/s)": 0.232426 }, { "epoch": 4.568564677185761, "grad_norm": 1.926332712173462, "learning_rate": 1.825870822603931e-06, "loss": 0.026098307967185975, "memory(GiB)": 122.96, "step": 59935, "token_acc": 0.9873566598059395, "train_speed(iter/s)": 0.232431 }, { "epoch": 4.568945803796021, "grad_norm": 0.6676890850067139, "learning_rate": 1.8226660623555047e-06, "loss": 0.024030840396881102, "memory(GiB)": 122.96, "step": 59940, "token_acc": 0.9900800360725961, "train_speed(iter/s)": 0.23243 }, { "epoch": 4.569326930406281, "grad_norm": 1.5346708297729492, "learning_rate": 1.819464064853621e-06, "loss": 0.018864962458610534, "memory(GiB)": 122.96, "step": 59945, "token_acc": 0.9912023460410557, "train_speed(iter/s)": 0.232435 }, { "epoch": 4.569708057016541, "grad_norm": 0.9163938164710999, "learning_rate": 1.816264830281883e-06, "loss": 0.029303383827209473, "memory(GiB)": 122.96, "step": 59950, "token_acc": 0.9859624078039496, "train_speed(iter/s)": 0.232439 }, { "epoch": 4.570089183626801, "grad_norm": 0.9648891091346741, "learning_rate": 1.813068358823755e-06, "loss": 0.02263110429048538, "memory(GiB)": 122.96, "step": 59955, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.232444 }, { "epoch": 4.570470310237061, "grad_norm": 1.7188870906829834, "learning_rate": 1.8098746506625464e-06, "loss": 0.03433563709259033, "memory(GiB)": 122.96, "step": 59960, "token_acc": 0.9852263701350278, "train_speed(iter/s)": 0.232447 }, { "epoch": 4.57085143684732, "grad_norm": 0.7120586633682251, "learning_rate": 1.8066837059813934e-06, "loss": 0.022353214025497437, "memory(GiB)": 122.96, "step": 59965, "token_acc": 0.9920178206794134, "train_speed(iter/s)": 0.23245 }, { "epoch": 4.57123256345758, "grad_norm": 0.6888728737831116, "learning_rate": 1.8034955249632834e-06, "loss": 0.021001023054122925, "memory(GiB)": 122.96, "step": 59970, "token_acc": 0.9864205065608789, "train_speed(iter/s)": 0.232452 }, { "epoch": 4.57161369006784, "grad_norm": 0.12956003844738007, "learning_rate": 1.8003101077910535e-06, "loss": 0.01822526752948761, "memory(GiB)": 122.96, "step": 59975, "token_acc": 0.9911618883380039, "train_speed(iter/s)": 0.232456 }, { "epoch": 4.5719948166781, "grad_norm": 0.689012348651886, "learning_rate": 1.7971274546473572e-06, "loss": 0.034342071413993834, "memory(GiB)": 122.96, "step": 59980, "token_acc": 0.9867614611424369, "train_speed(iter/s)": 0.232457 }, { "epoch": 4.57237594328836, "grad_norm": 0.846711277961731, "learning_rate": 1.7939475657147153e-06, "loss": 0.02447666972875595, "memory(GiB)": 122.96, "step": 59985, "token_acc": 0.9918984015765273, "train_speed(iter/s)": 0.232462 }, { "epoch": 4.57275706989862, "grad_norm": 1.3503602743148804, "learning_rate": 1.7907704411754822e-06, "loss": 0.02795928418636322, "memory(GiB)": 122.96, "step": 59990, "token_acc": 0.9880072285197963, "train_speed(iter/s)": 0.232464 }, { "epoch": 4.57313819650888, "grad_norm": 0.5484017729759216, "learning_rate": 1.7875960812118397e-06, "loss": 0.013801859319210052, "memory(GiB)": 122.96, "step": 59995, "token_acc": 0.9943364168397206, "train_speed(iter/s)": 0.232467 }, { "epoch": 4.5735193231191404, "grad_norm": 0.8491193056106567, "learning_rate": 1.7844244860058313e-06, "loss": 0.03162351846694946, "memory(GiB)": 122.96, "step": 60000, "token_acc": 0.9843167701863355, "train_speed(iter/s)": 0.23247 }, { "epoch": 4.5735193231191404, "eval_loss": 0.04785448685288429, "eval_runtime": 220.2794, "eval_samples_per_second": 2.406, "eval_steps_per_second": 2.406, "eval_token_acc": 0.980701463767243, "step": 60000 }, { "epoch": 4.5739004497294005, "grad_norm": 0.9440217018127441, "learning_rate": 1.7812556557393333e-06, "loss": 0.015989485383033752, "memory(GiB)": 122.96, "step": 60005, "token_acc": 0.9810926048332619, "train_speed(iter/s)": 0.232274 }, { "epoch": 4.57428157633966, "grad_norm": 0.6773310899734497, "learning_rate": 1.7780895905940565e-06, "loss": 0.04291318953037262, "memory(GiB)": 122.96, "step": 60010, "token_acc": 0.983868044226935, "train_speed(iter/s)": 0.232278 }, { "epoch": 4.57466270294992, "grad_norm": 0.7233074903488159, "learning_rate": 1.7749262907515662e-06, "loss": 0.01898895800113678, "memory(GiB)": 122.96, "step": 60015, "token_acc": 0.9918990203466466, "train_speed(iter/s)": 0.232282 }, { "epoch": 4.57504382956018, "grad_norm": 1.5137418508529663, "learning_rate": 1.7717657563932677e-06, "loss": 0.02928108870983124, "memory(GiB)": 122.96, "step": 60020, "token_acc": 0.988568503668316, "train_speed(iter/s)": 0.232286 }, { "epoch": 4.57542495617044, "grad_norm": 1.1302114725112915, "learning_rate": 1.768607987700388e-06, "loss": 0.030040445923805236, "memory(GiB)": 122.96, "step": 60025, "token_acc": 0.9905342182569905, "train_speed(iter/s)": 0.232288 }, { "epoch": 4.5758060827807, "grad_norm": 0.13967666029930115, "learning_rate": 1.7654529848540214e-06, "loss": 0.040157470107078555, "memory(GiB)": 122.96, "step": 60030, "token_acc": 0.9840031213421772, "train_speed(iter/s)": 0.232294 }, { "epoch": 4.57618720939096, "grad_norm": 1.895655632019043, "learning_rate": 1.7623007480350951e-06, "loss": 0.04450112581253052, "memory(GiB)": 122.96, "step": 60035, "token_acc": 0.9866558835422564, "train_speed(iter/s)": 0.2323 }, { "epoch": 4.57656833600122, "grad_norm": 1.3249233961105347, "learning_rate": 1.7591512774243646e-06, "loss": 0.044917309284210206, "memory(GiB)": 122.96, "step": 60040, "token_acc": 0.9834053586862576, "train_speed(iter/s)": 0.232304 }, { "epoch": 4.57694946261148, "grad_norm": 1.1808003187179565, "learning_rate": 1.7560045732024466e-06, "loss": 0.044404846429824826, "memory(GiB)": 122.96, "step": 60045, "token_acc": 0.9811046511627907, "train_speed(iter/s)": 0.232303 }, { "epoch": 4.57733058922174, "grad_norm": 1.0066592693328857, "learning_rate": 1.752860635549791e-06, "loss": 0.02665989100933075, "memory(GiB)": 122.96, "step": 60050, "token_acc": 0.991475166790215, "train_speed(iter/s)": 0.23231 }, { "epoch": 4.577711715832, "grad_norm": 1.4281504154205322, "learning_rate": 1.7497194646466707e-06, "loss": 0.027522575855255128, "memory(GiB)": 122.96, "step": 60055, "token_acc": 0.9890282131661442, "train_speed(iter/s)": 0.232313 }, { "epoch": 4.57809284244226, "grad_norm": 1.17691171169281, "learning_rate": 1.7465810606732413e-06, "loss": 0.03984653055667877, "memory(GiB)": 122.96, "step": 60060, "token_acc": 0.9826281871672737, "train_speed(iter/s)": 0.232319 }, { "epoch": 4.578473969052519, "grad_norm": 1.8963079452514648, "learning_rate": 1.7434454238094643e-06, "loss": 0.030718156695365907, "memory(GiB)": 122.96, "step": 60065, "token_acc": 0.9893948845913911, "train_speed(iter/s)": 0.232318 }, { "epoch": 4.578855095662779, "grad_norm": 0.6634041666984558, "learning_rate": 1.7403125542351462e-06, "loss": 0.03846811652183533, "memory(GiB)": 122.96, "step": 60070, "token_acc": 0.9869737148173994, "train_speed(iter/s)": 0.232322 }, { "epoch": 4.579236222273039, "grad_norm": 1.0475953817367554, "learning_rate": 1.737182452129965e-06, "loss": 0.02282102108001709, "memory(GiB)": 122.96, "step": 60075, "token_acc": 0.991543690930194, "train_speed(iter/s)": 0.232325 }, { "epoch": 4.579617348883299, "grad_norm": 2.204859972000122, "learning_rate": 1.7340551176733943e-06, "loss": 0.034926393628120424, "memory(GiB)": 122.96, "step": 60080, "token_acc": 0.9857066285510094, "train_speed(iter/s)": 0.232329 }, { "epoch": 4.579998475493559, "grad_norm": 0.704266369342804, "learning_rate": 1.7309305510447792e-06, "loss": 0.02819114625453949, "memory(GiB)": 122.96, "step": 60085, "token_acc": 0.9883867710174199, "train_speed(iter/s)": 0.232334 }, { "epoch": 4.580379602103819, "grad_norm": 0.8263523578643799, "learning_rate": 1.72780875242331e-06, "loss": 0.04407320618629455, "memory(GiB)": 122.96, "step": 60090, "token_acc": 0.9810732576263639, "train_speed(iter/s)": 0.232337 }, { "epoch": 4.580760728714079, "grad_norm": 1.746094822883606, "learning_rate": 1.7246897219879876e-06, "loss": 0.049853771924972534, "memory(GiB)": 122.96, "step": 60095, "token_acc": 0.9839269406392694, "train_speed(iter/s)": 0.23234 }, { "epoch": 4.581141855324339, "grad_norm": 1.2198140621185303, "learning_rate": 1.7215734599176913e-06, "loss": 0.03984477519989014, "memory(GiB)": 122.96, "step": 60100, "token_acc": 0.98287362538309, "train_speed(iter/s)": 0.232343 }, { "epoch": 4.581522981934599, "grad_norm": 2.425952911376953, "learning_rate": 1.7184599663911172e-06, "loss": 0.03478081226348877, "memory(GiB)": 122.96, "step": 60105, "token_acc": 0.9860221316249272, "train_speed(iter/s)": 0.232347 }, { "epoch": 4.581904108544858, "grad_norm": 1.125320553779602, "learning_rate": 1.715349241586811e-06, "loss": 0.023632806539535523, "memory(GiB)": 122.96, "step": 60110, "token_acc": 0.9910571702331523, "train_speed(iter/s)": 0.23235 }, { "epoch": 4.582285235155118, "grad_norm": 0.9611056447029114, "learning_rate": 1.7122412856831582e-06, "loss": 0.020388785004615783, "memory(GiB)": 122.96, "step": 60115, "token_acc": 0.9950765864332604, "train_speed(iter/s)": 0.232355 }, { "epoch": 4.582666361765378, "grad_norm": 2.912971258163452, "learning_rate": 1.7091360988583827e-06, "loss": 0.020713424682617186, "memory(GiB)": 122.96, "step": 60120, "token_acc": 0.9929039301310044, "train_speed(iter/s)": 0.232361 }, { "epoch": 4.583047488375638, "grad_norm": 1.0447680950164795, "learning_rate": 1.706033681290553e-06, "loss": 0.025502729415893554, "memory(GiB)": 122.96, "step": 60125, "token_acc": 0.9857962305381044, "train_speed(iter/s)": 0.232365 }, { "epoch": 4.583428614985898, "grad_norm": 1.3650598526000977, "learning_rate": 1.7029340331575883e-06, "loss": 0.016788786649703978, "memory(GiB)": 122.96, "step": 60130, "token_acc": 0.9908963585434174, "train_speed(iter/s)": 0.232372 }, { "epoch": 4.583809741596158, "grad_norm": 3.726581573486328, "learning_rate": 1.6998371546372238e-06, "loss": 0.048850458860397336, "memory(GiB)": 122.96, "step": 60135, "token_acc": 0.9863829787234043, "train_speed(iter/s)": 0.232375 }, { "epoch": 4.584190868206418, "grad_norm": 1.9532380104064941, "learning_rate": 1.6967430459070566e-06, "loss": 0.0351662278175354, "memory(GiB)": 122.96, "step": 60140, "token_acc": 0.9859202145491116, "train_speed(iter/s)": 0.232381 }, { "epoch": 4.584571994816678, "grad_norm": 0.6070446968078613, "learning_rate": 1.6936517071445335e-06, "loss": 0.041287145018577574, "memory(GiB)": 122.96, "step": 60145, "token_acc": 0.9851656730902537, "train_speed(iter/s)": 0.232383 }, { "epoch": 4.584953121426938, "grad_norm": 0.547359824180603, "learning_rate": 1.6905631385269016e-06, "loss": 0.03146966099739075, "memory(GiB)": 122.96, "step": 60150, "token_acc": 0.9937637564196625, "train_speed(iter/s)": 0.232387 }, { "epoch": 4.5853342480371975, "grad_norm": 1.8083281517028809, "learning_rate": 1.687477340231297e-06, "loss": 0.0327758252620697, "memory(GiB)": 122.96, "step": 60155, "token_acc": 0.9885289431757833, "train_speed(iter/s)": 0.232388 }, { "epoch": 4.585715374647458, "grad_norm": 1.201610803604126, "learning_rate": 1.6843943124346728e-06, "loss": 0.01426461637020111, "memory(GiB)": 122.96, "step": 60160, "token_acc": 0.9935085007727975, "train_speed(iter/s)": 0.232394 }, { "epoch": 4.5860965012577175, "grad_norm": 1.7381525039672852, "learning_rate": 1.681314055313815e-06, "loss": 0.0716774582862854, "memory(GiB)": 122.96, "step": 60165, "token_acc": 0.9790764790764791, "train_speed(iter/s)": 0.232399 }, { "epoch": 4.5864776278679775, "grad_norm": 0.4354534447193146, "learning_rate": 1.6782365690453771e-06, "loss": 0.04054155051708221, "memory(GiB)": 122.96, "step": 60170, "token_acc": 0.9862263817764436, "train_speed(iter/s)": 0.232403 }, { "epoch": 4.5868587544782375, "grad_norm": 1.0597255229949951, "learning_rate": 1.675161853805829e-06, "loss": 0.026748090982437134, "memory(GiB)": 122.96, "step": 60175, "token_acc": 0.9868620515411825, "train_speed(iter/s)": 0.23241 }, { "epoch": 4.587239881088498, "grad_norm": 0.7224579453468323, "learning_rate": 1.6720899097714903e-06, "loss": 0.031559419631958005, "memory(GiB)": 122.96, "step": 60180, "token_acc": 0.9909792129690156, "train_speed(iter/s)": 0.232411 }, { "epoch": 4.587621007698758, "grad_norm": 1.1298805475234985, "learning_rate": 1.6690207371185319e-06, "loss": 0.03997042179107666, "memory(GiB)": 122.96, "step": 60185, "token_acc": 0.9812382739212008, "train_speed(iter/s)": 0.232417 }, { "epoch": 4.588002134309018, "grad_norm": 1.400958776473999, "learning_rate": 1.6659543360229457e-06, "loss": 0.03959152102470398, "memory(GiB)": 122.96, "step": 60190, "token_acc": 0.9864608376565861, "train_speed(iter/s)": 0.232421 }, { "epoch": 4.588383260919278, "grad_norm": 0.7632793188095093, "learning_rate": 1.6628907066605804e-06, "loss": 0.027781376242637636, "memory(GiB)": 122.96, "step": 60195, "token_acc": 0.9857677124454602, "train_speed(iter/s)": 0.232423 }, { "epoch": 4.588764387529538, "grad_norm": 2.5302255153656006, "learning_rate": 1.6598298492071285e-06, "loss": 0.036502805352210996, "memory(GiB)": 122.96, "step": 60200, "token_acc": 0.9893723015609432, "train_speed(iter/s)": 0.232429 }, { "epoch": 4.588764387529538, "eval_loss": 0.04778910055756569, "eval_runtime": 218.2845, "eval_samples_per_second": 2.428, "eval_steps_per_second": 2.428, "eval_token_acc": 0.9804529847599542, "step": 60200 }, { "epoch": 4.589145514139798, "grad_norm": 1.1907320022583008, "learning_rate": 1.6567717638381053e-06, "loss": 0.030229973793029784, "memory(GiB)": 122.96, "step": 60205, "token_acc": 0.9808775883927314, "train_speed(iter/s)": 0.232234 }, { "epoch": 4.589526640750057, "grad_norm": 1.1080923080444336, "learning_rate": 1.6537164507288817e-06, "loss": 0.025506001710891724, "memory(GiB)": 122.96, "step": 60210, "token_acc": 0.9912044954800879, "train_speed(iter/s)": 0.232238 }, { "epoch": 4.589907767360317, "grad_norm": 0.7423890233039856, "learning_rate": 1.6506639100546672e-06, "loss": 0.04169832468032837, "memory(GiB)": 122.96, "step": 60215, "token_acc": 0.9870303261491512, "train_speed(iter/s)": 0.232242 }, { "epoch": 4.590288893970577, "grad_norm": 1.869046926498413, "learning_rate": 1.6476141419905056e-06, "loss": 0.031049197912216185, "memory(GiB)": 122.96, "step": 60220, "token_acc": 0.9841332804442682, "train_speed(iter/s)": 0.232248 }, { "epoch": 4.590670020580837, "grad_norm": 0.8535139560699463, "learning_rate": 1.64456714671129e-06, "loss": 0.037983039021492006, "memory(GiB)": 122.96, "step": 60225, "token_acc": 0.9844023127605217, "train_speed(iter/s)": 0.232252 }, { "epoch": 4.591051147191097, "grad_norm": 1.0083580017089844, "learning_rate": 1.6415229243917585e-06, "loss": 0.02485654056072235, "memory(GiB)": 122.96, "step": 60230, "token_acc": 0.9931997136721546, "train_speed(iter/s)": 0.232256 }, { "epoch": 4.591432273801357, "grad_norm": 2.17631459236145, "learning_rate": 1.6384814752064714e-06, "loss": 0.044647216796875, "memory(GiB)": 122.96, "step": 60235, "token_acc": 0.9859105931466342, "train_speed(iter/s)": 0.23226 }, { "epoch": 4.591813400411617, "grad_norm": 1.051814317703247, "learning_rate": 1.635442799329845e-06, "loss": 0.04464126825332641, "memory(GiB)": 122.96, "step": 60240, "token_acc": 0.9811628596855533, "train_speed(iter/s)": 0.232263 }, { "epoch": 4.592194527021877, "grad_norm": 0.5999324321746826, "learning_rate": 1.6324068969361451e-06, "loss": 0.02101951986551285, "memory(GiB)": 122.96, "step": 60245, "token_acc": 0.985, "train_speed(iter/s)": 0.232267 }, { "epoch": 4.592575653632137, "grad_norm": 1.5771499872207642, "learning_rate": 1.6293737681994493e-06, "loss": 0.027857202291488647, "memory(GiB)": 122.96, "step": 60250, "token_acc": 0.9886241682764542, "train_speed(iter/s)": 0.232272 }, { "epoch": 4.592956780242396, "grad_norm": 2.9690494537353516, "learning_rate": 1.626343413293696e-06, "loss": 0.0625740647315979, "memory(GiB)": 122.96, "step": 60255, "token_acc": 0.9776339022954679, "train_speed(iter/s)": 0.232277 }, { "epoch": 4.593337906852656, "grad_norm": 0.938224732875824, "learning_rate": 1.6233158323926745e-06, "loss": 0.025192296504974364, "memory(GiB)": 122.96, "step": 60260, "token_acc": 0.9926032463529896, "train_speed(iter/s)": 0.232281 }, { "epoch": 4.593719033462916, "grad_norm": 1.1227422952651978, "learning_rate": 1.6202910256699899e-06, "loss": 0.039169132709503174, "memory(GiB)": 122.96, "step": 60265, "token_acc": 0.9842883548983364, "train_speed(iter/s)": 0.232286 }, { "epoch": 4.594100160073176, "grad_norm": 0.22451379895210266, "learning_rate": 1.6172689932991092e-06, "loss": 0.028187331557273865, "memory(GiB)": 122.96, "step": 60270, "token_acc": 0.9876347951114306, "train_speed(iter/s)": 0.232286 }, { "epoch": 4.594481286683436, "grad_norm": 1.1264325380325317, "learning_rate": 1.614249735453327e-06, "loss": 0.04928330779075622, "memory(GiB)": 122.96, "step": 60275, "token_acc": 0.989344262295082, "train_speed(iter/s)": 0.232291 }, { "epoch": 4.594862413293696, "grad_norm": 0.47045502066612244, "learning_rate": 1.6112332523057883e-06, "loss": 0.016138830780982973, "memory(GiB)": 122.96, "step": 60280, "token_acc": 0.9960642833715972, "train_speed(iter/s)": 0.232297 }, { "epoch": 4.595243539903956, "grad_norm": 1.0440360307693481, "learning_rate": 1.6082195440294711e-06, "loss": 0.04160493612289429, "memory(GiB)": 122.96, "step": 60285, "token_acc": 0.986627043090639, "train_speed(iter/s)": 0.232302 }, { "epoch": 4.595624666514216, "grad_norm": 1.0293899774551392, "learning_rate": 1.605208610797193e-06, "loss": 0.018251010775566102, "memory(GiB)": 122.96, "step": 60290, "token_acc": 0.9914841849148418, "train_speed(iter/s)": 0.232306 }, { "epoch": 4.596005793124476, "grad_norm": 1.9113883972167969, "learning_rate": 1.6022004527816215e-06, "loss": 0.03104698657989502, "memory(GiB)": 122.96, "step": 60295, "token_acc": 0.9886669227814061, "train_speed(iter/s)": 0.23231 }, { "epoch": 4.596386919734736, "grad_norm": 0.964566171169281, "learning_rate": 1.5991950701552626e-06, "loss": 0.0317289799451828, "memory(GiB)": 122.96, "step": 60300, "token_acc": 0.9847328244274809, "train_speed(iter/s)": 0.232313 }, { "epoch": 4.596768046344996, "grad_norm": 2.8679873943328857, "learning_rate": 1.5961924630904624e-06, "loss": 0.039047205448150636, "memory(GiB)": 122.96, "step": 60305, "token_acc": 0.9865449303219606, "train_speed(iter/s)": 0.232317 }, { "epoch": 4.597149172955255, "grad_norm": 0.8691472411155701, "learning_rate": 1.5931926317593992e-06, "loss": 0.022492413222789765, "memory(GiB)": 122.96, "step": 60310, "token_acc": 0.9898122253296044, "train_speed(iter/s)": 0.232321 }, { "epoch": 4.597530299565515, "grad_norm": 0.9712971448898315, "learning_rate": 1.590195576334108e-06, "loss": 0.016670326888561248, "memory(GiB)": 122.96, "step": 60315, "token_acc": 0.9934891979875703, "train_speed(iter/s)": 0.232327 }, { "epoch": 4.597911426175775, "grad_norm": 0.7216866612434387, "learning_rate": 1.5872012969864402e-06, "loss": 0.03517068326473236, "memory(GiB)": 122.96, "step": 60320, "token_acc": 0.9892150988615938, "train_speed(iter/s)": 0.232333 }, { "epoch": 4.598292552786035, "grad_norm": 0.9319965839385986, "learning_rate": 1.5842097938881251e-06, "loss": 0.02844511568546295, "memory(GiB)": 122.96, "step": 60325, "token_acc": 0.9884780957204355, "train_speed(iter/s)": 0.232333 }, { "epoch": 4.5986736793962955, "grad_norm": 1.0959216356277466, "learning_rate": 1.5812210672106976e-06, "loss": 0.04616422653198242, "memory(GiB)": 122.96, "step": 60330, "token_acc": 0.9859613428280773, "train_speed(iter/s)": 0.232336 }, { "epoch": 4.5990548060065555, "grad_norm": 0.5280647277832031, "learning_rate": 1.5782351171255538e-06, "loss": 0.02717118263244629, "memory(GiB)": 122.96, "step": 60335, "token_acc": 0.9905691056910569, "train_speed(iter/s)": 0.232337 }, { "epoch": 4.5994359326168155, "grad_norm": 0.8890611529350281, "learning_rate": 1.5752519438039237e-06, "loss": 0.022702060639858246, "memory(GiB)": 122.96, "step": 60340, "token_acc": 0.9905161005734451, "train_speed(iter/s)": 0.232342 }, { "epoch": 4.5998170592270755, "grad_norm": 1.4445242881774902, "learning_rate": 1.572271547416876e-06, "loss": 0.03399159908294678, "memory(GiB)": 122.96, "step": 60345, "token_acc": 0.9842873176206509, "train_speed(iter/s)": 0.232346 }, { "epoch": 4.6001981858373355, "grad_norm": 0.9531233906745911, "learning_rate": 1.5692939281353181e-06, "loss": 0.02634703814983368, "memory(GiB)": 122.96, "step": 60350, "token_acc": 0.9911842491918895, "train_speed(iter/s)": 0.232351 }, { "epoch": 4.600579312447595, "grad_norm": 1.616041898727417, "learning_rate": 1.5663190861300136e-06, "loss": 0.021078944206237793, "memory(GiB)": 122.96, "step": 60355, "token_acc": 0.9927814121362508, "train_speed(iter/s)": 0.232351 }, { "epoch": 4.600960439057855, "grad_norm": 1.3202298879623413, "learning_rate": 1.5633470215715428e-06, "loss": 0.03138360679149628, "memory(GiB)": 122.96, "step": 60360, "token_acc": 0.9889289712650827, "train_speed(iter/s)": 0.232353 }, { "epoch": 4.601341565668115, "grad_norm": 1.0044137239456177, "learning_rate": 1.5603777346303529e-06, "loss": 0.02727665603160858, "memory(GiB)": 122.96, "step": 60365, "token_acc": 0.989233602493271, "train_speed(iter/s)": 0.232355 }, { "epoch": 4.601722692278375, "grad_norm": 2.341463804244995, "learning_rate": 1.5574112254767125e-06, "loss": 0.02946229577064514, "memory(GiB)": 122.96, "step": 60370, "token_acc": 0.9888728621471254, "train_speed(iter/s)": 0.232358 }, { "epoch": 4.602103818888635, "grad_norm": 0.8675979971885681, "learning_rate": 1.5544474942807309e-06, "loss": 0.020540449023246764, "memory(GiB)": 122.96, "step": 60375, "token_acc": 0.9921221073362876, "train_speed(iter/s)": 0.232364 }, { "epoch": 4.602484945498895, "grad_norm": 0.7815234065055847, "learning_rate": 1.5514865412123769e-06, "loss": 0.028404626250267028, "memory(GiB)": 122.96, "step": 60380, "token_acc": 0.9878345498783455, "train_speed(iter/s)": 0.232368 }, { "epoch": 4.602866072109155, "grad_norm": 1.1786706447601318, "learning_rate": 1.5485283664414374e-06, "loss": 0.04037463665008545, "memory(GiB)": 122.96, "step": 60385, "token_acc": 0.9822949724402873, "train_speed(iter/s)": 0.232372 }, { "epoch": 4.603247198719415, "grad_norm": 0.7871391177177429, "learning_rate": 1.5455729701375544e-06, "loss": 0.031095612049102783, "memory(GiB)": 122.96, "step": 60390, "token_acc": 0.9888170974155069, "train_speed(iter/s)": 0.232376 }, { "epoch": 4.603628325329675, "grad_norm": 1.697455644607544, "learning_rate": 1.5426203524702087e-06, "loss": 0.06850939989089966, "memory(GiB)": 122.96, "step": 60395, "token_acc": 0.9775262194106875, "train_speed(iter/s)": 0.23238 }, { "epoch": 4.604009451939935, "grad_norm": 0.8423616290092468, "learning_rate": 1.5396705136087153e-06, "loss": 0.016543598473072053, "memory(GiB)": 122.96, "step": 60400, "token_acc": 0.9923617476321418, "train_speed(iter/s)": 0.232386 }, { "epoch": 4.604009451939935, "eval_loss": 0.04711604490876198, "eval_runtime": 220.0164, "eval_samples_per_second": 2.409, "eval_steps_per_second": 2.409, "eval_token_acc": 0.9807993494367809, "step": 60400 }, { "epoch": 4.604390578550195, "grad_norm": 1.954870581626892, "learning_rate": 1.5367234537222331e-06, "loss": 0.029045552015304565, "memory(GiB)": 122.96, "step": 60405, "token_acc": 0.9812549540478588, "train_speed(iter/s)": 0.232191 }, { "epoch": 4.604771705160454, "grad_norm": 1.1792799234390259, "learning_rate": 1.5337791729797601e-06, "loss": 0.033573535084724423, "memory(GiB)": 122.96, "step": 60410, "token_acc": 0.9862651875330164, "train_speed(iter/s)": 0.232193 }, { "epoch": 4.605152831770714, "grad_norm": 2.309124708175659, "learning_rate": 1.5308376715501504e-06, "loss": 0.03355427384376526, "memory(GiB)": 122.96, "step": 60415, "token_acc": 0.9865740009477176, "train_speed(iter/s)": 0.232197 }, { "epoch": 4.605533958380974, "grad_norm": 1.00972318649292, "learning_rate": 1.527898949602069e-06, "loss": 0.03464475572109223, "memory(GiB)": 122.96, "step": 60420, "token_acc": 0.9880303389428775, "train_speed(iter/s)": 0.232197 }, { "epoch": 4.605915084991234, "grad_norm": 0.5804013013839722, "learning_rate": 1.5249630073040422e-06, "loss": 0.03323263227939606, "memory(GiB)": 122.96, "step": 60425, "token_acc": 0.9886990077177509, "train_speed(iter/s)": 0.232197 }, { "epoch": 4.606296211601494, "grad_norm": 1.0567662715911865, "learning_rate": 1.5220298448244407e-06, "loss": 0.037587472796440126, "memory(GiB)": 122.96, "step": 60430, "token_acc": 0.9845655487804879, "train_speed(iter/s)": 0.2322 }, { "epoch": 4.606677338211754, "grad_norm": 1.4615020751953125, "learning_rate": 1.5190994623314636e-06, "loss": 0.04613993167877197, "memory(GiB)": 122.96, "step": 60435, "token_acc": 0.9795918367346939, "train_speed(iter/s)": 0.232206 }, { "epoch": 4.607058464822014, "grad_norm": 1.3508583307266235, "learning_rate": 1.5161718599931485e-06, "loss": 0.023120780289173127, "memory(GiB)": 122.96, "step": 60440, "token_acc": 0.9918681318681318, "train_speed(iter/s)": 0.232211 }, { "epoch": 4.607439591432274, "grad_norm": 0.27848517894744873, "learning_rate": 1.5132470379773888e-06, "loss": 0.01077825054526329, "memory(GiB)": 122.96, "step": 60445, "token_acc": 0.9944068981589373, "train_speed(iter/s)": 0.232216 }, { "epoch": 4.607820718042534, "grad_norm": 0.9888471364974976, "learning_rate": 1.5103249964519006e-06, "loss": 0.025972363352775574, "memory(GiB)": 122.96, "step": 60450, "token_acc": 0.9834158415841584, "train_speed(iter/s)": 0.23222 }, { "epoch": 4.608201844652793, "grad_norm": 0.7890436053276062, "learning_rate": 1.5074057355842553e-06, "loss": 0.02236652821302414, "memory(GiB)": 122.96, "step": 60455, "token_acc": 0.9925396331986323, "train_speed(iter/s)": 0.232224 }, { "epoch": 4.608582971263053, "grad_norm": 0.5020701289176941, "learning_rate": 1.5044892555418576e-06, "loss": 0.018180637061595915, "memory(GiB)": 122.96, "step": 60460, "token_acc": 0.9925202576355703, "train_speed(iter/s)": 0.232228 }, { "epoch": 4.608964097873313, "grad_norm": 0.5753322839736938, "learning_rate": 1.5015755564919576e-06, "loss": 0.020305061340332033, "memory(GiB)": 122.96, "step": 60465, "token_acc": 0.9911639762107052, "train_speed(iter/s)": 0.232232 }, { "epoch": 4.609345224483573, "grad_norm": 0.38562124967575073, "learning_rate": 1.498664638601638e-06, "loss": 0.02703651785850525, "memory(GiB)": 122.96, "step": 60470, "token_acc": 0.9842122395833334, "train_speed(iter/s)": 0.232234 }, { "epoch": 4.609726351093833, "grad_norm": 0.4953431189060211, "learning_rate": 1.495756502037826e-06, "loss": 0.012278829514980317, "memory(GiB)": 122.96, "step": 60475, "token_acc": 0.9931626649705836, "train_speed(iter/s)": 0.232238 }, { "epoch": 4.610107477704093, "grad_norm": 0.9084377288818359, "learning_rate": 1.4928511469672946e-06, "loss": 0.018532338738441467, "memory(GiB)": 122.96, "step": 60480, "token_acc": 0.9881305637982196, "train_speed(iter/s)": 0.232245 }, { "epoch": 4.610488604314353, "grad_norm": 1.2257671356201172, "learning_rate": 1.4899485735566487e-06, "loss": 0.027209565043449402, "memory(GiB)": 122.96, "step": 60485, "token_acc": 0.9886047100531781, "train_speed(iter/s)": 0.23225 }, { "epoch": 4.610869730924613, "grad_norm": 0.6328903436660767, "learning_rate": 1.487048781972339e-06, "loss": 0.014005079865455627, "memory(GiB)": 122.96, "step": 60490, "token_acc": 0.9955823293172691, "train_speed(iter/s)": 0.232256 }, { "epoch": 4.611250857534873, "grad_norm": 0.7289530634880066, "learning_rate": 1.4841517723806542e-06, "loss": 0.02788618803024292, "memory(GiB)": 122.96, "step": 60495, "token_acc": 0.9867887508175278, "train_speed(iter/s)": 0.232258 }, { "epoch": 4.6116319841451325, "grad_norm": 1.662116527557373, "learning_rate": 1.4812575449477228e-06, "loss": 0.04066420197486877, "memory(GiB)": 122.96, "step": 60500, "token_acc": 0.986801791185482, "train_speed(iter/s)": 0.232263 }, { "epoch": 4.6120131107553926, "grad_norm": 3.475571870803833, "learning_rate": 1.4783660998395122e-06, "loss": 0.026362782716751097, "memory(GiB)": 122.96, "step": 60505, "token_acc": 0.9896640826873385, "train_speed(iter/s)": 0.232265 }, { "epoch": 4.612394237365653, "grad_norm": 0.461713582277298, "learning_rate": 1.475477437221845e-06, "loss": 0.04226141571998596, "memory(GiB)": 122.96, "step": 60510, "token_acc": 0.9839640795381654, "train_speed(iter/s)": 0.232266 }, { "epoch": 4.612775363975913, "grad_norm": 1.7202541828155518, "learning_rate": 1.472591557260361e-06, "loss": 0.04059189856052399, "memory(GiB)": 122.96, "step": 60515, "token_acc": 0.985894580549369, "train_speed(iter/s)": 0.232269 }, { "epoch": 4.613156490586173, "grad_norm": 0.5399847030639648, "learning_rate": 1.4697084601205557e-06, "loss": 0.030712932348251343, "memory(GiB)": 122.96, "step": 60520, "token_acc": 0.9887284951552304, "train_speed(iter/s)": 0.232269 }, { "epoch": 4.613537617196433, "grad_norm": 1.6612892150878906, "learning_rate": 1.466828145967769e-06, "loss": 0.025882846117019652, "memory(GiB)": 122.96, "step": 60525, "token_acc": 0.9889484259879437, "train_speed(iter/s)": 0.232275 }, { "epoch": 4.613918743806693, "grad_norm": 1.8940273523330688, "learning_rate": 1.4639506149671633e-06, "loss": 0.04016627669334412, "memory(GiB)": 122.96, "step": 60530, "token_acc": 0.98828125, "train_speed(iter/s)": 0.232279 }, { "epoch": 4.614299870416953, "grad_norm": 1.3457883596420288, "learning_rate": 1.461075867283751e-06, "loss": 0.034187743067741395, "memory(GiB)": 122.96, "step": 60535, "token_acc": 0.9885258654220148, "train_speed(iter/s)": 0.232284 }, { "epoch": 4.614680997027213, "grad_norm": 1.0306673049926758, "learning_rate": 1.4582039030823946e-06, "loss": 0.02613743245601654, "memory(GiB)": 122.96, "step": 60540, "token_acc": 0.989070480081716, "train_speed(iter/s)": 0.232286 }, { "epoch": 4.615062123637473, "grad_norm": 0.5974879860877991, "learning_rate": 1.455334722527779e-06, "loss": 0.020207615196704866, "memory(GiB)": 122.96, "step": 60545, "token_acc": 0.9900797929695924, "train_speed(iter/s)": 0.232291 }, { "epoch": 4.615443250247733, "grad_norm": 1.1610548496246338, "learning_rate": 1.452468325784445e-06, "loss": 0.05483081340789795, "memory(GiB)": 122.96, "step": 60550, "token_acc": 0.9778933680104032, "train_speed(iter/s)": 0.232294 }, { "epoch": 4.615824376857992, "grad_norm": 0.931641697883606, "learning_rate": 1.4496047130167667e-06, "loss": 0.031997448205947875, "memory(GiB)": 122.96, "step": 60555, "token_acc": 0.9874658724249193, "train_speed(iter/s)": 0.232296 }, { "epoch": 4.616205503468252, "grad_norm": 0.8772879838943481, "learning_rate": 1.4467438843889514e-06, "loss": 0.03065740466117859, "memory(GiB)": 122.96, "step": 60560, "token_acc": 0.9860081320258312, "train_speed(iter/s)": 0.232297 }, { "epoch": 4.616586630078512, "grad_norm": 1.0568422079086304, "learning_rate": 1.4438858400650567e-06, "loss": 0.028476798534393312, "memory(GiB)": 122.96, "step": 60565, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.232301 }, { "epoch": 4.616967756688772, "grad_norm": 0.7462822794914246, "learning_rate": 1.4410305802089908e-06, "loss": 0.032854008674621585, "memory(GiB)": 122.96, "step": 60570, "token_acc": 0.9883909914093336, "train_speed(iter/s)": 0.232305 }, { "epoch": 4.617348883299032, "grad_norm": 0.26486948132514954, "learning_rate": 1.4381781049844778e-06, "loss": 0.02071422040462494, "memory(GiB)": 122.96, "step": 60575, "token_acc": 0.9908459595959596, "train_speed(iter/s)": 0.232311 }, { "epoch": 4.617730009909292, "grad_norm": 1.3376753330230713, "learning_rate": 1.4353284145550872e-06, "loss": 0.03996903300285339, "memory(GiB)": 122.96, "step": 60580, "token_acc": 0.9821037253469685, "train_speed(iter/s)": 0.232315 }, { "epoch": 4.618111136519552, "grad_norm": 1.6657531261444092, "learning_rate": 1.4324815090842548e-06, "loss": 0.02808789610862732, "memory(GiB)": 122.96, "step": 60585, "token_acc": 0.9884892086330935, "train_speed(iter/s)": 0.232319 }, { "epoch": 4.618492263129812, "grad_norm": 2.362541913986206, "learning_rate": 1.4296373887352165e-06, "loss": 0.031555795669555665, "memory(GiB)": 122.96, "step": 60590, "token_acc": 0.9882110226937814, "train_speed(iter/s)": 0.232321 }, { "epoch": 4.618873389740072, "grad_norm": 0.40944018959999084, "learning_rate": 1.4267960536710922e-06, "loss": 0.0232081338763237, "memory(GiB)": 122.96, "step": 60595, "token_acc": 0.9887865438526231, "train_speed(iter/s)": 0.232328 }, { "epoch": 4.619254516350331, "grad_norm": 1.1540981531143188, "learning_rate": 1.4239575040548014e-06, "loss": 0.0308624267578125, "memory(GiB)": 122.96, "step": 60600, "token_acc": 0.9854833882071237, "train_speed(iter/s)": 0.232329 }, { "epoch": 4.619254516350331, "eval_loss": 0.04741703346371651, "eval_runtime": 218.4448, "eval_samples_per_second": 2.426, "eval_steps_per_second": 2.426, "eval_token_acc": 0.9808671164387688, "step": 60600 }, { "epoch": 4.619635642960591, "grad_norm": 0.639030396938324, "learning_rate": 1.421121740049125e-06, "loss": 0.0338226318359375, "memory(GiB)": 122.96, "step": 60605, "token_acc": 0.9812943422047754, "train_speed(iter/s)": 0.232139 }, { "epoch": 4.620016769570851, "grad_norm": 0.9705949425697327, "learning_rate": 1.4182887618166885e-06, "loss": 0.03828955292701721, "memory(GiB)": 122.96, "step": 60610, "token_acc": 0.9893981914561896, "train_speed(iter/s)": 0.232143 }, { "epoch": 4.620397896181111, "grad_norm": 2.011972188949585, "learning_rate": 1.4154585695199395e-06, "loss": 0.03169429004192352, "memory(GiB)": 122.96, "step": 60615, "token_acc": 0.9853807549639974, "train_speed(iter/s)": 0.232148 }, { "epoch": 4.620779022791371, "grad_norm": 1.4156197309494019, "learning_rate": 1.4126311633211819e-06, "loss": 0.06127045154571533, "memory(GiB)": 122.96, "step": 60620, "token_acc": 0.9740200546946217, "train_speed(iter/s)": 0.232154 }, { "epoch": 4.621160149401631, "grad_norm": 1.8500258922576904, "learning_rate": 1.4098065433825637e-06, "loss": 0.04739214181900024, "memory(GiB)": 122.96, "step": 60625, "token_acc": 0.9849838187702266, "train_speed(iter/s)": 0.232154 }, { "epoch": 4.621541276011891, "grad_norm": 1.0170401334762573, "learning_rate": 1.4069847098660437e-06, "loss": 0.03260248899459839, "memory(GiB)": 122.96, "step": 60630, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.23216 }, { "epoch": 4.621922402622151, "grad_norm": 1.5877408981323242, "learning_rate": 1.4041656629334542e-06, "loss": 0.03542076349258423, "memory(GiB)": 122.96, "step": 60635, "token_acc": 0.9842818428184282, "train_speed(iter/s)": 0.232165 }, { "epoch": 4.622303529232411, "grad_norm": 2.0237069129943848, "learning_rate": 1.4013494027464547e-06, "loss": 0.037897327542304994, "memory(GiB)": 122.96, "step": 60640, "token_acc": 0.9875870420829549, "train_speed(iter/s)": 0.232169 }, { "epoch": 4.622684655842671, "grad_norm": 2.6795835494995117, "learning_rate": 1.398535929466538e-06, "loss": 0.046442723274230956, "memory(GiB)": 122.96, "step": 60645, "token_acc": 0.9759887005649718, "train_speed(iter/s)": 0.232173 }, { "epoch": 4.623065782452931, "grad_norm": 0.819282591342926, "learning_rate": 1.3957252432550471e-06, "loss": 0.03213365375995636, "memory(GiB)": 122.96, "step": 60650, "token_acc": 0.9868787928489421, "train_speed(iter/s)": 0.232176 }, { "epoch": 4.6234469090631904, "grad_norm": 0.8229780793190002, "learning_rate": 1.3929173442731646e-06, "loss": 0.024974478781223296, "memory(GiB)": 122.96, "step": 60655, "token_acc": 0.9910885396703718, "train_speed(iter/s)": 0.232176 }, { "epoch": 4.6238280356734505, "grad_norm": 0.7513054609298706, "learning_rate": 1.3901122326819061e-06, "loss": 0.022850276529788972, "memory(GiB)": 122.96, "step": 60660, "token_acc": 0.9915281850765721, "train_speed(iter/s)": 0.232182 }, { "epoch": 4.6242091622837105, "grad_norm": 4.19672155380249, "learning_rate": 1.3873099086421371e-06, "loss": 0.038404762744903564, "memory(GiB)": 122.96, "step": 60665, "token_acc": 0.9881588999236058, "train_speed(iter/s)": 0.232187 }, { "epoch": 4.6245902888939705, "grad_norm": 1.238061785697937, "learning_rate": 1.3845103723145514e-06, "loss": 0.024101121723651885, "memory(GiB)": 122.96, "step": 60670, "token_acc": 0.99045445113094, "train_speed(iter/s)": 0.232191 }, { "epoch": 4.6249714155042305, "grad_norm": 1.5947785377502441, "learning_rate": 1.381713623859693e-06, "loss": 0.040961471199989316, "memory(GiB)": 122.96, "step": 60675, "token_acc": 0.9788940481215703, "train_speed(iter/s)": 0.232197 }, { "epoch": 4.6253525421144905, "grad_norm": 1.2246191501617432, "learning_rate": 1.3789196634379443e-06, "loss": 0.020083144307136536, "memory(GiB)": 122.96, "step": 60680, "token_acc": 0.9925187032418953, "train_speed(iter/s)": 0.232201 }, { "epoch": 4.625733668724751, "grad_norm": 1.0509718656539917, "learning_rate": 1.3761284912095218e-06, "loss": 0.024473166465759276, "memory(GiB)": 122.96, "step": 60685, "token_acc": 0.9897302647859441, "train_speed(iter/s)": 0.232203 }, { "epoch": 4.626114795335011, "grad_norm": 1.1027723550796509, "learning_rate": 1.3733401073344865e-06, "loss": 0.03910635113716125, "memory(GiB)": 122.96, "step": 60690, "token_acc": 0.9804946011842564, "train_speed(iter/s)": 0.232209 }, { "epoch": 4.626495921945271, "grad_norm": 1.5473670959472656, "learning_rate": 1.3705545119727435e-06, "loss": 0.017747431993484497, "memory(GiB)": 122.96, "step": 60695, "token_acc": 0.9920554084334895, "train_speed(iter/s)": 0.232211 }, { "epoch": 4.62687704855553, "grad_norm": 1.848470687866211, "learning_rate": 1.3677717052840267e-06, "loss": 0.02820538878440857, "memory(GiB)": 122.96, "step": 60700, "token_acc": 0.9899749373433584, "train_speed(iter/s)": 0.232217 }, { "epoch": 4.62725817516579, "grad_norm": 1.926586627960205, "learning_rate": 1.3649916874279245e-06, "loss": 0.02229377329349518, "memory(GiB)": 122.96, "step": 60705, "token_acc": 0.9899569583931134, "train_speed(iter/s)": 0.232221 }, { "epoch": 4.62763930177605, "grad_norm": 1.134567379951477, "learning_rate": 1.3622144585638542e-06, "loss": 0.039626327157020566, "memory(GiB)": 122.96, "step": 60710, "token_acc": 0.9785384406411302, "train_speed(iter/s)": 0.232226 }, { "epoch": 4.62802042838631, "grad_norm": 1.9666521549224854, "learning_rate": 1.359440018851077e-06, "loss": 0.02587990164756775, "memory(GiB)": 122.96, "step": 60715, "token_acc": 0.9906505648617062, "train_speed(iter/s)": 0.232232 }, { "epoch": 4.62840155499657, "grad_norm": 1.7332028150558472, "learning_rate": 1.3566683684486936e-06, "loss": 0.04547636508941651, "memory(GiB)": 122.96, "step": 60720, "token_acc": 0.988653787181846, "train_speed(iter/s)": 0.232237 }, { "epoch": 4.62878268160683, "grad_norm": 1.292776346206665, "learning_rate": 1.3538995075156492e-06, "loss": 0.02467718571424484, "memory(GiB)": 122.96, "step": 60725, "token_acc": 0.9895083932853717, "train_speed(iter/s)": 0.232242 }, { "epoch": 4.62916380821709, "grad_norm": 0.8719351887702942, "learning_rate": 1.351133436210722e-06, "loss": 0.05328918099403381, "memory(GiB)": 122.96, "step": 60730, "token_acc": 0.9825156576200418, "train_speed(iter/s)": 0.232246 }, { "epoch": 4.62954493482735, "grad_norm": 0.8221322894096375, "learning_rate": 1.3483701546925354e-06, "loss": 0.026733216643333436, "memory(GiB)": 122.96, "step": 60735, "token_acc": 0.9844645829480303, "train_speed(iter/s)": 0.232251 }, { "epoch": 4.62992606143761, "grad_norm": 0.7159673571586609, "learning_rate": 1.3456096631195459e-06, "loss": 0.02159619629383087, "memory(GiB)": 122.96, "step": 60740, "token_acc": 0.9900018178512998, "train_speed(iter/s)": 0.232253 }, { "epoch": 4.630307188047869, "grad_norm": 1.2413570880889893, "learning_rate": 1.34285196165006e-06, "loss": 0.018571449816226958, "memory(GiB)": 122.96, "step": 60745, "token_acc": 0.9901408450704225, "train_speed(iter/s)": 0.232257 }, { "epoch": 4.63068831465813, "grad_norm": 1.0842870473861694, "learning_rate": 1.3400970504422238e-06, "loss": 0.020454996824264528, "memory(GiB)": 122.96, "step": 60750, "token_acc": 0.9955197132616488, "train_speed(iter/s)": 0.232263 }, { "epoch": 4.631069441268389, "grad_norm": 0.5409120917320251, "learning_rate": 1.3373449296540053e-06, "loss": 0.022233276069164275, "memory(GiB)": 122.96, "step": 60755, "token_acc": 0.9866156787762906, "train_speed(iter/s)": 0.232266 }, { "epoch": 4.631450567878649, "grad_norm": 0.7430676221847534, "learning_rate": 1.334595599443228e-06, "loss": 0.017131757736206055, "memory(GiB)": 122.96, "step": 60760, "token_acc": 0.99168500855955, "train_speed(iter/s)": 0.232271 }, { "epoch": 4.631831694488909, "grad_norm": 1.3821676969528198, "learning_rate": 1.3318490599675715e-06, "loss": 0.03269219398498535, "memory(GiB)": 122.96, "step": 60765, "token_acc": 0.9856041131105399, "train_speed(iter/s)": 0.232275 }, { "epoch": 4.632212821099169, "grad_norm": 1.7403305768966675, "learning_rate": 1.3291053113845098e-06, "loss": 0.044626444578170776, "memory(GiB)": 122.96, "step": 60770, "token_acc": 0.984382625671059, "train_speed(iter/s)": 0.232281 }, { "epoch": 4.632593947709429, "grad_norm": 2.045459270477295, "learning_rate": 1.3263643538514058e-06, "loss": 0.023094524443149567, "memory(GiB)": 122.96, "step": 60775, "token_acc": 0.9904175222450377, "train_speed(iter/s)": 0.232285 }, { "epoch": 4.632975074319689, "grad_norm": 1.418128252029419, "learning_rate": 1.3236261875254342e-06, "loss": 0.026492989063262938, "memory(GiB)": 122.96, "step": 60780, "token_acc": 0.9901315789473685, "train_speed(iter/s)": 0.232288 }, { "epoch": 4.633356200929949, "grad_norm": 1.2903869152069092, "learning_rate": 1.3208908125636077e-06, "loss": 0.04507728219032288, "memory(GiB)": 122.96, "step": 60785, "token_acc": 0.9849740932642487, "train_speed(iter/s)": 0.232293 }, { "epoch": 4.633737327540209, "grad_norm": 2.3505120277404785, "learning_rate": 1.3181582291227956e-06, "loss": 0.032853543758392334, "memory(GiB)": 122.96, "step": 60790, "token_acc": 0.988056206088993, "train_speed(iter/s)": 0.232297 }, { "epoch": 4.634118454150469, "grad_norm": 0.5790948867797852, "learning_rate": 1.315428437359706e-06, "loss": 0.0162195160984993, "memory(GiB)": 122.96, "step": 60795, "token_acc": 0.9913085465958474, "train_speed(iter/s)": 0.232303 }, { "epoch": 4.634499580760728, "grad_norm": 1.6790337562561035, "learning_rate": 1.3127014374308632e-06, "loss": 0.022939407825469972, "memory(GiB)": 122.96, "step": 60800, "token_acc": 0.9886965736488873, "train_speed(iter/s)": 0.232306 }, { "epoch": 4.634499580760728, "eval_loss": 0.047551922500133514, "eval_runtime": 222.8452, "eval_samples_per_second": 2.378, "eval_steps_per_second": 2.378, "eval_token_acc": 0.9809348834407566, "step": 60800 }, { "epoch": 4.634880707370988, "grad_norm": 0.6930747032165527, "learning_rate": 1.3099772294926594e-06, "loss": 0.026609230041503906, "memory(GiB)": 122.96, "step": 60805, "token_acc": 0.9811366723761148, "train_speed(iter/s)": 0.232114 }, { "epoch": 4.635261833981248, "grad_norm": 0.9142654538154602, "learning_rate": 1.3072558137013135e-06, "loss": 0.025716793537139893, "memory(GiB)": 122.96, "step": 60810, "token_acc": 0.9916365280289331, "train_speed(iter/s)": 0.232119 }, { "epoch": 4.635642960591508, "grad_norm": 0.7066440582275391, "learning_rate": 1.3045371902128846e-06, "loss": 0.03184525370597839, "memory(GiB)": 122.96, "step": 60815, "token_acc": 0.9872913032643907, "train_speed(iter/s)": 0.23212 }, { "epoch": 4.636024087201768, "grad_norm": 0.7985535860061646, "learning_rate": 1.3018213591832696e-06, "loss": 0.035155081748962404, "memory(GiB)": 122.96, "step": 60820, "token_acc": 0.9871506986027944, "train_speed(iter/s)": 0.232121 }, { "epoch": 4.636405213812028, "grad_norm": 1.1715788841247559, "learning_rate": 1.2991083207682164e-06, "loss": 0.02949814796447754, "memory(GiB)": 122.96, "step": 60825, "token_acc": 0.990303648890023, "train_speed(iter/s)": 0.232126 }, { "epoch": 4.636786340422288, "grad_norm": 0.8028376698493958, "learning_rate": 1.2963980751233008e-06, "loss": 0.016097447276115416, "memory(GiB)": 122.96, "step": 60830, "token_acc": 0.9922572960095295, "train_speed(iter/s)": 0.23213 }, { "epoch": 4.6371674670325485, "grad_norm": 1.7293095588684082, "learning_rate": 1.293690622403937e-06, "loss": 0.01777866929769516, "memory(GiB)": 122.96, "step": 60835, "token_acc": 0.9933520838660189, "train_speed(iter/s)": 0.232131 }, { "epoch": 4.6375485936428085, "grad_norm": 0.3175983726978302, "learning_rate": 1.290985962765401e-06, "loss": 0.04016014039516449, "memory(GiB)": 122.96, "step": 60840, "token_acc": 0.9799574855754631, "train_speed(iter/s)": 0.232137 }, { "epoch": 4.637929720253068, "grad_norm": 1.547991394996643, "learning_rate": 1.288284096362774e-06, "loss": 0.0553737998008728, "memory(GiB)": 122.96, "step": 60845, "token_acc": 0.9822012350163458, "train_speed(iter/s)": 0.232143 }, { "epoch": 4.638310846863328, "grad_norm": 1.1655960083007812, "learning_rate": 1.2855850233510158e-06, "loss": 0.04579851031303406, "memory(GiB)": 122.96, "step": 60850, "token_acc": 0.9830600108127591, "train_speed(iter/s)": 0.232147 }, { "epoch": 4.638691973473588, "grad_norm": 1.2653907537460327, "learning_rate": 1.2828887438848802e-06, "loss": 0.03539060652256012, "memory(GiB)": 122.96, "step": 60855, "token_acc": 0.9882677708764666, "train_speed(iter/s)": 0.232151 }, { "epoch": 4.639073100083848, "grad_norm": 0.8968381881713867, "learning_rate": 1.2801952581190047e-06, "loss": 0.025895309448242188, "memory(GiB)": 122.96, "step": 60860, "token_acc": 0.991690635985938, "train_speed(iter/s)": 0.232154 }, { "epoch": 4.639454226694108, "grad_norm": 1.7907832860946655, "learning_rate": 1.2775045662078433e-06, "loss": 0.025793179869651794, "memory(GiB)": 122.96, "step": 60865, "token_acc": 0.9869997968718262, "train_speed(iter/s)": 0.232158 }, { "epoch": 4.639835353304368, "grad_norm": 1.6447900533676147, "learning_rate": 1.274816668305695e-06, "loss": 0.03707886040210724, "memory(GiB)": 122.96, "step": 60870, "token_acc": 0.9862436210339472, "train_speed(iter/s)": 0.232163 }, { "epoch": 4.640216479914628, "grad_norm": 1.0288949012756348, "learning_rate": 1.2721315645667032e-06, "loss": 0.02903560996055603, "memory(GiB)": 122.96, "step": 60875, "token_acc": 0.9875157806143919, "train_speed(iter/s)": 0.232165 }, { "epoch": 4.640597606524888, "grad_norm": 1.0418990850448608, "learning_rate": 1.269449255144839e-06, "loss": 0.03530450463294983, "memory(GiB)": 122.96, "step": 60880, "token_acc": 0.98761835396941, "train_speed(iter/s)": 0.232169 }, { "epoch": 4.640978733135148, "grad_norm": 0.7457214593887329, "learning_rate": 1.2667697401939183e-06, "loss": 0.015837322175502776, "memory(GiB)": 122.96, "step": 60885, "token_acc": 0.9949220166848023, "train_speed(iter/s)": 0.232175 }, { "epoch": 4.641359859745408, "grad_norm": 1.5198224782943726, "learning_rate": 1.2640930198676125e-06, "loss": 0.020224574208259582, "memory(GiB)": 122.96, "step": 60890, "token_acc": 0.9916651567312919, "train_speed(iter/s)": 0.232179 }, { "epoch": 4.641740986355668, "grad_norm": 1.3879709243774414, "learning_rate": 1.2614190943193993e-06, "loss": 0.024478282034397125, "memory(GiB)": 122.96, "step": 60895, "token_acc": 0.9908485856905158, "train_speed(iter/s)": 0.232186 }, { "epoch": 4.642122112965927, "grad_norm": 2.139388084411621, "learning_rate": 1.2587479637026335e-06, "loss": 0.0459387868642807, "memory(GiB)": 122.96, "step": 60900, "token_acc": 0.9815279361459521, "train_speed(iter/s)": 0.232189 }, { "epoch": 4.642503239576187, "grad_norm": 0.6582933068275452, "learning_rate": 1.2560796281704867e-06, "loss": 0.021988961100578307, "memory(GiB)": 122.96, "step": 60905, "token_acc": 0.9897020458602225, "train_speed(iter/s)": 0.232191 }, { "epoch": 4.642884366186447, "grad_norm": 0.9625617861747742, "learning_rate": 1.2534140878759647e-06, "loss": 0.03884890675544739, "memory(GiB)": 122.96, "step": 60910, "token_acc": 0.9848942598187311, "train_speed(iter/s)": 0.232192 }, { "epoch": 4.643265492796707, "grad_norm": 0.8598637580871582, "learning_rate": 1.2507513429719397e-06, "loss": 0.020272910594940186, "memory(GiB)": 122.96, "step": 60915, "token_acc": 0.9932046332046333, "train_speed(iter/s)": 0.232194 }, { "epoch": 4.643646619406967, "grad_norm": 0.7010855674743652, "learning_rate": 1.2480913936111006e-06, "loss": 0.019646824896335603, "memory(GiB)": 122.96, "step": 60920, "token_acc": 0.9928571428571429, "train_speed(iter/s)": 0.2322 }, { "epoch": 4.644027746017227, "grad_norm": 1.77031409740448, "learning_rate": 1.2454342399459806e-06, "loss": 0.029390883445739747, "memory(GiB)": 122.96, "step": 60925, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.232207 }, { "epoch": 4.644408872627487, "grad_norm": 1.978651523590088, "learning_rate": 1.2427798821289583e-06, "loss": 0.030956292152404787, "memory(GiB)": 122.96, "step": 60930, "token_acc": 0.9854848672019765, "train_speed(iter/s)": 0.232212 }, { "epoch": 4.644789999237747, "grad_norm": 4.312848091125488, "learning_rate": 1.2401283203122505e-06, "loss": 0.024297848343849182, "memory(GiB)": 122.96, "step": 60935, "token_acc": 0.9879281302638967, "train_speed(iter/s)": 0.232218 }, { "epoch": 4.645171125848007, "grad_norm": 2.0148231983184814, "learning_rate": 1.237479554647908e-06, "loss": 0.0364683598279953, "memory(GiB)": 122.96, "step": 60940, "token_acc": 0.9866873751941424, "train_speed(iter/s)": 0.232223 }, { "epoch": 4.645552252458266, "grad_norm": 0.7453345060348511, "learning_rate": 1.2348335852878313e-06, "loss": 0.033798706531524655, "memory(GiB)": 122.96, "step": 60945, "token_acc": 0.9863251155624037, "train_speed(iter/s)": 0.232221 }, { "epoch": 4.645933379068526, "grad_norm": 2.073413133621216, "learning_rate": 1.2321904123837491e-06, "loss": 0.0502602219581604, "memory(GiB)": 122.96, "step": 60950, "token_acc": 0.980480187390201, "train_speed(iter/s)": 0.232224 }, { "epoch": 4.646314505678786, "grad_norm": 1.3760384321212769, "learning_rate": 1.2295500360872347e-06, "loss": 0.022798720002174377, "memory(GiB)": 122.96, "step": 60955, "token_acc": 0.9896678966789668, "train_speed(iter/s)": 0.23223 }, { "epoch": 4.646695632289046, "grad_norm": 2.8886334896087646, "learning_rate": 1.2269124565497004e-06, "loss": 0.040892386436462404, "memory(GiB)": 122.96, "step": 60960, "token_acc": 0.9894756984309223, "train_speed(iter/s)": 0.232234 }, { "epoch": 4.647076758899306, "grad_norm": 1.017041563987732, "learning_rate": 1.2242776739224082e-06, "loss": 0.013414829969406128, "memory(GiB)": 122.96, "step": 60965, "token_acc": 0.9951541850220265, "train_speed(iter/s)": 0.23224 }, { "epoch": 4.647457885509566, "grad_norm": 0.5610625147819519, "learning_rate": 1.2216456883564431e-06, "loss": 0.02452038824558258, "memory(GiB)": 122.96, "step": 60970, "token_acc": 0.9892431809450634, "train_speed(iter/s)": 0.232246 }, { "epoch": 4.647839012119826, "grad_norm": 1.9398943185806274, "learning_rate": 1.2190165000027343e-06, "loss": 0.037284481525421145, "memory(GiB)": 122.96, "step": 60975, "token_acc": 0.9802823758519961, "train_speed(iter/s)": 0.232251 }, { "epoch": 4.648220138730086, "grad_norm": 1.347019076347351, "learning_rate": 1.216390109012061e-06, "loss": 0.033004969358444214, "memory(GiB)": 122.96, "step": 60980, "token_acc": 0.9862897765741367, "train_speed(iter/s)": 0.232254 }, { "epoch": 4.648601265340346, "grad_norm": 0.9502182006835938, "learning_rate": 1.2137665155350308e-06, "loss": 0.02264381796121597, "memory(GiB)": 122.96, "step": 60985, "token_acc": 0.9926324737886086, "train_speed(iter/s)": 0.232259 }, { "epoch": 4.648982391950606, "grad_norm": 1.3928273916244507, "learning_rate": 1.2111457197220955e-06, "loss": 0.04179536998271942, "memory(GiB)": 122.96, "step": 60990, "token_acc": 0.9864983342100648, "train_speed(iter/s)": 0.232263 }, { "epoch": 4.649363518560866, "grad_norm": 1.5768187046051025, "learning_rate": 1.2085277217235402e-06, "loss": 0.027929714322090148, "memory(GiB)": 122.96, "step": 60995, "token_acc": 0.9833887043189369, "train_speed(iter/s)": 0.232268 }, { "epoch": 4.6497446451711255, "grad_norm": 1.1759685277938843, "learning_rate": 1.2059125216895062e-06, "loss": 0.027248209714889525, "memory(GiB)": 122.96, "step": 61000, "token_acc": 0.9896519285042333, "train_speed(iter/s)": 0.232273 }, { "epoch": 4.6497446451711255, "eval_loss": 0.047376252710819244, "eval_runtime": 221.0618, "eval_samples_per_second": 2.398, "eval_steps_per_second": 2.398, "eval_token_acc": 0.9808897054394313, "step": 61000 }, { "epoch": 4.6501257717813855, "grad_norm": 0.17850451171398163, "learning_rate": 1.2033001197699566e-06, "loss": 0.013813818991184234, "memory(GiB)": 122.96, "step": 61005, "token_acc": 0.9811182658968084, "train_speed(iter/s)": 0.232083 }, { "epoch": 4.6505068983916455, "grad_norm": 2.2087550163269043, "learning_rate": 1.2006905161146998e-06, "loss": 0.0296069860458374, "memory(GiB)": 122.96, "step": 61010, "token_acc": 0.9851718714895529, "train_speed(iter/s)": 0.232088 }, { "epoch": 4.650888025001906, "grad_norm": 0.5064737200737, "learning_rate": 1.1980837108733822e-06, "loss": 0.022315962612628935, "memory(GiB)": 122.96, "step": 61015, "token_acc": 0.9928017718715393, "train_speed(iter/s)": 0.232095 }, { "epoch": 4.651269151612166, "grad_norm": 1.6522910594940186, "learning_rate": 1.1954797041955012e-06, "loss": 0.03992014229297638, "memory(GiB)": 122.96, "step": 61020, "token_acc": 0.9831450268568254, "train_speed(iter/s)": 0.232098 }, { "epoch": 4.651650278222426, "grad_norm": 1.1453275680541992, "learning_rate": 1.1928784962303762e-06, "loss": 0.022047913074493407, "memory(GiB)": 122.96, "step": 61025, "token_acc": 0.9913534497970707, "train_speed(iter/s)": 0.232101 }, { "epoch": 4.652031404832686, "grad_norm": 1.0719425678253174, "learning_rate": 1.1902800871271768e-06, "loss": 0.037893146276474, "memory(GiB)": 122.96, "step": 61030, "token_acc": 0.9875901875901876, "train_speed(iter/s)": 0.232107 }, { "epoch": 4.652412531442946, "grad_norm": 1.3731259107589722, "learning_rate": 1.1876844770349115e-06, "loss": 0.0212628573179245, "memory(GiB)": 122.96, "step": 61035, "token_acc": 0.9911605360707157, "train_speed(iter/s)": 0.232112 }, { "epoch": 4.652793658053206, "grad_norm": 0.6540417671203613, "learning_rate": 1.185091666102417e-06, "loss": 0.036983382701873777, "memory(GiB)": 122.96, "step": 61040, "token_acc": 0.9862330407023144, "train_speed(iter/s)": 0.232116 }, { "epoch": 4.653174784663465, "grad_norm": 0.9115381836891174, "learning_rate": 1.1825016544783906e-06, "loss": 0.04454312920570373, "memory(GiB)": 122.96, "step": 61045, "token_acc": 0.9820701792982071, "train_speed(iter/s)": 0.232116 }, { "epoch": 4.653555911273725, "grad_norm": 0.41214799880981445, "learning_rate": 1.1799144423113583e-06, "loss": 0.02541455626487732, "memory(GiB)": 122.96, "step": 61050, "token_acc": 0.9872162221732422, "train_speed(iter/s)": 0.232121 }, { "epoch": 4.653937037883985, "grad_norm": 0.229727640748024, "learning_rate": 1.1773300297496736e-06, "loss": 0.01942000985145569, "memory(GiB)": 122.96, "step": 61055, "token_acc": 0.9923165578179024, "train_speed(iter/s)": 0.232123 }, { "epoch": 4.654318164494245, "grad_norm": 1.1289477348327637, "learning_rate": 1.1747484169415568e-06, "loss": 0.028842803835868836, "memory(GiB)": 122.96, "step": 61060, "token_acc": 0.9875073199297286, "train_speed(iter/s)": 0.232126 }, { "epoch": 4.654699291104505, "grad_norm": 0.8330904841423035, "learning_rate": 1.1721696040350282e-06, "loss": 0.02264593541622162, "memory(GiB)": 122.96, "step": 61065, "token_acc": 0.9893727859970827, "train_speed(iter/s)": 0.232129 }, { "epoch": 4.655080417714765, "grad_norm": 0.6046236753463745, "learning_rate": 1.169593591177992e-06, "loss": 0.02747665047645569, "memory(GiB)": 122.96, "step": 61070, "token_acc": 0.9913860610806577, "train_speed(iter/s)": 0.232132 }, { "epoch": 4.655461544325025, "grad_norm": 1.0319052934646606, "learning_rate": 1.1670203785181633e-06, "loss": 0.037145569920539856, "memory(GiB)": 122.96, "step": 61075, "token_acc": 0.9872260488415779, "train_speed(iter/s)": 0.232133 }, { "epoch": 4.655842670935285, "grad_norm": 1.1024092435836792, "learning_rate": 1.164449966203096e-06, "loss": 0.03279047906398773, "memory(GiB)": 122.96, "step": 61080, "token_acc": 0.9880376344086022, "train_speed(iter/s)": 0.232133 }, { "epoch": 4.656223797545545, "grad_norm": 0.6266564726829529, "learning_rate": 1.1618823543802005e-06, "loss": 0.02527884840965271, "memory(GiB)": 122.96, "step": 61085, "token_acc": 0.9890818858560794, "train_speed(iter/s)": 0.232138 }, { "epoch": 4.656604924155804, "grad_norm": 1.9543403387069702, "learning_rate": 1.1593175431967252e-06, "loss": 0.018741035461425783, "memory(GiB)": 122.96, "step": 61090, "token_acc": 0.9911221590909091, "train_speed(iter/s)": 0.232141 }, { "epoch": 4.656986050766065, "grad_norm": 0.9495273232460022, "learning_rate": 1.1567555327997303e-06, "loss": 0.04945862293243408, "memory(GiB)": 122.96, "step": 61095, "token_acc": 0.9861270022883295, "train_speed(iter/s)": 0.232142 }, { "epoch": 4.657367177376324, "grad_norm": 1.0158766508102417, "learning_rate": 1.1541963233361486e-06, "loss": 0.021300849318504334, "memory(GiB)": 122.96, "step": 61100, "token_acc": 0.9906017886918296, "train_speed(iter/s)": 0.232146 }, { "epoch": 4.657748303986584, "grad_norm": 1.1285429000854492, "learning_rate": 1.1516399149527347e-06, "loss": 0.040990900993347165, "memory(GiB)": 122.96, "step": 61105, "token_acc": 0.9819909954977488, "train_speed(iter/s)": 0.23215 }, { "epoch": 4.658129430596844, "grad_norm": 1.8849905729293823, "learning_rate": 1.1490863077960879e-06, "loss": 0.040176373720169065, "memory(GiB)": 122.96, "step": 61110, "token_acc": 0.9830719442369927, "train_speed(iter/s)": 0.232155 }, { "epoch": 4.658510557207104, "grad_norm": 1.1330013275146484, "learning_rate": 1.146535502012641e-06, "loss": 0.017493507266044615, "memory(GiB)": 122.96, "step": 61115, "token_acc": 0.9961146187469645, "train_speed(iter/s)": 0.232161 }, { "epoch": 4.658891683817364, "grad_norm": 0.952081561088562, "learning_rate": 1.1439874977486774e-06, "loss": 0.04211142063140869, "memory(GiB)": 122.96, "step": 61120, "token_acc": 0.9828678713916921, "train_speed(iter/s)": 0.232165 }, { "epoch": 4.659272810427624, "grad_norm": 1.5412325859069824, "learning_rate": 1.1414422951503134e-06, "loss": 0.017467445135116576, "memory(GiB)": 122.96, "step": 61125, "token_acc": 0.9922330097087378, "train_speed(iter/s)": 0.232172 }, { "epoch": 4.659653937037884, "grad_norm": 0.6203048825263977, "learning_rate": 1.1388998943635042e-06, "loss": 0.02607642114162445, "memory(GiB)": 122.96, "step": 61130, "token_acc": 0.98673505482844, "train_speed(iter/s)": 0.232174 }, { "epoch": 4.660035063648144, "grad_norm": 1.8143306970596313, "learning_rate": 1.1363602955340391e-06, "loss": 0.028908094763755797, "memory(GiB)": 122.96, "step": 61135, "token_acc": 0.9919137466307277, "train_speed(iter/s)": 0.23218 }, { "epoch": 4.660416190258404, "grad_norm": 2.3708927631378174, "learning_rate": 1.133823498807557e-06, "loss": 0.03409013748168945, "memory(GiB)": 122.96, "step": 61140, "token_acc": 0.988360814742968, "train_speed(iter/s)": 0.232184 }, { "epoch": 4.660797316868663, "grad_norm": 0.5703150033950806, "learning_rate": 1.1312895043295357e-06, "loss": 0.01564977169036865, "memory(GiB)": 122.96, "step": 61145, "token_acc": 0.9927613941018767, "train_speed(iter/s)": 0.232188 }, { "epoch": 4.661178443478923, "grad_norm": 0.740776002407074, "learning_rate": 1.128758312245276e-06, "loss": 0.03463201820850372, "memory(GiB)": 122.96, "step": 61150, "token_acc": 0.9847011952191235, "train_speed(iter/s)": 0.23219 }, { "epoch": 4.661559570089183, "grad_norm": 0.9579795002937317, "learning_rate": 1.1262299226999396e-06, "loss": 0.029874277114868165, "memory(GiB)": 122.96, "step": 61155, "token_acc": 0.9853244078269825, "train_speed(iter/s)": 0.232194 }, { "epoch": 4.661940696699443, "grad_norm": 3.4181885719299316, "learning_rate": 1.1237043358385157e-06, "loss": 0.05609452724456787, "memory(GiB)": 122.96, "step": 61160, "token_acc": 0.9834352198777362, "train_speed(iter/s)": 0.232198 }, { "epoch": 4.6623218233097035, "grad_norm": 1.467902660369873, "learning_rate": 1.1211815518058333e-06, "loss": 0.016769374907016753, "memory(GiB)": 122.96, "step": 61165, "token_acc": 0.9940938762822505, "train_speed(iter/s)": 0.232203 }, { "epoch": 4.6627029499199635, "grad_norm": 2.0438809394836426, "learning_rate": 1.1186615707465652e-06, "loss": 0.030275991559028624, "memory(GiB)": 122.96, "step": 61170, "token_acc": 0.989075018208303, "train_speed(iter/s)": 0.232207 }, { "epoch": 4.6630840765302235, "grad_norm": 1.8113023042678833, "learning_rate": 1.1161443928052184e-06, "loss": 0.024242308735847474, "memory(GiB)": 122.96, "step": 61175, "token_acc": 0.9880014998125234, "train_speed(iter/s)": 0.232214 }, { "epoch": 4.6634652031404835, "grad_norm": 1.5368345975875854, "learning_rate": 1.1136300181261383e-06, "loss": 0.04363790154457092, "memory(GiB)": 122.96, "step": 61180, "token_acc": 0.9835504885993486, "train_speed(iter/s)": 0.232216 }, { "epoch": 4.6638463297507435, "grad_norm": 1.0030744075775146, "learning_rate": 1.1111184468535208e-06, "loss": 0.026553583145141602, "memory(GiB)": 122.96, "step": 61185, "token_acc": 0.9887096774193549, "train_speed(iter/s)": 0.232221 }, { "epoch": 4.664227456361003, "grad_norm": 0.9807528257369995, "learning_rate": 1.1086096791313893e-06, "loss": 0.01515219807624817, "memory(GiB)": 122.96, "step": 61190, "token_acc": 0.9946470391435263, "train_speed(iter/s)": 0.232226 }, { "epoch": 4.664608582971263, "grad_norm": 0.7072246074676514, "learning_rate": 1.1061037151036014e-06, "loss": 0.027915796637535094, "memory(GiB)": 122.96, "step": 61195, "token_acc": 0.9888629176405884, "train_speed(iter/s)": 0.232228 }, { "epoch": 4.664989709581523, "grad_norm": 0.6890553832054138, "learning_rate": 1.103600554913875e-06, "loss": 0.02189536690711975, "memory(GiB)": 122.96, "step": 61200, "token_acc": 0.9909528673573618, "train_speed(iter/s)": 0.232231 }, { "epoch": 4.664989709581523, "eval_loss": 0.04738510400056839, "eval_runtime": 221.98, "eval_samples_per_second": 2.388, "eval_steps_per_second": 2.388, "eval_token_acc": 0.9809047647732064, "step": 61200 }, { "epoch": 4.665370836191783, "grad_norm": 1.0901367664337158, "learning_rate": 1.1011001987057512e-06, "loss": 0.026385876536369323, "memory(GiB)": 122.96, "step": 61205, "token_acc": 0.9812427405798879, "train_speed(iter/s)": 0.232038 }, { "epoch": 4.665751962802043, "grad_norm": 1.1585325002670288, "learning_rate": 1.0986026466226097e-06, "loss": 0.03670257329940796, "memory(GiB)": 122.96, "step": 61210, "token_acc": 0.9861895794099184, "train_speed(iter/s)": 0.232043 }, { "epoch": 4.666133089412303, "grad_norm": 0.689922571182251, "learning_rate": 1.0961078988076745e-06, "loss": 0.021965204179286955, "memory(GiB)": 122.96, "step": 61215, "token_acc": 0.9938735177865613, "train_speed(iter/s)": 0.232047 }, { "epoch": 4.666514216022563, "grad_norm": 0.7392595410346985, "learning_rate": 1.0936159554040148e-06, "loss": 0.01721942126750946, "memory(GiB)": 122.96, "step": 61220, "token_acc": 0.993414863593603, "train_speed(iter/s)": 0.23205 }, { "epoch": 4.666895342632823, "grad_norm": 0.5252753496170044, "learning_rate": 1.0911268165545219e-06, "loss": 0.029862654209136964, "memory(GiB)": 122.96, "step": 61225, "token_acc": 0.9900788257678717, "train_speed(iter/s)": 0.232051 }, { "epoch": 4.667276469243083, "grad_norm": 0.8966323137283325, "learning_rate": 1.0886404824019425e-06, "loss": 0.022389745712280272, "memory(GiB)": 122.96, "step": 61230, "token_acc": 0.989577905158937, "train_speed(iter/s)": 0.232054 }, { "epoch": 4.667657595853343, "grad_norm": 1.6428639888763428, "learning_rate": 1.086156953088857e-06, "loss": 0.038077926635742186, "memory(GiB)": 122.96, "step": 61235, "token_acc": 0.9847730600292826, "train_speed(iter/s)": 0.232058 }, { "epoch": 4.668038722463603, "grad_norm": 0.19019271433353424, "learning_rate": 1.0836762287576795e-06, "loss": 0.03622758090496063, "memory(GiB)": 122.96, "step": 61240, "token_acc": 0.9807288016818501, "train_speed(iter/s)": 0.232064 }, { "epoch": 4.668419849073862, "grad_norm": 0.5325716733932495, "learning_rate": 1.0811983095506794e-06, "loss": 0.032621186971664426, "memory(GiB)": 122.96, "step": 61245, "token_acc": 0.9905911592401917, "train_speed(iter/s)": 0.232067 }, { "epoch": 4.668800975684122, "grad_norm": 0.9472758173942566, "learning_rate": 1.0787231956099375e-06, "loss": 0.024635913968086242, "memory(GiB)": 122.96, "step": 61250, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.23207 }, { "epoch": 4.669182102294382, "grad_norm": 0.8344020247459412, "learning_rate": 1.0762508870773902e-06, "loss": 0.024257178604602813, "memory(GiB)": 122.96, "step": 61255, "token_acc": 0.988772272394435, "train_speed(iter/s)": 0.232076 }, { "epoch": 4.669563228904642, "grad_norm": 2.424053192138672, "learning_rate": 1.0737813840948296e-06, "loss": 0.0257768452167511, "memory(GiB)": 122.96, "step": 61260, "token_acc": 0.9901800327332242, "train_speed(iter/s)": 0.232082 }, { "epoch": 4.669944355514902, "grad_norm": 2.4174084663391113, "learning_rate": 1.0713146868038592e-06, "loss": 0.03682131767272949, "memory(GiB)": 122.96, "step": 61265, "token_acc": 0.9886601535240753, "train_speed(iter/s)": 0.232085 }, { "epoch": 4.670325482125162, "grad_norm": 0.9286627173423767, "learning_rate": 1.0688507953459382e-06, "loss": 0.02891078293323517, "memory(GiB)": 122.96, "step": 61270, "token_acc": 0.9863644412962635, "train_speed(iter/s)": 0.232088 }, { "epoch": 4.670706608735422, "grad_norm": 1.3224529027938843, "learning_rate": 1.0663897098623531e-06, "loss": 0.02586783766746521, "memory(GiB)": 122.96, "step": 61275, "token_acc": 0.9875393494228751, "train_speed(iter/s)": 0.232089 }, { "epoch": 4.671087735345682, "grad_norm": 1.2540982961654663, "learning_rate": 1.0639314304942416e-06, "loss": 0.026846295595169066, "memory(GiB)": 122.96, "step": 61280, "token_acc": 0.9905482041587902, "train_speed(iter/s)": 0.232093 }, { "epoch": 4.671468861955942, "grad_norm": 1.5355980396270752, "learning_rate": 1.0614759573825684e-06, "loss": 0.026610863208770753, "memory(GiB)": 122.96, "step": 61285, "token_acc": 0.99265750828991, "train_speed(iter/s)": 0.232097 }, { "epoch": 4.671849988566201, "grad_norm": 0.9595994353294373, "learning_rate": 1.0590232906681486e-06, "loss": 0.03639009296894073, "memory(GiB)": 122.96, "step": 61290, "token_acc": 0.9873070325900515, "train_speed(iter/s)": 0.232103 }, { "epoch": 4.672231115176461, "grad_norm": 0.8145761489868164, "learning_rate": 1.0565734304916253e-06, "loss": 0.02731316089630127, "memory(GiB)": 122.96, "step": 61295, "token_acc": 0.987745376148871, "train_speed(iter/s)": 0.232104 }, { "epoch": 4.672612241786721, "grad_norm": 0.8485201597213745, "learning_rate": 1.0541263769934972e-06, "loss": 0.03316416144371033, "memory(GiB)": 122.96, "step": 61300, "token_acc": 0.9871231155778895, "train_speed(iter/s)": 0.232109 }, { "epoch": 4.672993368396981, "grad_norm": 1.3438727855682373, "learning_rate": 1.05168213031408e-06, "loss": 0.038492798805236816, "memory(GiB)": 122.96, "step": 61305, "token_acc": 0.9846529473317056, "train_speed(iter/s)": 0.232112 }, { "epoch": 4.673374495007241, "grad_norm": 1.0703741312026978, "learning_rate": 1.0492406905935393e-06, "loss": 0.027751418948173522, "memory(GiB)": 122.96, "step": 61310, "token_acc": 0.9852766563761577, "train_speed(iter/s)": 0.232117 }, { "epoch": 4.673755621617501, "grad_norm": 1.1000903844833374, "learning_rate": 1.0468020579718962e-06, "loss": 0.021775977313518526, "memory(GiB)": 122.96, "step": 61315, "token_acc": 0.9903732477622024, "train_speed(iter/s)": 0.23212 }, { "epoch": 4.674136748227761, "grad_norm": 0.29717740416526794, "learning_rate": 1.044366232588978e-06, "loss": 0.036657577753067015, "memory(GiB)": 122.96, "step": 61320, "token_acc": 0.9908496732026144, "train_speed(iter/s)": 0.232125 }, { "epoch": 4.674517874838021, "grad_norm": 0.17276863753795624, "learning_rate": 1.041933214584473e-06, "loss": 0.03333697617053986, "memory(GiB)": 122.96, "step": 61325, "token_acc": 0.9834857723577236, "train_speed(iter/s)": 0.232131 }, { "epoch": 4.674899001448281, "grad_norm": 1.5690613985061646, "learning_rate": 1.0395030040979137e-06, "loss": 0.05338284969329834, "memory(GiB)": 122.96, "step": 61330, "token_acc": 0.976399814900509, "train_speed(iter/s)": 0.232136 }, { "epoch": 4.675280128058541, "grad_norm": 0.8349639773368835, "learning_rate": 1.0370756012686444e-06, "loss": 0.04203073084354401, "memory(GiB)": 122.96, "step": 61335, "token_acc": 0.9893213099193165, "train_speed(iter/s)": 0.232141 }, { "epoch": 4.6756612546688014, "grad_norm": 0.5867727398872375, "learning_rate": 1.034651006235876e-06, "loss": 0.03405931890010834, "memory(GiB)": 122.96, "step": 61340, "token_acc": 0.9876543209876543, "train_speed(iter/s)": 0.232144 }, { "epoch": 4.676042381279061, "grad_norm": 0.7253621220588684, "learning_rate": 1.0322292191386473e-06, "loss": 0.023121434450149535, "memory(GiB)": 122.96, "step": 61345, "token_acc": 0.9916302765647744, "train_speed(iter/s)": 0.232149 }, { "epoch": 4.676423507889321, "grad_norm": 2.2306067943573, "learning_rate": 1.029810240115836e-06, "loss": 0.03506519198417664, "memory(GiB)": 122.96, "step": 61350, "token_acc": 0.9884348496530455, "train_speed(iter/s)": 0.232153 }, { "epoch": 4.676804634499581, "grad_norm": 1.200028419494629, "learning_rate": 1.0273940693061534e-06, "loss": 0.023531691730022432, "memory(GiB)": 122.96, "step": 61355, "token_acc": 0.9890145395799677, "train_speed(iter/s)": 0.232157 }, { "epoch": 4.677185761109841, "grad_norm": 1.9438990354537964, "learning_rate": 1.0249807068481665e-06, "loss": 0.038467228412628174, "memory(GiB)": 122.96, "step": 61360, "token_acc": 0.985115677074907, "train_speed(iter/s)": 0.232161 }, { "epoch": 4.677566887720101, "grad_norm": 0.5216359496116638, "learning_rate": 1.0225701528802535e-06, "loss": 0.029366058111190797, "memory(GiB)": 122.96, "step": 61365, "token_acc": 0.9906449044585988, "train_speed(iter/s)": 0.232164 }, { "epoch": 4.677948014330361, "grad_norm": 0.6416053771972656, "learning_rate": 1.0201624075406646e-06, "loss": 0.02630408704280853, "memory(GiB)": 122.96, "step": 61370, "token_acc": 0.9892794376098418, "train_speed(iter/s)": 0.232167 }, { "epoch": 4.678329140940621, "grad_norm": 0.7418555617332458, "learning_rate": 1.017757470967473e-06, "loss": 0.02009653449058533, "memory(GiB)": 122.96, "step": 61375, "token_acc": 0.9917945058865502, "train_speed(iter/s)": 0.232171 }, { "epoch": 4.678710267550881, "grad_norm": 1.9875215291976929, "learning_rate": 1.0153553432985797e-06, "loss": 0.028872692584991456, "memory(GiB)": 122.96, "step": 61380, "token_acc": 0.9897869343194224, "train_speed(iter/s)": 0.232174 }, { "epoch": 4.679091394161141, "grad_norm": 0.8982096314430237, "learning_rate": 1.0129560246717408e-06, "loss": 0.02235586941242218, "memory(GiB)": 122.96, "step": 61385, "token_acc": 0.992741935483871, "train_speed(iter/s)": 0.232181 }, { "epoch": 4.6794725207714, "grad_norm": 4.214331150054932, "learning_rate": 1.0105595152245462e-06, "loss": 0.06918458342552185, "memory(GiB)": 122.96, "step": 61390, "token_acc": 0.9793353811784418, "train_speed(iter/s)": 0.232185 }, { "epoch": 4.67985364738166, "grad_norm": 1.053538203239441, "learning_rate": 1.008165815094425e-06, "loss": 0.034184446930885314, "memory(GiB)": 122.96, "step": 61395, "token_acc": 0.986873920552677, "train_speed(iter/s)": 0.232189 }, { "epoch": 4.68023477399192, "grad_norm": 1.1268681287765503, "learning_rate": 1.0057749244186455e-06, "loss": 0.024711443483829497, "memory(GiB)": 122.96, "step": 61400, "token_acc": 0.9901781635449978, "train_speed(iter/s)": 0.23219 }, { "epoch": 4.68023477399192, "eval_loss": 0.04719125106930733, "eval_runtime": 221.9428, "eval_samples_per_second": 2.388, "eval_steps_per_second": 2.388, "eval_token_acc": 0.9810704174447322, "step": 61400 }, { "epoch": 4.68061590060218, "grad_norm": 0.7273041605949402, "learning_rate": 1.0033868433343085e-06, "loss": 0.02062627673149109, "memory(GiB)": 122.96, "step": 61405, "token_acc": 0.9813245541514114, "train_speed(iter/s)": 0.232002 }, { "epoch": 4.68099702721244, "grad_norm": 1.6693743467330933, "learning_rate": 1.0010015719783717e-06, "loss": 0.028826135396957397, "memory(GiB)": 122.96, "step": 61410, "token_acc": 0.9868504772004242, "train_speed(iter/s)": 0.232005 }, { "epoch": 4.6813781538227, "grad_norm": 1.104300618171692, "learning_rate": 9.986191104876086e-07, "loss": 0.024704959988594056, "memory(GiB)": 122.96, "step": 61415, "token_acc": 0.9870359457866824, "train_speed(iter/s)": 0.232009 }, { "epoch": 4.68175928043296, "grad_norm": 0.456901878118515, "learning_rate": 9.962394589986435e-07, "loss": 0.029898211359977722, "memory(GiB)": 122.96, "step": 61420, "token_acc": 0.9906170031276657, "train_speed(iter/s)": 0.232014 }, { "epoch": 4.68214040704322, "grad_norm": 1.2525732517242432, "learning_rate": 9.938626176479393e-07, "loss": 0.01822896897792816, "memory(GiB)": 122.96, "step": 61425, "token_acc": 0.9933143907738593, "train_speed(iter/s)": 0.232017 }, { "epoch": 4.68252153365348, "grad_norm": 0.6975488662719727, "learning_rate": 9.914885865718038e-07, "loss": 0.059270888566970825, "memory(GiB)": 122.96, "step": 61430, "token_acc": 0.9832319579155022, "train_speed(iter/s)": 0.232019 }, { "epoch": 4.682902660263739, "grad_norm": 1.7342591285705566, "learning_rate": 9.891173659063667e-07, "loss": 0.050550436973571776, "memory(GiB)": 122.96, "step": 61435, "token_acc": 0.9802338530066815, "train_speed(iter/s)": 0.232024 }, { "epoch": 4.683283786873999, "grad_norm": 0.8611802458763123, "learning_rate": 9.867489557876085e-07, "loss": 0.0369092583656311, "memory(GiB)": 122.96, "step": 61440, "token_acc": 0.9866776703607639, "train_speed(iter/s)": 0.232025 }, { "epoch": 4.683664913484259, "grad_norm": 1.1343817710876465, "learning_rate": 9.843833563513537e-07, "loss": 0.022960680723190307, "memory(GiB)": 122.96, "step": 61445, "token_acc": 0.990264679038637, "train_speed(iter/s)": 0.232031 }, { "epoch": 4.684046040094519, "grad_norm": 2.3431215286254883, "learning_rate": 9.820205677332439e-07, "loss": 0.04980856478214264, "memory(GiB)": 122.96, "step": 61450, "token_acc": 0.9771542090886516, "train_speed(iter/s)": 0.232035 }, { "epoch": 4.684427166704779, "grad_norm": 0.9385314583778381, "learning_rate": 9.796605900687927e-07, "loss": 0.03279307186603546, "memory(GiB)": 122.96, "step": 61455, "token_acc": 0.9879109538552263, "train_speed(iter/s)": 0.232037 }, { "epoch": 4.684808293315039, "grad_norm": 1.1598470211029053, "learning_rate": 9.773034234933198e-07, "loss": 0.029305845499038696, "memory(GiB)": 122.96, "step": 61460, "token_acc": 0.9882713821725009, "train_speed(iter/s)": 0.23204 }, { "epoch": 4.685189419925299, "grad_norm": 1.0944265127182007, "learning_rate": 9.74949068142006e-07, "loss": 0.03499045968055725, "memory(GiB)": 122.96, "step": 61465, "token_acc": 0.9843612334801762, "train_speed(iter/s)": 0.232045 }, { "epoch": 4.685570546535559, "grad_norm": 1.1009502410888672, "learning_rate": 9.725975241498597e-07, "loss": 0.015461921691894531, "memory(GiB)": 122.96, "step": 61470, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.23205 }, { "epoch": 4.685951673145819, "grad_norm": 0.6325932741165161, "learning_rate": 9.702487916517234e-07, "loss": 0.02363167405128479, "memory(GiB)": 122.96, "step": 61475, "token_acc": 0.9899982453061941, "train_speed(iter/s)": 0.232052 }, { "epoch": 4.686332799756079, "grad_norm": 0.751190721988678, "learning_rate": 9.679028707822945e-07, "loss": 0.029121458530426025, "memory(GiB)": 122.96, "step": 61480, "token_acc": 0.9870707070707071, "train_speed(iter/s)": 0.232055 }, { "epoch": 4.686713926366339, "grad_norm": 1.291039228439331, "learning_rate": 9.655597616761048e-07, "loss": 0.058616673946380614, "memory(GiB)": 122.96, "step": 61485, "token_acc": 0.9804778554778555, "train_speed(iter/s)": 0.232061 }, { "epoch": 4.6870950529765985, "grad_norm": 0.7196748852729797, "learning_rate": 9.63219464467513e-07, "loss": 0.04002287685871124, "memory(GiB)": 122.96, "step": 61490, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.232063 }, { "epoch": 4.6874761795868585, "grad_norm": 1.1394920349121094, "learning_rate": 9.60881979290723e-07, "loss": 0.031638532876968384, "memory(GiB)": 122.96, "step": 61495, "token_acc": 0.9886167644015178, "train_speed(iter/s)": 0.232068 }, { "epoch": 4.6878573061971185, "grad_norm": 1.2487534284591675, "learning_rate": 9.58547306279789e-07, "loss": 0.03853621780872345, "memory(GiB)": 122.96, "step": 61500, "token_acc": 0.9831132944427388, "train_speed(iter/s)": 0.232071 }, { "epoch": 4.6882384328073785, "grad_norm": 1.2918245792388916, "learning_rate": 9.562154455685813e-07, "loss": 0.022726066410541534, "memory(GiB)": 122.96, "step": 61505, "token_acc": 0.9902723735408561, "train_speed(iter/s)": 0.232073 }, { "epoch": 4.6886195594176385, "grad_norm": 1.2484921216964722, "learning_rate": 9.538863972908262e-07, "loss": 0.04696458578109741, "memory(GiB)": 122.96, "step": 61510, "token_acc": 0.9863518197573656, "train_speed(iter/s)": 0.232076 }, { "epoch": 4.6890006860278985, "grad_norm": 0.9017997980117798, "learning_rate": 9.515601615800895e-07, "loss": 0.035561764240264894, "memory(GiB)": 122.96, "step": 61515, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.232082 }, { "epoch": 4.689381812638159, "grad_norm": 0.7909307479858398, "learning_rate": 9.492367385697587e-07, "loss": 0.019288481771945955, "memory(GiB)": 122.96, "step": 61520, "token_acc": 0.9908616187989556, "train_speed(iter/s)": 0.232087 }, { "epoch": 4.689762939248419, "grad_norm": 0.697950005531311, "learning_rate": 9.469161283930827e-07, "loss": 0.022962921857833864, "memory(GiB)": 122.96, "step": 61525, "token_acc": 0.9920708095150286, "train_speed(iter/s)": 0.232091 }, { "epoch": 4.690144065858679, "grad_norm": 1.0322718620300293, "learning_rate": 9.445983311831275e-07, "loss": 0.02409725487232208, "memory(GiB)": 122.96, "step": 61530, "token_acc": 0.9911178509532063, "train_speed(iter/s)": 0.232095 }, { "epoch": 4.690525192468938, "grad_norm": 0.8391053080558777, "learning_rate": 9.422833470728142e-07, "loss": 0.026560297608375548, "memory(GiB)": 122.96, "step": 61535, "token_acc": 0.9891854578923148, "train_speed(iter/s)": 0.2321 }, { "epoch": 4.690906319079198, "grad_norm": 0.858946681022644, "learning_rate": 9.399711761949037e-07, "loss": 0.0274686336517334, "memory(GiB)": 122.96, "step": 61540, "token_acc": 0.9875753012048193, "train_speed(iter/s)": 0.232101 }, { "epoch": 4.691287445689458, "grad_norm": 0.8050089478492737, "learning_rate": 9.37661818681973e-07, "loss": 0.03988372981548309, "memory(GiB)": 122.96, "step": 61545, "token_acc": 0.9874794745484401, "train_speed(iter/s)": 0.232105 }, { "epoch": 4.691668572299718, "grad_norm": 0.7987444996833801, "learning_rate": 9.353552746664551e-07, "loss": 0.016304290294647215, "memory(GiB)": 122.96, "step": 61550, "token_acc": 0.9921760391198045, "train_speed(iter/s)": 0.232107 }, { "epoch": 4.692049698909978, "grad_norm": 0.6347752213478088, "learning_rate": 9.330515442806331e-07, "loss": 0.03597458600997925, "memory(GiB)": 122.96, "step": 61555, "token_acc": 0.9870211429767637, "train_speed(iter/s)": 0.23211 }, { "epoch": 4.692430825520238, "grad_norm": 1.4847877025604248, "learning_rate": 9.307506276566014e-07, "loss": 0.041729781031608584, "memory(GiB)": 122.96, "step": 61560, "token_acc": 0.9824852507374632, "train_speed(iter/s)": 0.232113 }, { "epoch": 4.692811952130498, "grad_norm": 0.13778835535049438, "learning_rate": 9.284525249263154e-07, "loss": 0.01844359189271927, "memory(GiB)": 122.96, "step": 61565, "token_acc": 0.9899512789281364, "train_speed(iter/s)": 0.232118 }, { "epoch": 4.693193078740758, "grad_norm": 0.5915724635124207, "learning_rate": 9.261572362215587e-07, "loss": 0.018096770346164703, "memory(GiB)": 122.96, "step": 61570, "token_acc": 0.9925816023738873, "train_speed(iter/s)": 0.232121 }, { "epoch": 4.693574205351018, "grad_norm": 0.4606468975543976, "learning_rate": 9.238647616739483e-07, "loss": 0.034654590487480166, "memory(GiB)": 122.96, "step": 61575, "token_acc": 0.985, "train_speed(iter/s)": 0.232125 }, { "epoch": 4.693955331961278, "grad_norm": 0.6657158732414246, "learning_rate": 9.215751014149566e-07, "loss": 0.02218567132949829, "memory(GiB)": 122.96, "step": 61580, "token_acc": 0.9892358646794417, "train_speed(iter/s)": 0.232124 }, { "epoch": 4.694336458571538, "grad_norm": 0.9543592929840088, "learning_rate": 9.192882555758841e-07, "loss": 0.05128744840621948, "memory(GiB)": 122.96, "step": 61585, "token_acc": 0.984805510534846, "train_speed(iter/s)": 0.232127 }, { "epoch": 4.694717585181797, "grad_norm": 1.2065224647521973, "learning_rate": 9.170042242878596e-07, "loss": 0.014747908711433411, "memory(GiB)": 122.96, "step": 61590, "token_acc": 0.9949991934182932, "train_speed(iter/s)": 0.232131 }, { "epoch": 4.695098711792057, "grad_norm": 1.5719512701034546, "learning_rate": 9.147230076818725e-07, "loss": 0.015516871213912964, "memory(GiB)": 122.96, "step": 61595, "token_acc": 0.9927184466019418, "train_speed(iter/s)": 0.232137 }, { "epoch": 4.695479838402317, "grad_norm": 4.407609939575195, "learning_rate": 9.124446058887459e-07, "loss": 0.029218369722366334, "memory(GiB)": 122.96, "step": 61600, "token_acc": 0.9917369901547117, "train_speed(iter/s)": 0.232139 }, { "epoch": 4.695479838402317, "eval_loss": 0.047350626438856125, "eval_runtime": 219.8368, "eval_samples_per_second": 2.411, "eval_steps_per_second": 2.411, "eval_token_acc": 0.9810402987771821, "step": 61600 }, { "epoch": 4.695860965012577, "grad_norm": 1.0861201286315918, "learning_rate": 9.101690190391198e-07, "loss": 0.030333444476127625, "memory(GiB)": 122.96, "step": 61605, "token_acc": 0.9812552653748947, "train_speed(iter/s)": 0.231951 }, { "epoch": 4.696242091622837, "grad_norm": 0.7025170922279358, "learning_rate": 9.078962472634955e-07, "loss": 0.04480908215045929, "memory(GiB)": 122.96, "step": 61610, "token_acc": 0.9876783398184177, "train_speed(iter/s)": 0.231951 }, { "epoch": 4.696623218233097, "grad_norm": 1.9182344675064087, "learning_rate": 9.056262906922075e-07, "loss": 0.028415021300315858, "memory(GiB)": 122.96, "step": 61615, "token_acc": 0.988774341351661, "train_speed(iter/s)": 0.231956 }, { "epoch": 4.697004344843357, "grad_norm": 0.88592129945755, "learning_rate": 9.033591494554238e-07, "loss": 0.026820510625839233, "memory(GiB)": 122.96, "step": 61620, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.231961 }, { "epoch": 4.697385471453617, "grad_norm": 1.183349370956421, "learning_rate": 9.010948236831629e-07, "loss": 0.021524593234062195, "memory(GiB)": 122.96, "step": 61625, "token_acc": 0.9920017773828038, "train_speed(iter/s)": 0.231965 }, { "epoch": 4.697766598063877, "grad_norm": 1.3756611347198486, "learning_rate": 8.988333135052596e-07, "loss": 0.032225769758224485, "memory(GiB)": 122.96, "step": 61630, "token_acc": 0.9855916615573268, "train_speed(iter/s)": 0.23197 }, { "epoch": 4.698147724674136, "grad_norm": 0.15146058797836304, "learning_rate": 8.965746190514102e-07, "loss": 0.018706586956977845, "memory(GiB)": 122.96, "step": 61635, "token_acc": 0.989058039961941, "train_speed(iter/s)": 0.231977 }, { "epoch": 4.698528851284396, "grad_norm": 0.6771263480186462, "learning_rate": 8.943187404511444e-07, "loss": 0.03512516319751739, "memory(GiB)": 122.96, "step": 61640, "token_acc": 0.9879594423320659, "train_speed(iter/s)": 0.231982 }, { "epoch": 4.698909977894656, "grad_norm": 0.5780419707298279, "learning_rate": 8.920656778338144e-07, "loss": 0.031663113832473756, "memory(GiB)": 122.96, "step": 61645, "token_acc": 0.9877041823730031, "train_speed(iter/s)": 0.23198 }, { "epoch": 4.699291104504916, "grad_norm": 0.9707927703857422, "learning_rate": 8.898154313286277e-07, "loss": 0.020829975605010986, "memory(GiB)": 122.96, "step": 61650, "token_acc": 0.9905869324473976, "train_speed(iter/s)": 0.231983 }, { "epoch": 4.699672231115176, "grad_norm": 1.0742472410202026, "learning_rate": 8.875680010646314e-07, "loss": 0.03369962573051453, "memory(GiB)": 122.96, "step": 61655, "token_acc": 0.9812974868497955, "train_speed(iter/s)": 0.231987 }, { "epoch": 4.700053357725436, "grad_norm": 1.8568872213363647, "learning_rate": 8.853233871707001e-07, "loss": 0.03152759671211243, "memory(GiB)": 122.96, "step": 61660, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.231992 }, { "epoch": 4.700434484335696, "grad_norm": 0.8744524717330933, "learning_rate": 8.830815897755474e-07, "loss": 0.02084215134382248, "memory(GiB)": 122.96, "step": 61665, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.231999 }, { "epoch": 4.7008156109459565, "grad_norm": 1.023655652999878, "learning_rate": 8.808426090077426e-07, "loss": 0.030477452278137206, "memory(GiB)": 122.96, "step": 61670, "token_acc": 0.9873314926100374, "train_speed(iter/s)": 0.232 }, { "epoch": 4.7011967375562165, "grad_norm": 1.085789680480957, "learning_rate": 8.786064449956721e-07, "loss": 0.03263699114322662, "memory(GiB)": 122.96, "step": 61675, "token_acc": 0.9828211952576821, "train_speed(iter/s)": 0.232004 }, { "epoch": 4.701577864166476, "grad_norm": 1.58330237865448, "learning_rate": 8.763730978675721e-07, "loss": 0.02944439947605133, "memory(GiB)": 122.96, "step": 61680, "token_acc": 0.9859845830413455, "train_speed(iter/s)": 0.232009 }, { "epoch": 4.7019589907767365, "grad_norm": 1.9860066175460815, "learning_rate": 8.74142567751507e-07, "loss": 0.028304296731948852, "memory(GiB)": 122.96, "step": 61685, "token_acc": 0.9914371673223791, "train_speed(iter/s)": 0.232013 }, { "epoch": 4.702340117386996, "grad_norm": 2.2179653644561768, "learning_rate": 8.719148547753964e-07, "loss": 0.027554517984390257, "memory(GiB)": 122.96, "step": 61690, "token_acc": 0.9900526623756583, "train_speed(iter/s)": 0.232017 }, { "epoch": 4.702721243997256, "grad_norm": 1.190682291984558, "learning_rate": 8.696899590669938e-07, "loss": 0.027781599760055543, "memory(GiB)": 122.96, "step": 61695, "token_acc": 0.9909824394874229, "train_speed(iter/s)": 0.232022 }, { "epoch": 4.703102370607516, "grad_norm": 1.3976812362670898, "learning_rate": 8.674678807538694e-07, "loss": 0.03505501449108124, "memory(GiB)": 122.96, "step": 61700, "token_acc": 0.9808084932625561, "train_speed(iter/s)": 0.232027 }, { "epoch": 4.703483497217776, "grad_norm": 0.7081575393676758, "learning_rate": 8.652486199634657e-07, "loss": 0.02512039840221405, "memory(GiB)": 122.96, "step": 61705, "token_acc": 0.9892121608368748, "train_speed(iter/s)": 0.232029 }, { "epoch": 4.703864623828036, "grad_norm": 1.7217961549758911, "learning_rate": 8.630321768230421e-07, "loss": 0.043262803554534913, "memory(GiB)": 122.96, "step": 61710, "token_acc": 0.9819187089061566, "train_speed(iter/s)": 0.232031 }, { "epoch": 4.704245750438296, "grad_norm": 0.8707626461982727, "learning_rate": 8.608185514596912e-07, "loss": 0.02333463281393051, "memory(GiB)": 122.96, "step": 61715, "token_acc": 0.9899713467048711, "train_speed(iter/s)": 0.232036 }, { "epoch": 4.704626877048556, "grad_norm": 0.9587823152542114, "learning_rate": 8.58607744000367e-07, "loss": 0.028771862387657166, "memory(GiB)": 122.96, "step": 61720, "token_acc": 0.9912095639943741, "train_speed(iter/s)": 0.232042 }, { "epoch": 4.705008003658816, "grad_norm": 0.624072790145874, "learning_rate": 8.563997545718517e-07, "loss": 0.01867756396532059, "memory(GiB)": 122.96, "step": 61725, "token_acc": 0.9889698640929683, "train_speed(iter/s)": 0.232045 }, { "epoch": 4.705389130269076, "grad_norm": 1.561194658279419, "learning_rate": 8.541945833007492e-07, "loss": 0.03948388397693634, "memory(GiB)": 122.96, "step": 61730, "token_acc": 0.9867585461424132, "train_speed(iter/s)": 0.232046 }, { "epoch": 4.705770256879335, "grad_norm": 0.7397366762161255, "learning_rate": 8.519922303135198e-07, "loss": 0.022575873136520385, "memory(GiB)": 122.96, "step": 61735, "token_acc": 0.9916150928329007, "train_speed(iter/s)": 0.23205 }, { "epoch": 4.706151383489595, "grad_norm": 0.5449602007865906, "learning_rate": 8.497926957364677e-07, "loss": 0.024910880625247954, "memory(GiB)": 122.96, "step": 61740, "token_acc": 0.9904204364023417, "train_speed(iter/s)": 0.232049 }, { "epoch": 4.706532510099855, "grad_norm": 3.4783735275268555, "learning_rate": 8.475959796957145e-07, "loss": 0.05982747077941895, "memory(GiB)": 122.96, "step": 61745, "token_acc": 0.9815133276010318, "train_speed(iter/s)": 0.232053 }, { "epoch": 4.706913636710115, "grad_norm": 1.1235932111740112, "learning_rate": 8.454020823172426e-07, "loss": 0.03244302570819855, "memory(GiB)": 122.96, "step": 61750, "token_acc": 0.9890047393364929, "train_speed(iter/s)": 0.232056 }, { "epoch": 4.707294763320375, "grad_norm": 1.931552529335022, "learning_rate": 8.432110037268515e-07, "loss": 0.03795554637908936, "memory(GiB)": 122.96, "step": 61755, "token_acc": 0.985227841762644, "train_speed(iter/s)": 0.232058 }, { "epoch": 4.707675889930635, "grad_norm": 1.5592424869537354, "learning_rate": 8.410227440501961e-07, "loss": 0.039103978872299196, "memory(GiB)": 122.96, "step": 61760, "token_acc": 0.986105750675415, "train_speed(iter/s)": 0.232063 }, { "epoch": 4.708057016540895, "grad_norm": 1.4020404815673828, "learning_rate": 8.388373034127595e-07, "loss": 0.01750762164592743, "memory(GiB)": 122.96, "step": 61765, "token_acc": 0.9926995457495134, "train_speed(iter/s)": 0.232066 }, { "epoch": 4.708438143151155, "grad_norm": 1.8466473817825317, "learning_rate": 8.366546819398746e-07, "loss": 0.061489599943161014, "memory(GiB)": 122.96, "step": 61770, "token_acc": 0.9742037378257437, "train_speed(iter/s)": 0.232072 }, { "epoch": 4.708819269761415, "grad_norm": 1.1521086692810059, "learning_rate": 8.344748797566859e-07, "loss": 0.03179409503936768, "memory(GiB)": 122.96, "step": 61775, "token_acc": 0.9908214777420835, "train_speed(iter/s)": 0.232076 }, { "epoch": 4.709200396371674, "grad_norm": 1.9039260149002075, "learning_rate": 8.322978969882156e-07, "loss": 0.03324616849422455, "memory(GiB)": 122.96, "step": 61780, "token_acc": 0.9880516170144974, "train_speed(iter/s)": 0.232077 }, { "epoch": 4.709581522981934, "grad_norm": 1.1952173709869385, "learning_rate": 8.301237337592916e-07, "loss": 0.03460843563079834, "memory(GiB)": 122.96, "step": 61785, "token_acc": 0.9872380952380952, "train_speed(iter/s)": 0.23208 }, { "epoch": 4.709962649592194, "grad_norm": 0.4792175889015198, "learning_rate": 8.279523901945918e-07, "loss": 0.032402992248535156, "memory(GiB)": 122.96, "step": 61790, "token_acc": 0.9921168560166937, "train_speed(iter/s)": 0.232084 }, { "epoch": 4.710343776202454, "grad_norm": 1.7554985284805298, "learning_rate": 8.257838664186446e-07, "loss": 0.03487452268600464, "memory(GiB)": 122.96, "step": 61795, "token_acc": 0.9820396366639141, "train_speed(iter/s)": 0.232089 }, { "epoch": 4.710724902812714, "grad_norm": 2.2758843898773193, "learning_rate": 8.236181625557893e-07, "loss": 0.031564533710479736, "memory(GiB)": 122.96, "step": 61800, "token_acc": 0.9899726526891522, "train_speed(iter/s)": 0.232092 }, { "epoch": 4.710724902812714, "eval_loss": 0.0472443550825119, "eval_runtime": 220.4037, "eval_samples_per_second": 2.405, "eval_steps_per_second": 2.405, "eval_token_acc": 0.9808746461056563, "step": 61800 }, { "epoch": 4.711106029422974, "grad_norm": 1.0591243505477905, "learning_rate": 8.21455278730232e-07, "loss": 0.019861635565757752, "memory(GiB)": 122.96, "step": 61805, "token_acc": 0.9811944976493122, "train_speed(iter/s)": 0.231905 }, { "epoch": 4.711487156033234, "grad_norm": 1.2226675748825073, "learning_rate": 8.192952150659961e-07, "loss": 0.028653019666671754, "memory(GiB)": 122.96, "step": 61810, "token_acc": 0.9873732542567438, "train_speed(iter/s)": 0.231909 }, { "epoch": 4.711868282643494, "grad_norm": 1.3626633882522583, "learning_rate": 8.171379716869488e-07, "loss": 0.0284088134765625, "memory(GiB)": 122.96, "step": 61815, "token_acc": 0.9881245798790051, "train_speed(iter/s)": 0.231914 }, { "epoch": 4.712249409253754, "grad_norm": 0.7712843418121338, "learning_rate": 8.149835487168078e-07, "loss": 0.024213385581970216, "memory(GiB)": 122.96, "step": 61820, "token_acc": 0.991579990094106, "train_speed(iter/s)": 0.231917 }, { "epoch": 4.712630535864014, "grad_norm": 0.611932635307312, "learning_rate": 8.128319462791134e-07, "loss": 0.04592408537864685, "memory(GiB)": 122.96, "step": 61825, "token_acc": 0.9840764331210191, "train_speed(iter/s)": 0.231919 }, { "epoch": 4.713011662474274, "grad_norm": 1.0983195304870605, "learning_rate": 8.106831644972501e-07, "loss": 0.021586798131465912, "memory(GiB)": 122.96, "step": 61830, "token_acc": 0.9896490785155264, "train_speed(iter/s)": 0.231924 }, { "epoch": 4.7133927890845335, "grad_norm": 1.0040957927703857, "learning_rate": 8.08537203494436e-07, "loss": 0.03197257518768311, "memory(GiB)": 122.96, "step": 61835, "token_acc": 0.9860413176996091, "train_speed(iter/s)": 0.231927 }, { "epoch": 4.7137739156947935, "grad_norm": 1.2614765167236328, "learning_rate": 8.06394063393745e-07, "loss": 0.02559064030647278, "memory(GiB)": 122.96, "step": 61840, "token_acc": 0.9900651971437442, "train_speed(iter/s)": 0.231933 }, { "epoch": 4.7141550423050536, "grad_norm": 1.1566534042358398, "learning_rate": 8.042537443180675e-07, "loss": 0.02578798830509186, "memory(GiB)": 122.96, "step": 61845, "token_acc": 0.9892870807294988, "train_speed(iter/s)": 0.231935 }, { "epoch": 4.714536168915314, "grad_norm": 0.9257664680480957, "learning_rate": 8.021162463901388e-07, "loss": 0.023386830091476442, "memory(GiB)": 122.96, "step": 61850, "token_acc": 0.9897189856065799, "train_speed(iter/s)": 0.23194 }, { "epoch": 4.714917295525574, "grad_norm": 0.9253749251365662, "learning_rate": 7.999815697325386e-07, "loss": 0.017361581325531006, "memory(GiB)": 122.96, "step": 61855, "token_acc": 0.9924504561182762, "train_speed(iter/s)": 0.231942 }, { "epoch": 4.715298422135834, "grad_norm": 1.7043178081512451, "learning_rate": 7.978497144676745e-07, "loss": 0.027774921059608458, "memory(GiB)": 122.96, "step": 61860, "token_acc": 0.9905096660808436, "train_speed(iter/s)": 0.231946 }, { "epoch": 4.715679548746094, "grad_norm": 0.7902039885520935, "learning_rate": 7.957206807178097e-07, "loss": 0.020238834619522094, "memory(GiB)": 122.96, "step": 61865, "token_acc": 0.99105288384726, "train_speed(iter/s)": 0.231949 }, { "epoch": 4.716060675356354, "grad_norm": 0.4455850422382355, "learning_rate": 7.935944686050245e-07, "loss": 0.024988940358161925, "memory(GiB)": 122.96, "step": 61870, "token_acc": 0.9835197174808711, "train_speed(iter/s)": 0.231953 }, { "epoch": 4.716441801966614, "grad_norm": 0.9301992058753967, "learning_rate": 7.914710782512491e-07, "loss": 0.025735464692115784, "memory(GiB)": 122.96, "step": 61875, "token_acc": 0.9868352676251516, "train_speed(iter/s)": 0.231955 }, { "epoch": 4.716822928576873, "grad_norm": 0.8572659492492676, "learning_rate": 7.893505097782583e-07, "loss": 0.03395584225654602, "memory(GiB)": 122.96, "step": 61880, "token_acc": 0.98595726122983, "train_speed(iter/s)": 0.231954 }, { "epoch": 4.717204055187133, "grad_norm": 0.45193105936050415, "learning_rate": 7.872327633076437e-07, "loss": 0.023128990828990937, "memory(GiB)": 122.96, "step": 61885, "token_acc": 0.9918436441982148, "train_speed(iter/s)": 0.231958 }, { "epoch": 4.717585181797393, "grad_norm": 0.9684525728225708, "learning_rate": 7.851178389608582e-07, "loss": 0.02137288749217987, "memory(GiB)": 122.96, "step": 61890, "token_acc": 0.9910390441647109, "train_speed(iter/s)": 0.231963 }, { "epoch": 4.717966308407653, "grad_norm": 0.4051115810871124, "learning_rate": 7.830057368591826e-07, "loss": 0.02309268116950989, "memory(GiB)": 122.96, "step": 61895, "token_acc": 0.9907755380936112, "train_speed(iter/s)": 0.231968 }, { "epoch": 4.718347435017913, "grad_norm": 1.2723939418792725, "learning_rate": 7.808964571237309e-07, "loss": 0.029468932747840883, "memory(GiB)": 122.96, "step": 61900, "token_acc": 0.9871428571428571, "train_speed(iter/s)": 0.231973 }, { "epoch": 4.718728561628173, "grad_norm": 0.7696756720542908, "learning_rate": 7.787899998754566e-07, "loss": 0.025675442814826966, "memory(GiB)": 122.96, "step": 61905, "token_acc": 0.9905471732412289, "train_speed(iter/s)": 0.231977 }, { "epoch": 4.719109688238433, "grad_norm": 1.297554850578308, "learning_rate": 7.766863652351741e-07, "loss": 0.03240588903427124, "memory(GiB)": 122.96, "step": 61910, "token_acc": 0.9907881269191402, "train_speed(iter/s)": 0.231981 }, { "epoch": 4.719490814848693, "grad_norm": 0.7695630192756653, "learning_rate": 7.745855533234924e-07, "loss": 0.01970825493335724, "memory(GiB)": 122.96, "step": 61915, "token_acc": 0.9911683068382539, "train_speed(iter/s)": 0.231986 }, { "epoch": 4.719871941458953, "grad_norm": 0.5283740758895874, "learning_rate": 7.724875642608986e-07, "loss": 0.03007328510284424, "memory(GiB)": 122.96, "step": 61920, "token_acc": 0.9871692060946271, "train_speed(iter/s)": 0.231988 }, { "epoch": 4.720253068069213, "grad_norm": 0.6999237537384033, "learning_rate": 7.70392398167702e-07, "loss": 0.0332852303981781, "memory(GiB)": 122.96, "step": 61925, "token_acc": 0.9888136127304238, "train_speed(iter/s)": 0.23199 }, { "epoch": 4.720634194679473, "grad_norm": 1.4386382102966309, "learning_rate": 7.683000551640451e-07, "loss": 0.0416686624288559, "memory(GiB)": 122.96, "step": 61930, "token_acc": 0.9795313762456235, "train_speed(iter/s)": 0.231995 }, { "epoch": 4.721015321289732, "grad_norm": 0.9754907488822937, "learning_rate": 7.662105353699212e-07, "loss": 0.02248874306678772, "memory(GiB)": 122.96, "step": 61935, "token_acc": 0.991166077738516, "train_speed(iter/s)": 0.232001 }, { "epoch": 4.721396447899992, "grad_norm": 2.04880428314209, "learning_rate": 7.641238389051508e-07, "loss": 0.03315885066986084, "memory(GiB)": 122.96, "step": 61940, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.232004 }, { "epoch": 4.721777574510252, "grad_norm": 1.6568114757537842, "learning_rate": 7.620399658893884e-07, "loss": 0.04028114080429077, "memory(GiB)": 122.96, "step": 61945, "token_acc": 0.9835351089588378, "train_speed(iter/s)": 0.232009 }, { "epoch": 4.722158701120512, "grad_norm": 3.1231231689453125, "learning_rate": 7.599589164421495e-07, "loss": 0.04070684313774109, "memory(GiB)": 122.96, "step": 61950, "token_acc": 0.9869264923532314, "train_speed(iter/s)": 0.232013 }, { "epoch": 4.722539827730772, "grad_norm": 0.20124852657318115, "learning_rate": 7.578806906827607e-07, "loss": 0.01038898304104805, "memory(GiB)": 122.96, "step": 61955, "token_acc": 0.9953115842938074, "train_speed(iter/s)": 0.232018 }, { "epoch": 4.722920954341032, "grad_norm": 0.8577485680580139, "learning_rate": 7.558052887304046e-07, "loss": 0.025547531247138978, "memory(GiB)": 122.96, "step": 61960, "token_acc": 0.9878391019644528, "train_speed(iter/s)": 0.232023 }, { "epoch": 4.723302080951292, "grad_norm": 0.1241108626127243, "learning_rate": 7.53732710704097e-07, "loss": 0.01273319274187088, "memory(GiB)": 122.96, "step": 61965, "token_acc": 0.9936102236421726, "train_speed(iter/s)": 0.232031 }, { "epoch": 4.723683207561552, "grad_norm": 1.4321353435516357, "learning_rate": 7.516629567226874e-07, "loss": 0.018410694599151612, "memory(GiB)": 122.96, "step": 61970, "token_acc": 0.9926273458445041, "train_speed(iter/s)": 0.232036 }, { "epoch": 4.724064334171812, "grad_norm": 1.2552728652954102, "learning_rate": 7.495960269048641e-07, "loss": 0.03941004276275635, "memory(GiB)": 122.96, "step": 61975, "token_acc": 0.9866814650388457, "train_speed(iter/s)": 0.232041 }, { "epoch": 4.724445460782071, "grad_norm": 1.2214080095291138, "learning_rate": 7.4753192136916e-07, "loss": 0.03132750988006592, "memory(GiB)": 122.96, "step": 61980, "token_acc": 0.9875259875259875, "train_speed(iter/s)": 0.232044 }, { "epoch": 4.724826587392331, "grad_norm": 0.0722636729478836, "learning_rate": 7.454706402339362e-07, "loss": 0.040295711159706114, "memory(GiB)": 122.96, "step": 61985, "token_acc": 0.9833091436865021, "train_speed(iter/s)": 0.232048 }, { "epoch": 4.725207714002591, "grad_norm": 0.8022904396057129, "learning_rate": 7.434121836174091e-07, "loss": 0.046335083246231076, "memory(GiB)": 122.96, "step": 61990, "token_acc": 0.9857839155158408, "train_speed(iter/s)": 0.232051 }, { "epoch": 4.7255888406128514, "grad_norm": 0.1450691968202591, "learning_rate": 7.413565516376119e-07, "loss": 0.02963472008705139, "memory(GiB)": 122.96, "step": 61995, "token_acc": 0.9846732215153268, "train_speed(iter/s)": 0.232056 }, { "epoch": 4.7259699672231115, "grad_norm": 0.8683902621269226, "learning_rate": 7.393037444124285e-07, "loss": 0.048889583349227904, "memory(GiB)": 122.96, "step": 62000, "token_acc": 0.9884383650871194, "train_speed(iter/s)": 0.232059 }, { "epoch": 4.7259699672231115, "eval_loss": 0.04725060239434242, "eval_runtime": 220.7027, "eval_samples_per_second": 2.401, "eval_steps_per_second": 2.401, "eval_token_acc": 0.9810553581109571, "step": 62000 }, { "epoch": 4.7263510938333715, "grad_norm": 0.742735743522644, "learning_rate": 7.372537620595754e-07, "loss": 0.016405805945396423, "memory(GiB)": 122.96, "step": 62005, "token_acc": 0.9814223064548119, "train_speed(iter/s)": 0.231871 }, { "epoch": 4.7267322204436315, "grad_norm": 0.8343555331230164, "learning_rate": 7.352066046966199e-07, "loss": 0.021360822021961212, "memory(GiB)": 122.96, "step": 62010, "token_acc": 0.9913946587537092, "train_speed(iter/s)": 0.231873 }, { "epoch": 4.7271133470538915, "grad_norm": 0.9106143116950989, "learning_rate": 7.331622724409403e-07, "loss": 0.02023686170578003, "memory(GiB)": 122.96, "step": 62015, "token_acc": 0.9909214707217431, "train_speed(iter/s)": 0.231878 }, { "epoch": 4.7274944736641515, "grad_norm": 1.317388653755188, "learning_rate": 7.311207654097874e-07, "loss": 0.03119143843650818, "memory(GiB)": 122.96, "step": 62020, "token_acc": 0.9839615076182838, "train_speed(iter/s)": 0.231881 }, { "epoch": 4.727875600274411, "grad_norm": 0.48401129245758057, "learning_rate": 7.290820837202172e-07, "loss": 0.04561898708343506, "memory(GiB)": 122.96, "step": 62025, "token_acc": 0.9885466034755134, "train_speed(iter/s)": 0.231883 }, { "epoch": 4.728256726884672, "grad_norm": 3.983164072036743, "learning_rate": 7.270462274891421e-07, "loss": 0.02207397371530533, "memory(GiB)": 122.96, "step": 62030, "token_acc": 0.9930452061599603, "train_speed(iter/s)": 0.231889 }, { "epoch": 4.728637853494931, "grad_norm": 1.903393268585205, "learning_rate": 7.250131968333185e-07, "loss": 0.04355311691761017, "memory(GiB)": 122.96, "step": 62035, "token_acc": 0.983143197684318, "train_speed(iter/s)": 0.231892 }, { "epoch": 4.729018980105191, "grad_norm": 0.7180707454681396, "learning_rate": 7.229829918693143e-07, "loss": 0.03705786466598511, "memory(GiB)": 122.96, "step": 62040, "token_acc": 0.9851101846337106, "train_speed(iter/s)": 0.231897 }, { "epoch": 4.729400106715451, "grad_norm": 0.9231045842170715, "learning_rate": 7.209556127135697e-07, "loss": 0.026156502962112426, "memory(GiB)": 122.96, "step": 62045, "token_acc": 0.9872916322825549, "train_speed(iter/s)": 0.231899 }, { "epoch": 4.729781233325711, "grad_norm": 2.432751178741455, "learning_rate": 7.189310594823362e-07, "loss": 0.034845370054245, "memory(GiB)": 122.96, "step": 62050, "token_acc": 0.9818136522172397, "train_speed(iter/s)": 0.231904 }, { "epoch": 4.730162359935971, "grad_norm": 1.170448899269104, "learning_rate": 7.169093322917097e-07, "loss": 0.025478962063789367, "memory(GiB)": 122.96, "step": 62055, "token_acc": 0.9912359550561798, "train_speed(iter/s)": 0.231907 }, { "epoch": 4.730543486546231, "grad_norm": 0.9166926741600037, "learning_rate": 7.148904312576366e-07, "loss": 0.01753988265991211, "memory(GiB)": 122.96, "step": 62060, "token_acc": 0.9888449082403742, "train_speed(iter/s)": 0.231912 }, { "epoch": 4.730924613156491, "grad_norm": 1.763756513595581, "learning_rate": 7.128743564958851e-07, "loss": 0.03528833985328674, "memory(GiB)": 122.96, "step": 62065, "token_acc": 0.9846235418875928, "train_speed(iter/s)": 0.231916 }, { "epoch": 4.731305739766751, "grad_norm": 0.44966134428977966, "learning_rate": 7.108611081220685e-07, "loss": 0.02063930928707123, "memory(GiB)": 122.96, "step": 62070, "token_acc": 0.9940381558028617, "train_speed(iter/s)": 0.231921 }, { "epoch": 4.731686866377011, "grad_norm": 0.7246293425559998, "learning_rate": 7.088506862516442e-07, "loss": 0.01986888349056244, "memory(GiB)": 122.96, "step": 62075, "token_acc": 0.9907940161104718, "train_speed(iter/s)": 0.231924 }, { "epoch": 4.73206799298727, "grad_norm": 0.41684481501579285, "learning_rate": 7.068430909998869e-07, "loss": 0.03042006492614746, "memory(GiB)": 122.96, "step": 62080, "token_acc": 0.9891567984020545, "train_speed(iter/s)": 0.231927 }, { "epoch": 4.73244911959753, "grad_norm": 0.737126350402832, "learning_rate": 7.048383224819321e-07, "loss": 0.02299324721097946, "memory(GiB)": 122.96, "step": 62085, "token_acc": 0.9889682024659312, "train_speed(iter/s)": 0.231932 }, { "epoch": 4.73283024620779, "grad_norm": 0.9655119776725769, "learning_rate": 7.028363808127436e-07, "loss": 0.03909604549407959, "memory(GiB)": 122.96, "step": 62090, "token_acc": 0.9872423945044161, "train_speed(iter/s)": 0.231936 }, { "epoch": 4.73321137281805, "grad_norm": 0.5992923974990845, "learning_rate": 7.008372661071183e-07, "loss": 0.02086440473794937, "memory(GiB)": 122.96, "step": 62095, "token_acc": 0.9922493403693932, "train_speed(iter/s)": 0.231939 }, { "epoch": 4.73359249942831, "grad_norm": 1.1325461864471436, "learning_rate": 6.988409784797034e-07, "loss": 0.027878275513648985, "memory(GiB)": 122.96, "step": 62100, "token_acc": 0.991883416343848, "train_speed(iter/s)": 0.231942 }, { "epoch": 4.73397362603857, "grad_norm": 0.7047910094261169, "learning_rate": 6.968475180449741e-07, "loss": 0.03279573023319245, "memory(GiB)": 122.96, "step": 62105, "token_acc": 0.981534910559723, "train_speed(iter/s)": 0.231946 }, { "epoch": 4.73435475264883, "grad_norm": 2.023902177810669, "learning_rate": 6.948568849172387e-07, "loss": 0.041151690483093264, "memory(GiB)": 122.96, "step": 62110, "token_acc": 0.9860228716645489, "train_speed(iter/s)": 0.23195 }, { "epoch": 4.73473587925909, "grad_norm": 1.647334098815918, "learning_rate": 6.928690792106618e-07, "loss": 0.052065491676330566, "memory(GiB)": 122.96, "step": 62115, "token_acc": 0.9784560143626571, "train_speed(iter/s)": 0.231955 }, { "epoch": 4.73511700586935, "grad_norm": 1.0370899438858032, "learning_rate": 6.908841010392298e-07, "loss": 0.027405565977096556, "memory(GiB)": 122.96, "step": 62120, "token_acc": 0.989188231123715, "train_speed(iter/s)": 0.231959 }, { "epoch": 4.735498132479609, "grad_norm": 1.0596392154693604, "learning_rate": 6.889019505167737e-07, "loss": 0.03533194363117218, "memory(GiB)": 122.96, "step": 62125, "token_acc": 0.986470051687443, "train_speed(iter/s)": 0.231961 }, { "epoch": 4.735879259089869, "grad_norm": 1.191657304763794, "learning_rate": 6.86922627756953e-07, "loss": 0.022111250460147856, "memory(GiB)": 122.96, "step": 62130, "token_acc": 0.990917590490183, "train_speed(iter/s)": 0.231964 }, { "epoch": 4.736260385700129, "grad_norm": 0.6441670656204224, "learning_rate": 6.849461328732875e-07, "loss": 0.028660926222801208, "memory(GiB)": 122.96, "step": 62135, "token_acc": 0.9874314037882811, "train_speed(iter/s)": 0.231967 }, { "epoch": 4.736641512310389, "grad_norm": 2.099799871444702, "learning_rate": 6.829724659791092e-07, "loss": 0.03742862343788147, "memory(GiB)": 122.96, "step": 62140, "token_acc": 0.9850778085696014, "train_speed(iter/s)": 0.231971 }, { "epoch": 4.737022638920649, "grad_norm": 1.9841886758804321, "learning_rate": 6.810016271875941e-07, "loss": 0.03053494691848755, "memory(GiB)": 122.96, "step": 62145, "token_acc": 0.9889473684210527, "train_speed(iter/s)": 0.231976 }, { "epoch": 4.737403765530909, "grad_norm": 3.202604055404663, "learning_rate": 6.790336166117794e-07, "loss": 0.0362687349319458, "memory(GiB)": 122.96, "step": 62150, "token_acc": 0.9858316221765914, "train_speed(iter/s)": 0.23198 }, { "epoch": 4.737784892141169, "grad_norm": 1.1023616790771484, "learning_rate": 6.770684343644973e-07, "loss": 0.035684362053871155, "memory(GiB)": 122.96, "step": 62155, "token_acc": 0.9886484568996098, "train_speed(iter/s)": 0.231986 }, { "epoch": 4.738166018751429, "grad_norm": 1.1412627696990967, "learning_rate": 6.75106080558463e-07, "loss": 0.0259348064661026, "memory(GiB)": 122.96, "step": 62160, "token_acc": 0.988914373088685, "train_speed(iter/s)": 0.231991 }, { "epoch": 4.738547145361689, "grad_norm": 2.470303773880005, "learning_rate": 6.731465553061977e-07, "loss": 0.024385052919387817, "memory(GiB)": 122.96, "step": 62165, "token_acc": 0.9915230291042667, "train_speed(iter/s)": 0.231996 }, { "epoch": 4.738928271971949, "grad_norm": 1.459538221359253, "learning_rate": 6.711898587200671e-07, "loss": 0.045340290665626524, "memory(GiB)": 122.96, "step": 62170, "token_acc": 0.9804674457429048, "train_speed(iter/s)": 0.232 }, { "epoch": 4.7393093985822095, "grad_norm": 0.9652634859085083, "learning_rate": 6.69235990912287e-07, "loss": 0.020407013595104218, "memory(GiB)": 122.96, "step": 62175, "token_acc": 0.9925634909499088, "train_speed(iter/s)": 0.232001 }, { "epoch": 4.739690525192469, "grad_norm": 2.8477938175201416, "learning_rate": 6.672849519949065e-07, "loss": 0.04225781559944153, "memory(GiB)": 122.96, "step": 62180, "token_acc": 0.984360625574977, "train_speed(iter/s)": 0.232007 }, { "epoch": 4.740071651802729, "grad_norm": 0.9222937226295471, "learning_rate": 6.653367420797974e-07, "loss": 0.029726028442382812, "memory(GiB)": 122.96, "step": 62185, "token_acc": 0.9878296146044625, "train_speed(iter/s)": 0.232012 }, { "epoch": 4.740452778412989, "grad_norm": 2.9480388164520264, "learning_rate": 6.633913612786813e-07, "loss": 0.035613083839416505, "memory(GiB)": 122.96, "step": 62190, "token_acc": 0.9886769964243146, "train_speed(iter/s)": 0.232016 }, { "epoch": 4.740833905023249, "grad_norm": 1.0153138637542725, "learning_rate": 6.614488097031246e-07, "loss": 0.02535497546195984, "memory(GiB)": 122.96, "step": 62195, "token_acc": 0.9892114554727344, "train_speed(iter/s)": 0.232019 }, { "epoch": 4.741215031633509, "grad_norm": 2.354051351547241, "learning_rate": 6.595090874645216e-07, "loss": 0.050149714946746825, "memory(GiB)": 122.96, "step": 62200, "token_acc": 0.983763305069457, "train_speed(iter/s)": 0.232023 }, { "epoch": 4.741215031633509, "eval_loss": 0.04706527292728424, "eval_runtime": 217.7774, "eval_samples_per_second": 2.434, "eval_steps_per_second": 2.434, "eval_token_acc": 0.9809047647732064, "step": 62200 }, { "epoch": 4.741596158243769, "grad_norm": 1.193485975265503, "learning_rate": 6.575721946741053e-07, "loss": 0.02844291627407074, "memory(GiB)": 122.96, "step": 62205, "token_acc": 0.9811991279069767, "train_speed(iter/s)": 0.231839 }, { "epoch": 4.741977284854029, "grad_norm": 1.3738884925842285, "learning_rate": 6.556381314429427e-07, "loss": 0.02180907428264618, "memory(GiB)": 122.96, "step": 62210, "token_acc": 0.9878337276106793, "train_speed(iter/s)": 0.231845 }, { "epoch": 4.742358411464289, "grad_norm": 0.9453949332237244, "learning_rate": 6.537068978819505e-07, "loss": 0.0493392288684845, "memory(GiB)": 122.96, "step": 62215, "token_acc": 0.9843191742755062, "train_speed(iter/s)": 0.231849 }, { "epoch": 4.742739538074549, "grad_norm": 1.9585646390914917, "learning_rate": 6.517784941018735e-07, "loss": 0.02582308053970337, "memory(GiB)": 122.96, "step": 62220, "token_acc": 0.9876685934489403, "train_speed(iter/s)": 0.231854 }, { "epoch": 4.743120664684808, "grad_norm": 1.083376169204712, "learning_rate": 6.498529202132897e-07, "loss": 0.024546247720718384, "memory(GiB)": 122.96, "step": 62225, "token_acc": 0.9814413857098669, "train_speed(iter/s)": 0.23186 }, { "epoch": 4.743501791295068, "grad_norm": 1.9965709447860718, "learning_rate": 6.479301763266332e-07, "loss": 0.04541406333446503, "memory(GiB)": 122.96, "step": 62230, "token_acc": 0.9854271356783919, "train_speed(iter/s)": 0.231864 }, { "epoch": 4.743882917905328, "grad_norm": 0.9628795981407166, "learning_rate": 6.460102625521657e-07, "loss": 0.027686893939971924, "memory(GiB)": 122.96, "step": 62235, "token_acc": 0.9883527454242929, "train_speed(iter/s)": 0.231869 }, { "epoch": 4.744264044515588, "grad_norm": 1.109751582145691, "learning_rate": 6.440931789999716e-07, "loss": 0.032413291931152347, "memory(GiB)": 122.96, "step": 62240, "token_acc": 0.9844231117551169, "train_speed(iter/s)": 0.231872 }, { "epoch": 4.744645171125848, "grad_norm": 0.21415045857429504, "learning_rate": 6.42178925779996e-07, "loss": 0.028750473260879518, "memory(GiB)": 122.96, "step": 62245, "token_acc": 0.9864546525323911, "train_speed(iter/s)": 0.231879 }, { "epoch": 4.745026297736108, "grad_norm": 1.9899805784225464, "learning_rate": 6.402675030020066e-07, "loss": 0.026098412275314332, "memory(GiB)": 122.96, "step": 62250, "token_acc": 0.9902749205161773, "train_speed(iter/s)": 0.231881 }, { "epoch": 4.745407424346368, "grad_norm": 0.8632193207740784, "learning_rate": 6.383589107756216e-07, "loss": 0.021792301535606386, "memory(GiB)": 122.96, "step": 62255, "token_acc": 0.9886055344546935, "train_speed(iter/s)": 0.231885 }, { "epoch": 4.745788550956628, "grad_norm": 0.87612384557724, "learning_rate": 6.364531492102921e-07, "loss": 0.020595601201057433, "memory(GiB)": 122.96, "step": 62260, "token_acc": 0.9898338705678156, "train_speed(iter/s)": 0.231889 }, { "epoch": 4.746169677566888, "grad_norm": 1.709390640258789, "learning_rate": 6.345502184152974e-07, "loss": 0.028952884674072265, "memory(GiB)": 122.96, "step": 62265, "token_acc": 0.9883619071455387, "train_speed(iter/s)": 0.231891 }, { "epoch": 4.746550804177148, "grad_norm": 1.0851967334747314, "learning_rate": 6.326501184997613e-07, "loss": 0.026143833994865417, "memory(GiB)": 122.96, "step": 62270, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.231897 }, { "epoch": 4.746931930787408, "grad_norm": 1.135430097579956, "learning_rate": 6.307528495726467e-07, "loss": 0.02856021523475647, "memory(GiB)": 122.96, "step": 62275, "token_acc": 0.9907373101148573, "train_speed(iter/s)": 0.231903 }, { "epoch": 4.747313057397667, "grad_norm": 0.7441526055335999, "learning_rate": 6.288584117427609e-07, "loss": 0.02938147187232971, "memory(GiB)": 122.96, "step": 62280, "token_acc": 0.9894372959477626, "train_speed(iter/s)": 0.231905 }, { "epoch": 4.747694184007927, "grad_norm": 2.0306737422943115, "learning_rate": 6.269668051187283e-07, "loss": 0.0263720840215683, "memory(GiB)": 122.96, "step": 62285, "token_acc": 0.9859293193717278, "train_speed(iter/s)": 0.231911 }, { "epoch": 4.748075310618187, "grad_norm": 1.9786263704299927, "learning_rate": 6.250780298090342e-07, "loss": 0.024472638964653015, "memory(GiB)": 122.96, "step": 62290, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.231915 }, { "epoch": 4.748456437228447, "grad_norm": 1.4458848237991333, "learning_rate": 6.231920859219864e-07, "loss": 0.04496138095855713, "memory(GiB)": 122.96, "step": 62295, "token_acc": 0.979024099970247, "train_speed(iter/s)": 0.231918 }, { "epoch": 4.748837563838707, "grad_norm": 0.8253783583641052, "learning_rate": 6.213089735657374e-07, "loss": 0.018313558399677278, "memory(GiB)": 122.96, "step": 62300, "token_acc": 0.9904588823262154, "train_speed(iter/s)": 0.231921 }, { "epoch": 4.749218690448967, "grad_norm": 0.7312617897987366, "learning_rate": 6.194286928482785e-07, "loss": 0.02432798743247986, "memory(GiB)": 122.96, "step": 62305, "token_acc": 0.9907038512616202, "train_speed(iter/s)": 0.231919 }, { "epoch": 4.749599817059227, "grad_norm": 1.6132622957229614, "learning_rate": 6.175512438774234e-07, "loss": 0.037587675452232364, "memory(GiB)": 122.96, "step": 62310, "token_acc": 0.9844393592677345, "train_speed(iter/s)": 0.231925 }, { "epoch": 4.749980943669487, "grad_norm": 1.4922758340835571, "learning_rate": 6.156766267608416e-07, "loss": 0.019921346008777617, "memory(GiB)": 122.96, "step": 62315, "token_acc": 0.9896346644010195, "train_speed(iter/s)": 0.231928 }, { "epoch": 4.750362070279747, "grad_norm": 0.0005492289201356471, "learning_rate": 6.13804841606036e-07, "loss": 0.034427300095558167, "memory(GiB)": 122.96, "step": 62320, "token_acc": 0.9877697841726619, "train_speed(iter/s)": 0.231933 }, { "epoch": 4.7507431968900065, "grad_norm": 3.0645153522491455, "learning_rate": 6.119358885203375e-07, "loss": 0.03334043025970459, "memory(GiB)": 122.96, "step": 62325, "token_acc": 0.9867610837438424, "train_speed(iter/s)": 0.231939 }, { "epoch": 4.7511243235002665, "grad_norm": 0.8321848511695862, "learning_rate": 6.100697676109379e-07, "loss": 0.06582321524620056, "memory(GiB)": 122.96, "step": 62330, "token_acc": 0.975, "train_speed(iter/s)": 0.231944 }, { "epoch": 4.7515054501105265, "grad_norm": 1.699718713760376, "learning_rate": 6.082064789848296e-07, "loss": 0.03961111307144165, "memory(GiB)": 122.96, "step": 62335, "token_acc": 0.9844375277901289, "train_speed(iter/s)": 0.231948 }, { "epoch": 4.7518865767207865, "grad_norm": 3.1555092334747314, "learning_rate": 6.063460227488771e-07, "loss": 0.049031776189804074, "memory(GiB)": 122.96, "step": 62340, "token_acc": 0.9841780638240816, "train_speed(iter/s)": 0.231953 }, { "epoch": 4.7522677033310465, "grad_norm": 1.4447624683380127, "learning_rate": 6.044883990097727e-07, "loss": 0.024416552484035493, "memory(GiB)": 122.96, "step": 62345, "token_acc": 0.9906646751306946, "train_speed(iter/s)": 0.231959 }, { "epoch": 4.7526488299413066, "grad_norm": 1.1554878950119019, "learning_rate": 6.026336078740258e-07, "loss": 0.041811487078666686, "memory(GiB)": 122.96, "step": 62350, "token_acc": 0.991238670694864, "train_speed(iter/s)": 0.231965 }, { "epoch": 4.753029956551567, "grad_norm": 1.2215365171432495, "learning_rate": 6.007816494480123e-07, "loss": 0.014797374606132507, "memory(GiB)": 122.96, "step": 62355, "token_acc": 0.9905476786210731, "train_speed(iter/s)": 0.23197 }, { "epoch": 4.753411083161827, "grad_norm": 2.3872244358062744, "learning_rate": 5.989325238379362e-07, "loss": 0.030013573169708253, "memory(GiB)": 122.96, "step": 62360, "token_acc": 0.9905581782838101, "train_speed(iter/s)": 0.231972 }, { "epoch": 4.753792209772087, "grad_norm": 1.1459513902664185, "learning_rate": 5.970862311498237e-07, "loss": 0.047540485858917236, "memory(GiB)": 122.96, "step": 62365, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.231974 }, { "epoch": 4.754173336382346, "grad_norm": 0.9639091491699219, "learning_rate": 5.952427714895626e-07, "loss": 0.027018123865127565, "memory(GiB)": 122.96, "step": 62370, "token_acc": 0.9865269461077845, "train_speed(iter/s)": 0.231978 }, { "epoch": 4.754554462992606, "grad_norm": 0.9635127186775208, "learning_rate": 5.934021449628624e-07, "loss": 0.03779637813568115, "memory(GiB)": 122.96, "step": 62375, "token_acc": 0.9905349794238684, "train_speed(iter/s)": 0.231982 }, { "epoch": 4.754935589602866, "grad_norm": 1.6690819263458252, "learning_rate": 5.915643516752723e-07, "loss": 0.04272338449954986, "memory(GiB)": 122.96, "step": 62380, "token_acc": 0.9817818817070127, "train_speed(iter/s)": 0.231985 }, { "epoch": 4.755316716213126, "grad_norm": 0.7876395583152771, "learning_rate": 5.897293917321856e-07, "loss": 0.024808910489082337, "memory(GiB)": 122.96, "step": 62385, "token_acc": 0.9896311066799601, "train_speed(iter/s)": 0.231989 }, { "epoch": 4.755697842823386, "grad_norm": 0.6003465056419373, "learning_rate": 5.878972652388237e-07, "loss": 0.0286358505487442, "memory(GiB)": 122.96, "step": 62390, "token_acc": 0.9842873176206509, "train_speed(iter/s)": 0.231995 }, { "epoch": 4.756078969433646, "grad_norm": 0.9249973297119141, "learning_rate": 5.860679723002582e-07, "loss": 0.031125855445861817, "memory(GiB)": 122.96, "step": 62395, "token_acc": 0.9853469726292508, "train_speed(iter/s)": 0.231999 }, { "epoch": 4.756460096043906, "grad_norm": 1.4878966808319092, "learning_rate": 5.84241513021383e-07, "loss": 0.0350125253200531, "memory(GiB)": 122.96, "step": 62400, "token_acc": 0.9916911891985459, "train_speed(iter/s)": 0.232002 }, { "epoch": 4.756460096043906, "eval_loss": 0.047160789370536804, "eval_runtime": 220.1925, "eval_samples_per_second": 2.407, "eval_steps_per_second": 2.407, "eval_token_acc": 0.9809800614420818, "step": 62400 }, { "epoch": 4.756841222654166, "grad_norm": 1.021369218826294, "learning_rate": 5.824178875069419e-07, "loss": 0.029633811116218566, "memory(GiB)": 122.96, "step": 62405, "token_acc": 0.9813989648495145, "train_speed(iter/s)": 0.231815 }, { "epoch": 4.757222349264426, "grad_norm": 0.5650991201400757, "learning_rate": 5.80597095861507e-07, "loss": 0.03304466009140015, "memory(GiB)": 122.96, "step": 62410, "token_acc": 0.9910361191668864, "train_speed(iter/s)": 0.23182 }, { "epoch": 4.757603475874686, "grad_norm": 0.7109672427177429, "learning_rate": 5.787791381894947e-07, "loss": 0.050838303565979, "memory(GiB)": 122.96, "step": 62415, "token_acc": 0.9868449901337426, "train_speed(iter/s)": 0.231823 }, { "epoch": 4.757984602484946, "grad_norm": 1.0059800148010254, "learning_rate": 5.769640145951549e-07, "loss": 0.03181655406951904, "memory(GiB)": 122.96, "step": 62420, "token_acc": 0.9907884042264968, "train_speed(iter/s)": 0.231828 }, { "epoch": 4.758365729095205, "grad_norm": 1.9135833978652954, "learning_rate": 5.751517251825822e-07, "loss": 0.02930476367473602, "memory(GiB)": 122.96, "step": 62425, "token_acc": 0.9870586580391053, "train_speed(iter/s)": 0.231831 }, { "epoch": 4.758746855705465, "grad_norm": 1.1706748008728027, "learning_rate": 5.73342270055699e-07, "loss": 0.013683287799358368, "memory(GiB)": 122.96, "step": 62430, "token_acc": 0.9937808277932662, "train_speed(iter/s)": 0.231835 }, { "epoch": 4.759127982315725, "grad_norm": 0.6838061213493347, "learning_rate": 5.715356493182611e-07, "loss": 0.03754611611366272, "memory(GiB)": 122.96, "step": 62435, "token_acc": 0.9866758557316793, "train_speed(iter/s)": 0.231834 }, { "epoch": 4.759509108925985, "grad_norm": 0.9137806296348572, "learning_rate": 5.697318630738857e-07, "loss": 0.02070358544588089, "memory(GiB)": 122.96, "step": 62440, "token_acc": 0.991672753834916, "train_speed(iter/s)": 0.231836 }, { "epoch": 4.759890235536245, "grad_norm": 2.2764229774475098, "learning_rate": 5.679309114260011e-07, "loss": 0.0347255140542984, "memory(GiB)": 122.96, "step": 62445, "token_acc": 0.9869461298032727, "train_speed(iter/s)": 0.231839 }, { "epoch": 4.760271362146505, "grad_norm": 1.6856176853179932, "learning_rate": 5.661327944778861e-07, "loss": 0.041247588396072385, "memory(GiB)": 122.96, "step": 62450, "token_acc": 0.9817474566128067, "train_speed(iter/s)": 0.231844 }, { "epoch": 4.760652488756765, "grad_norm": 0.9990968704223633, "learning_rate": 5.643375123326522e-07, "loss": 0.027240318059921265, "memory(GiB)": 122.96, "step": 62455, "token_acc": 0.9877312952534192, "train_speed(iter/s)": 0.231848 }, { "epoch": 4.761033615367025, "grad_norm": 1.72262704372406, "learning_rate": 5.625450650932507e-07, "loss": 0.04734380543231964, "memory(GiB)": 122.96, "step": 62460, "token_acc": 0.9840462833099579, "train_speed(iter/s)": 0.231851 }, { "epoch": 4.761414741977285, "grad_norm": 2.6365325450897217, "learning_rate": 5.607554528624715e-07, "loss": 0.027806589007377626, "memory(GiB)": 122.96, "step": 62465, "token_acc": 0.9924208649130629, "train_speed(iter/s)": 0.231855 }, { "epoch": 4.761795868587544, "grad_norm": 0.6551743745803833, "learning_rate": 5.589686757429491e-07, "loss": 0.03544677197933197, "memory(GiB)": 122.96, "step": 62470, "token_acc": 0.9896640826873385, "train_speed(iter/s)": 0.231857 }, { "epoch": 4.762176995197804, "grad_norm": 1.0703436136245728, "learning_rate": 5.571847338371295e-07, "loss": 0.025857603549957274, "memory(GiB)": 122.96, "step": 62475, "token_acc": 0.990614485567558, "train_speed(iter/s)": 0.23186 }, { "epoch": 4.762558121808064, "grad_norm": 1.1262619495391846, "learning_rate": 5.554036272473306e-07, "loss": 0.022240528464317323, "memory(GiB)": 122.96, "step": 62480, "token_acc": 0.9910696566779122, "train_speed(iter/s)": 0.231864 }, { "epoch": 4.762939248418324, "grad_norm": 0.9627217650413513, "learning_rate": 5.536253560756821e-07, "loss": 0.04562720060348511, "memory(GiB)": 122.96, "step": 62485, "token_acc": 0.9809459762385115, "train_speed(iter/s)": 0.231869 }, { "epoch": 4.763320375028584, "grad_norm": 2.3767354488372803, "learning_rate": 5.518499204241523e-07, "loss": 0.01992850750684738, "memory(GiB)": 122.96, "step": 62490, "token_acc": 0.9863739591218774, "train_speed(iter/s)": 0.231874 }, { "epoch": 4.763701501638844, "grad_norm": 1.881363034248352, "learning_rate": 5.500773203945708e-07, "loss": 0.031372097134590146, "memory(GiB)": 122.96, "step": 62495, "token_acc": 0.9870156636438582, "train_speed(iter/s)": 0.231878 }, { "epoch": 4.7640826282491044, "grad_norm": 1.9561997652053833, "learning_rate": 5.483075560885731e-07, "loss": 0.02145341634750366, "memory(GiB)": 122.96, "step": 62500, "token_acc": 0.9926522043386984, "train_speed(iter/s)": 0.231884 }, { "epoch": 4.7644637548593645, "grad_norm": 2.389164686203003, "learning_rate": 5.465406276076557e-07, "loss": 0.05595534443855286, "memory(GiB)": 122.96, "step": 62505, "token_acc": 0.9775713638423199, "train_speed(iter/s)": 0.231888 }, { "epoch": 4.7648448814696245, "grad_norm": 1.020013689994812, "learning_rate": 5.447765350531431e-07, "loss": 0.03650898039340973, "memory(GiB)": 122.96, "step": 62510, "token_acc": 0.9824790794979079, "train_speed(iter/s)": 0.231893 }, { "epoch": 4.7652260080798845, "grad_norm": 1.5467352867126465, "learning_rate": 5.430152785261932e-07, "loss": 0.0366679698228836, "memory(GiB)": 122.96, "step": 62515, "token_acc": 0.9844236760124611, "train_speed(iter/s)": 0.231898 }, { "epoch": 4.7656071346901445, "grad_norm": 1.3167610168457031, "learning_rate": 5.412568581278088e-07, "loss": 0.034946206212043765, "memory(GiB)": 122.96, "step": 62520, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.231902 }, { "epoch": 4.765988261300404, "grad_norm": 0.9978244304656982, "learning_rate": 5.395012739588312e-07, "loss": 0.017975199222564697, "memory(GiB)": 122.96, "step": 62525, "token_acc": 0.9890648567119156, "train_speed(iter/s)": 0.231908 }, { "epoch": 4.766369387910664, "grad_norm": 0.7265720963478088, "learning_rate": 5.377485261199244e-07, "loss": 0.017236940562725067, "memory(GiB)": 122.96, "step": 62530, "token_acc": 0.9937788480834838, "train_speed(iter/s)": 0.231911 }, { "epoch": 4.766750514520924, "grad_norm": 0.6945503950119019, "learning_rate": 5.359986147116136e-07, "loss": 0.013528251647949218, "memory(GiB)": 122.96, "step": 62535, "token_acc": 0.993869370431502, "train_speed(iter/s)": 0.231915 }, { "epoch": 4.767131641131184, "grad_norm": 0.36731958389282227, "learning_rate": 5.342515398342351e-07, "loss": 0.014740860462188721, "memory(GiB)": 122.96, "step": 62540, "token_acc": 0.9944361224053071, "train_speed(iter/s)": 0.231919 }, { "epoch": 4.767512767741444, "grad_norm": 0.9458820223808289, "learning_rate": 5.325073015879867e-07, "loss": 0.035223832726478575, "memory(GiB)": 122.96, "step": 62545, "token_acc": 0.987010904425914, "train_speed(iter/s)": 0.231922 }, { "epoch": 4.767893894351704, "grad_norm": 0.7575409412384033, "learning_rate": 5.307659000728827e-07, "loss": 0.035723942518234256, "memory(GiB)": 122.96, "step": 62550, "token_acc": 0.9890052356020942, "train_speed(iter/s)": 0.231927 }, { "epoch": 4.768275020961964, "grad_norm": 1.0945377349853516, "learning_rate": 5.290273353887876e-07, "loss": 0.025836312770843507, "memory(GiB)": 122.96, "step": 62555, "token_acc": 0.9892446834514789, "train_speed(iter/s)": 0.23193 }, { "epoch": 4.768656147572224, "grad_norm": 0.9755048751831055, "learning_rate": 5.272916076354106e-07, "loss": 0.02612437605857849, "memory(GiB)": 122.96, "step": 62560, "token_acc": 0.9901497241922774, "train_speed(iter/s)": 0.231935 }, { "epoch": 4.769037274182484, "grad_norm": 0.7875927686691284, "learning_rate": 5.255587169122722e-07, "loss": 0.03251497447490692, "memory(GiB)": 122.96, "step": 62565, "token_acc": 0.9842209072978304, "train_speed(iter/s)": 0.23194 }, { "epoch": 4.769418400792743, "grad_norm": 1.1808918714523315, "learning_rate": 5.238286633187483e-07, "loss": 0.03806655704975128, "memory(GiB)": 122.96, "step": 62570, "token_acc": 0.9864029666254636, "train_speed(iter/s)": 0.231944 }, { "epoch": 4.769799527403003, "grad_norm": 1.1637760400772095, "learning_rate": 5.221014469540597e-07, "loss": 0.029971325397491456, "memory(GiB)": 122.96, "step": 62575, "token_acc": 0.9876579488686453, "train_speed(iter/s)": 0.231947 }, { "epoch": 4.770180654013263, "grad_norm": 0.5817446112632751, "learning_rate": 5.203770679172437e-07, "loss": 0.03605890572071076, "memory(GiB)": 122.96, "step": 62580, "token_acc": 0.9885284101093386, "train_speed(iter/s)": 0.23195 }, { "epoch": 4.770561780623523, "grad_norm": 1.4201160669326782, "learning_rate": 5.186555263071935e-07, "loss": 0.03081439733505249, "memory(GiB)": 122.96, "step": 62585, "token_acc": 0.9882000374601986, "train_speed(iter/s)": 0.231953 }, { "epoch": 4.770942907233783, "grad_norm": 0.8010106682777405, "learning_rate": 5.169368222226189e-07, "loss": 0.029424139857292177, "memory(GiB)": 122.96, "step": 62590, "token_acc": 0.9883365200764819, "train_speed(iter/s)": 0.231957 }, { "epoch": 4.771324033844043, "grad_norm": 1.048891544342041, "learning_rate": 5.152209557620969e-07, "loss": 0.03727588951587677, "memory(GiB)": 122.96, "step": 62595, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.231962 }, { "epoch": 4.771705160454303, "grad_norm": 1.6480255126953125, "learning_rate": 5.135079270240151e-07, "loss": 0.023561863601207732, "memory(GiB)": 122.96, "step": 62600, "token_acc": 0.9907876554583142, "train_speed(iter/s)": 0.231966 }, { "epoch": 4.771705160454303, "eval_loss": 0.046934016048908234, "eval_runtime": 219.4664, "eval_samples_per_second": 2.415, "eval_steps_per_second": 2.415, "eval_token_acc": 0.9810628877778447, "step": 62600 }, { "epoch": 4.772086287064563, "grad_norm": 2.224191427230835, "learning_rate": 5.117977361066006e-07, "loss": 0.045066073536872864, "memory(GiB)": 122.96, "step": 62605, "token_acc": 0.9812285343486272, "train_speed(iter/s)": 0.231778 }, { "epoch": 4.772467413674823, "grad_norm": 1.2512331008911133, "learning_rate": 5.100903831079362e-07, "loss": 0.03577404618263245, "memory(GiB)": 122.96, "step": 62610, "token_acc": 0.9840017532325225, "train_speed(iter/s)": 0.231783 }, { "epoch": 4.772848540285082, "grad_norm": 1.0179688930511475, "learning_rate": 5.083858681259324e-07, "loss": 0.02062276899814606, "memory(GiB)": 122.96, "step": 62615, "token_acc": 0.993305231837342, "train_speed(iter/s)": 0.231787 }, { "epoch": 4.773229666895343, "grad_norm": 0.8549711108207703, "learning_rate": 5.066841912583276e-07, "loss": 0.03325316309928894, "memory(GiB)": 122.96, "step": 62620, "token_acc": 0.9863851919840906, "train_speed(iter/s)": 0.231789 }, { "epoch": 4.773610793505602, "grad_norm": 3.8969483375549316, "learning_rate": 5.049853526027049e-07, "loss": 0.03270209431648254, "memory(GiB)": 122.96, "step": 62625, "token_acc": 0.9924103637791154, "train_speed(iter/s)": 0.231794 }, { "epoch": 4.773991920115862, "grad_norm": 0.6585058569908142, "learning_rate": 5.032893522564919e-07, "loss": 0.024587245285511018, "memory(GiB)": 122.96, "step": 62630, "token_acc": 0.9905340491664312, "train_speed(iter/s)": 0.231796 }, { "epoch": 4.774373046726122, "grad_norm": 0.8218967914581299, "learning_rate": 5.015961903169331e-07, "loss": 0.0219271719455719, "memory(GiB)": 122.96, "step": 62635, "token_acc": 0.9918912745545911, "train_speed(iter/s)": 0.231798 }, { "epoch": 4.774754173336382, "grad_norm": 0.8402374982833862, "learning_rate": 4.999058668811396e-07, "loss": 0.027888554334640502, "memory(GiB)": 122.96, "step": 62640, "token_acc": 0.987873754152824, "train_speed(iter/s)": 0.231801 }, { "epoch": 4.775135299946642, "grad_norm": 1.5269091129302979, "learning_rate": 4.982183820460339e-07, "loss": 0.016513562202453612, "memory(GiB)": 122.96, "step": 62645, "token_acc": 0.9923566878980892, "train_speed(iter/s)": 0.231808 }, { "epoch": 4.775516426556902, "grad_norm": 2.2441885471343994, "learning_rate": 4.965337359083888e-07, "loss": 0.04495645761489868, "memory(GiB)": 122.96, "step": 62650, "token_acc": 0.9873496474491912, "train_speed(iter/s)": 0.231813 }, { "epoch": 4.775897553167162, "grad_norm": 0.9190105199813843, "learning_rate": 4.948519285648101e-07, "loss": 0.024235151708126068, "memory(GiB)": 122.96, "step": 62655, "token_acc": 0.9887140902872777, "train_speed(iter/s)": 0.231815 }, { "epoch": 4.776278679777422, "grad_norm": 1.3544459342956543, "learning_rate": 4.931729601117374e-07, "loss": 0.02572435438632965, "memory(GiB)": 122.96, "step": 62660, "token_acc": 0.990108803165183, "train_speed(iter/s)": 0.231818 }, { "epoch": 4.776659806387682, "grad_norm": 4.327854633331299, "learning_rate": 4.914968306454604e-07, "loss": 0.03743477761745453, "memory(GiB)": 122.96, "step": 62665, "token_acc": 0.9886527550726031, "train_speed(iter/s)": 0.231818 }, { "epoch": 4.7770409329979415, "grad_norm": 0.6562339067459106, "learning_rate": 4.89823540262091e-07, "loss": 0.020708820223808287, "memory(GiB)": 122.96, "step": 62670, "token_acc": 0.9898463941681853, "train_speed(iter/s)": 0.231823 }, { "epoch": 4.7774220596082015, "grad_norm": 2.04366397857666, "learning_rate": 4.881530890575859e-07, "loss": 0.029667758941650392, "memory(GiB)": 122.96, "step": 62675, "token_acc": 0.9889442541264404, "train_speed(iter/s)": 0.231826 }, { "epoch": 4.777803186218462, "grad_norm": 1.1642143726348877, "learning_rate": 4.86485477127735e-07, "loss": 0.029621019959449768, "memory(GiB)": 122.96, "step": 62680, "token_acc": 0.9878362907842015, "train_speed(iter/s)": 0.231827 }, { "epoch": 4.778184312828722, "grad_norm": 1.4233492612838745, "learning_rate": 4.848207045681785e-07, "loss": 0.024975875020027162, "memory(GiB)": 122.96, "step": 62685, "token_acc": 0.9862165401518178, "train_speed(iter/s)": 0.231829 }, { "epoch": 4.778565439438982, "grad_norm": 0.22307443618774414, "learning_rate": 4.831587714743679e-07, "loss": 0.01600743681192398, "memory(GiB)": 122.96, "step": 62690, "token_acc": 0.9903713892709766, "train_speed(iter/s)": 0.231833 }, { "epoch": 4.778946566049242, "grad_norm": 0.7013185024261475, "learning_rate": 4.814996779416214e-07, "loss": 0.023009638488292693, "memory(GiB)": 122.96, "step": 62695, "token_acc": 0.9923132356473697, "train_speed(iter/s)": 0.231838 }, { "epoch": 4.779327692659502, "grad_norm": 1.3341270685195923, "learning_rate": 4.79843424065074e-07, "loss": 0.029384291172027587, "memory(GiB)": 122.96, "step": 62700, "token_acc": 0.9893992932862191, "train_speed(iter/s)": 0.23184 }, { "epoch": 4.779708819269762, "grad_norm": 1.5046217441558838, "learning_rate": 4.781900099396996e-07, "loss": 0.02776591181755066, "memory(GiB)": 122.96, "step": 62705, "token_acc": 0.9900604432505037, "train_speed(iter/s)": 0.231842 }, { "epoch": 4.780089945880022, "grad_norm": 1.2191438674926758, "learning_rate": 4.7653943566032255e-07, "loss": 0.03564045131206513, "memory(GiB)": 122.96, "step": 62710, "token_acc": 0.9856436808660861, "train_speed(iter/s)": 0.231846 }, { "epoch": 4.780471072490281, "grad_norm": 0.94449782371521, "learning_rate": 4.7489170132159476e-07, "loss": 0.04566242098808289, "memory(GiB)": 122.96, "step": 62715, "token_acc": 0.9814202487797198, "train_speed(iter/s)": 0.231849 }, { "epoch": 4.780852199100541, "grad_norm": 0.6896898746490479, "learning_rate": 4.7324680701799626e-07, "loss": 0.03482694625854492, "memory(GiB)": 122.96, "step": 62720, "token_acc": 0.9869578404610252, "train_speed(iter/s)": 0.231851 }, { "epoch": 4.781233325710801, "grad_norm": 0.9160072207450867, "learning_rate": 4.7160475284386827e-07, "loss": 0.021392405033111572, "memory(GiB)": 122.96, "step": 62725, "token_acc": 0.9916094584286804, "train_speed(iter/s)": 0.231853 }, { "epoch": 4.781614452321061, "grad_norm": 0.9346680045127869, "learning_rate": 4.6996553889336325e-07, "loss": 0.03498615622520447, "memory(GiB)": 122.96, "step": 62730, "token_acc": 0.9850498338870431, "train_speed(iter/s)": 0.231857 }, { "epoch": 4.781995578931321, "grad_norm": 1.2975705862045288, "learning_rate": 4.683291652604893e-07, "loss": 0.031201893091201784, "memory(GiB)": 122.96, "step": 62735, "token_acc": 0.9835255354200988, "train_speed(iter/s)": 0.231862 }, { "epoch": 4.782376705541581, "grad_norm": 1.0884557962417603, "learning_rate": 4.666956320390825e-07, "loss": 0.02988581657409668, "memory(GiB)": 122.96, "step": 62740, "token_acc": 0.9881566113975199, "train_speed(iter/s)": 0.231865 }, { "epoch": 4.782757832151841, "grad_norm": 1.6932110786437988, "learning_rate": 4.650649393228179e-07, "loss": 0.04001253843307495, "memory(GiB)": 122.96, "step": 62745, "token_acc": 0.9848267964500429, "train_speed(iter/s)": 0.231865 }, { "epoch": 4.783138958762101, "grad_norm": 1.3907928466796875, "learning_rate": 4.6343708720520405e-07, "loss": 0.04118227660655975, "memory(GiB)": 122.96, "step": 62750, "token_acc": 0.9805112272277927, "train_speed(iter/s)": 0.231867 }, { "epoch": 4.783520085372361, "grad_norm": 0.6844531297683716, "learning_rate": 4.618120757796052e-07, "loss": 0.024432250857353212, "memory(GiB)": 122.96, "step": 62755, "token_acc": 0.9889033942558747, "train_speed(iter/s)": 0.231869 }, { "epoch": 4.783901211982621, "grad_norm": 1.2198781967163086, "learning_rate": 4.6018990513919114e-07, "loss": 0.02533690333366394, "memory(GiB)": 122.96, "step": 62760, "token_acc": 0.9890194865450046, "train_speed(iter/s)": 0.231873 }, { "epoch": 4.784282338592881, "grad_norm": 1.9371570348739624, "learning_rate": 4.5857057537699867e-07, "loss": 0.023909792304039, "memory(GiB)": 122.96, "step": 62765, "token_acc": 0.9932513858761147, "train_speed(iter/s)": 0.231878 }, { "epoch": 4.78466346520314, "grad_norm": 2.163282632827759, "learning_rate": 4.569540865858812e-07, "loss": 0.025183388590812684, "memory(GiB)": 122.96, "step": 62770, "token_acc": 0.9883214568487728, "train_speed(iter/s)": 0.231882 }, { "epoch": 4.7850445918134, "grad_norm": 1.7538942098617554, "learning_rate": 4.553404388585369e-07, "loss": 0.04961448311805725, "memory(GiB)": 122.96, "step": 62775, "token_acc": 0.9830022075055188, "train_speed(iter/s)": 0.231885 }, { "epoch": 4.78542571842366, "grad_norm": 0.8211517930030823, "learning_rate": 4.5372963228750285e-07, "loss": 0.02752041518688202, "memory(GiB)": 122.96, "step": 62780, "token_acc": 0.987410071942446, "train_speed(iter/s)": 0.231889 }, { "epoch": 4.78580684503392, "grad_norm": 1.928106665611267, "learning_rate": 4.5212166696515514e-07, "loss": 0.01908148229122162, "memory(GiB)": 122.96, "step": 62785, "token_acc": 0.9922495274102079, "train_speed(iter/s)": 0.231893 }, { "epoch": 4.78618797164418, "grad_norm": 1.5437453985214233, "learning_rate": 4.5051654298369773e-07, "loss": 0.02234601676464081, "memory(GiB)": 122.96, "step": 62790, "token_acc": 0.9922212618841832, "train_speed(iter/s)": 0.231897 }, { "epoch": 4.78656909825444, "grad_norm": 2.175292491912842, "learning_rate": 4.489142604351848e-07, "loss": 0.033027869462966916, "memory(GiB)": 122.96, "step": 62795, "token_acc": 0.986705663387397, "train_speed(iter/s)": 0.231902 }, { "epoch": 4.7869502248647, "grad_norm": 0.9385145306587219, "learning_rate": 4.4731481941148735e-07, "loss": 0.019714036583900453, "memory(GiB)": 122.96, "step": 62800, "token_acc": 0.9927128188141768, "train_speed(iter/s)": 0.231908 }, { "epoch": 4.7869502248647, "eval_loss": 0.04687352478504181, "eval_runtime": 220.9384, "eval_samples_per_second": 2.399, "eval_steps_per_second": 2.399, "eval_token_acc": 0.980912294440094, "step": 62800 }, { "epoch": 4.78733135147496, "grad_norm": 0.6770663857460022, "learning_rate": 4.4571822000433194e-07, "loss": 0.021140208840370177, "memory(GiB)": 122.96, "step": 62805, "token_acc": 0.9814591776008161, "train_speed(iter/s)": 0.231721 }, { "epoch": 4.78771247808522, "grad_norm": 1.3421344757080078, "learning_rate": 4.441244623052787e-07, "loss": 0.03847982883453369, "memory(GiB)": 122.96, "step": 62810, "token_acc": 0.9834224598930481, "train_speed(iter/s)": 0.231727 }, { "epoch": 4.788093604695479, "grad_norm": 0.693310022354126, "learning_rate": 4.425335464057212e-07, "loss": 0.022837018966674803, "memory(GiB)": 122.96, "step": 62815, "token_acc": 0.988318863456985, "train_speed(iter/s)": 0.23173 }, { "epoch": 4.788474731305739, "grad_norm": 2.5987956523895264, "learning_rate": 4.409454723968864e-07, "loss": 0.014039571583271026, "memory(GiB)": 122.96, "step": 62820, "token_acc": 0.9939197405755978, "train_speed(iter/s)": 0.231735 }, { "epoch": 4.788855857915999, "grad_norm": 1.7249819040298462, "learning_rate": 4.393602403698516e-07, "loss": 0.04708206653594971, "memory(GiB)": 122.96, "step": 62825, "token_acc": 0.9813925570228091, "train_speed(iter/s)": 0.231739 }, { "epoch": 4.7892369845262595, "grad_norm": 1.1998162269592285, "learning_rate": 4.3777785041551056e-07, "loss": 0.0169865220785141, "memory(GiB)": 122.96, "step": 62830, "token_acc": 0.9927426160337552, "train_speed(iter/s)": 0.231742 }, { "epoch": 4.7896181111365195, "grad_norm": 0.19533002376556396, "learning_rate": 4.361983026246186e-07, "loss": 0.021781469881534576, "memory(GiB)": 122.96, "step": 62835, "token_acc": 0.9894580549368968, "train_speed(iter/s)": 0.231746 }, { "epoch": 4.7899992377467795, "grad_norm": 1.0102574825286865, "learning_rate": 4.346215970877476e-07, "loss": 0.02966471016407013, "memory(GiB)": 122.96, "step": 62840, "token_acc": 0.9899841855561413, "train_speed(iter/s)": 0.23175 }, { "epoch": 4.7903803643570395, "grad_norm": 0.8018893003463745, "learning_rate": 4.330477338953198e-07, "loss": 0.028153863549232484, "memory(GiB)": 122.96, "step": 62845, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.231755 }, { "epoch": 4.7907614909672995, "grad_norm": 1.1174206733703613, "learning_rate": 4.3147671313757963e-07, "loss": 0.02002035826444626, "memory(GiB)": 122.96, "step": 62850, "token_acc": 0.9910184714455177, "train_speed(iter/s)": 0.231758 }, { "epoch": 4.7911426175775595, "grad_norm": 2.8240368366241455, "learning_rate": 4.2990853490463277e-07, "loss": 0.05269259214401245, "memory(GiB)": 122.96, "step": 62855, "token_acc": 0.9813506640293869, "train_speed(iter/s)": 0.231764 }, { "epoch": 4.79152374418782, "grad_norm": 0.9259799122810364, "learning_rate": 4.283431992863851e-07, "loss": 0.036946064233779906, "memory(GiB)": 122.96, "step": 62860, "token_acc": 0.9850223072020395, "train_speed(iter/s)": 0.231769 }, { "epoch": 4.79190487079808, "grad_norm": 1.4255354404449463, "learning_rate": 4.267807063726259e-07, "loss": 0.025675690174102782, "memory(GiB)": 122.96, "step": 62865, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.231773 }, { "epoch": 4.792285997408339, "grad_norm": 0.47609105706214905, "learning_rate": 4.252210562529391e-07, "loss": 0.04000739455223083, "memory(GiB)": 122.96, "step": 62870, "token_acc": 0.9834745762711864, "train_speed(iter/s)": 0.231776 }, { "epoch": 4.792667124018599, "grad_norm": 1.2595787048339844, "learning_rate": 4.2366424901677525e-07, "loss": 0.048571014404296876, "memory(GiB)": 122.96, "step": 62875, "token_acc": 0.9776721108124871, "train_speed(iter/s)": 0.23178 }, { "epoch": 4.793048250628859, "grad_norm": 0.798064649105072, "learning_rate": 4.2211028475340196e-07, "loss": 0.016861100494861603, "memory(GiB)": 122.96, "step": 62880, "token_acc": 0.9935117599351176, "train_speed(iter/s)": 0.231784 }, { "epoch": 4.793429377239119, "grad_norm": 0.67025226354599, "learning_rate": 4.2055916355193126e-07, "loss": 0.03081633448600769, "memory(GiB)": 122.96, "step": 62885, "token_acc": 0.9887742718446602, "train_speed(iter/s)": 0.231789 }, { "epoch": 4.793810503849379, "grad_norm": 0.08056657016277313, "learning_rate": 4.190108855013197e-07, "loss": 0.016903056204319, "memory(GiB)": 122.96, "step": 62890, "token_acc": 0.9932203389830508, "train_speed(iter/s)": 0.231794 }, { "epoch": 4.794191630459639, "grad_norm": 0.6369702219963074, "learning_rate": 4.1746545069034636e-07, "loss": 0.017359675467014314, "memory(GiB)": 122.96, "step": 62895, "token_acc": 0.9921663151551672, "train_speed(iter/s)": 0.231799 }, { "epoch": 4.794572757069899, "grad_norm": 1.533560872077942, "learning_rate": 4.159228592076403e-07, "loss": 0.013130754232406616, "memory(GiB)": 122.96, "step": 62900, "token_acc": 0.9941728714794432, "train_speed(iter/s)": 0.231804 }, { "epoch": 4.794953883680159, "grad_norm": 1.611738920211792, "learning_rate": 4.143831111416585e-07, "loss": 0.03364244699478149, "memory(GiB)": 122.96, "step": 62905, "token_acc": 0.9820426487093153, "train_speed(iter/s)": 0.231808 }, { "epoch": 4.795335010290419, "grad_norm": 1.980904459953308, "learning_rate": 4.1284620658070816e-07, "loss": 0.03443012535572052, "memory(GiB)": 122.96, "step": 62910, "token_acc": 0.9902006532897807, "train_speed(iter/s)": 0.231813 }, { "epoch": 4.795716136900678, "grad_norm": 0.9902762770652771, "learning_rate": 4.1131214561290763e-07, "loss": 0.0279181569814682, "memory(GiB)": 122.96, "step": 62915, "token_acc": 0.9908866730996033, "train_speed(iter/s)": 0.231813 }, { "epoch": 4.796097263510938, "grad_norm": 0.7366730570793152, "learning_rate": 4.097809283262366e-07, "loss": 0.021202768385410308, "memory(GiB)": 122.96, "step": 62920, "token_acc": 0.9920582395764395, "train_speed(iter/s)": 0.23182 }, { "epoch": 4.796478390121198, "grad_norm": 0.3260992169380188, "learning_rate": 4.0825255480850255e-07, "loss": 0.01669573187828064, "memory(GiB)": 122.96, "step": 62925, "token_acc": 0.9912779673871824, "train_speed(iter/s)": 0.231825 }, { "epoch": 4.796859516731458, "grad_norm": 0.9112858176231384, "learning_rate": 4.0672702514735207e-07, "loss": 0.02739083766937256, "memory(GiB)": 122.96, "step": 62930, "token_acc": 0.9869470630891951, "train_speed(iter/s)": 0.231829 }, { "epoch": 4.797240643341718, "grad_norm": 2.9881927967071533, "learning_rate": 4.052043394302651e-07, "loss": 0.037132936716079715, "memory(GiB)": 122.96, "step": 62935, "token_acc": 0.9854838709677419, "train_speed(iter/s)": 0.231833 }, { "epoch": 4.797621769951978, "grad_norm": 1.567980408668518, "learning_rate": 4.0368449774456084e-07, "loss": 0.039171501994132996, "memory(GiB)": 122.96, "step": 62940, "token_acc": 0.9785658612626656, "train_speed(iter/s)": 0.231837 }, { "epoch": 4.798002896562238, "grad_norm": 0.31848493218421936, "learning_rate": 4.021675001773972e-07, "loss": 0.02820260226726532, "memory(GiB)": 122.96, "step": 62945, "token_acc": 0.9888996138996139, "train_speed(iter/s)": 0.231843 }, { "epoch": 4.798384023172498, "grad_norm": 1.1921637058258057, "learning_rate": 4.0065334681576027e-07, "loss": 0.04333561360836029, "memory(GiB)": 122.96, "step": 62950, "token_acc": 0.983283393786469, "train_speed(iter/s)": 0.231847 }, { "epoch": 4.798765149782758, "grad_norm": 1.0458654165267944, "learning_rate": 3.991420377464916e-07, "loss": 0.06382641792297364, "memory(GiB)": 122.96, "step": 62955, "token_acc": 0.9730911537167843, "train_speed(iter/s)": 0.231851 }, { "epoch": 4.799146276393017, "grad_norm": 0.9163231253623962, "learning_rate": 3.9763357305624414e-07, "loss": 0.030017131567001344, "memory(GiB)": 122.96, "step": 62960, "token_acc": 0.98898628905372, "train_speed(iter/s)": 0.231855 }, { "epoch": 4.799527403003278, "grad_norm": 1.6784805059432983, "learning_rate": 3.9612795283153206e-07, "loss": 0.03238446712493896, "memory(GiB)": 122.96, "step": 62965, "token_acc": 0.9881413359148112, "train_speed(iter/s)": 0.23186 }, { "epoch": 4.799908529613537, "grad_norm": 1.1986920833587646, "learning_rate": 3.946251771586973e-07, "loss": 0.039999520778656004, "memory(GiB)": 122.96, "step": 62970, "token_acc": 0.9842956852791879, "train_speed(iter/s)": 0.231863 }, { "epoch": 4.800289656223797, "grad_norm": 1.2301034927368164, "learning_rate": 3.9312524612390434e-07, "loss": 0.02826564908027649, "memory(GiB)": 122.96, "step": 62975, "token_acc": 0.9890613451589061, "train_speed(iter/s)": 0.231865 }, { "epoch": 4.800670782834057, "grad_norm": 1.8153774738311768, "learning_rate": 3.916281598131788e-07, "loss": 0.025593915581703187, "memory(GiB)": 122.96, "step": 62980, "token_acc": 0.9889901290812453, "train_speed(iter/s)": 0.23187 }, { "epoch": 4.801051909444317, "grad_norm": 1.4484559297561646, "learning_rate": 3.9013391831236313e-07, "loss": 0.02503419518470764, "memory(GiB)": 122.96, "step": 62985, "token_acc": 0.9906987836870975, "train_speed(iter/s)": 0.231875 }, { "epoch": 4.801433036054577, "grad_norm": 0.8772055506706238, "learning_rate": 3.8864252170714435e-07, "loss": 0.04700168967247009, "memory(GiB)": 122.96, "step": 62990, "token_acc": 0.9783399209486165, "train_speed(iter/s)": 0.231878 }, { "epoch": 4.801814162664837, "grad_norm": 1.037616491317749, "learning_rate": 3.871539700830651e-07, "loss": 0.03213859498500824, "memory(GiB)": 122.96, "step": 62995, "token_acc": 0.9880224883891469, "train_speed(iter/s)": 0.231883 }, { "epoch": 4.802195289275097, "grad_norm": 1.2666484117507935, "learning_rate": 3.856682635254627e-07, "loss": 0.02355342507362366, "memory(GiB)": 122.96, "step": 63000, "token_acc": 0.9887527278831627, "train_speed(iter/s)": 0.231886 }, { "epoch": 4.802195289275097, "eval_loss": 0.046976033598184586, "eval_runtime": 219.2182, "eval_samples_per_second": 2.418, "eval_steps_per_second": 2.418, "eval_token_acc": 0.9810026504427444, "step": 63000 }, { "epoch": 4.802576415885357, "grad_norm": 0.9564261436462402, "learning_rate": 3.8418540211955235e-07, "loss": 0.018975776433944703, "memory(GiB)": 122.96, "step": 63005, "token_acc": 0.9812516484482607, "train_speed(iter/s)": 0.231702 }, { "epoch": 4.8029575424956175, "grad_norm": 0.6300713419914246, "learning_rate": 3.82705385950366e-07, "loss": 0.03789075016975403, "memory(GiB)": 122.96, "step": 63010, "token_acc": 0.9870918754745635, "train_speed(iter/s)": 0.231704 }, { "epoch": 4.803338669105877, "grad_norm": 1.6559646129608154, "learning_rate": 3.812282151027691e-07, "loss": 0.022215460240840913, "memory(GiB)": 122.96, "step": 63015, "token_acc": 0.9905131522207848, "train_speed(iter/s)": 0.231711 }, { "epoch": 4.803719795716137, "grad_norm": 1.1685469150543213, "learning_rate": 3.7975388966147717e-07, "loss": 0.01466372311115265, "memory(GiB)": 122.96, "step": 63020, "token_acc": 0.9918856259659969, "train_speed(iter/s)": 0.231717 }, { "epoch": 4.804100922326397, "grad_norm": 0.9703885912895203, "learning_rate": 3.782824097110338e-07, "loss": 0.03989015817642212, "memory(GiB)": 122.96, "step": 63025, "token_acc": 0.9860248447204969, "train_speed(iter/s)": 0.23172 }, { "epoch": 4.804482048936657, "grad_norm": 0.9476400017738342, "learning_rate": 3.768137753358159e-07, "loss": 0.021323683857917785, "memory(GiB)": 122.96, "step": 63030, "token_acc": 0.9921875, "train_speed(iter/s)": 0.231724 }, { "epoch": 4.804863175546917, "grad_norm": 1.2856132984161377, "learning_rate": 3.75347986620056e-07, "loss": 0.04156841933727264, "memory(GiB)": 122.96, "step": 63035, "token_acc": 0.9873517786561264, "train_speed(iter/s)": 0.23173 }, { "epoch": 4.805244302157177, "grad_norm": 2.2435648441314697, "learning_rate": 3.738850436477981e-07, "loss": 0.030949196219444274, "memory(GiB)": 122.96, "step": 63040, "token_acc": 0.9887556221889056, "train_speed(iter/s)": 0.231736 }, { "epoch": 4.805625428767437, "grad_norm": 0.887566864490509, "learning_rate": 3.724249465029417e-07, "loss": 0.0371663510799408, "memory(GiB)": 122.96, "step": 63045, "token_acc": 0.9876506024096385, "train_speed(iter/s)": 0.231741 }, { "epoch": 4.806006555377697, "grad_norm": 0.6292742490768433, "learning_rate": 3.709676952692143e-07, "loss": 0.016661980748176576, "memory(GiB)": 122.96, "step": 63050, "token_acc": 0.9934289127837514, "train_speed(iter/s)": 0.231746 }, { "epoch": 4.806387681987957, "grad_norm": 1.110206127166748, "learning_rate": 3.695132900301823e-07, "loss": 0.015166878700256348, "memory(GiB)": 122.96, "step": 63055, "token_acc": 0.9928656361474435, "train_speed(iter/s)": 0.231752 }, { "epoch": 4.806768808598216, "grad_norm": 0.55885910987854, "learning_rate": 3.6806173086924575e-07, "loss": 0.01713033616542816, "memory(GiB)": 122.96, "step": 63060, "token_acc": 0.9887459807073955, "train_speed(iter/s)": 0.231753 }, { "epoch": 4.807149935208476, "grad_norm": 0.00022453462588600814, "learning_rate": 3.666130178696547e-07, "loss": 0.013344967365264892, "memory(GiB)": 122.96, "step": 63065, "token_acc": 0.993766404199475, "train_speed(iter/s)": 0.231759 }, { "epoch": 4.807531061818736, "grad_norm": 0.9472525119781494, "learning_rate": 3.651671511144761e-07, "loss": 0.029626345634460448, "memory(GiB)": 122.96, "step": 63070, "token_acc": 0.9894995531724754, "train_speed(iter/s)": 0.231763 }, { "epoch": 4.807912188428996, "grad_norm": 1.0764795541763306, "learning_rate": 3.637241306866268e-07, "loss": 0.03310932219028473, "memory(GiB)": 122.96, "step": 63075, "token_acc": 0.9825987965522849, "train_speed(iter/s)": 0.231767 }, { "epoch": 4.808293315039256, "grad_norm": 0.723978579044342, "learning_rate": 3.6228395666886294e-07, "loss": 0.04004574418067932, "memory(GiB)": 122.96, "step": 63080, "token_acc": 0.9914417379855168, "train_speed(iter/s)": 0.231771 }, { "epoch": 4.808674441649516, "grad_norm": 1.5451291799545288, "learning_rate": 3.608466291437629e-07, "loss": 0.030081966519355775, "memory(GiB)": 122.96, "step": 63085, "token_acc": 0.9887429643527205, "train_speed(iter/s)": 0.231776 }, { "epoch": 4.809055568259776, "grad_norm": 0.9197754263877869, "learning_rate": 3.594121481937551e-07, "loss": 0.02376446723937988, "memory(GiB)": 122.96, "step": 63090, "token_acc": 0.9906617945594803, "train_speed(iter/s)": 0.231779 }, { "epoch": 4.809436694870036, "grad_norm": 1.2862337827682495, "learning_rate": 3.5798051390110164e-07, "loss": 0.03707510232925415, "memory(GiB)": 122.96, "step": 63095, "token_acc": 0.9845050215208034, "train_speed(iter/s)": 0.231781 }, { "epoch": 4.809817821480296, "grad_norm": 0.3562486171722412, "learning_rate": 3.565517263478979e-07, "loss": 0.025219646096229554, "memory(GiB)": 122.96, "step": 63100, "token_acc": 0.9894753627810556, "train_speed(iter/s)": 0.231784 }, { "epoch": 4.810198948090556, "grad_norm": 0.8712477087974548, "learning_rate": 3.5512578561607834e-07, "loss": 0.03258058130741119, "memory(GiB)": 122.96, "step": 63105, "token_acc": 0.9835796387520526, "train_speed(iter/s)": 0.231788 }, { "epoch": 4.810580074700816, "grad_norm": 0.4659501612186432, "learning_rate": 3.5370269178741646e-07, "loss": 0.020153559744358063, "memory(GiB)": 122.96, "step": 63110, "token_acc": 0.9930896121266162, "train_speed(iter/s)": 0.231792 }, { "epoch": 4.810961201311075, "grad_norm": 1.564516305923462, "learning_rate": 3.5228244494351916e-07, "loss": 0.038122183084487914, "memory(GiB)": 122.96, "step": 63115, "token_acc": 0.9852549298276781, "train_speed(iter/s)": 0.231796 }, { "epoch": 4.811342327921335, "grad_norm": 0.31255125999450684, "learning_rate": 3.5086504516582686e-07, "loss": 0.02219025194644928, "memory(GiB)": 122.96, "step": 63120, "token_acc": 0.9960638530505139, "train_speed(iter/s)": 0.2318 }, { "epoch": 4.811723454531595, "grad_norm": 0.9172451496124268, "learning_rate": 3.4945049253562455e-07, "loss": 0.03771839141845703, "memory(GiB)": 122.96, "step": 63125, "token_acc": 0.9880580039806653, "train_speed(iter/s)": 0.231801 }, { "epoch": 4.812104581141855, "grad_norm": 3.185112237930298, "learning_rate": 3.480387871340307e-07, "loss": 0.05779516100883484, "memory(GiB)": 122.96, "step": 63130, "token_acc": 0.9833714140644834, "train_speed(iter/s)": 0.231803 }, { "epoch": 4.812485707752115, "grad_norm": 1.875710368156433, "learning_rate": 3.4662992904200276e-07, "loss": 0.062228846549987796, "memory(GiB)": 122.96, "step": 63135, "token_acc": 0.9801128728836335, "train_speed(iter/s)": 0.231808 }, { "epoch": 4.812866834362375, "grad_norm": 0.5117953419685364, "learning_rate": 3.452239183403205e-07, "loss": 0.030853748321533203, "memory(GiB)": 122.96, "step": 63140, "token_acc": 0.988309790550414, "train_speed(iter/s)": 0.231811 }, { "epoch": 4.813247960972635, "grad_norm": 0.7803714275360107, "learning_rate": 3.4382075510962507e-07, "loss": 0.03281426131725311, "memory(GiB)": 122.96, "step": 63145, "token_acc": 0.9872895344998349, "train_speed(iter/s)": 0.231814 }, { "epoch": 4.813629087582895, "grad_norm": 1.8475613594055176, "learning_rate": 3.4242043943037985e-07, "loss": 0.027683475613594057, "memory(GiB)": 122.96, "step": 63150, "token_acc": 0.9906654622101777, "train_speed(iter/s)": 0.231817 }, { "epoch": 4.814010214193155, "grad_norm": 0.9337578415870667, "learning_rate": 3.410229713828761e-07, "loss": 0.024223875999450684, "memory(GiB)": 122.96, "step": 63155, "token_acc": 0.9909397387273493, "train_speed(iter/s)": 0.23182 }, { "epoch": 4.8143913408034145, "grad_norm": 0.59360671043396, "learning_rate": 3.3962835104726087e-07, "loss": 0.01840486526489258, "memory(GiB)": 122.96, "step": 63160, "token_acc": 0.9894979508196722, "train_speed(iter/s)": 0.231821 }, { "epoch": 4.8147724674136745, "grad_norm": 0.020730547606945038, "learning_rate": 3.382365785035146e-07, "loss": 0.02042759656906128, "memory(GiB)": 122.96, "step": 63165, "token_acc": 0.9886653895274585, "train_speed(iter/s)": 0.231823 }, { "epoch": 4.8151535940239345, "grad_norm": 1.3015488386154175, "learning_rate": 3.368476538314347e-07, "loss": 0.026781582832336427, "memory(GiB)": 122.96, "step": 63170, "token_acc": 0.9918330308529946, "train_speed(iter/s)": 0.231828 }, { "epoch": 4.8155347206341945, "grad_norm": 0.398786723613739, "learning_rate": 3.354615771106739e-07, "loss": 0.028829434514045717, "memory(GiB)": 122.96, "step": 63175, "token_acc": 0.9880542619963555, "train_speed(iter/s)": 0.231832 }, { "epoch": 4.8159158472444545, "grad_norm": 3.616356134414673, "learning_rate": 3.340783484207299e-07, "loss": 0.05064421892166138, "memory(GiB)": 122.96, "step": 63180, "token_acc": 0.9860576923076924, "train_speed(iter/s)": 0.231838 }, { "epoch": 4.816296973854715, "grad_norm": 0.9188841581344604, "learning_rate": 3.3269796784090587e-07, "loss": 0.02438846677541733, "memory(GiB)": 122.96, "step": 63185, "token_acc": 0.9914995640802092, "train_speed(iter/s)": 0.231842 }, { "epoch": 4.816678100464975, "grad_norm": 0.9154105186462402, "learning_rate": 3.3132043545037183e-07, "loss": 0.04107388257980347, "memory(GiB)": 122.96, "step": 63190, "token_acc": 0.9840905614196115, "train_speed(iter/s)": 0.231845 }, { "epoch": 4.817059227075235, "grad_norm": 1.4026626348495483, "learning_rate": 3.2994575132812013e-07, "loss": 0.03948026299476624, "memory(GiB)": 122.96, "step": 63195, "token_acc": 0.9864069506726457, "train_speed(iter/s)": 0.231847 }, { "epoch": 4.817440353685495, "grad_norm": 1.0218368768692017, "learning_rate": 3.285739155529821e-07, "loss": 0.028913941979408265, "memory(GiB)": 122.96, "step": 63200, "token_acc": 0.9895212285456187, "train_speed(iter/s)": 0.23185 }, { "epoch": 4.817440353685495, "eval_loss": 0.046944618225097656, "eval_runtime": 216.6095, "eval_samples_per_second": 2.447, "eval_steps_per_second": 2.447, "eval_token_acc": 0.9809725317751943, "step": 63200 }, { "epoch": 4.817821480295755, "grad_norm": 2.01212215423584, "learning_rate": 3.2720492820362826e-07, "loss": 0.042246705293655394, "memory(GiB)": 122.96, "step": 63205, "token_acc": 0.9809797289796401, "train_speed(iter/s)": 0.231672 }, { "epoch": 4.818202606906015, "grad_norm": 0.7240539789199829, "learning_rate": 3.258387893585624e-07, "loss": 0.01544800102710724, "memory(GiB)": 122.96, "step": 63210, "token_acc": 0.9941520467836257, "train_speed(iter/s)": 0.231676 }, { "epoch": 4.818583733516274, "grad_norm": 0.6905646920204163, "learning_rate": 3.2447549909612186e-07, "loss": 0.028396591544151306, "memory(GiB)": 122.96, "step": 63215, "token_acc": 0.9868391701985277, "train_speed(iter/s)": 0.231682 }, { "epoch": 4.818964860126534, "grad_norm": 3.066343307495117, "learning_rate": 3.231150574944941e-07, "loss": 0.030277109146118163, "memory(GiB)": 122.96, "step": 63220, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.231686 }, { "epoch": 4.819345986736794, "grad_norm": 1.7845511436462402, "learning_rate": 3.2175746463168345e-07, "loss": 0.04549559950828552, "memory(GiB)": 122.96, "step": 63225, "token_acc": 0.9847161572052402, "train_speed(iter/s)": 0.231688 }, { "epoch": 4.819727113347054, "grad_norm": 2.5665321350097656, "learning_rate": 3.2040272058554975e-07, "loss": 0.026473623514175416, "memory(GiB)": 122.96, "step": 63230, "token_acc": 0.994447015270708, "train_speed(iter/s)": 0.231694 }, { "epoch": 4.820108239957314, "grad_norm": 42.414817810058594, "learning_rate": 3.190508254337754e-07, "loss": 0.054391014575958255, "memory(GiB)": 122.96, "step": 63235, "token_acc": 0.9769794209975584, "train_speed(iter/s)": 0.2317 }, { "epoch": 4.820489366567574, "grad_norm": 2.351365089416504, "learning_rate": 3.1770177925389277e-07, "loss": 0.0252038836479187, "memory(GiB)": 122.96, "step": 63240, "token_acc": 0.9915824915824916, "train_speed(iter/s)": 0.231704 }, { "epoch": 4.820870493177834, "grad_norm": 0.7023640275001526, "learning_rate": 3.1635558212325666e-07, "loss": 0.024034537374973297, "memory(GiB)": 122.96, "step": 63245, "token_acc": 0.9895712630359212, "train_speed(iter/s)": 0.23171 }, { "epoch": 4.821251619788094, "grad_norm": 0.7927979826927185, "learning_rate": 3.15012234119072e-07, "loss": 0.02616577744483948, "memory(GiB)": 122.96, "step": 63250, "token_acc": 0.991190765492102, "train_speed(iter/s)": 0.231712 }, { "epoch": 4.821632746398354, "grad_norm": 0.593925952911377, "learning_rate": 3.136717353183605e-07, "loss": 0.03302351534366608, "memory(GiB)": 122.96, "step": 63255, "token_acc": 0.9901333333333333, "train_speed(iter/s)": 0.231716 }, { "epoch": 4.822013873008613, "grad_norm": 0.2578660547733307, "learning_rate": 3.123340857980106e-07, "loss": 0.032762521505355836, "memory(GiB)": 122.96, "step": 63260, "token_acc": 0.9832203936753792, "train_speed(iter/s)": 0.231721 }, { "epoch": 4.822394999618873, "grad_norm": 0.08026441931724548, "learning_rate": 3.109992856347166e-07, "loss": 0.016461023688316347, "memory(GiB)": 122.96, "step": 63265, "token_acc": 0.9871939736346516, "train_speed(iter/s)": 0.231727 }, { "epoch": 4.822776126229133, "grad_norm": 1.1899210214614868, "learning_rate": 3.096673349050283e-07, "loss": 0.02368229180574417, "memory(GiB)": 122.96, "step": 63270, "token_acc": 0.988822652757079, "train_speed(iter/s)": 0.231731 }, { "epoch": 4.823157252839393, "grad_norm": 1.7817083597183228, "learning_rate": 3.083382336853291e-07, "loss": 0.049903833866119386, "memory(GiB)": 122.96, "step": 63275, "token_acc": 0.9853372434017595, "train_speed(iter/s)": 0.231734 }, { "epoch": 4.823538379449653, "grad_norm": 0.5186770558357239, "learning_rate": 3.070119820518358e-07, "loss": 0.017431795597076416, "memory(GiB)": 122.96, "step": 63280, "token_acc": 0.9948738170347003, "train_speed(iter/s)": 0.23174 }, { "epoch": 4.823919506059913, "grad_norm": 1.6400145292282104, "learning_rate": 3.0568858008059864e-07, "loss": 0.02420383244752884, "memory(GiB)": 122.96, "step": 63285, "token_acc": 0.9908128619932095, "train_speed(iter/s)": 0.231743 }, { "epoch": 4.824300632670173, "grad_norm": 1.0496143102645874, "learning_rate": 3.0436802784751804e-07, "loss": 0.04476313889026642, "memory(GiB)": 122.96, "step": 63290, "token_acc": 0.9834080717488789, "train_speed(iter/s)": 0.231746 }, { "epoch": 4.824681759280433, "grad_norm": 0.7308018803596497, "learning_rate": 3.0305032542830567e-07, "loss": 0.02051415145397186, "memory(GiB)": 122.96, "step": 63295, "token_acc": 0.9926873857404022, "train_speed(iter/s)": 0.231751 }, { "epoch": 4.825062885890693, "grad_norm": 0.8635321855545044, "learning_rate": 3.0173547289854e-07, "loss": 0.02273241728544235, "memory(GiB)": 122.96, "step": 63300, "token_acc": 0.9869832893579595, "train_speed(iter/s)": 0.231755 }, { "epoch": 4.825444012500952, "grad_norm": 1.0389147996902466, "learning_rate": 3.0042347033362174e-07, "loss": 0.02302742749452591, "memory(GiB)": 122.96, "step": 63305, "token_acc": 0.9919544083137781, "train_speed(iter/s)": 0.23176 }, { "epoch": 4.825825139111212, "grad_norm": 1.4147989749908447, "learning_rate": 2.991143178087741e-07, "loss": 0.03196639716625214, "memory(GiB)": 122.96, "step": 63310, "token_acc": 0.9846775909768036, "train_speed(iter/s)": 0.231765 }, { "epoch": 4.826206265721472, "grad_norm": 0.956030547618866, "learning_rate": 2.97808015399087e-07, "loss": 0.018962310254573823, "memory(GiB)": 122.96, "step": 63315, "token_acc": 0.9930390181351896, "train_speed(iter/s)": 0.231768 }, { "epoch": 4.826587392331732, "grad_norm": 0.38244718313217163, "learning_rate": 2.9650456317946163e-07, "loss": 0.024062004685401917, "memory(GiB)": 122.96, "step": 63320, "token_acc": 0.9893211289092296, "train_speed(iter/s)": 0.231773 }, { "epoch": 4.826968518941992, "grad_norm": 0.9873067736625671, "learning_rate": 2.952039612246438e-07, "loss": 0.026440274715423585, "memory(GiB)": 122.96, "step": 63325, "token_acc": 0.9921777777777778, "train_speed(iter/s)": 0.231776 }, { "epoch": 4.827349645552252, "grad_norm": 1.0325442552566528, "learning_rate": 2.9390620960922376e-07, "loss": 0.034022435545921326, "memory(GiB)": 122.96, "step": 63330, "token_acc": 0.9886573463466104, "train_speed(iter/s)": 0.23178 }, { "epoch": 4.8277307721625125, "grad_norm": 1.3539836406707764, "learning_rate": 2.9261130840761984e-07, "loss": 0.038401469588279724, "memory(GiB)": 122.96, "step": 63335, "token_acc": 0.9851037417981912, "train_speed(iter/s)": 0.231784 }, { "epoch": 4.8281118987727725, "grad_norm": 1.1062259674072266, "learning_rate": 2.9131925769408373e-07, "loss": 0.018070292472839356, "memory(GiB)": 122.96, "step": 63340, "token_acc": 0.9922680412371134, "train_speed(iter/s)": 0.231786 }, { "epoch": 4.8284930253830325, "grad_norm": 0.7680717706680298, "learning_rate": 2.9003005754271175e-07, "loss": 0.04154585003852844, "memory(GiB)": 122.96, "step": 63345, "token_acc": 0.9844474199196621, "train_speed(iter/s)": 0.231787 }, { "epoch": 4.8288741519932925, "grad_norm": 2.320725679397583, "learning_rate": 2.887437080274391e-07, "loss": 0.027806323766708375, "memory(GiB)": 122.96, "step": 63350, "token_acc": 0.9897546508492855, "train_speed(iter/s)": 0.231792 }, { "epoch": 4.8292552786035525, "grad_norm": 0.6801755428314209, "learning_rate": 2.87460209222018e-07, "loss": 0.0517522394657135, "memory(GiB)": 122.96, "step": 63355, "token_acc": 0.9872792392244041, "train_speed(iter/s)": 0.231793 }, { "epoch": 4.829636405213812, "grad_norm": 1.2667820453643799, "learning_rate": 2.8617956120006726e-07, "loss": 0.03218090534210205, "memory(GiB)": 122.96, "step": 63360, "token_acc": 0.9860732232591529, "train_speed(iter/s)": 0.231797 }, { "epoch": 4.830017531824072, "grad_norm": 4.037550449371338, "learning_rate": 2.849017640350171e-07, "loss": 0.06372233629226684, "memory(GiB)": 122.96, "step": 63365, "token_acc": 0.9853493613824192, "train_speed(iter/s)": 0.231799 }, { "epoch": 4.830398658434332, "grad_norm": 1.708094596862793, "learning_rate": 2.8362681780013665e-07, "loss": 0.01915057599544525, "memory(GiB)": 122.96, "step": 63370, "token_acc": 0.9920243959652827, "train_speed(iter/s)": 0.231803 }, { "epoch": 4.830779785044592, "grad_norm": 0.7303031086921692, "learning_rate": 2.823547225685563e-07, "loss": 0.03380066752433777, "memory(GiB)": 122.96, "step": 63375, "token_acc": 0.9857094249744811, "train_speed(iter/s)": 0.231806 }, { "epoch": 4.831160911654852, "grad_norm": 1.4593260288238525, "learning_rate": 2.8108547841320665e-07, "loss": 0.028407156467437744, "memory(GiB)": 122.96, "step": 63380, "token_acc": 0.98000768935025, "train_speed(iter/s)": 0.231812 }, { "epoch": 4.831542038265112, "grad_norm": 1.0783460140228271, "learning_rate": 2.7981908540688497e-07, "loss": 0.03029269278049469, "memory(GiB)": 122.96, "step": 63385, "token_acc": 0.9859758472925594, "train_speed(iter/s)": 0.231818 }, { "epoch": 4.831923164875372, "grad_norm": 1.0439966917037964, "learning_rate": 2.7855554362221093e-07, "loss": 0.03331114947795868, "memory(GiB)": 122.96, "step": 63390, "token_acc": 0.987305158283786, "train_speed(iter/s)": 0.231821 }, { "epoch": 4.832304291485632, "grad_norm": 1.0250555276870728, "learning_rate": 2.772948531316322e-07, "loss": 0.02863774299621582, "memory(GiB)": 122.96, "step": 63395, "token_acc": 0.9889867841409692, "train_speed(iter/s)": 0.231824 }, { "epoch": 4.832685418095892, "grad_norm": 1.1258635520935059, "learning_rate": 2.7603701400745754e-07, "loss": 0.030116668343544005, "memory(GiB)": 122.96, "step": 63400, "token_acc": 0.9873859887444207, "train_speed(iter/s)": 0.231828 }, { "epoch": 4.832685418095892, "eval_loss": 0.04672357812523842, "eval_runtime": 218.6935, "eval_samples_per_second": 2.423, "eval_steps_per_second": 2.423, "eval_token_acc": 0.9810553581109571, "step": 63400 }, { "epoch": 4.833066544706151, "grad_norm": 0.8140780925750732, "learning_rate": 2.747820263218126e-07, "loss": 0.019103607535362242, "memory(GiB)": 122.96, "step": 63405, "token_acc": 0.9813506806267346, "train_speed(iter/s)": 0.231648 }, { "epoch": 4.833447671316411, "grad_norm": 0.5592483878135681, "learning_rate": 2.7352989014666763e-07, "loss": 0.024830490350723267, "memory(GiB)": 122.96, "step": 63410, "token_acc": 0.9903766840802859, "train_speed(iter/s)": 0.23165 }, { "epoch": 4.833828797926671, "grad_norm": 0.8610469102859497, "learning_rate": 2.722806055538207e-07, "loss": 0.01745525449514389, "memory(GiB)": 122.96, "step": 63415, "token_acc": 0.9926233287229138, "train_speed(iter/s)": 0.231654 }, { "epoch": 4.834209924536931, "grad_norm": 0.4673379361629486, "learning_rate": 2.7103417261492014e-07, "loss": 0.02346380054950714, "memory(GiB)": 122.96, "step": 63420, "token_acc": 0.9924188580904999, "train_speed(iter/s)": 0.231659 }, { "epoch": 4.834591051147191, "grad_norm": 1.0860482454299927, "learning_rate": 2.6979059140143645e-07, "loss": 0.0491860568523407, "memory(GiB)": 122.96, "step": 63425, "token_acc": 0.9833762584874737, "train_speed(iter/s)": 0.231663 }, { "epoch": 4.834972177757451, "grad_norm": 2.1303327083587646, "learning_rate": 2.685498619846849e-07, "loss": 0.03133138418197632, "memory(GiB)": 122.96, "step": 63430, "token_acc": 0.986905916585839, "train_speed(iter/s)": 0.231667 }, { "epoch": 4.835353304367711, "grad_norm": 1.088713526725769, "learning_rate": 2.673119844358196e-07, "loss": 0.05537205934524536, "memory(GiB)": 122.96, "step": 63435, "token_acc": 0.9839827506545511, "train_speed(iter/s)": 0.231671 }, { "epoch": 4.835734430977971, "grad_norm": 1.5280109643936157, "learning_rate": 2.6607695882582275e-07, "loss": 0.04109824299812317, "memory(GiB)": 122.96, "step": 63440, "token_acc": 0.9798051793775243, "train_speed(iter/s)": 0.231675 }, { "epoch": 4.836115557588231, "grad_norm": 0.07485716044902802, "learning_rate": 2.648447852255209e-07, "loss": 0.0327631413936615, "memory(GiB)": 122.96, "step": 63445, "token_acc": 0.9833156779661016, "train_speed(iter/s)": 0.23168 }, { "epoch": 4.836496684198491, "grad_norm": 0.00063524697907269, "learning_rate": 2.636154637055688e-07, "loss": 0.036380958557128903, "memory(GiB)": 122.96, "step": 63450, "token_acc": 0.9855889724310777, "train_speed(iter/s)": 0.231683 }, { "epoch": 4.836877810808751, "grad_norm": 0.8382189273834229, "learning_rate": 2.623889943364599e-07, "loss": 0.026345273852348326, "memory(GiB)": 122.96, "step": 63455, "token_acc": 0.988, "train_speed(iter/s)": 0.231688 }, { "epoch": 4.83725893741901, "grad_norm": 2.322080612182617, "learning_rate": 2.6116537718853806e-07, "loss": 0.022588518261909486, "memory(GiB)": 122.96, "step": 63460, "token_acc": 0.9889089502192416, "train_speed(iter/s)": 0.231692 }, { "epoch": 4.83764006402927, "grad_norm": 0.6515766978263855, "learning_rate": 2.599446123319638e-07, "loss": 0.019360694289207458, "memory(GiB)": 122.96, "step": 63465, "token_acc": 0.992723778143877, "train_speed(iter/s)": 0.231694 }, { "epoch": 4.83802119063953, "grad_norm": 0.5862978100776672, "learning_rate": 2.5872669983674767e-07, "loss": 0.03466072380542755, "memory(GiB)": 122.96, "step": 63470, "token_acc": 0.9862514493953951, "train_speed(iter/s)": 0.231697 }, { "epoch": 4.83840231724979, "grad_norm": 1.1850827932357788, "learning_rate": 2.5751163977272285e-07, "loss": 0.03136466443538666, "memory(GiB)": 122.96, "step": 63475, "token_acc": 0.9865782932891466, "train_speed(iter/s)": 0.2317 }, { "epoch": 4.83878344386005, "grad_norm": 0.8709152936935425, "learning_rate": 2.562994322095724e-07, "loss": 0.01974187046289444, "memory(GiB)": 122.96, "step": 63480, "token_acc": 0.9873811864962307, "train_speed(iter/s)": 0.231703 }, { "epoch": 4.83916457047031, "grad_norm": 1.023867130279541, "learning_rate": 2.55090077216813e-07, "loss": 0.02091221362352371, "memory(GiB)": 122.96, "step": 63485, "token_acc": 0.9921599372794982, "train_speed(iter/s)": 0.231707 }, { "epoch": 4.83954569708057, "grad_norm": 1.3771401643753052, "learning_rate": 2.5388357486379466e-07, "loss": 0.030989474058151244, "memory(GiB)": 122.96, "step": 63490, "token_acc": 0.9907578558225508, "train_speed(iter/s)": 0.23171 }, { "epoch": 4.83992682369083, "grad_norm": 1.307253360748291, "learning_rate": 2.526799252196954e-07, "loss": 0.02163785994052887, "memory(GiB)": 122.96, "step": 63495, "token_acc": 0.9908749329039184, "train_speed(iter/s)": 0.231715 }, { "epoch": 4.84030795030109, "grad_norm": 1.20551598072052, "learning_rate": 2.5147912835355445e-07, "loss": 0.03606013059616089, "memory(GiB)": 122.96, "step": 63500, "token_acc": 0.9856468366383381, "train_speed(iter/s)": 0.231717 }, { "epoch": 4.8406890769113495, "grad_norm": 0.6759769320487976, "learning_rate": 2.502811843342223e-07, "loss": 0.031311073899269105, "memory(GiB)": 122.96, "step": 63505, "token_acc": 0.9840805123513267, "train_speed(iter/s)": 0.231721 }, { "epoch": 4.8410702035216095, "grad_norm": 0.9417913556098938, "learning_rate": 2.49086093230394e-07, "loss": 0.031794705986976625, "memory(GiB)": 122.96, "step": 63510, "token_acc": 0.9863108248015645, "train_speed(iter/s)": 0.231722 }, { "epoch": 4.84145133013187, "grad_norm": 1.2885313034057617, "learning_rate": 2.4789385511060915e-07, "loss": 0.030968889594078064, "memory(GiB)": 122.96, "step": 63515, "token_acc": 0.9890770070999454, "train_speed(iter/s)": 0.231728 }, { "epoch": 4.84183245674213, "grad_norm": 0.6319652199745178, "learning_rate": 2.467044700432297e-07, "loss": 0.02667335569858551, "memory(GiB)": 122.96, "step": 63520, "token_acc": 0.9903600793875815, "train_speed(iter/s)": 0.23173 }, { "epoch": 4.84221358335239, "grad_norm": 1.489200472831726, "learning_rate": 2.455179380964678e-07, "loss": 0.03151824176311493, "memory(GiB)": 122.96, "step": 63525, "token_acc": 0.9847467968273338, "train_speed(iter/s)": 0.231736 }, { "epoch": 4.84259470996265, "grad_norm": 1.3481451272964478, "learning_rate": 2.443342593383635e-07, "loss": 0.025184664130210876, "memory(GiB)": 122.96, "step": 63530, "token_acc": 0.9898107714701602, "train_speed(iter/s)": 0.23174 }, { "epoch": 4.84297583657291, "grad_norm": 1.2058234214782715, "learning_rate": 2.431534338367958e-07, "loss": 0.03211406767368317, "memory(GiB)": 122.96, "step": 63535, "token_acc": 0.9861607142857143, "train_speed(iter/s)": 0.231745 }, { "epoch": 4.84335696318317, "grad_norm": 1.3236397504806519, "learning_rate": 2.4197546165947714e-07, "loss": 0.047597482800483704, "memory(GiB)": 122.96, "step": 63540, "token_acc": 0.9788450543167524, "train_speed(iter/s)": 0.23175 }, { "epoch": 4.84373808979343, "grad_norm": 1.6006097793579102, "learning_rate": 2.408003428739647e-07, "loss": 0.022249022126197816, "memory(GiB)": 122.96, "step": 63545, "token_acc": 0.9886107800291352, "train_speed(iter/s)": 0.231752 }, { "epoch": 4.844119216403689, "grad_norm": 1.3932478427886963, "learning_rate": 2.396280775476378e-07, "loss": 0.024757683277130127, "memory(GiB)": 122.96, "step": 63550, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.231758 }, { "epoch": 4.84450034301395, "grad_norm": 1.3240060806274414, "learning_rate": 2.3845866574772056e-07, "loss": 0.027435886859893798, "memory(GiB)": 122.96, "step": 63555, "token_acc": 0.987279843444227, "train_speed(iter/s)": 0.231761 }, { "epoch": 4.844881469624209, "grad_norm": 1.7725639343261719, "learning_rate": 2.3729210754128706e-07, "loss": 0.02540752589702606, "memory(GiB)": 122.96, "step": 63560, "token_acc": 0.9906781635982288, "train_speed(iter/s)": 0.231764 }, { "epoch": 4.845262596234469, "grad_norm": 1.044792652130127, "learning_rate": 2.361284029952171e-07, "loss": 0.03956393897533417, "memory(GiB)": 122.96, "step": 63565, "token_acc": 0.9863409930901494, "train_speed(iter/s)": 0.231768 }, { "epoch": 4.845643722844729, "grad_norm": 0.6337283253669739, "learning_rate": 2.3496755217624623e-07, "loss": 0.03137912154197693, "memory(GiB)": 122.96, "step": 63570, "token_acc": 0.986905281536447, "train_speed(iter/s)": 0.231774 }, { "epoch": 4.846024849454989, "grad_norm": 0.10651111602783203, "learning_rate": 2.3380955515096004e-07, "loss": 0.024837365746498107, "memory(GiB)": 122.96, "step": 63575, "token_acc": 0.9916839916839917, "train_speed(iter/s)": 0.23178 }, { "epoch": 4.846405976065249, "grad_norm": 0.7044525146484375, "learning_rate": 2.3265441198574433e-07, "loss": 0.019390700757503508, "memory(GiB)": 122.96, "step": 63580, "token_acc": 0.9923241699393074, "train_speed(iter/s)": 0.231783 }, { "epoch": 4.846787102675509, "grad_norm": 0.5159116387367249, "learning_rate": 2.3150212274685168e-07, "loss": 0.03701809048652649, "memory(GiB)": 122.96, "step": 63585, "token_acc": 0.9833309588260436, "train_speed(iter/s)": 0.231785 }, { "epoch": 4.847168229285769, "grad_norm": 0.7864515781402588, "learning_rate": 2.30352687500357e-07, "loss": 0.029854807257652282, "memory(GiB)": 122.96, "step": 63590, "token_acc": 0.988050784167289, "train_speed(iter/s)": 0.231784 }, { "epoch": 4.847549355896029, "grad_norm": 0.19863025844097137, "learning_rate": 2.292061063121742e-07, "loss": 0.017898136377334596, "memory(GiB)": 122.96, "step": 63595, "token_acc": 0.9923224568138196, "train_speed(iter/s)": 0.23179 }, { "epoch": 4.847930482506289, "grad_norm": 0.14372771978378296, "learning_rate": 2.2806237924806185e-07, "loss": 0.025042295455932617, "memory(GiB)": 122.96, "step": 63600, "token_acc": 0.9869731800766284, "train_speed(iter/s)": 0.231794 }, { "epoch": 4.847930482506289, "eval_loss": 0.046677686274051666, "eval_runtime": 220.0074, "eval_samples_per_second": 2.409, "eval_steps_per_second": 2.409, "eval_token_acc": 0.9810402987771821, "step": 63600 }, { "epoch": 4.848311609116548, "grad_norm": 0.8813661336898804, "learning_rate": 2.2692150637360078e-07, "loss": 0.033819186687469485, "memory(GiB)": 122.96, "step": 63605, "token_acc": 0.9811962071480671, "train_speed(iter/s)": 0.231612 }, { "epoch": 4.848692735726808, "grad_norm": 0.9819487929344177, "learning_rate": 2.2578348775421642e-07, "loss": 0.028145435452461242, "memory(GiB)": 122.96, "step": 63610, "token_acc": 0.9898255813953488, "train_speed(iter/s)": 0.231615 }, { "epoch": 4.849073862337068, "grad_norm": 0.11429768055677414, "learning_rate": 2.2464832345516773e-07, "loss": 0.01513756662607193, "memory(GiB)": 122.96, "step": 63615, "token_acc": 0.9946658299341073, "train_speed(iter/s)": 0.23162 }, { "epoch": 4.849454988947328, "grad_norm": 0.9345092177391052, "learning_rate": 2.2351601354154704e-07, "loss": 0.03259872794151306, "memory(GiB)": 122.96, "step": 63620, "token_acc": 0.9852832965415746, "train_speed(iter/s)": 0.231621 }, { "epoch": 4.849836115557588, "grad_norm": 1.1775275468826294, "learning_rate": 2.223865580782969e-07, "loss": 0.03991618454456329, "memory(GiB)": 122.96, "step": 63625, "token_acc": 0.9848682494129924, "train_speed(iter/s)": 0.231624 }, { "epoch": 4.850217242167848, "grad_norm": 1.2756683826446533, "learning_rate": 2.212599571301821e-07, "loss": 0.029350200295448305, "memory(GiB)": 122.96, "step": 63630, "token_acc": 0.9872966029117899, "train_speed(iter/s)": 0.231626 }, { "epoch": 4.850598368778108, "grad_norm": 0.7096001505851746, "learning_rate": 2.2013621076180658e-07, "loss": 0.028466662764549254, "memory(GiB)": 122.96, "step": 63635, "token_acc": 0.9905233685117381, "train_speed(iter/s)": 0.231631 }, { "epoch": 4.850979495388368, "grad_norm": 0.49754393100738525, "learning_rate": 2.1901531903760763e-07, "loss": 0.03962083458900452, "memory(GiB)": 122.96, "step": 63640, "token_acc": 0.9842302385176425, "train_speed(iter/s)": 0.231634 }, { "epoch": 4.851360621998628, "grad_norm": 2.165450096130371, "learning_rate": 2.1789728202187277e-07, "loss": 0.04339950680732727, "memory(GiB)": 122.96, "step": 63645, "token_acc": 0.9830665024630542, "train_speed(iter/s)": 0.23164 }, { "epoch": 4.851741748608887, "grad_norm": 0.7543947100639343, "learning_rate": 2.1678209977871178e-07, "loss": 0.039947924017906186, "memory(GiB)": 122.96, "step": 63650, "token_acc": 0.9869337979094077, "train_speed(iter/s)": 0.231642 }, { "epoch": 4.852122875219147, "grad_norm": 0.912329912185669, "learning_rate": 2.1566977237207353e-07, "loss": 0.025211113691329955, "memory(GiB)": 122.96, "step": 63655, "token_acc": 0.9875031879622546, "train_speed(iter/s)": 0.231648 }, { "epoch": 4.852504001829407, "grad_norm": 2.6631243228912354, "learning_rate": 2.1456029986574587e-07, "loss": 0.03371896743774414, "memory(GiB)": 122.96, "step": 63660, "token_acc": 0.988391376451078, "train_speed(iter/s)": 0.231652 }, { "epoch": 4.8528851284396675, "grad_norm": 0.7513755559921265, "learning_rate": 2.134536823233557e-07, "loss": 0.05090123414993286, "memory(GiB)": 122.96, "step": 63665, "token_acc": 0.9797921478060047, "train_speed(iter/s)": 0.231657 }, { "epoch": 4.8532662550499275, "grad_norm": 0.47310882806777954, "learning_rate": 2.1234991980835784e-07, "loss": 0.021261148154735565, "memory(GiB)": 122.96, "step": 63670, "token_acc": 0.9896251673360107, "train_speed(iter/s)": 0.231662 }, { "epoch": 4.8536473816601875, "grad_norm": 1.6551554203033447, "learning_rate": 2.1124901238404604e-07, "loss": 0.028561824560165407, "memory(GiB)": 122.96, "step": 63675, "token_acc": 0.9875553680658089, "train_speed(iter/s)": 0.231666 }, { "epoch": 4.8540285082704475, "grad_norm": 0.5902779698371887, "learning_rate": 2.1015096011355318e-07, "loss": 0.019939391314983367, "memory(GiB)": 122.96, "step": 63680, "token_acc": 0.9922687283391096, "train_speed(iter/s)": 0.23167 }, { "epoch": 4.8544096348807075, "grad_norm": 1.874626874923706, "learning_rate": 2.0905576305985665e-07, "loss": 0.026715266704559325, "memory(GiB)": 122.96, "step": 63685, "token_acc": 0.9894259818731118, "train_speed(iter/s)": 0.231675 }, { "epoch": 4.8547907614909676, "grad_norm": 2.076921224594116, "learning_rate": 2.0796342128575063e-07, "loss": 0.04649493098258972, "memory(GiB)": 122.96, "step": 63690, "token_acc": 0.9788373278939316, "train_speed(iter/s)": 0.231679 }, { "epoch": 4.855171888101228, "grad_norm": 1.7890684604644775, "learning_rate": 2.0687393485387952e-07, "loss": 0.021721091866493226, "memory(GiB)": 122.96, "step": 63695, "token_acc": 0.9931717309662, "train_speed(iter/s)": 0.231685 }, { "epoch": 4.855553014711488, "grad_norm": 1.1155509948730469, "learning_rate": 2.0578730382672107e-07, "loss": 0.03151506185531616, "memory(GiB)": 122.96, "step": 63700, "token_acc": 0.9836755301245372, "train_speed(iter/s)": 0.231688 }, { "epoch": 4.855934141321747, "grad_norm": 0.8792484402656555, "learning_rate": 2.0470352826658656e-07, "loss": 0.019628126919269562, "memory(GiB)": 122.96, "step": 63705, "token_acc": 0.9937388193202147, "train_speed(iter/s)": 0.231692 }, { "epoch": 4.856315267932007, "grad_norm": 1.2249419689178467, "learning_rate": 2.0362260823562628e-07, "loss": 0.024875304102897643, "memory(GiB)": 122.96, "step": 63710, "token_acc": 0.9901774943994486, "train_speed(iter/s)": 0.231695 }, { "epoch": 4.856696394542267, "grad_norm": 1.651612639427185, "learning_rate": 2.0254454379582398e-07, "loss": 0.02446194738149643, "memory(GiB)": 122.96, "step": 63715, "token_acc": 0.9904376609047444, "train_speed(iter/s)": 0.231701 }, { "epoch": 4.857077521152527, "grad_norm": 0.3486618399620056, "learning_rate": 2.01469335009008e-07, "loss": 0.026943543553352357, "memory(GiB)": 122.96, "step": 63720, "token_acc": 0.987611837577426, "train_speed(iter/s)": 0.231706 }, { "epoch": 4.857458647762787, "grad_norm": 0.4851253926753998, "learning_rate": 2.0039698193682898e-07, "loss": 0.02305719703435898, "memory(GiB)": 122.96, "step": 63725, "token_acc": 0.9908478673804179, "train_speed(iter/s)": 0.23171 }, { "epoch": 4.857839774373047, "grad_norm": 1.1019747257232666, "learning_rate": 1.993274846407822e-07, "loss": 0.0324232816696167, "memory(GiB)": 122.96, "step": 63730, "token_acc": 0.9865528833721231, "train_speed(iter/s)": 0.231715 }, { "epoch": 4.858220900983307, "grad_norm": 1.0173248052597046, "learning_rate": 1.982608431822075e-07, "loss": 0.022532182931900024, "memory(GiB)": 122.96, "step": 63735, "token_acc": 0.9890590809628009, "train_speed(iter/s)": 0.231719 }, { "epoch": 4.858602027593567, "grad_norm": 1.4136961698532104, "learning_rate": 1.971970576222615e-07, "loss": 0.025222840905189513, "memory(GiB)": 122.96, "step": 63740, "token_acc": 0.9899435426958363, "train_speed(iter/s)": 0.231721 }, { "epoch": 4.858983154203827, "grad_norm": 0.6944568157196045, "learning_rate": 1.961361280219509e-07, "loss": 0.028235653042793275, "memory(GiB)": 122.96, "step": 63745, "token_acc": 0.9902310727033627, "train_speed(iter/s)": 0.231724 }, { "epoch": 4.859364280814086, "grad_norm": 1.8777761459350586, "learning_rate": 1.9507805444211603e-07, "loss": 0.030285876989364625, "memory(GiB)": 122.96, "step": 63750, "token_acc": 0.9868714391875155, "train_speed(iter/s)": 0.231729 }, { "epoch": 4.859745407424346, "grad_norm": 0.9192717671394348, "learning_rate": 1.940228369434305e-07, "loss": 0.025669777393341066, "memory(GiB)": 122.96, "step": 63755, "token_acc": 0.9903653151344841, "train_speed(iter/s)": 0.23173 }, { "epoch": 4.860126534034606, "grad_norm": 1.3644462823867798, "learning_rate": 1.9297047558640701e-07, "loss": 0.03496770858764649, "memory(GiB)": 122.96, "step": 63760, "token_acc": 0.986640385457731, "train_speed(iter/s)": 0.231734 }, { "epoch": 4.860507660644866, "grad_norm": 0.9638566970825195, "learning_rate": 1.919209704313918e-07, "loss": 0.022020496428012848, "memory(GiB)": 122.96, "step": 63765, "token_acc": 0.9885288790501107, "train_speed(iter/s)": 0.231737 }, { "epoch": 4.860888787255126, "grad_norm": 0.6192764043807983, "learning_rate": 1.9087432153857554e-07, "loss": 0.02205311059951782, "memory(GiB)": 122.96, "step": 63770, "token_acc": 0.9948917538311847, "train_speed(iter/s)": 0.231741 }, { "epoch": 4.861269913865386, "grad_norm": 0.5418553948402405, "learning_rate": 1.8983052896797693e-07, "loss": 0.02168118953704834, "memory(GiB)": 122.96, "step": 63775, "token_acc": 0.9910347403810236, "train_speed(iter/s)": 0.231745 }, { "epoch": 4.861651040475646, "grad_norm": 0.6758118271827698, "learning_rate": 1.8878959277944254e-07, "loss": 0.030096563696861266, "memory(GiB)": 122.96, "step": 63780, "token_acc": 0.9875019287147045, "train_speed(iter/s)": 0.231749 }, { "epoch": 4.862032167085906, "grad_norm": 1.7213093042373657, "learning_rate": 1.8775151303268013e-07, "loss": 0.017012296617031096, "memory(GiB)": 122.96, "step": 63785, "token_acc": 0.9926421404682274, "train_speed(iter/s)": 0.231756 }, { "epoch": 4.862413293696166, "grad_norm": 1.1647675037384033, "learning_rate": 1.8671628978720323e-07, "loss": 0.04599955677986145, "memory(GiB)": 122.96, "step": 63790, "token_acc": 0.9831697054698457, "train_speed(iter/s)": 0.23176 }, { "epoch": 4.862794420306426, "grad_norm": 1.945186972618103, "learning_rate": 1.8568392310239214e-07, "loss": 0.020744654536247253, "memory(GiB)": 122.96, "step": 63795, "token_acc": 0.9909663865546219, "train_speed(iter/s)": 0.231763 }, { "epoch": 4.863175546916686, "grad_norm": 0.9581672549247742, "learning_rate": 1.8465441303743835e-07, "loss": 0.017987486720085145, "memory(GiB)": 122.96, "step": 63800, "token_acc": 0.9900670474298485, "train_speed(iter/s)": 0.231767 }, { "epoch": 4.863175546916686, "eval_loss": 0.04679752141237259, "eval_runtime": 219.9157, "eval_samples_per_second": 2.41, "eval_steps_per_second": 2.41, "eval_token_acc": 0.9810930064453949, "step": 63800 }, { "epoch": 4.863556673526945, "grad_norm": 1.1121246814727783, "learning_rate": 1.83627759651378e-07, "loss": 0.0197479709982872, "memory(GiB)": 122.96, "step": 63805, "token_acc": 0.9813944264077213, "train_speed(iter/s)": 0.231587 }, { "epoch": 4.863937800137205, "grad_norm": 0.9048646092414856, "learning_rate": 1.8260396300309734e-07, "loss": 0.023024699091911315, "memory(GiB)": 122.96, "step": 63810, "token_acc": 0.9935461364032793, "train_speed(iter/s)": 0.231589 }, { "epoch": 4.864318926747465, "grad_norm": 2.3069543838500977, "learning_rate": 1.8158302315129382e-07, "loss": 0.03786337971687317, "memory(GiB)": 122.96, "step": 63815, "token_acc": 0.9811039283938339, "train_speed(iter/s)": 0.231595 }, { "epoch": 4.864700053357725, "grad_norm": 1.3953595161437988, "learning_rate": 1.8056494015452064e-07, "loss": 0.041260254383087155, "memory(GiB)": 122.96, "step": 63820, "token_acc": 0.9862501668669069, "train_speed(iter/s)": 0.231597 }, { "epoch": 4.865081179967985, "grad_norm": 0.8397042155265808, "learning_rate": 1.7954971407115883e-07, "loss": 0.03047110140323639, "memory(GiB)": 122.96, "step": 63825, "token_acc": 0.9848650335465752, "train_speed(iter/s)": 0.231601 }, { "epoch": 4.865462306578245, "grad_norm": 0.8198574185371399, "learning_rate": 1.7853734495942298e-07, "loss": 0.033782586455345154, "memory(GiB)": 122.96, "step": 63830, "token_acc": 0.9859075535512966, "train_speed(iter/s)": 0.231605 }, { "epoch": 4.865843433188505, "grad_norm": 2.7345728874206543, "learning_rate": 1.7752783287737218e-07, "loss": 0.012765756249427796, "memory(GiB)": 122.96, "step": 63835, "token_acc": 0.9969751966122202, "train_speed(iter/s)": 0.231611 }, { "epoch": 4.8662245597987654, "grad_norm": 1.5577542781829834, "learning_rate": 1.7652117788289347e-07, "loss": 0.027393531799316407, "memory(GiB)": 122.96, "step": 63840, "token_acc": 0.9867086393844001, "train_speed(iter/s)": 0.231617 }, { "epoch": 4.8666056864090255, "grad_norm": 1.5931227207183838, "learning_rate": 1.7551738003371842e-07, "loss": 0.026637130975723268, "memory(GiB)": 122.96, "step": 63845, "token_acc": 0.98698224852071, "train_speed(iter/s)": 0.231622 }, { "epoch": 4.866986813019285, "grad_norm": 0.10796253383159637, "learning_rate": 1.745164393874066e-07, "loss": 0.031009498238563537, "memory(GiB)": 122.96, "step": 63850, "token_acc": 0.9856305539799584, "train_speed(iter/s)": 0.231626 }, { "epoch": 4.867367939629545, "grad_norm": 1.1273672580718994, "learning_rate": 1.73518356001362e-07, "loss": 0.029703986644744874, "memory(GiB)": 122.96, "step": 63855, "token_acc": 0.9893599334995844, "train_speed(iter/s)": 0.231629 }, { "epoch": 4.867749066239805, "grad_norm": 1.78102707862854, "learning_rate": 1.7252312993281116e-07, "loss": 0.036250603199005124, "memory(GiB)": 122.96, "step": 63860, "token_acc": 0.9872944693572496, "train_speed(iter/s)": 0.231631 }, { "epoch": 4.868130192850065, "grad_norm": 1.0938230752944946, "learning_rate": 1.715307612388306e-07, "loss": 0.028365698456764222, "memory(GiB)": 122.96, "step": 63865, "token_acc": 0.9881748071979435, "train_speed(iter/s)": 0.231632 }, { "epoch": 4.868511319460325, "grad_norm": 1.4246264696121216, "learning_rate": 1.705412499763359e-07, "loss": 0.01668952703475952, "memory(GiB)": 122.96, "step": 63870, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.231639 }, { "epoch": 4.868892446070585, "grad_norm": 0.5927743911743164, "learning_rate": 1.695545962020595e-07, "loss": 0.03373218774795532, "memory(GiB)": 122.96, "step": 63875, "token_acc": 0.9851324337831084, "train_speed(iter/s)": 0.231639 }, { "epoch": 4.869273572680845, "grad_norm": 1.538282871246338, "learning_rate": 1.6857079997258384e-07, "loss": 0.026500028371810914, "memory(GiB)": 122.96, "step": 63880, "token_acc": 0.989687768547694, "train_speed(iter/s)": 0.231644 }, { "epoch": 4.869654699291105, "grad_norm": 2.1982321739196777, "learning_rate": 1.6758986134432496e-07, "loss": 0.04125989079475403, "memory(GiB)": 122.96, "step": 63885, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.23165 }, { "epoch": 4.870035825901365, "grad_norm": 1.888222575187683, "learning_rate": 1.6661178037354342e-07, "loss": 0.03261609673500061, "memory(GiB)": 122.96, "step": 63890, "token_acc": 0.9872419392252377, "train_speed(iter/s)": 0.231654 }, { "epoch": 4.870416952511624, "grad_norm": 2.4966673851013184, "learning_rate": 1.6563655711631655e-07, "loss": 0.0238148033618927, "memory(GiB)": 122.96, "step": 63895, "token_acc": 0.9944649446494465, "train_speed(iter/s)": 0.23166 }, { "epoch": 4.870798079121885, "grad_norm": 0.541151225566864, "learning_rate": 1.6466419162857182e-07, "loss": 0.023106947541236877, "memory(GiB)": 122.96, "step": 63900, "token_acc": 0.9910161708923937, "train_speed(iter/s)": 0.231664 }, { "epoch": 4.871179205732144, "grad_norm": 1.0560071468353271, "learning_rate": 1.6369468396607578e-07, "loss": 0.032941436767578124, "memory(GiB)": 122.96, "step": 63905, "token_acc": 0.9893822393822393, "train_speed(iter/s)": 0.231669 }, { "epoch": 4.871560332342404, "grad_norm": 1.6914194822311401, "learning_rate": 1.627280341844173e-07, "loss": 0.024679920077323912, "memory(GiB)": 122.96, "step": 63910, "token_acc": 0.9909536373916321, "train_speed(iter/s)": 0.231674 }, { "epoch": 4.871941458952664, "grad_norm": 1.0863217115402222, "learning_rate": 1.617642423390353e-07, "loss": 0.01988966017961502, "memory(GiB)": 122.96, "step": 63915, "token_acc": 0.9923728813559322, "train_speed(iter/s)": 0.23168 }, { "epoch": 4.872322585562924, "grad_norm": 0.8507096171379089, "learning_rate": 1.6080330848519677e-07, "loss": 0.023190774023532867, "memory(GiB)": 122.96, "step": 63920, "token_acc": 0.990491539081386, "train_speed(iter/s)": 0.231681 }, { "epoch": 4.872703712173184, "grad_norm": 1.3213520050048828, "learning_rate": 1.5984523267800756e-07, "loss": 0.03189520537853241, "memory(GiB)": 122.96, "step": 63925, "token_acc": 0.9874349308921199, "train_speed(iter/s)": 0.231681 }, { "epoch": 4.873084838783444, "grad_norm": 0.09521715342998505, "learning_rate": 1.588900149724015e-07, "loss": 0.03810795247554779, "memory(GiB)": 122.96, "step": 63930, "token_acc": 0.9869743734956817, "train_speed(iter/s)": 0.231684 }, { "epoch": 4.873465965393704, "grad_norm": 0.3573542833328247, "learning_rate": 1.579376554231682e-07, "loss": 0.01369284838438034, "memory(GiB)": 122.96, "step": 63935, "token_acc": 0.9962256238205074, "train_speed(iter/s)": 0.231688 }, { "epoch": 4.873847092003964, "grad_norm": 1.1519255638122559, "learning_rate": 1.5698815408491384e-07, "loss": 0.037502944469451904, "memory(GiB)": 122.96, "step": 63940, "token_acc": 0.9841031793641272, "train_speed(iter/s)": 0.231691 }, { "epoch": 4.874228218614224, "grad_norm": 1.0848225355148315, "learning_rate": 1.5604151101208387e-07, "loss": 0.023704853653907777, "memory(GiB)": 122.96, "step": 63945, "token_acc": 0.9868596881959911, "train_speed(iter/s)": 0.231696 }, { "epoch": 4.874609345224483, "grad_norm": 1.0592962503433228, "learning_rate": 1.5509772625897367e-07, "loss": 0.021315036714076994, "memory(GiB)": 122.96, "step": 63950, "token_acc": 0.9884734326679786, "train_speed(iter/s)": 0.231701 }, { "epoch": 4.874990471834743, "grad_norm": 1.991119623184204, "learning_rate": 1.541567998797011e-07, "loss": 0.03387020826339722, "memory(GiB)": 122.96, "step": 63955, "token_acc": 0.988271741535738, "train_speed(iter/s)": 0.231704 }, { "epoch": 4.875371598445003, "grad_norm": 1.090657114982605, "learning_rate": 1.53218731928223e-07, "loss": 0.020071226358413696, "memory(GiB)": 122.96, "step": 63960, "token_acc": 0.9939925312550739, "train_speed(iter/s)": 0.231707 }, { "epoch": 4.875752725055263, "grad_norm": 0.5233426690101624, "learning_rate": 1.5228352245832966e-07, "loss": 0.016641002893447877, "memory(GiB)": 122.96, "step": 63965, "token_acc": 0.9953314659197012, "train_speed(iter/s)": 0.231712 }, { "epoch": 4.876133851665523, "grad_norm": 1.0501010417938232, "learning_rate": 1.5135117152365597e-07, "loss": 0.030105790495872496, "memory(GiB)": 122.96, "step": 63970, "token_acc": 0.990158026060438, "train_speed(iter/s)": 0.231714 }, { "epoch": 4.876514978275783, "grad_norm": 0.983324408531189, "learning_rate": 1.5042167917767024e-07, "loss": 0.026026269793510436, "memory(GiB)": 122.96, "step": 63975, "token_acc": 0.9900184842883549, "train_speed(iter/s)": 0.231718 }, { "epoch": 4.876896104886043, "grad_norm": 0.7305537462234497, "learning_rate": 1.4949504547366877e-07, "loss": 0.0293489009141922, "memory(GiB)": 122.96, "step": 63980, "token_acc": 0.9877316293929712, "train_speed(iter/s)": 0.23172 }, { "epoch": 4.877277231496303, "grad_norm": 1.25846529006958, "learning_rate": 1.4857127046479235e-07, "loss": 0.028020796179771424, "memory(GiB)": 122.96, "step": 63985, "token_acc": 0.9890199934447722, "train_speed(iter/s)": 0.231723 }, { "epoch": 4.877658358106563, "grad_norm": 1.5212522745132446, "learning_rate": 1.4765035420400975e-07, "loss": 0.04660446047782898, "memory(GiB)": 122.96, "step": 63990, "token_acc": 0.9765942202053977, "train_speed(iter/s)": 0.231727 }, { "epoch": 4.8780394847168225, "grad_norm": 1.873464584350586, "learning_rate": 1.4673229674414535e-07, "loss": 0.028292939066886902, "memory(GiB)": 122.96, "step": 63995, "token_acc": 0.9902446345490019, "train_speed(iter/s)": 0.231729 }, { "epoch": 4.8784206113270825, "grad_norm": 2.4227077960968018, "learning_rate": 1.4581709813782928e-07, "loss": 0.05091788172721863, "memory(GiB)": 122.96, "step": 64000, "token_acc": 0.9844656559985027, "train_speed(iter/s)": 0.23173 }, { "epoch": 4.8784206113270825, "eval_loss": 0.046654511243104935, "eval_runtime": 220.0789, "eval_samples_per_second": 2.408, "eval_steps_per_second": 2.408, "eval_token_acc": 0.981123125112945, "step": 64000 }, { "epoch": 4.8788017379373425, "grad_norm": 1.1746586561203003, "learning_rate": 1.449047584375529e-07, "loss": 0.02137819230556488, "memory(GiB)": 122.96, "step": 64005, "token_acc": 0.9813413945161455, "train_speed(iter/s)": 0.231549 }, { "epoch": 4.8791828645476025, "grad_norm": 1.5716578960418701, "learning_rate": 1.4399527769562992e-07, "loss": 0.03729550242424011, "memory(GiB)": 122.96, "step": 64010, "token_acc": 0.9873509215757138, "train_speed(iter/s)": 0.231554 }, { "epoch": 4.8795639911578625, "grad_norm": 1.1390552520751953, "learning_rate": 1.4308865596422415e-07, "loss": 0.014447665214538575, "memory(GiB)": 122.96, "step": 64015, "token_acc": 0.9925394548063128, "train_speed(iter/s)": 0.231559 }, { "epoch": 4.879945117768123, "grad_norm": 1.4348002672195435, "learning_rate": 1.421848932953107e-07, "loss": 0.02098757177591324, "memory(GiB)": 122.96, "step": 64020, "token_acc": 0.9908975059166212, "train_speed(iter/s)": 0.231562 }, { "epoch": 4.880326244378383, "grad_norm": 0.9525677561759949, "learning_rate": 1.4128398974073142e-07, "loss": 0.027804243564605712, "memory(GiB)": 122.96, "step": 64025, "token_acc": 0.9892968878643175, "train_speed(iter/s)": 0.231567 }, { "epoch": 4.880707370988643, "grad_norm": 1.3490206003189087, "learning_rate": 1.40385945352145e-07, "loss": 0.04998818933963776, "memory(GiB)": 122.96, "step": 64030, "token_acc": 0.9844184752365053, "train_speed(iter/s)": 0.231571 }, { "epoch": 4.881088497598903, "grad_norm": 1.3587148189544678, "learning_rate": 1.3949076018104357e-07, "loss": 0.027815648913383485, "memory(GiB)": 122.96, "step": 64035, "token_acc": 0.986819199204178, "train_speed(iter/s)": 0.231575 }, { "epoch": 4.881469624209163, "grad_norm": 1.980865478515625, "learning_rate": 1.3859843427876386e-07, "loss": 0.03404979705810547, "memory(GiB)": 122.96, "step": 64040, "token_acc": 0.9852117409814026, "train_speed(iter/s)": 0.231578 }, { "epoch": 4.881850750819423, "grad_norm": 1.8850866556167603, "learning_rate": 1.3770896769648157e-07, "loss": 0.03142691850662231, "memory(GiB)": 122.96, "step": 64045, "token_acc": 0.9781239642028505, "train_speed(iter/s)": 0.231583 }, { "epoch": 4.882231877429682, "grad_norm": 0.5378733277320862, "learning_rate": 1.3682236048520037e-07, "loss": 0.036781692504882814, "memory(GiB)": 122.96, "step": 64050, "token_acc": 0.9881456392887383, "train_speed(iter/s)": 0.231586 }, { "epoch": 4.882613004039942, "grad_norm": 3.7554686069488525, "learning_rate": 1.3593861269576846e-07, "loss": 0.0701400876045227, "memory(GiB)": 122.96, "step": 64055, "token_acc": 0.978195164075993, "train_speed(iter/s)": 0.23159 }, { "epoch": 4.882994130650202, "grad_norm": 0.7262171506881714, "learning_rate": 1.3505772437885089e-07, "loss": 0.02781137526035309, "memory(GiB)": 122.96, "step": 64060, "token_acc": 0.9899250839576337, "train_speed(iter/s)": 0.231591 }, { "epoch": 4.883375257260462, "grad_norm": 0.8623680472373962, "learning_rate": 1.341796955849739e-07, "loss": 0.017824490368366242, "memory(GiB)": 122.96, "step": 64065, "token_acc": 0.9938309685379395, "train_speed(iter/s)": 0.231596 }, { "epoch": 4.883756383870722, "grad_norm": 1.250404715538025, "learning_rate": 1.333045263644861e-07, "loss": 0.019313928484916688, "memory(GiB)": 122.96, "step": 64070, "token_acc": 0.9929055143502096, "train_speed(iter/s)": 0.231601 }, { "epoch": 4.884137510480982, "grad_norm": 2.2878899574279785, "learning_rate": 1.324322167675751e-07, "loss": 0.03352259397506714, "memory(GiB)": 122.96, "step": 64075, "token_acc": 0.9844077961019491, "train_speed(iter/s)": 0.231607 }, { "epoch": 4.884518637091242, "grad_norm": 3.9819135665893555, "learning_rate": 1.315627668442676e-07, "loss": 0.03370629847049713, "memory(GiB)": 122.96, "step": 64080, "token_acc": 0.9908079342041606, "train_speed(iter/s)": 0.231612 }, { "epoch": 4.884899763701502, "grad_norm": 0.6362810730934143, "learning_rate": 1.3069617664440702e-07, "loss": 0.02924657166004181, "memory(GiB)": 122.96, "step": 64085, "token_acc": 0.9854431545973834, "train_speed(iter/s)": 0.231615 }, { "epoch": 4.885280890311762, "grad_norm": 0.7824097275733948, "learning_rate": 1.2983244621770364e-07, "loss": 0.023427146673202514, "memory(GiB)": 122.96, "step": 64090, "token_acc": 0.9906893464637422, "train_speed(iter/s)": 0.231617 }, { "epoch": 4.885662016922021, "grad_norm": 0.36623910069465637, "learning_rate": 1.2897157561369e-07, "loss": 0.028702008724212646, "memory(GiB)": 122.96, "step": 64095, "token_acc": 0.9882990867579908, "train_speed(iter/s)": 0.231622 }, { "epoch": 4.886043143532281, "grad_norm": 0.550358772277832, "learning_rate": 1.2811356488171554e-07, "loss": 0.031884977221488954, "memory(GiB)": 122.96, "step": 64100, "token_acc": 0.990593204069879, "train_speed(iter/s)": 0.231626 }, { "epoch": 4.886424270142541, "grad_norm": 0.4522779583930969, "learning_rate": 1.2725841407100204e-07, "loss": 0.030427432060241698, "memory(GiB)": 122.96, "step": 64105, "token_acc": 0.988527724665392, "train_speed(iter/s)": 0.231627 }, { "epoch": 4.886805396752801, "grad_norm": 2.0121238231658936, "learning_rate": 1.2640612323057687e-07, "loss": 0.029360902309417725, "memory(GiB)": 122.96, "step": 64110, "token_acc": 0.987662132236634, "train_speed(iter/s)": 0.231629 }, { "epoch": 4.887186523363061, "grad_norm": 1.2736997604370117, "learning_rate": 1.2555669240932323e-07, "loss": 0.037063497304916385, "memory(GiB)": 122.96, "step": 64115, "token_acc": 0.9879821129122415, "train_speed(iter/s)": 0.231634 }, { "epoch": 4.887567649973321, "grad_norm": 1.0322515964508057, "learning_rate": 1.24710121655941e-07, "loss": 0.05292032361030578, "memory(GiB)": 122.96, "step": 64120, "token_acc": 0.9769998308811094, "train_speed(iter/s)": 0.231637 }, { "epoch": 4.887948776583581, "grad_norm": 0.5778435468673706, "learning_rate": 1.2386641101899131e-07, "loss": 0.015937414765357972, "memory(GiB)": 122.96, "step": 64125, "token_acc": 0.9942320017090366, "train_speed(iter/s)": 0.231641 }, { "epoch": 4.888329903193841, "grad_norm": 0.7822463512420654, "learning_rate": 1.230255605468411e-07, "loss": 0.02904028594493866, "memory(GiB)": 122.96, "step": 64130, "token_acc": 0.9883081155433288, "train_speed(iter/s)": 0.231647 }, { "epoch": 4.888711029804101, "grad_norm": 0.7912140488624573, "learning_rate": 1.2218757028772397e-07, "loss": 0.046209341287612914, "memory(GiB)": 122.96, "step": 64135, "token_acc": 0.9815151515151516, "train_speed(iter/s)": 0.231652 }, { "epoch": 4.889092156414361, "grad_norm": 1.3622865676879883, "learning_rate": 1.2135244028968484e-07, "loss": 0.028918224573135375, "memory(GiB)": 122.96, "step": 64140, "token_acc": 0.9932421295533213, "train_speed(iter/s)": 0.231654 }, { "epoch": 4.889473283024621, "grad_norm": 1.586916446685791, "learning_rate": 1.205201706006187e-07, "loss": 0.027198830246925355, "memory(GiB)": 122.96, "step": 64145, "token_acc": 0.9906880496637351, "train_speed(iter/s)": 0.23166 }, { "epoch": 4.88985440963488, "grad_norm": 1.4979556798934937, "learning_rate": 1.1969076126825408e-07, "loss": 0.023404639959335328, "memory(GiB)": 122.96, "step": 64150, "token_acc": 0.9906803355079217, "train_speed(iter/s)": 0.231665 }, { "epoch": 4.89023553624514, "grad_norm": 0.9755077362060547, "learning_rate": 1.1886421234015288e-07, "loss": 0.048777458071708676, "memory(GiB)": 122.96, "step": 64155, "token_acc": 0.9837764758900406, "train_speed(iter/s)": 0.231667 }, { "epoch": 4.8906166628554, "grad_norm": 0.45193055272102356, "learning_rate": 1.1804052386370501e-07, "loss": 0.03770902752876282, "memory(GiB)": 122.96, "step": 64160, "token_acc": 0.9889870836165874, "train_speed(iter/s)": 0.231667 }, { "epoch": 4.89099778946566, "grad_norm": 1.710532307624817, "learning_rate": 1.1721969588616155e-07, "loss": 0.04038097858428955, "memory(GiB)": 122.96, "step": 64165, "token_acc": 0.9866787085120795, "train_speed(iter/s)": 0.231672 }, { "epoch": 4.8913789160759205, "grad_norm": 0.7619187831878662, "learning_rate": 1.1640172845457931e-07, "loss": 0.020853692293167116, "memory(GiB)": 122.96, "step": 64170, "token_acc": 0.9876025842500437, "train_speed(iter/s)": 0.231674 }, { "epoch": 4.8917600426861805, "grad_norm": 0.8132439255714417, "learning_rate": 1.155866216158652e-07, "loss": 0.02986462414264679, "memory(GiB)": 122.96, "step": 64175, "token_acc": 0.9892274027941425, "train_speed(iter/s)": 0.231677 }, { "epoch": 4.8921411692964405, "grad_norm": 0.937916100025177, "learning_rate": 1.1477437541677626e-07, "loss": 0.02019456624984741, "memory(GiB)": 122.96, "step": 64180, "token_acc": 0.9906713005304555, "train_speed(iter/s)": 0.23168 }, { "epoch": 4.8925222959067005, "grad_norm": 0.5252156257629395, "learning_rate": 1.1396498990387528e-07, "loss": 0.024926219880580903, "memory(GiB)": 122.96, "step": 64185, "token_acc": 0.9926836406204272, "train_speed(iter/s)": 0.231684 }, { "epoch": 4.8929034225169605, "grad_norm": 1.0079171657562256, "learning_rate": 1.1315846512358063e-07, "loss": 0.05386235117912293, "memory(GiB)": 122.96, "step": 64190, "token_acc": 0.9854432450114491, "train_speed(iter/s)": 0.231687 }, { "epoch": 4.89328454912722, "grad_norm": 0.5369539856910706, "learning_rate": 1.123548011221498e-07, "loss": 0.018352425098419188, "memory(GiB)": 122.96, "step": 64195, "token_acc": 0.9905632984901278, "train_speed(iter/s)": 0.231688 }, { "epoch": 4.89366567573748, "grad_norm": 1.6985254287719727, "learning_rate": 1.1155399794565701e-07, "loss": 0.030277884006500243, "memory(GiB)": 122.96, "step": 64200, "token_acc": 0.9835203366058906, "train_speed(iter/s)": 0.231694 }, { "epoch": 4.89366567573748, "eval_loss": 0.04665606468915939, "eval_runtime": 221.393, "eval_samples_per_second": 2.394, "eval_steps_per_second": 2.394, "eval_token_acc": 0.9811607734473827, "step": 64200 }, { "epoch": 4.89404680234774, "grad_norm": 0.8977580666542053, "learning_rate": 1.1075605564003777e-07, "loss": 0.01863901913166046, "memory(GiB)": 122.96, "step": 64205, "token_acc": 0.9815808556925308, "train_speed(iter/s)": 0.231512 }, { "epoch": 4.894427928958, "grad_norm": 0.6453624367713928, "learning_rate": 1.0996097425104435e-07, "loss": 0.016907230019569397, "memory(GiB)": 122.96, "step": 64210, "token_acc": 0.9926997431391105, "train_speed(iter/s)": 0.231513 }, { "epoch": 4.89480905556826, "grad_norm": 0.5604943037033081, "learning_rate": 1.0916875382426806e-07, "loss": 0.012651169300079345, "memory(GiB)": 122.96, "step": 64215, "token_acc": 0.996234309623431, "train_speed(iter/s)": 0.23152 }, { "epoch": 4.89519018217852, "grad_norm": 0.854701817035675, "learning_rate": 1.0837939440514477e-07, "loss": 0.022519244253635405, "memory(GiB)": 122.96, "step": 64220, "token_acc": 0.9883040935672515, "train_speed(iter/s)": 0.231523 }, { "epoch": 4.89557130878878, "grad_norm": 0.42718201875686646, "learning_rate": 1.075928960389383e-07, "loss": 0.021795514225959777, "memory(GiB)": 122.96, "step": 64225, "token_acc": 0.9924337957124842, "train_speed(iter/s)": 0.231528 }, { "epoch": 4.89595243539904, "grad_norm": 1.2956572771072388, "learning_rate": 1.0680925877074587e-07, "loss": 0.03028526306152344, "memory(GiB)": 122.96, "step": 64230, "token_acc": 0.9885304659498207, "train_speed(iter/s)": 0.231532 }, { "epoch": 4.8963335620093, "grad_norm": 1.3366303443908691, "learning_rate": 1.060284826455149e-07, "loss": 0.02601749002933502, "memory(GiB)": 122.96, "step": 64235, "token_acc": 0.9897054165346848, "train_speed(iter/s)": 0.231533 }, { "epoch": 4.896714688619559, "grad_norm": 1.0870394706726074, "learning_rate": 1.0525056770800956e-07, "loss": 0.01694463938474655, "memory(GiB)": 122.96, "step": 64240, "token_acc": 0.99092513234182, "train_speed(iter/s)": 0.231538 }, { "epoch": 4.897095815229819, "grad_norm": 1.3748048543930054, "learning_rate": 1.0447551400284972e-07, "loss": 0.03651675283908844, "memory(GiB)": 122.96, "step": 64245, "token_acc": 0.9880232941435008, "train_speed(iter/s)": 0.231538 }, { "epoch": 4.897476941840079, "grad_norm": 1.2045592069625854, "learning_rate": 1.0370332157447205e-07, "loss": 0.0320104718208313, "memory(GiB)": 122.96, "step": 64250, "token_acc": 0.9874323279195669, "train_speed(iter/s)": 0.231542 }, { "epoch": 4.897858068450339, "grad_norm": 1.7498600482940674, "learning_rate": 1.0293399046716334e-07, "loss": 0.03226054608821869, "memory(GiB)": 122.96, "step": 64255, "token_acc": 0.9916016796640672, "train_speed(iter/s)": 0.231547 }, { "epoch": 4.898239195060599, "grad_norm": 1.103106141090393, "learning_rate": 1.0216752072503832e-07, "loss": 0.022823716700077056, "memory(GiB)": 122.96, "step": 64260, "token_acc": 0.9893446989877464, "train_speed(iter/s)": 0.231549 }, { "epoch": 4.898620321670859, "grad_norm": 0.8260906934738159, "learning_rate": 1.0140391239205072e-07, "loss": 0.030225256085395814, "memory(GiB)": 122.96, "step": 64265, "token_acc": 0.9831836734693877, "train_speed(iter/s)": 0.231553 }, { "epoch": 4.899001448281119, "grad_norm": 1.2039344310760498, "learning_rate": 1.0064316551199326e-07, "loss": 0.0339770644903183, "memory(GiB)": 122.96, "step": 64270, "token_acc": 0.9863768115942029, "train_speed(iter/s)": 0.231557 }, { "epoch": 4.899382574891379, "grad_norm": 0.9829592108726501, "learning_rate": 9.988528012848663e-08, "loss": 0.013434669375419617, "memory(GiB)": 122.96, "step": 64275, "token_acc": 0.9956188389923329, "train_speed(iter/s)": 0.231563 }, { "epoch": 4.899763701501639, "grad_norm": 0.730282187461853, "learning_rate": 9.913025628499606e-08, "loss": 0.017526018619537353, "memory(GiB)": 122.96, "step": 64280, "token_acc": 0.9908918406072106, "train_speed(iter/s)": 0.231566 }, { "epoch": 4.900144828111899, "grad_norm": 1.8883883953094482, "learning_rate": 9.837809402481468e-08, "loss": 0.02817840874195099, "memory(GiB)": 122.96, "step": 64285, "token_acc": 0.9904212045663299, "train_speed(iter/s)": 0.231568 }, { "epoch": 4.900525954722159, "grad_norm": 0.5347760915756226, "learning_rate": 9.762879339108022e-08, "loss": 0.02192070484161377, "memory(GiB)": 122.96, "step": 64290, "token_acc": 0.9926315789473684, "train_speed(iter/s)": 0.231572 }, { "epoch": 4.900907081332418, "grad_norm": 0.6599332690238953, "learning_rate": 9.68823544267583e-08, "loss": 0.02078361064195633, "memory(GiB)": 122.96, "step": 64295, "token_acc": 0.9915814792543596, "train_speed(iter/s)": 0.231577 }, { "epoch": 4.901288207942678, "grad_norm": 2.0364482402801514, "learning_rate": 9.613877717465358e-08, "loss": 0.03194462060928345, "memory(GiB)": 122.96, "step": 64300, "token_acc": 0.9926854754440961, "train_speed(iter/s)": 0.231581 }, { "epoch": 4.901669334552938, "grad_norm": 0.9433983564376831, "learning_rate": 9.539806167740972e-08, "loss": 0.04009949564933777, "memory(GiB)": 122.96, "step": 64305, "token_acc": 0.9784727863525589, "train_speed(iter/s)": 0.231586 }, { "epoch": 4.902050461163198, "grad_norm": 0.8805011510848999, "learning_rate": 9.466020797750385e-08, "loss": 0.03706354796886444, "memory(GiB)": 122.96, "step": 64310, "token_acc": 0.9863945578231292, "train_speed(iter/s)": 0.231589 }, { "epoch": 4.902431587773458, "grad_norm": 2.1648404598236084, "learning_rate": 9.392521611724104e-08, "loss": 0.03462998270988464, "memory(GiB)": 122.96, "step": 64315, "token_acc": 0.984786557674841, "train_speed(iter/s)": 0.231594 }, { "epoch": 4.902812714383718, "grad_norm": 1.214913249015808, "learning_rate": 9.319308613877642e-08, "loss": 0.03989481329917908, "memory(GiB)": 122.96, "step": 64320, "token_acc": 0.9846912298910223, "train_speed(iter/s)": 0.231598 }, { "epoch": 4.903193840993978, "grad_norm": 0.8736729025840759, "learning_rate": 9.24638180840931e-08, "loss": 0.02207944691181183, "memory(GiB)": 122.96, "step": 64325, "token_acc": 0.9917864476386037, "train_speed(iter/s)": 0.231603 }, { "epoch": 4.903574967604238, "grad_norm": 2.5189058780670166, "learning_rate": 9.173741199500762e-08, "loss": 0.023525960743427277, "memory(GiB)": 122.96, "step": 64330, "token_acc": 0.9901780233271946, "train_speed(iter/s)": 0.231607 }, { "epoch": 4.903956094214498, "grad_norm": 0.8430429100990295, "learning_rate": 9.101386791318112e-08, "loss": 0.02384883165359497, "memory(GiB)": 122.96, "step": 64335, "token_acc": 0.990771259063942, "train_speed(iter/s)": 0.23161 }, { "epoch": 4.9043372208247575, "grad_norm": 0.0005045531434006989, "learning_rate": 9.02931858801026e-08, "loss": 0.018063436448574065, "memory(GiB)": 122.96, "step": 64340, "token_acc": 0.9880704412043174, "train_speed(iter/s)": 0.231612 }, { "epoch": 4.9047183474350176, "grad_norm": 0.790397584438324, "learning_rate": 8.957536593710014e-08, "loss": 0.015913563966751098, "memory(GiB)": 122.96, "step": 64345, "token_acc": 0.9942928482254325, "train_speed(iter/s)": 0.231616 }, { "epoch": 4.905099474045278, "grad_norm": 2.102175235748291, "learning_rate": 8.88604081253408e-08, "loss": 0.041238969564437865, "memory(GiB)": 122.96, "step": 64350, "token_acc": 0.9841193455245428, "train_speed(iter/s)": 0.231622 }, { "epoch": 4.905480600655538, "grad_norm": 0.6328693628311157, "learning_rate": 8.814831248581957e-08, "loss": 0.022477823495864867, "memory(GiB)": 122.96, "step": 64355, "token_acc": 0.9927383462169126, "train_speed(iter/s)": 0.231626 }, { "epoch": 4.905861727265798, "grad_norm": 1.40575110912323, "learning_rate": 8.743907905937598e-08, "loss": 0.03008895218372345, "memory(GiB)": 122.96, "step": 64360, "token_acc": 0.9834387232761217, "train_speed(iter/s)": 0.231631 }, { "epoch": 4.906242853876058, "grad_norm": 1.0178332328796387, "learning_rate": 8.673270788667198e-08, "loss": 0.02452968955039978, "memory(GiB)": 122.96, "step": 64365, "token_acc": 0.9894179894179894, "train_speed(iter/s)": 0.231634 }, { "epoch": 4.906623980486318, "grad_norm": 1.0000109672546387, "learning_rate": 8.602919900822514e-08, "loss": 0.04501459896564484, "memory(GiB)": 122.96, "step": 64370, "token_acc": 0.981675392670157, "train_speed(iter/s)": 0.231639 }, { "epoch": 4.907005107096578, "grad_norm": 2.4417669773101807, "learning_rate": 8.532855246437544e-08, "loss": 0.05304419994354248, "memory(GiB)": 122.96, "step": 64375, "token_acc": 0.9853707995365005, "train_speed(iter/s)": 0.231641 }, { "epoch": 4.907386233706838, "grad_norm": 0.601804256439209, "learning_rate": 8.463076829530182e-08, "loss": 0.041284358501434325, "memory(GiB)": 122.96, "step": 64380, "token_acc": 0.9865947359816903, "train_speed(iter/s)": 0.231644 }, { "epoch": 4.907767360317098, "grad_norm": 1.2536370754241943, "learning_rate": 8.393584654101671e-08, "loss": 0.041166120767593385, "memory(GiB)": 122.96, "step": 64385, "token_acc": 0.9800664451827242, "train_speed(iter/s)": 0.231649 }, { "epoch": 4.908148486927358, "grad_norm": 1.2991529703140259, "learning_rate": 8.32437872413716e-08, "loss": 0.04201065003871918, "memory(GiB)": 122.96, "step": 64390, "token_acc": 0.9821586292174528, "train_speed(iter/s)": 0.231652 }, { "epoch": 4.908529613537617, "grad_norm": 1.560674786567688, "learning_rate": 8.25545904360514e-08, "loss": 0.03325926661491394, "memory(GiB)": 122.96, "step": 64395, "token_acc": 0.9835345773874863, "train_speed(iter/s)": 0.231657 }, { "epoch": 4.908910740147877, "grad_norm": 0.6735882759094238, "learning_rate": 8.186825616458005e-08, "loss": 0.02837896943092346, "memory(GiB)": 122.96, "step": 64400, "token_acc": 0.9854651162790697, "train_speed(iter/s)": 0.231661 }, { "epoch": 4.908910740147877, "eval_loss": 0.046845316886901855, "eval_runtime": 221.7702, "eval_samples_per_second": 2.39, "eval_steps_per_second": 2.39, "eval_token_acc": 0.9809875911089694, "step": 64400 }, { "epoch": 4.909291866758137, "grad_norm": 0.7552636861801147, "learning_rate": 8.11847844663205e-08, "loss": 0.04875497221946716, "memory(GiB)": 122.96, "step": 64405, "token_acc": 0.9811974949255629, "train_speed(iter/s)": 0.231479 }, { "epoch": 4.909672993368397, "grad_norm": 1.599703073501587, "learning_rate": 8.050417538045807e-08, "loss": 0.02993651032447815, "memory(GiB)": 122.96, "step": 64410, "token_acc": 0.9866975130133024, "train_speed(iter/s)": 0.231483 }, { "epoch": 4.910054119978657, "grad_norm": 0.823034942150116, "learning_rate": 7.982642894602821e-08, "loss": 0.04323030412197113, "memory(GiB)": 122.96, "step": 64415, "token_acc": 0.9823442864149092, "train_speed(iter/s)": 0.231487 }, { "epoch": 4.910435246588917, "grad_norm": 1.6516269445419312, "learning_rate": 7.915154520189427e-08, "loss": 0.03099621832370758, "memory(GiB)": 122.96, "step": 64420, "token_acc": 0.9874055415617129, "train_speed(iter/s)": 0.231491 }, { "epoch": 4.910816373199177, "grad_norm": 1.363691806793213, "learning_rate": 7.847952418675863e-08, "loss": 0.04162880778312683, "memory(GiB)": 122.96, "step": 64425, "token_acc": 0.9785860956292168, "train_speed(iter/s)": 0.231496 }, { "epoch": 4.911197499809437, "grad_norm": 0.5435741543769836, "learning_rate": 7.781036593915713e-08, "loss": 0.041896003484725955, "memory(GiB)": 122.96, "step": 64430, "token_acc": 0.9912798874824191, "train_speed(iter/s)": 0.231501 }, { "epoch": 4.911578626419697, "grad_norm": 0.3774520456790924, "learning_rate": 7.714407049746464e-08, "loss": 0.025584176182746887, "memory(GiB)": 122.96, "step": 64435, "token_acc": 0.9910109431995832, "train_speed(iter/s)": 0.231502 }, { "epoch": 4.911959753029956, "grad_norm": 1.443864107131958, "learning_rate": 7.648063789988391e-08, "loss": 0.02326509952545166, "memory(GiB)": 122.96, "step": 64440, "token_acc": 0.989755529685681, "train_speed(iter/s)": 0.231505 }, { "epoch": 4.912340879640216, "grad_norm": 1.39468252658844, "learning_rate": 7.582006818447341e-08, "loss": 0.04066857099533081, "memory(GiB)": 122.96, "step": 64445, "token_acc": 0.9800918836140888, "train_speed(iter/s)": 0.231508 }, { "epoch": 4.912722006250476, "grad_norm": 1.185280680656433, "learning_rate": 7.516236138910282e-08, "loss": 0.0232534259557724, "memory(GiB)": 122.96, "step": 64450, "token_acc": 0.9912810194500336, "train_speed(iter/s)": 0.231512 }, { "epoch": 4.913103132860736, "grad_norm": 1.2230703830718994, "learning_rate": 7.450751755148644e-08, "loss": 0.030050212144851686, "memory(GiB)": 122.96, "step": 64455, "token_acc": 0.9882168106834249, "train_speed(iter/s)": 0.231515 }, { "epoch": 4.913484259470996, "grad_norm": 2.9290802478790283, "learning_rate": 7.385553670918865e-08, "loss": 0.03916605114936829, "memory(GiB)": 122.96, "step": 64460, "token_acc": 0.9832084374132667, "train_speed(iter/s)": 0.231518 }, { "epoch": 4.913865386081256, "grad_norm": 1.8706047534942627, "learning_rate": 7.320641889958513e-08, "loss": 0.041625994443893435, "memory(GiB)": 122.96, "step": 64465, "token_acc": 0.9871713985278654, "train_speed(iter/s)": 0.231519 }, { "epoch": 4.914246512691516, "grad_norm": 0.974461019039154, "learning_rate": 7.256016415990719e-08, "loss": 0.05081263780593872, "memory(GiB)": 122.96, "step": 64470, "token_acc": 0.9825370675453048, "train_speed(iter/s)": 0.231522 }, { "epoch": 4.914627639301776, "grad_norm": 0.11970564723014832, "learning_rate": 7.191677252721407e-08, "loss": 0.03673213422298431, "memory(GiB)": 122.96, "step": 64475, "token_acc": 0.9814229983280699, "train_speed(iter/s)": 0.231524 }, { "epoch": 4.915008765912036, "grad_norm": 0.6540882587432861, "learning_rate": 7.127624403839295e-08, "loss": 0.030864232778549196, "memory(GiB)": 122.96, "step": 64480, "token_acc": 0.9882878717830175, "train_speed(iter/s)": 0.231528 }, { "epoch": 4.915389892522295, "grad_norm": 1.0574827194213867, "learning_rate": 7.063857873018665e-08, "loss": 0.037467995285987855, "memory(GiB)": 122.96, "step": 64485, "token_acc": 0.9823588709677419, "train_speed(iter/s)": 0.231534 }, { "epoch": 4.915771019132556, "grad_norm": 0.7093335390090942, "learning_rate": 7.000377663916036e-08, "loss": 0.03385821878910065, "memory(GiB)": 122.96, "step": 64490, "token_acc": 0.9856887298747764, "train_speed(iter/s)": 0.231537 }, { "epoch": 4.9161521457428154, "grad_norm": 1.1035047769546509, "learning_rate": 6.93718378017072e-08, "loss": 0.030837732553482055, "memory(GiB)": 122.96, "step": 64495, "token_acc": 0.9893721386527142, "train_speed(iter/s)": 0.231541 }, { "epoch": 4.9165332723530755, "grad_norm": 1.9608557224273682, "learning_rate": 6.874276225407594e-08, "loss": 0.03197336494922638, "memory(GiB)": 122.96, "step": 64500, "token_acc": 0.9867167196072769, "train_speed(iter/s)": 0.231545 }, { "epoch": 4.9169143989633355, "grad_norm": 0.34979119896888733, "learning_rate": 6.811655003233774e-08, "loss": 0.03345221281051636, "memory(GiB)": 122.96, "step": 64505, "token_acc": 0.9884165411791961, "train_speed(iter/s)": 0.231546 }, { "epoch": 4.9172955255735955, "grad_norm": 1.3707996606826782, "learning_rate": 6.749320117240277e-08, "loss": 0.022626328468322753, "memory(GiB)": 122.96, "step": 64510, "token_acc": 0.9902589079723148, "train_speed(iter/s)": 0.231551 }, { "epoch": 4.9176766521838555, "grad_norm": 0.8020222187042236, "learning_rate": 6.687271571002018e-08, "loss": 0.05901910066604614, "memory(GiB)": 122.96, "step": 64515, "token_acc": 0.9828571428571429, "train_speed(iter/s)": 0.231556 }, { "epoch": 4.9180577787941155, "grad_norm": 0.5597627758979797, "learning_rate": 6.62550936807671e-08, "loss": 0.055349808931350705, "memory(GiB)": 122.96, "step": 64520, "token_acc": 0.9905298759864712, "train_speed(iter/s)": 0.23156 }, { "epoch": 4.918438905404376, "grad_norm": 3.046227216720581, "learning_rate": 6.564033512006518e-08, "loss": 0.04920347332954407, "memory(GiB)": 122.96, "step": 64525, "token_acc": 0.9817826935588809, "train_speed(iter/s)": 0.231565 }, { "epoch": 4.918820032014636, "grad_norm": 1.1145527362823486, "learning_rate": 6.5028440063164e-08, "loss": 0.020717234909534456, "memory(GiB)": 122.96, "step": 64530, "token_acc": 0.9928789420142421, "train_speed(iter/s)": 0.23157 }, { "epoch": 4.919201158624896, "grad_norm": 0.6510210633277893, "learning_rate": 6.441940854515217e-08, "loss": 0.02747705578804016, "memory(GiB)": 122.96, "step": 64535, "token_acc": 0.9917428487171925, "train_speed(iter/s)": 0.231572 }, { "epoch": 4.919582285235155, "grad_norm": 0.9933061599731445, "learning_rate": 6.381324060096284e-08, "loss": 0.028990226984024047, "memory(GiB)": 122.96, "step": 64540, "token_acc": 0.987933047878552, "train_speed(iter/s)": 0.231578 }, { "epoch": 4.919963411845415, "grad_norm": 0.9668512344360352, "learning_rate": 6.320993626535155e-08, "loss": 0.024772673845291138, "memory(GiB)": 122.96, "step": 64545, "token_acc": 0.9896313364055299, "train_speed(iter/s)": 0.231583 }, { "epoch": 4.920344538455675, "grad_norm": 0.42658889293670654, "learning_rate": 6.260949557291285e-08, "loss": 0.029283928871154784, "memory(GiB)": 122.96, "step": 64550, "token_acc": 0.9892177357245021, "train_speed(iter/s)": 0.231582 }, { "epoch": 4.920725665065935, "grad_norm": 0.5815668106079102, "learning_rate": 6.201191855808586e-08, "loss": 0.04231945276260376, "memory(GiB)": 122.96, "step": 64555, "token_acc": 0.9818563789152025, "train_speed(iter/s)": 0.231586 }, { "epoch": 4.921106791676195, "grad_norm": 0.7054473161697388, "learning_rate": 6.141720525513206e-08, "loss": 0.03468429148197174, "memory(GiB)": 122.96, "step": 64560, "token_acc": 0.9862932061978545, "train_speed(iter/s)": 0.23159 }, { "epoch": 4.921487918286455, "grad_norm": 0.7855212092399597, "learning_rate": 6.08253556981575e-08, "loss": 0.027456405758857726, "memory(GiB)": 122.96, "step": 64565, "token_acc": 0.9886585149743045, "train_speed(iter/s)": 0.231593 }, { "epoch": 4.921869044896715, "grad_norm": 0.9582118988037109, "learning_rate": 6.023636992110172e-08, "loss": 0.018627263605594635, "memory(GiB)": 122.96, "step": 64570, "token_acc": 0.9903213317847465, "train_speed(iter/s)": 0.231597 }, { "epoch": 4.922250171506975, "grad_norm": 0.5846145749092102, "learning_rate": 5.965024795774876e-08, "loss": 0.03103383779525757, "memory(GiB)": 122.96, "step": 64575, "token_acc": 0.9894820400873189, "train_speed(iter/s)": 0.231599 }, { "epoch": 4.922631298117235, "grad_norm": 2.1260106563568115, "learning_rate": 5.906698984169401e-08, "loss": 0.037701737880706784, "memory(GiB)": 122.96, "step": 64580, "token_acc": 0.9843997286909337, "train_speed(iter/s)": 0.231603 }, { "epoch": 4.923012424727494, "grad_norm": 0.9373419284820557, "learning_rate": 5.848659560639402e-08, "loss": 0.03209035396575928, "memory(GiB)": 122.96, "step": 64585, "token_acc": 0.9846475924633635, "train_speed(iter/s)": 0.231606 }, { "epoch": 4.923393551337754, "grad_norm": 1.3691716194152832, "learning_rate": 5.790906528513329e-08, "loss": 0.01878657639026642, "memory(GiB)": 122.96, "step": 64590, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.231611 }, { "epoch": 4.923774677948014, "grad_norm": 0.8006615042686462, "learning_rate": 5.7334398911029764e-08, "loss": 0.025621595978736877, "memory(GiB)": 122.96, "step": 64595, "token_acc": 0.9883259397618491, "train_speed(iter/s)": 0.231614 }, { "epoch": 4.924155804558274, "grad_norm": 1.2499167919158936, "learning_rate": 5.676259651703486e-08, "loss": 0.024955277144908906, "memory(GiB)": 122.96, "step": 64600, "token_acc": 0.9908485856905158, "train_speed(iter/s)": 0.231619 }, { "epoch": 4.924155804558274, "eval_loss": 0.04685001075267792, "eval_runtime": 221.9456, "eval_samples_per_second": 2.388, "eval_steps_per_second": 2.388, "eval_token_acc": 0.9810553581109571, "step": 64600 }, { "epoch": 4.924536931168534, "grad_norm": 0.3286164402961731, "learning_rate": 5.619365813593902e-08, "loss": 0.02209991216659546, "memory(GiB)": 122.96, "step": 64605, "token_acc": 0.9813955848033744, "train_speed(iter/s)": 0.231438 }, { "epoch": 4.924918057778794, "grad_norm": 0.8896494507789612, "learning_rate": 5.5627583800366146e-08, "loss": 0.0258838415145874, "memory(GiB)": 122.96, "step": 64610, "token_acc": 0.9905514847666795, "train_speed(iter/s)": 0.231442 }, { "epoch": 4.925299184389054, "grad_norm": 0.839924156665802, "learning_rate": 5.506437354279026e-08, "loss": 0.023374438285827637, "memory(GiB)": 122.96, "step": 64615, "token_acc": 0.9912296729398867, "train_speed(iter/s)": 0.231445 }, { "epoch": 4.925680310999314, "grad_norm": 1.0565558671951294, "learning_rate": 5.45040273954911e-08, "loss": 0.02859618067741394, "memory(GiB)": 122.96, "step": 64620, "token_acc": 0.9913710450623202, "train_speed(iter/s)": 0.231448 }, { "epoch": 4.926061437609574, "grad_norm": 0.9764311909675598, "learning_rate": 5.3946545390615163e-08, "loss": 0.025259491801261903, "memory(GiB)": 122.96, "step": 64625, "token_acc": 0.9910356666030898, "train_speed(iter/s)": 0.231452 }, { "epoch": 4.926442564219834, "grad_norm": 0.5672560930252075, "learning_rate": 5.339192756012579e-08, "loss": 0.016481542587280275, "memory(GiB)": 122.96, "step": 64630, "token_acc": 0.9917231000752446, "train_speed(iter/s)": 0.231456 }, { "epoch": 4.926823690830094, "grad_norm": 0.8351079821586609, "learning_rate": 5.28401739358253e-08, "loss": 0.032684749364852904, "memory(GiB)": 122.96, "step": 64635, "token_acc": 0.9870953387473579, "train_speed(iter/s)": 0.231455 }, { "epoch": 4.927204817440353, "grad_norm": 0.8930631279945374, "learning_rate": 5.229128454936061e-08, "loss": 0.01562105566263199, "memory(GiB)": 122.96, "step": 64640, "token_acc": 0.9930216329378926, "train_speed(iter/s)": 0.231462 }, { "epoch": 4.927585944050613, "grad_norm": 0.805168628692627, "learning_rate": 5.174525943220654e-08, "loss": 0.022696293890476227, "memory(GiB)": 122.96, "step": 64645, "token_acc": 0.9879539733908665, "train_speed(iter/s)": 0.231465 }, { "epoch": 4.927967070660873, "grad_norm": 1.4699281454086304, "learning_rate": 5.120209861567138e-08, "loss": 0.028155219554901124, "memory(GiB)": 122.96, "step": 64650, "token_acc": 0.9904761904761905, "train_speed(iter/s)": 0.231468 }, { "epoch": 4.928348197271133, "grad_norm": 1.7721388339996338, "learning_rate": 5.0661802130907986e-08, "loss": 0.043795999884605405, "memory(GiB)": 122.96, "step": 64655, "token_acc": 0.9836493303183163, "train_speed(iter/s)": 0.231472 }, { "epoch": 4.928729323881393, "grad_norm": 1.917946457862854, "learning_rate": 5.012437000889714e-08, "loss": 0.028987228870391846, "memory(GiB)": 122.96, "step": 64660, "token_acc": 0.9877464038359084, "train_speed(iter/s)": 0.231477 }, { "epoch": 4.929110450491653, "grad_norm": 0.7101659774780273, "learning_rate": 4.958980228045307e-08, "loss": 0.04030443131923676, "memory(GiB)": 122.96, "step": 64665, "token_acc": 0.9865301724137931, "train_speed(iter/s)": 0.231481 }, { "epoch": 4.929491577101913, "grad_norm": 0.8892742991447449, "learning_rate": 4.905809897624014e-08, "loss": 0.03716340363025665, "memory(GiB)": 122.96, "step": 64670, "token_acc": 0.9859951175639213, "train_speed(iter/s)": 0.231483 }, { "epoch": 4.9298727037121735, "grad_norm": 2.0488574504852295, "learning_rate": 4.852926012673953e-08, "loss": 0.02908652424812317, "memory(GiB)": 122.96, "step": 64675, "token_acc": 0.9907012785741961, "train_speed(iter/s)": 0.231486 }, { "epoch": 4.9302538303224335, "grad_norm": 1.6748040914535522, "learning_rate": 4.800328576228807e-08, "loss": 0.04813655912876129, "memory(GiB)": 122.96, "step": 64680, "token_acc": 0.9808150119030948, "train_speed(iter/s)": 0.231488 }, { "epoch": 4.930634956932693, "grad_norm": 1.5027719736099243, "learning_rate": 4.748017591303944e-08, "loss": 0.02259388267993927, "memory(GiB)": 122.96, "step": 64685, "token_acc": 0.9938238453276047, "train_speed(iter/s)": 0.231493 }, { "epoch": 4.931016083542953, "grad_norm": 1.0981311798095703, "learning_rate": 4.6959930608997395e-08, "loss": 0.020912915468215942, "memory(GiB)": 122.96, "step": 64690, "token_acc": 0.992108229988726, "train_speed(iter/s)": 0.231495 }, { "epoch": 4.931397210153213, "grad_norm": 2.204334259033203, "learning_rate": 4.644254987999363e-08, "loss": 0.021035978198051454, "memory(GiB)": 122.96, "step": 64695, "token_acc": 0.9933804060017652, "train_speed(iter/s)": 0.2315 }, { "epoch": 4.931778336763473, "grad_norm": 0.48203086853027344, "learning_rate": 4.592803375569332e-08, "loss": 0.016952426731586458, "memory(GiB)": 122.96, "step": 64700, "token_acc": 0.9929185634800203, "train_speed(iter/s)": 0.231506 }, { "epoch": 4.932159463373733, "grad_norm": 1.0587024688720703, "learning_rate": 4.541638226560618e-08, "loss": 0.020977646112442017, "memory(GiB)": 122.96, "step": 64705, "token_acc": 0.9911569638909359, "train_speed(iter/s)": 0.231512 }, { "epoch": 4.932540589983993, "grad_norm": 2.013866424560547, "learning_rate": 4.490759543906986e-08, "loss": 0.038837355375289914, "memory(GiB)": 122.96, "step": 64710, "token_acc": 0.9857560262965669, "train_speed(iter/s)": 0.231515 }, { "epoch": 4.932921716594253, "grad_norm": 3.0383195877075195, "learning_rate": 4.4401673305266566e-08, "loss": 0.035406842827796936, "memory(GiB)": 122.96, "step": 64715, "token_acc": 0.9897182025894897, "train_speed(iter/s)": 0.231521 }, { "epoch": 4.933302843204513, "grad_norm": 0.41020554304122925, "learning_rate": 4.389861589320643e-08, "loss": 0.020345111191272736, "memory(GiB)": 122.96, "step": 64720, "token_acc": 0.9908883826879271, "train_speed(iter/s)": 0.231524 }, { "epoch": 4.933683969814773, "grad_norm": 1.6570876836776733, "learning_rate": 4.339842323173304e-08, "loss": 0.031781095266342166, "memory(GiB)": 122.96, "step": 64725, "token_acc": 0.9875915682463746, "train_speed(iter/s)": 0.231527 }, { "epoch": 4.934065096425033, "grad_norm": 0.6656976938247681, "learning_rate": 4.290109534952902e-08, "loss": 0.037928760051727295, "memory(GiB)": 122.96, "step": 64730, "token_acc": 0.9834376150165623, "train_speed(iter/s)": 0.231532 }, { "epoch": 4.934446223035293, "grad_norm": 1.591727375984192, "learning_rate": 4.240663227512154e-08, "loss": 0.06975325345993041, "memory(GiB)": 122.96, "step": 64735, "token_acc": 0.9796355841371919, "train_speed(iter/s)": 0.231537 }, { "epoch": 4.934827349645552, "grad_norm": 2.1580207347869873, "learning_rate": 4.19150340368657e-08, "loss": 0.0288266122341156, "memory(GiB)": 122.96, "step": 64740, "token_acc": 0.9881899871630295, "train_speed(iter/s)": 0.231541 }, { "epoch": 4.935208476255812, "grad_norm": 1.2183821201324463, "learning_rate": 4.1426300662944504e-08, "loss": 0.03715924024581909, "memory(GiB)": 122.96, "step": 64745, "token_acc": 0.989095694679451, "train_speed(iter/s)": 0.231545 }, { "epoch": 4.935589602866072, "grad_norm": 0.6309221386909485, "learning_rate": 4.09404321813911e-08, "loss": 0.0403788685798645, "memory(GiB)": 122.96, "step": 64750, "token_acc": 0.9912137975919297, "train_speed(iter/s)": 0.23155 }, { "epoch": 4.935970729476332, "grad_norm": 0.679709255695343, "learning_rate": 4.045742862006652e-08, "loss": 0.02808076739311218, "memory(GiB)": 122.96, "step": 64755, "token_acc": 0.9914043442908584, "train_speed(iter/s)": 0.231552 }, { "epoch": 4.936351856086592, "grad_norm": 2.4173686504364014, "learning_rate": 3.997729000666528e-08, "loss": 0.030645695328712464, "memory(GiB)": 122.96, "step": 64760, "token_acc": 0.9867603350445825, "train_speed(iter/s)": 0.231557 }, { "epoch": 4.936732982696852, "grad_norm": 0.5623695850372314, "learning_rate": 3.9500016368720914e-08, "loss": 0.022415249049663542, "memory(GiB)": 122.96, "step": 64765, "token_acc": 0.9924707194645845, "train_speed(iter/s)": 0.231562 }, { "epoch": 4.937114109307112, "grad_norm": 2.1018929481506348, "learning_rate": 3.902560773361152e-08, "loss": 0.025396701693534852, "memory(GiB)": 122.96, "step": 64770, "token_acc": 0.9913955364345254, "train_speed(iter/s)": 0.231566 }, { "epoch": 4.937495235917372, "grad_norm": 0.9693512320518494, "learning_rate": 3.855406412853202e-08, "loss": 0.021274706721305846, "memory(GiB)": 122.96, "step": 64775, "token_acc": 0.9858757062146892, "train_speed(iter/s)": 0.23157 }, { "epoch": 4.937876362527632, "grad_norm": 1.0732938051223755, "learning_rate": 3.808538558053298e-08, "loss": 0.02112864553928375, "memory(GiB)": 122.96, "step": 64780, "token_acc": 0.9914114406093015, "train_speed(iter/s)": 0.231574 }, { "epoch": 4.938257489137891, "grad_norm": 1.4696190357208252, "learning_rate": 3.761957211648182e-08, "loss": 0.03221611678600311, "memory(GiB)": 122.96, "step": 64785, "token_acc": 0.9887034659820283, "train_speed(iter/s)": 0.231577 }, { "epoch": 4.938638615748151, "grad_norm": 0.8049118518829346, "learning_rate": 3.715662376309048e-08, "loss": 0.020142216980457307, "memory(GiB)": 122.96, "step": 64790, "token_acc": 0.9911591355599214, "train_speed(iter/s)": 0.231582 }, { "epoch": 4.939019742358411, "grad_norm": 3.7555630207061768, "learning_rate": 3.6696540546915516e-08, "loss": 0.03407995700836182, "memory(GiB)": 122.96, "step": 64795, "token_acc": 0.9884437596302003, "train_speed(iter/s)": 0.231588 }, { "epoch": 4.939400868968671, "grad_norm": 0.3630980849266052, "learning_rate": 3.6239322494335814e-08, "loss": 0.011773510277271271, "memory(GiB)": 122.96, "step": 64800, "token_acc": 0.9982964224872232, "train_speed(iter/s)": 0.231595 }, { "epoch": 4.939400868968671, "eval_loss": 0.04665433615446091, "eval_runtime": 222.2225, "eval_samples_per_second": 2.385, "eval_steps_per_second": 2.385, "eval_token_acc": 0.9809875911089694, "step": 64800 }, { "epoch": 4.939781995578931, "grad_norm": 0.9517976641654968, "learning_rate": 3.57849696315693e-08, "loss": 0.02866886556148529, "memory(GiB)": 122.96, "step": 64805, "token_acc": 0.9811070110701107, "train_speed(iter/s)": 0.231417 }, { "epoch": 4.940163122189191, "grad_norm": 1.2492786645889282, "learning_rate": 3.5333481984672903e-08, "loss": 0.024470609426498414, "memory(GiB)": 122.96, "step": 64810, "token_acc": 0.99161820047896, "train_speed(iter/s)": 0.231421 }, { "epoch": 4.940544248799451, "grad_norm": 1.572933316230774, "learning_rate": 3.488485957954257e-08, "loss": 0.033518800139427186, "memory(GiB)": 122.96, "step": 64815, "token_acc": 0.984418901660281, "train_speed(iter/s)": 0.231425 }, { "epoch": 4.940925375409711, "grad_norm": 1.3725875616073608, "learning_rate": 3.443910244189108e-08, "loss": 0.014935889840126037, "memory(GiB)": 122.96, "step": 64820, "token_acc": 0.9959090909090909, "train_speed(iter/s)": 0.231431 }, { "epoch": 4.941306502019971, "grad_norm": 0.8445883393287659, "learning_rate": 3.399621059729241e-08, "loss": 0.01631130576133728, "memory(GiB)": 122.96, "step": 64825, "token_acc": 0.9905783826223502, "train_speed(iter/s)": 0.231436 }, { "epoch": 4.9416876286302305, "grad_norm": 2.0795211791992188, "learning_rate": 3.3556184071137366e-08, "loss": 0.04226732850074768, "memory(GiB)": 122.96, "step": 64830, "token_acc": 0.9836410376256135, "train_speed(iter/s)": 0.23144 }, { "epoch": 4.942068755240491, "grad_norm": 0.590971827507019, "learning_rate": 3.311902288866686e-08, "loss": 0.01906786412000656, "memory(GiB)": 122.96, "step": 64835, "token_acc": 0.9903737259343148, "train_speed(iter/s)": 0.231443 }, { "epoch": 4.9424498818507505, "grad_norm": 0.5386515259742737, "learning_rate": 3.2684727074944185e-08, "loss": 0.03901462554931641, "memory(GiB)": 122.96, "step": 64840, "token_acc": 0.9854525862068966, "train_speed(iter/s)": 0.231447 }, { "epoch": 4.9428310084610105, "grad_norm": 0.55223548412323, "learning_rate": 3.2253296654871644e-08, "loss": 0.033474120497703555, "memory(GiB)": 122.96, "step": 64845, "token_acc": 0.9863322530646752, "train_speed(iter/s)": 0.231449 }, { "epoch": 4.9432121350712706, "grad_norm": 1.6952707767486572, "learning_rate": 3.182473165319611e-08, "loss": 0.05130731463432312, "memory(GiB)": 122.96, "step": 64850, "token_acc": 0.9844879518072289, "train_speed(iter/s)": 0.231452 }, { "epoch": 4.943593261681531, "grad_norm": 1.3078672885894775, "learning_rate": 3.1399032094497906e-08, "loss": 0.02917306125164032, "memory(GiB)": 122.96, "step": 64855, "token_acc": 0.9859193815571508, "train_speed(iter/s)": 0.231456 }, { "epoch": 4.943974388291791, "grad_norm": 1.4002556800842285, "learning_rate": 3.097619800317419e-08, "loss": 0.04606362581253052, "memory(GiB)": 122.96, "step": 64860, "token_acc": 0.980106100795756, "train_speed(iter/s)": 0.23146 }, { "epoch": 4.944355514902051, "grad_norm": 1.5704371929168701, "learning_rate": 3.0556229403483346e-08, "loss": 0.028604754805564882, "memory(GiB)": 122.96, "step": 64865, "token_acc": 0.9887671232876712, "train_speed(iter/s)": 0.231464 }, { "epoch": 4.944736641512311, "grad_norm": 0.5281844735145569, "learning_rate": 3.0139126319506105e-08, "loss": 0.01679226607084274, "memory(GiB)": 122.96, "step": 64870, "token_acc": 0.9935779816513761, "train_speed(iter/s)": 0.231468 }, { "epoch": 4.945117768122571, "grad_norm": 2.8246917724609375, "learning_rate": 2.972488877516222e-08, "loss": 0.040416795015335086, "memory(GiB)": 122.96, "step": 64875, "token_acc": 0.9846712384025816, "train_speed(iter/s)": 0.231471 }, { "epoch": 4.945498894732831, "grad_norm": 0.8814321160316467, "learning_rate": 2.9313516794210462e-08, "loss": 0.031017646193504333, "memory(GiB)": 122.96, "step": 64880, "token_acc": 0.9885310472659871, "train_speed(iter/s)": 0.23147 }, { "epoch": 4.94588002134309, "grad_norm": 0.8296684622764587, "learning_rate": 2.8905010400231968e-08, "loss": 0.018955858051776887, "memory(GiB)": 122.96, "step": 64885, "token_acc": 0.9941972920696325, "train_speed(iter/s)": 0.231474 }, { "epoch": 4.94626114795335, "grad_norm": 0.6999309062957764, "learning_rate": 2.849936961665245e-08, "loss": 0.022743219137191774, "memory(GiB)": 122.96, "step": 64890, "token_acc": 0.9867639113992437, "train_speed(iter/s)": 0.231478 }, { "epoch": 4.94664227456361, "grad_norm": 1.6255453824996948, "learning_rate": 2.8096594466742176e-08, "loss": 0.025053304433822633, "memory(GiB)": 122.96, "step": 64895, "token_acc": 0.9897172236503856, "train_speed(iter/s)": 0.23148 }, { "epoch": 4.94702340117387, "grad_norm": 0.8039292693138123, "learning_rate": 2.769668497359379e-08, "loss": 0.04155232608318329, "memory(GiB)": 122.96, "step": 64900, "token_acc": 0.9843551797040169, "train_speed(iter/s)": 0.231483 }, { "epoch": 4.94740452778413, "grad_norm": 1.4935150146484375, "learning_rate": 2.7299641160144497e-08, "loss": 0.04283004403114319, "memory(GiB)": 122.96, "step": 64905, "token_acc": 0.9823300389338125, "train_speed(iter/s)": 0.231484 }, { "epoch": 4.94778565439439, "grad_norm": 1.4070243835449219, "learning_rate": 2.6905463049153868e-08, "loss": 0.024528226256370543, "memory(GiB)": 122.96, "step": 64910, "token_acc": 0.9876209447922595, "train_speed(iter/s)": 0.231487 }, { "epoch": 4.94816678100465, "grad_norm": 0.8641025424003601, "learning_rate": 2.6514150663231595e-08, "loss": 0.022143405675888062, "memory(GiB)": 122.96, "step": 64915, "token_acc": 0.991169130351315, "train_speed(iter/s)": 0.231491 }, { "epoch": 4.94854790761491, "grad_norm": 0.7885650992393494, "learning_rate": 2.612570402482084e-08, "loss": 0.02473823130130768, "memory(GiB)": 122.96, "step": 64920, "token_acc": 0.9905023744063984, "train_speed(iter/s)": 0.231492 }, { "epoch": 4.94892903422517, "grad_norm": 2.652489185333252, "learning_rate": 2.5740123156192674e-08, "loss": 0.02357115298509598, "memory(GiB)": 122.96, "step": 64925, "token_acc": 0.9921098772647574, "train_speed(iter/s)": 0.231497 }, { "epoch": 4.949310160835429, "grad_norm": 0.2939680516719818, "learning_rate": 2.535740807945719e-08, "loss": 0.01811075508594513, "memory(GiB)": 122.96, "step": 64930, "token_acc": 0.9930939226519337, "train_speed(iter/s)": 0.231504 }, { "epoch": 4.949691287445689, "grad_norm": 1.04830801486969, "learning_rate": 2.4977558816563496e-08, "loss": 0.02518024146556854, "memory(GiB)": 122.96, "step": 64935, "token_acc": 0.988150289017341, "train_speed(iter/s)": 0.231507 }, { "epoch": 4.950072414055949, "grad_norm": 0.5464163422584534, "learning_rate": 2.4600575389299717e-08, "loss": 0.02978017330169678, "memory(GiB)": 122.96, "step": 64940, "token_acc": 0.9869620075704472, "train_speed(iter/s)": 0.231509 }, { "epoch": 4.950453540666209, "grad_norm": 1.059490442276001, "learning_rate": 2.422645781927635e-08, "loss": 0.03299559354782104, "memory(GiB)": 122.96, "step": 64945, "token_acc": 0.9867256637168141, "train_speed(iter/s)": 0.231515 }, { "epoch": 4.950834667276469, "grad_norm": 0.830332338809967, "learning_rate": 2.385520612794845e-08, "loss": 0.0242973193526268, "memory(GiB)": 122.96, "step": 64950, "token_acc": 0.9914586070959264, "train_speed(iter/s)": 0.23152 }, { "epoch": 4.951215793886729, "grad_norm": 1.2779712677001953, "learning_rate": 2.3486820336604543e-08, "loss": 0.036338013410568235, "memory(GiB)": 122.96, "step": 64955, "token_acc": 0.9873446847676024, "train_speed(iter/s)": 0.231524 }, { "epoch": 4.951596920496989, "grad_norm": 1.5079809427261353, "learning_rate": 2.3121300466377726e-08, "loss": 0.027551275491714478, "memory(GiB)": 122.96, "step": 64960, "token_acc": 0.9921414538310412, "train_speed(iter/s)": 0.231528 }, { "epoch": 4.951978047107249, "grad_norm": 2.7150604724884033, "learning_rate": 2.275864653821791e-08, "loss": 0.029328715801239014, "memory(GiB)": 122.96, "step": 64965, "token_acc": 0.9875621890547264, "train_speed(iter/s)": 0.231533 }, { "epoch": 4.952359173717509, "grad_norm": 0.8460292220115662, "learning_rate": 2.239885857293067e-08, "loss": 0.03708009421825409, "memory(GiB)": 122.96, "step": 64970, "token_acc": 0.9863630165007501, "train_speed(iter/s)": 0.231534 }, { "epoch": 4.952740300327769, "grad_norm": 1.4745982885360718, "learning_rate": 2.2041936591143952e-08, "loss": 0.023693555593490602, "memory(GiB)": 122.96, "step": 64975, "token_acc": 0.9927417891489748, "train_speed(iter/s)": 0.231537 }, { "epoch": 4.953121426938029, "grad_norm": 0.9538354277610779, "learning_rate": 2.168788061332472e-08, "loss": 0.013645447790622711, "memory(GiB)": 122.96, "step": 64980, "token_acc": 0.9915130231196957, "train_speed(iter/s)": 0.231542 }, { "epoch": 4.953502553548288, "grad_norm": 0.7509533166885376, "learning_rate": 2.1336690659778945e-08, "loss": 0.02005358785390854, "memory(GiB)": 122.96, "step": 64985, "token_acc": 0.9938235294117647, "train_speed(iter/s)": 0.231546 }, { "epoch": 4.953883680158548, "grad_norm": 0.809201180934906, "learning_rate": 2.098836675064053e-08, "loss": 0.022592173516750337, "memory(GiB)": 122.96, "step": 64990, "token_acc": 0.9896842763363551, "train_speed(iter/s)": 0.231549 }, { "epoch": 4.954264806768808, "grad_norm": 0.9203253984451294, "learning_rate": 2.0642908905893487e-08, "loss": 0.050018310546875, "memory(GiB)": 122.96, "step": 64995, "token_acc": 0.9858736059479554, "train_speed(iter/s)": 0.231553 }, { "epoch": 4.9546459333790684, "grad_norm": 0.3139859139919281, "learning_rate": 2.0300317145344195e-08, "loss": 0.030909261107444762, "memory(GiB)": 122.96, "step": 65000, "token_acc": 0.9871515151515151, "train_speed(iter/s)": 0.231557 }, { "epoch": 4.9546459333790684, "eval_loss": 0.04671008139848709, "eval_runtime": 221.4601, "eval_samples_per_second": 2.393, "eval_steps_per_second": 2.393, "eval_token_acc": 0.9812134811155955, "step": 65000 }, { "epoch": 4.9550270599893285, "grad_norm": 1.293513536453247, "learning_rate": 1.9960591488632497e-08, "loss": 0.0402255654335022, "memory(GiB)": 122.96, "step": 65005, "token_acc": 0.9813472180811996, "train_speed(iter/s)": 0.231378 }, { "epoch": 4.9554081865995885, "grad_norm": 0.9089357256889343, "learning_rate": 1.962373195524836e-08, "loss": 0.026728412508964537, "memory(GiB)": 122.96, "step": 65010, "token_acc": 0.9876802884615384, "train_speed(iter/s)": 0.23138 }, { "epoch": 4.9557893132098485, "grad_norm": 1.9416024684906006, "learning_rate": 1.928973856450411e-08, "loss": 0.03082018196582794, "memory(GiB)": 122.96, "step": 65015, "token_acc": 0.983957219251337, "train_speed(iter/s)": 0.231385 }, { "epoch": 4.9561704398201085, "grad_norm": 2.2913880348205566, "learning_rate": 1.8958611335556654e-08, "loss": 0.043072617053985594, "memory(GiB)": 122.96, "step": 65020, "token_acc": 0.9839765918907621, "train_speed(iter/s)": 0.231387 }, { "epoch": 4.9565515664303685, "grad_norm": 1.1580296754837036, "learning_rate": 1.8630350287390796e-08, "loss": 0.02130861282348633, "memory(GiB)": 122.96, "step": 65025, "token_acc": 0.9944488501189532, "train_speed(iter/s)": 0.231393 }, { "epoch": 4.956932693040628, "grad_norm": 1.5668610334396362, "learning_rate": 1.8304955438830372e-08, "loss": 0.028245702385902405, "memory(GiB)": 122.96, "step": 65030, "token_acc": 0.9873118914133963, "train_speed(iter/s)": 0.231398 }, { "epoch": 4.957313819650888, "grad_norm": 0.9121356010437012, "learning_rate": 1.7982426808543785e-08, "loss": 0.01952408254146576, "memory(GiB)": 122.96, "step": 65035, "token_acc": 0.9920174165457184, "train_speed(iter/s)": 0.231404 }, { "epoch": 4.957694946261148, "grad_norm": 1.4825537204742432, "learning_rate": 1.766276441501624e-08, "loss": 0.03305492103099823, "memory(GiB)": 122.96, "step": 65040, "token_acc": 0.9839606501283148, "train_speed(iter/s)": 0.231407 }, { "epoch": 4.958076072871408, "grad_norm": 1.6918951272964478, "learning_rate": 1.734596827658308e-08, "loss": 0.029897454380989074, "memory(GiB)": 122.96, "step": 65045, "token_acc": 0.9876018420120439, "train_speed(iter/s)": 0.23141 }, { "epoch": 4.958457199481668, "grad_norm": 0.3942674994468689, "learning_rate": 1.7032038411407547e-08, "loss": 0.02342854142189026, "memory(GiB)": 122.96, "step": 65050, "token_acc": 0.9892274546014158, "train_speed(iter/s)": 0.231415 }, { "epoch": 4.958838326091928, "grad_norm": 1.3899760246276855, "learning_rate": 1.672097483749746e-08, "loss": 0.030343794822692872, "memory(GiB)": 122.96, "step": 65055, "token_acc": 0.9930218446601942, "train_speed(iter/s)": 0.231421 }, { "epoch": 4.959219452702188, "grad_norm": 1.0190696716308594, "learning_rate": 1.6412777572694104e-08, "loss": 0.030113857984542847, "memory(GiB)": 122.96, "step": 65060, "token_acc": 0.9903660886319846, "train_speed(iter/s)": 0.231426 }, { "epoch": 4.959600579312448, "grad_norm": 1.6286569833755493, "learning_rate": 1.6107446634661128e-08, "loss": 0.05882562398910522, "memory(GiB)": 122.96, "step": 65065, "token_acc": 0.9810582664196006, "train_speed(iter/s)": 0.23143 }, { "epoch": 4.959981705922708, "grad_norm": 0.7291297316551208, "learning_rate": 1.5804982040912296e-08, "loss": 0.03664923906326294, "memory(GiB)": 122.96, "step": 65070, "token_acc": 0.9877014418999152, "train_speed(iter/s)": 0.231432 }, { "epoch": 4.960362832532968, "grad_norm": 0.8221415877342224, "learning_rate": 1.55053838087893e-08, "loss": 0.022282299399375916, "memory(GiB)": 122.96, "step": 65075, "token_acc": 0.9857717290442314, "train_speed(iter/s)": 0.231438 }, { "epoch": 4.960743959143228, "grad_norm": 2.0793137550354004, "learning_rate": 1.520865195548393e-08, "loss": 0.03262795209884643, "memory(GiB)": 122.96, "step": 65080, "token_acc": 0.9808314087759815, "train_speed(iter/s)": 0.231442 }, { "epoch": 4.961125085753487, "grad_norm": 0.5929187536239624, "learning_rate": 1.4914786497999266e-08, "loss": 0.02813912630081177, "memory(GiB)": 122.96, "step": 65085, "token_acc": 0.9915014164305949, "train_speed(iter/s)": 0.231446 }, { "epoch": 4.961506212363747, "grad_norm": 1.5217866897583008, "learning_rate": 1.4623787453194037e-08, "loss": 0.04026230573654175, "memory(GiB)": 122.96, "step": 65090, "token_acc": 0.983985297978472, "train_speed(iter/s)": 0.231451 }, { "epoch": 4.961887338974007, "grad_norm": 1.1844075918197632, "learning_rate": 1.4335654837754896e-08, "loss": 0.03660319745540619, "memory(GiB)": 122.96, "step": 65095, "token_acc": 0.9851679771265189, "train_speed(iter/s)": 0.231454 }, { "epoch": 4.962268465584267, "grad_norm": 0.24081404507160187, "learning_rate": 1.4050388668201963e-08, "loss": 0.013405577838420868, "memory(GiB)": 122.96, "step": 65100, "token_acc": 0.9964183381088825, "train_speed(iter/s)": 0.231461 }, { "epoch": 4.962649592194527, "grad_norm": 0.49624213576316833, "learning_rate": 1.3767988960899925e-08, "loss": 0.028308084607124327, "memory(GiB)": 122.96, "step": 65105, "token_acc": 0.9862641293461153, "train_speed(iter/s)": 0.231462 }, { "epoch": 4.963030718804787, "grad_norm": 0.8027380704879761, "learning_rate": 1.3488455732035831e-08, "loss": 0.024922049045562743, "memory(GiB)": 122.96, "step": 65110, "token_acc": 0.9900666415189237, "train_speed(iter/s)": 0.231463 }, { "epoch": 4.963411845415047, "grad_norm": 1.4113434553146362, "learning_rate": 1.3211788997641306e-08, "loss": 0.017991508543491363, "memory(GiB)": 122.96, "step": 65115, "token_acc": 0.9938159330665697, "train_speed(iter/s)": 0.231466 }, { "epoch": 4.963792972025307, "grad_norm": 1.2606244087219238, "learning_rate": 1.2937988773586984e-08, "loss": 0.03441023528575897, "memory(GiB)": 122.96, "step": 65120, "token_acc": 0.9856896551724138, "train_speed(iter/s)": 0.231469 }, { "epoch": 4.964174098635567, "grad_norm": 1.519629716873169, "learning_rate": 1.266705507557142e-08, "loss": 0.03796733021736145, "memory(GiB)": 122.96, "step": 65125, "token_acc": 0.9841165875225152, "train_speed(iter/s)": 0.231472 }, { "epoch": 4.964555225245826, "grad_norm": 0.6806376576423645, "learning_rate": 1.2398987919126636e-08, "loss": 0.014378158748149872, "memory(GiB)": 122.96, "step": 65130, "token_acc": 0.9928619986403807, "train_speed(iter/s)": 0.231475 }, { "epoch": 4.964936351856086, "grad_norm": 1.854337215423584, "learning_rate": 1.2133787319634771e-08, "loss": 0.029054483771324156, "memory(GiB)": 122.96, "step": 65135, "token_acc": 0.9893558451091944, "train_speed(iter/s)": 0.231478 }, { "epoch": 4.965317478466346, "grad_norm": 3.062816619873047, "learning_rate": 1.1871453292294776e-08, "loss": 0.031212151050567627, "memory(GiB)": 122.96, "step": 65140, "token_acc": 0.9901690238013108, "train_speed(iter/s)": 0.23148 }, { "epoch": 4.965698605076606, "grad_norm": 0.6915186643600464, "learning_rate": 1.1611985852150176e-08, "loss": 0.019060514867305756, "memory(GiB)": 122.96, "step": 65145, "token_acc": 0.9923907707412862, "train_speed(iter/s)": 0.231485 }, { "epoch": 4.966079731686866, "grad_norm": 0.6436535716056824, "learning_rate": 1.135538501408906e-08, "loss": 0.0318561851978302, "memory(GiB)": 122.96, "step": 65150, "token_acc": 0.989212354071228, "train_speed(iter/s)": 0.231487 }, { "epoch": 4.966460858297126, "grad_norm": 1.2644999027252197, "learning_rate": 1.1101650792821882e-08, "loss": 0.021271857619285583, "memory(GiB)": 122.96, "step": 65155, "token_acc": 0.9905097198836675, "train_speed(iter/s)": 0.231489 }, { "epoch": 4.966841984907386, "grad_norm": 0.9006397128105164, "learning_rate": 1.0850783202892567e-08, "loss": 0.021833418309688567, "memory(GiB)": 122.96, "step": 65160, "token_acc": 0.98819913952059, "train_speed(iter/s)": 0.23149 }, { "epoch": 4.967223111517646, "grad_norm": 0.687323272228241, "learning_rate": 1.0602782258695154e-08, "loss": 0.02468249499797821, "memory(GiB)": 122.96, "step": 65165, "token_acc": 0.9906591237995, "train_speed(iter/s)": 0.231492 }, { "epoch": 4.967604238127906, "grad_norm": 1.0897740125656128, "learning_rate": 1.0357647974451601e-08, "loss": 0.02184086889028549, "memory(GiB)": 122.96, "step": 65170, "token_acc": 0.990988567585743, "train_speed(iter/s)": 0.231493 }, { "epoch": 4.9679853647381655, "grad_norm": 1.7081990242004395, "learning_rate": 1.011538036421733e-08, "loss": 0.0493834912776947, "memory(GiB)": 122.96, "step": 65175, "token_acc": 0.9766069086139046, "train_speed(iter/s)": 0.231497 }, { "epoch": 4.968366491348426, "grad_norm": 1.278080701828003, "learning_rate": 9.875979441881233e-09, "loss": 0.024728986620903014, "memory(GiB)": 122.96, "step": 65180, "token_acc": 0.9862349444705146, "train_speed(iter/s)": 0.2315 }, { "epoch": 4.968747617958686, "grad_norm": 0.2256547063589096, "learning_rate": 9.639445221176769e-09, "loss": 0.022689932584762575, "memory(GiB)": 122.96, "step": 65185, "token_acc": 0.991049491049491, "train_speed(iter/s)": 0.231503 }, { "epoch": 4.969128744568946, "grad_norm": 1.1455553770065308, "learning_rate": 9.405777715665309e-09, "loss": 0.016859593987464904, "memory(GiB)": 122.96, "step": 65190, "token_acc": 0.9933018124507487, "train_speed(iter/s)": 0.231509 }, { "epoch": 4.969509871179206, "grad_norm": 0.7403997182846069, "learning_rate": 9.174976938747248e-09, "loss": 0.023007912933826445, "memory(GiB)": 122.96, "step": 65195, "token_acc": 0.9895405304445275, "train_speed(iter/s)": 0.23151 }, { "epoch": 4.969890997789466, "grad_norm": 1.192914605140686, "learning_rate": 8.947042903661996e-09, "loss": 0.03366773426532745, "memory(GiB)": 122.96, "step": 65200, "token_acc": 0.9885361552028219, "train_speed(iter/s)": 0.231516 }, { "epoch": 4.969890997789466, "eval_loss": 0.046652209013700485, "eval_runtime": 224.6805, "eval_samples_per_second": 2.359, "eval_steps_per_second": 2.359, "eval_token_acc": 0.9811381844467201, "step": 65200 }, { "epoch": 4.970272124399726, "grad_norm": 0.8586710691452026, "learning_rate": 8.721975623471323e-09, "loss": 0.029337078332901, "memory(GiB)": 122.96, "step": 65205, "token_acc": 0.9813436078681306, "train_speed(iter/s)": 0.231335 }, { "epoch": 4.970653251009986, "grad_norm": 2.8299784660339355, "learning_rate": 8.499775111092678e-09, "loss": 0.065387761592865, "memory(GiB)": 122.96, "step": 65210, "token_acc": 0.9810405643738977, "train_speed(iter/s)": 0.231337 }, { "epoch": 4.971034377620246, "grad_norm": 0.11697398871183395, "learning_rate": 8.280441379260318e-09, "loss": 0.035374969244003296, "memory(GiB)": 122.96, "step": 65215, "token_acc": 0.9855869523990138, "train_speed(iter/s)": 0.23134 }, { "epoch": 4.971415504230506, "grad_norm": 0.5981540083885193, "learning_rate": 8.06397444055862e-09, "loss": 0.03401350975036621, "memory(GiB)": 122.96, "step": 65220, "token_acc": 0.9916267942583732, "train_speed(iter/s)": 0.231341 }, { "epoch": 4.971796630840766, "grad_norm": 1.0130573511123657, "learning_rate": 7.850374307394326e-09, "loss": 0.030062052607536315, "memory(GiB)": 122.96, "step": 65225, "token_acc": 0.9867684478371501, "train_speed(iter/s)": 0.231343 }, { "epoch": 4.972177757451025, "grad_norm": 1.7282116413116455, "learning_rate": 7.639640992018748e-09, "loss": 0.043108826875686644, "memory(GiB)": 122.96, "step": 65230, "token_acc": 0.9866288492706645, "train_speed(iter/s)": 0.231347 }, { "epoch": 4.972558884061285, "grad_norm": 2.2964227199554443, "learning_rate": 7.4317745065166646e-09, "loss": 0.025778061151504515, "memory(GiB)": 122.96, "step": 65235, "token_acc": 0.9896309850564197, "train_speed(iter/s)": 0.231352 }, { "epoch": 4.972940010671545, "grad_norm": 1.087999939918518, "learning_rate": 7.226774862806318e-09, "loss": 0.020917908847332002, "memory(GiB)": 122.96, "step": 65240, "token_acc": 0.9897716127224324, "train_speed(iter/s)": 0.231354 }, { "epoch": 4.973321137281805, "grad_norm": 0.848408579826355, "learning_rate": 7.024642072650522e-09, "loss": 0.025107231736183167, "memory(GiB)": 122.96, "step": 65245, "token_acc": 0.9886627509903019, "train_speed(iter/s)": 0.231355 }, { "epoch": 4.973702263892065, "grad_norm": 1.5466257333755493, "learning_rate": 6.825376147628903e-09, "loss": 0.027141866087913514, "memory(GiB)": 122.96, "step": 65250, "token_acc": 0.9886883921357393, "train_speed(iter/s)": 0.23136 }, { "epoch": 4.974083390502325, "grad_norm": 1.3878365755081177, "learning_rate": 6.62897709918231e-09, "loss": 0.019205766916275024, "memory(GiB)": 122.96, "step": 65255, "token_acc": 0.9927253832164199, "train_speed(iter/s)": 0.231365 }, { "epoch": 4.974464517112585, "grad_norm": 0.5157955884933472, "learning_rate": 6.435444938562851e-09, "loss": 0.014418919384479523, "memory(GiB)": 122.96, "step": 65260, "token_acc": 0.9929252537680714, "train_speed(iter/s)": 0.231369 }, { "epoch": 4.974845643722845, "grad_norm": 2.8791165351867676, "learning_rate": 6.244779676872759e-09, "loss": 0.038768929243087766, "memory(GiB)": 122.96, "step": 65265, "token_acc": 0.9858767424798239, "train_speed(iter/s)": 0.231373 }, { "epoch": 4.975226770333105, "grad_norm": 0.640629768371582, "learning_rate": 6.056981325047728e-09, "loss": 0.03767937421798706, "memory(GiB)": 122.96, "step": 65270, "token_acc": 0.9906472128694351, "train_speed(iter/s)": 0.231376 }, { "epoch": 4.975607896943364, "grad_norm": 1.9037282466888428, "learning_rate": 5.872049893851373e-09, "loss": 0.03493503928184509, "memory(GiB)": 122.96, "step": 65275, "token_acc": 0.9866905045689313, "train_speed(iter/s)": 0.23138 }, { "epoch": 4.975989023553624, "grad_norm": 1.0967572927474976, "learning_rate": 5.689985393891872e-09, "loss": 0.015183040499687194, "memory(GiB)": 122.96, "step": 65280, "token_acc": 0.9933194154488518, "train_speed(iter/s)": 0.231384 }, { "epoch": 4.976370150163884, "grad_norm": 1.0153906345367432, "learning_rate": 5.5107878356108755e-09, "loss": 0.039233472943305966, "memory(GiB)": 122.96, "step": 65285, "token_acc": 0.9867708959711365, "train_speed(iter/s)": 0.231389 }, { "epoch": 4.976751276774144, "grad_norm": 0.0775807648897171, "learning_rate": 5.334457229283496e-09, "loss": 0.021889682114124297, "memory(GiB)": 122.96, "step": 65290, "token_acc": 0.9888549343879202, "train_speed(iter/s)": 0.231392 }, { "epoch": 4.977132403384404, "grad_norm": 2.409358263015747, "learning_rate": 5.1609935850238655e-09, "loss": 0.022383061051368714, "memory(GiB)": 122.96, "step": 65295, "token_acc": 0.9906442127773323, "train_speed(iter/s)": 0.231397 }, { "epoch": 4.977513529994664, "grad_norm": 1.1309925317764282, "learning_rate": 4.990396912774031e-09, "loss": 0.028367367386817933, "memory(GiB)": 122.96, "step": 65300, "token_acc": 0.9889941913787832, "train_speed(iter/s)": 0.231401 }, { "epoch": 4.977894656604924, "grad_norm": 2.4168503284454346, "learning_rate": 4.822667222315058e-09, "loss": 0.0213253915309906, "memory(GiB)": 122.96, "step": 65305, "token_acc": 0.9923488905891354, "train_speed(iter/s)": 0.231407 }, { "epoch": 4.978275783215184, "grad_norm": 0.7649396061897278, "learning_rate": 4.6578045232781305e-09, "loss": 0.0361462414264679, "memory(GiB)": 122.96, "step": 65310, "token_acc": 0.9838502947962061, "train_speed(iter/s)": 0.231411 }, { "epoch": 4.978656909825444, "grad_norm": 1.1200065612792969, "learning_rate": 4.495808825105696e-09, "loss": 0.021941231191158296, "memory(GiB)": 122.96, "step": 65315, "token_acc": 0.9975997599759976, "train_speed(iter/s)": 0.231415 }, { "epoch": 4.979038036435704, "grad_norm": 0.0934068113565445, "learning_rate": 4.336680137090321e-09, "loss": 0.013472728431224823, "memory(GiB)": 122.96, "step": 65320, "token_acc": 0.9926278240190249, "train_speed(iter/s)": 0.23142 }, { "epoch": 4.979419163045964, "grad_norm": 1.4806877374649048, "learning_rate": 4.180418468358038e-09, "loss": 0.026659953594207763, "memory(GiB)": 122.96, "step": 65325, "token_acc": 0.9874735356386732, "train_speed(iter/s)": 0.231422 }, { "epoch": 4.9798002896562235, "grad_norm": 1.2573615312576294, "learning_rate": 4.027023827873899e-09, "loss": 0.025554832816123963, "memory(GiB)": 122.96, "step": 65330, "token_acc": 0.9884304736956996, "train_speed(iter/s)": 0.231427 }, { "epoch": 4.9801814162664835, "grad_norm": 2.60612416267395, "learning_rate": 3.876496224425319e-09, "loss": 0.029011696577072144, "memory(GiB)": 122.96, "step": 65335, "token_acc": 0.9879979570990807, "train_speed(iter/s)": 0.231431 }, { "epoch": 4.9805625428767435, "grad_norm": 0.9691932201385498, "learning_rate": 3.728835666655384e-09, "loss": 0.022194311022758484, "memory(GiB)": 122.96, "step": 65340, "token_acc": 0.9901466544454629, "train_speed(iter/s)": 0.231434 }, { "epoch": 4.9809436694870035, "grad_norm": 0.5333957076072693, "learning_rate": 3.5840421630184417e-09, "loss": 0.024649661779403687, "memory(GiB)": 122.96, "step": 65345, "token_acc": 0.9919662582847961, "train_speed(iter/s)": 0.231438 }, { "epoch": 4.9813247960972635, "grad_norm": 1.3731346130371094, "learning_rate": 3.4421157218300635e-09, "loss": 0.03910989761352539, "memory(GiB)": 122.96, "step": 65350, "token_acc": 0.9831377087121839, "train_speed(iter/s)": 0.23144 }, { "epoch": 4.9817059227075235, "grad_norm": 1.4843852519989014, "learning_rate": 3.303056351222633e-09, "loss": 0.02757016122341156, "memory(GiB)": 122.96, "step": 65355, "token_acc": 0.9876651982378855, "train_speed(iter/s)": 0.231445 }, { "epoch": 4.982087049317784, "grad_norm": 0.5947726964950562, "learning_rate": 3.1668640591731025e-09, "loss": 0.017140640318393706, "memory(GiB)": 122.96, "step": 65360, "token_acc": 0.9933366238894373, "train_speed(iter/s)": 0.23145 }, { "epoch": 4.982468175928044, "grad_norm": 0.8116332292556763, "learning_rate": 3.0335388534863395e-09, "loss": 0.04749036431312561, "memory(GiB)": 122.96, "step": 65365, "token_acc": 0.9836641489548569, "train_speed(iter/s)": 0.231453 }, { "epoch": 4.982849302538304, "grad_norm": 0.7989891171455383, "learning_rate": 2.903080741817332e-09, "loss": 0.021308332681655884, "memory(GiB)": 122.96, "step": 65370, "token_acc": 0.991044776119403, "train_speed(iter/s)": 0.231459 }, { "epoch": 4.983230429148563, "grad_norm": 1.05201256275177, "learning_rate": 2.7754897316378814e-09, "loss": 0.04035530984401703, "memory(GiB)": 122.96, "step": 65375, "token_acc": 0.9829119850187266, "train_speed(iter/s)": 0.231463 }, { "epoch": 4.983611555758823, "grad_norm": 0.7579794526100159, "learning_rate": 2.650765830269908e-09, "loss": 0.03902249932289124, "memory(GiB)": 122.96, "step": 65380, "token_acc": 0.9788309636650869, "train_speed(iter/s)": 0.231468 }, { "epoch": 4.983992682369083, "grad_norm": 1.0315594673156738, "learning_rate": 2.528909044863248e-09, "loss": 0.03690943121910095, "memory(GiB)": 122.96, "step": 65385, "token_acc": 0.9830284744484254, "train_speed(iter/s)": 0.231472 }, { "epoch": 4.984373808979343, "grad_norm": 1.5889012813568115, "learning_rate": 2.4099193824067556e-09, "loss": 0.025462386012077332, "memory(GiB)": 122.96, "step": 65390, "token_acc": 0.9875307341060766, "train_speed(iter/s)": 0.231476 }, { "epoch": 4.984754935589603, "grad_norm": 1.0177972316741943, "learning_rate": 2.2937968497283025e-09, "loss": 0.029764628410339354, "memory(GiB)": 122.96, "step": 65395, "token_acc": 0.9880185519196083, "train_speed(iter/s)": 0.231478 }, { "epoch": 4.985136062199863, "grad_norm": 1.0992156267166138, "learning_rate": 2.180541453478124e-09, "loss": 0.03030233383178711, "memory(GiB)": 122.96, "step": 65400, "token_acc": 0.9860266315962518, "train_speed(iter/s)": 0.23148 }, { "epoch": 4.985136062199863, "eval_loss": 0.04671034961938858, "eval_runtime": 222.316, "eval_samples_per_second": 2.384, "eval_steps_per_second": 2.384, "eval_token_acc": 0.9810177097765195, "step": 65400 }, { "epoch": 4.985517188810123, "grad_norm": 0.2812090218067169, "learning_rate": 2.070153200156577e-09, "loss": 0.0283296138048172, "memory(GiB)": 122.96, "step": 65405, "token_acc": 0.9811667873450268, "train_speed(iter/s)": 0.231303 }, { "epoch": 4.985898315420383, "grad_norm": 0.8628404140472412, "learning_rate": 1.962632096097483e-09, "loss": 0.02814640998840332, "memory(GiB)": 122.96, "step": 65410, "token_acc": 0.9880294659300184, "train_speed(iter/s)": 0.231308 }, { "epoch": 4.986279442030643, "grad_norm": 0.9407519698143005, "learning_rate": 1.857978147457029e-09, "loss": 0.03356319069862366, "memory(GiB)": 122.96, "step": 65415, "token_acc": 0.9851110568708812, "train_speed(iter/s)": 0.231312 }, { "epoch": 4.986660568640902, "grad_norm": 1.055129051208496, "learning_rate": 1.7561913602415214e-09, "loss": 0.05196954011917114, "memory(GiB)": 122.96, "step": 65420, "token_acc": 0.9815860545052787, "train_speed(iter/s)": 0.231316 }, { "epoch": 4.987041695251163, "grad_norm": 0.7984076142311096, "learning_rate": 1.6572717402907334e-09, "loss": 0.035372763872146606, "memory(GiB)": 122.96, "step": 65425, "token_acc": 0.9860887096774194, "train_speed(iter/s)": 0.23132 }, { "epoch": 4.987422821861422, "grad_norm": 3.4746503829956055, "learning_rate": 1.5612192932779047e-09, "loss": 0.036388438940048215, "memory(GiB)": 122.96, "step": 65430, "token_acc": 0.983063063063063, "train_speed(iter/s)": 0.231325 }, { "epoch": 4.987803948471682, "grad_norm": 2.1452887058258057, "learning_rate": 1.46803402470419e-09, "loss": 0.049624505639076236, "memory(GiB)": 122.96, "step": 65435, "token_acc": 0.9812348668280871, "train_speed(iter/s)": 0.23133 }, { "epoch": 4.988185075081942, "grad_norm": 1.3940974473953247, "learning_rate": 1.3777159399153139e-09, "loss": 0.03781522512435913, "memory(GiB)": 122.96, "step": 65440, "token_acc": 0.9837442427526416, "train_speed(iter/s)": 0.231335 }, { "epoch": 4.988566201692202, "grad_norm": 1.258034110069275, "learning_rate": 1.2902650440960174e-09, "loss": 0.027615198493003847, "memory(GiB)": 122.96, "step": 65445, "token_acc": 0.9897191114374885, "train_speed(iter/s)": 0.231338 }, { "epoch": 4.988947328302462, "grad_norm": 0.6371923685073853, "learning_rate": 1.2056813422534063e-09, "loss": 0.029767253994941713, "memory(GiB)": 122.96, "step": 65450, "token_acc": 0.9889100126742713, "train_speed(iter/s)": 0.231342 }, { "epoch": 4.989328454912722, "grad_norm": 1.9030895233154297, "learning_rate": 1.1239648392447067e-09, "loss": 0.03673174977302551, "memory(GiB)": 122.96, "step": 65455, "token_acc": 0.9836512261580381, "train_speed(iter/s)": 0.231347 }, { "epoch": 4.989709581522982, "grad_norm": 0.5081695318222046, "learning_rate": 1.0451155397550594e-09, "loss": 0.017491374909877778, "memory(GiB)": 122.96, "step": 65460, "token_acc": 0.99354333789865, "train_speed(iter/s)": 0.23135 }, { "epoch": 4.990090708133242, "grad_norm": 0.5633696913719177, "learning_rate": 9.691334483030723e-10, "loss": 0.027231138944625855, "memory(GiB)": 122.96, "step": 65465, "token_acc": 0.9911186297314443, "train_speed(iter/s)": 0.231354 }, { "epoch": 4.990471834743502, "grad_norm": 0.6269640922546387, "learning_rate": 8.960185692463707e-10, "loss": 0.020356935262680054, "memory(GiB)": 122.96, "step": 65470, "token_acc": 0.9939037208324575, "train_speed(iter/s)": 0.231358 }, { "epoch": 4.990852961353761, "grad_norm": 0.8283225297927856, "learning_rate": 8.257709067815978e-10, "loss": 0.02714584469795227, "memory(GiB)": 122.96, "step": 65475, "token_acc": 0.98982763492512, "train_speed(iter/s)": 0.231362 }, { "epoch": 4.991234087964021, "grad_norm": 0.5034328699111938, "learning_rate": 7.583904649333118e-10, "loss": 0.01567468196153641, "memory(GiB)": 122.96, "step": 65480, "token_acc": 0.9931856899488927, "train_speed(iter/s)": 0.231368 }, { "epoch": 4.991615214574281, "grad_norm": 0.9832271337509155, "learning_rate": 6.93877247565089e-10, "loss": 0.029153388738632203, "memory(GiB)": 122.96, "step": 65485, "token_acc": 0.9879107781372382, "train_speed(iter/s)": 0.23137 }, { "epoch": 4.991996341184541, "grad_norm": 0.703941822052002, "learning_rate": 6.322312583795231e-10, "loss": 0.02367573082447052, "memory(GiB)": 122.96, "step": 65490, "token_acc": 0.9944579147904399, "train_speed(iter/s)": 0.231375 }, { "epoch": 4.992377467794801, "grad_norm": 1.9090490341186523, "learning_rate": 5.734525009071235e-10, "loss": 0.03463009297847748, "memory(GiB)": 122.96, "step": 65495, "token_acc": 0.9827463956511463, "train_speed(iter/s)": 0.23138 }, { "epoch": 4.992758594405061, "grad_norm": 0.5109768509864807, "learning_rate": 5.17540978528519e-10, "loss": 0.027520650625228883, "memory(GiB)": 122.96, "step": 65500, "token_acc": 0.9932960893854749, "train_speed(iter/s)": 0.231384 }, { "epoch": 4.993139721015321, "grad_norm": 1.6359907388687134, "learning_rate": 4.644966944356011e-10, "loss": 0.03614895045757294, "memory(GiB)": 122.96, "step": 65505, "token_acc": 0.9868319132455461, "train_speed(iter/s)": 0.231386 }, { "epoch": 4.9935208476255815, "grad_norm": 1.2480318546295166, "learning_rate": 4.143196516814829e-10, "loss": 0.026231345534324647, "memory(GiB)": 122.96, "step": 65510, "token_acc": 0.9941159988792378, "train_speed(iter/s)": 0.23139 }, { "epoch": 4.9939019742358415, "grad_norm": 0.6290131211280823, "learning_rate": 3.6700985313609106e-10, "loss": 0.025004053115844728, "memory(GiB)": 122.96, "step": 65515, "token_acc": 0.9889491242702252, "train_speed(iter/s)": 0.231394 }, { "epoch": 4.994283100846101, "grad_norm": 0.3121906518936157, "learning_rate": 3.225673015194719e-10, "loss": 0.017110726237297057, "memory(GiB)": 122.96, "step": 65520, "token_acc": 0.9913281606572342, "train_speed(iter/s)": 0.231399 }, { "epoch": 4.994664227456361, "grad_norm": 0.8594696521759033, "learning_rate": 2.809919993740362e-10, "loss": 0.029915162920951845, "memory(GiB)": 122.96, "step": 65525, "token_acc": 0.9885737604570496, "train_speed(iter/s)": 0.231404 }, { "epoch": 4.995045354066621, "grad_norm": 0.9535529613494873, "learning_rate": 2.4228394909231453e-10, "loss": 0.046113982796669006, "memory(GiB)": 122.96, "step": 65530, "token_acc": 0.9801051051051051, "train_speed(iter/s)": 0.231409 }, { "epoch": 4.995426480676881, "grad_norm": 0.6533511877059937, "learning_rate": 2.0644315288365078e-10, "loss": 0.031991952657699586, "memory(GiB)": 122.96, "step": 65535, "token_acc": 0.9886522346368715, "train_speed(iter/s)": 0.231412 }, { "epoch": 4.995807607287141, "grad_norm": 1.393473505973816, "learning_rate": 1.734696128075086e-10, "loss": 0.02318093478679657, "memory(GiB)": 122.96, "step": 65540, "token_acc": 0.9909272643395356, "train_speed(iter/s)": 0.231414 }, { "epoch": 4.996188733897401, "grad_norm": 1.2876241207122803, "learning_rate": 1.4336333075681828e-10, "loss": 0.03193975389003754, "memory(GiB)": 122.96, "step": 65545, "token_acc": 0.9888307984790875, "train_speed(iter/s)": 0.231418 }, { "epoch": 4.996569860507661, "grad_norm": 1.8820068836212158, "learning_rate": 1.1612430845797661e-10, "loss": 0.04155539870262146, "memory(GiB)": 122.96, "step": 65550, "token_acc": 0.985936621038815, "train_speed(iter/s)": 0.231422 }, { "epoch": 4.996950987117921, "grad_norm": 0.48003295063972473, "learning_rate": 9.175254747084694e-11, "loss": 0.024974866211414336, "memory(GiB)": 122.96, "step": 65555, "token_acc": 0.9895809739524348, "train_speed(iter/s)": 0.231425 }, { "epoch": 4.997332113728181, "grad_norm": 1.236311674118042, "learning_rate": 7.024804919431028e-11, "loss": 0.027466171979904176, "memory(GiB)": 122.96, "step": 65560, "token_acc": 0.9847109412326803, "train_speed(iter/s)": 0.23143 }, { "epoch": 4.997713240338441, "grad_norm": 0.39299336075782776, "learning_rate": 5.161081485516306e-11, "loss": 0.01858559399843216, "memory(GiB)": 122.96, "step": 65565, "token_acc": 0.993490054249548, "train_speed(iter/s)": 0.231436 }, { "epoch": 4.998094366948701, "grad_norm": 0.7008711695671082, "learning_rate": 3.5840845535872745e-11, "loss": 0.01804552674293518, "memory(GiB)": 122.96, "step": 65570, "token_acc": 0.9939855653568564, "train_speed(iter/s)": 0.23144 }, { "epoch": 4.99847549355896, "grad_norm": 1.4451603889465332, "learning_rate": 2.293814213016887e-11, "loss": 0.021957939863204955, "memory(GiB)": 122.96, "step": 65575, "token_acc": 0.9911764705882353, "train_speed(iter/s)": 0.231445 }, { "epoch": 4.99885662016922, "grad_norm": 4.2552690505981445, "learning_rate": 1.290270538190086e-11, "loss": 0.0438129335641861, "memory(GiB)": 122.96, "step": 65580, "token_acc": 0.9875300809450887, "train_speed(iter/s)": 0.231449 }, { "epoch": 4.99923774677948, "grad_norm": 0.9069608449935913, "learning_rate": 5.734535862833568e-12, "loss": 0.02502804696559906, "memory(GiB)": 122.96, "step": 65585, "token_acc": 0.989840106595603, "train_speed(iter/s)": 0.231448 }, { "epoch": 4.99961887338974, "grad_norm": 1.0553228855133057, "learning_rate": 1.4336339837495162e-12, "loss": 0.020407673716545106, "memory(GiB)": 122.96, "step": 65590, "token_acc": 0.9901986754966887, "train_speed(iter/s)": 0.23145 }, { "epoch": 5.0, "grad_norm": 0.8608609437942505, "learning_rate": 0.0, "loss": 0.01947695016860962, "memory(GiB)": 122.96, "step": 65595, "token_acc": 0.9901157865009884, "train_speed(iter/s)": 0.231456 }, { "epoch": 5.0, "eval_loss": 0.04670507833361626, "eval_runtime": 219.0908, "eval_samples_per_second": 2.419, "eval_steps_per_second": 2.419, "eval_token_acc": 0.9810402987771821, "step": 65595 } ], "logging_steps": 5, "max_steps": 65595, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3292660732573645e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }