{ "best_metric": 0.55531853, "best_model_checkpoint": "/root/workspace/myPharmHGT/KV_PLM/output/checkpoint-21050", "epoch": 0.9789018743878231, "eval_steps": 50, "global_step": 21050, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 4.650365198992033e-05, "grad_norm": 11.30196762084961, "learning_rate": 9.29368029739777e-09, "loss": 5.595128536224365, "memory(GiB)": 16.79, "step": 1, "token_acc": 0.35786802030456855, "train_speed(iter/s)": 0.181377 }, { "epoch": 0.00023251825994960167, "grad_norm": 14.642141342163086, "learning_rate": 4.6468401486988856e-08, "loss": 5.128164291381836, "memory(GiB)": 16.8, "step": 5, "token_acc": 0.4304812834224599, "train_speed(iter/s)": 0.21625 }, { "epoch": 0.00046503651989920333, "grad_norm": 7.316074848175049, "learning_rate": 9.293680297397771e-08, "loss": 4.300411987304687, "memory(GiB)": 17.84, "step": 10, "token_acc": 0.4681737061273052, "train_speed(iter/s)": 0.219536 }, { "epoch": 0.000697554779848805, "grad_norm": 11.100884437561035, "learning_rate": 1.3940520446096655e-07, "loss": 4.639457321166992, "memory(GiB)": 19.03, "step": 15, "token_acc": 0.43668770887486075, "train_speed(iter/s)": 0.220308 }, { "epoch": 0.0009300730397984067, "grad_norm": 6.971445560455322, "learning_rate": 1.8587360594795542e-07, "loss": 5.257072448730469, "memory(GiB)": 19.04, "step": 20, "token_acc": 0.4142085583714167, "train_speed(iter/s)": 0.222155 }, { "epoch": 0.0011625912997480082, "grad_norm": 7.333343505859375, "learning_rate": 2.3234200743494425e-07, "loss": 4.158517074584961, "memory(GiB)": 19.04, "step": 25, "token_acc": 0.4433541480820696, "train_speed(iter/s)": 0.221422 }, { "epoch": 0.00139510955969761, "grad_norm": 7.327789783477783, "learning_rate": 2.788104089219331e-07, "loss": 3.946809768676758, "memory(GiB)": 20.29, "step": 30, "token_acc": 0.4627841793833697, "train_speed(iter/s)": 0.220406 }, { "epoch": 0.0016276278196472116, "grad_norm": 9.05079174041748, "learning_rate": 3.2527881040892197e-07, "loss": 4.110320281982422, "memory(GiB)": 22.09, "step": 35, "token_acc": 0.4564489112227806, "train_speed(iter/s)": 0.21968 }, { "epoch": 0.0018601460795968133, "grad_norm": 10.026455879211426, "learning_rate": 3.7174721189591085e-07, "loss": 4.707549285888672, "memory(GiB)": 22.09, "step": 40, "token_acc": 0.41740674955595025, "train_speed(iter/s)": 0.218784 }, { "epoch": 0.002092664339546415, "grad_norm": 7.746947288513184, "learning_rate": 4.1821561338289967e-07, "loss": 4.500521087646485, "memory(GiB)": 22.09, "step": 45, "token_acc": 0.4180354267310789, "train_speed(iter/s)": 0.219268 }, { "epoch": 0.0023251825994960165, "grad_norm": 11.605240821838379, "learning_rate": 4.646840148698885e-07, "loss": 5.464200210571289, "memory(GiB)": 22.09, "step": 50, "token_acc": 0.37989080982711554, "train_speed(iter/s)": 0.219704 }, { "epoch": 0.0023251825994960165, "eval_loss": 6.238847732543945, "eval_runtime": 290.2527, "eval_samples_per_second": 11.972, "eval_steps_per_second": 11.972, "step": 50 }, { "epoch": 0.0025577008594456184, "grad_norm": 10.126920700073242, "learning_rate": 5.111524163568774e-07, "loss": 4.355241012573242, "memory(GiB)": 22.09, "step": 55, "token_acc": 0.4303478219228375, "train_speed(iter/s)": 0.101729 }, { "epoch": 0.00279021911939522, "grad_norm": 10.698599815368652, "learning_rate": 5.576208178438662e-07, "loss": 4.348709487915039, "memory(GiB)": 22.09, "step": 60, "token_acc": 0.44151949350216596, "train_speed(iter/s)": 0.106489 }, { "epoch": 0.0030227373793448218, "grad_norm": 10.698352813720703, "learning_rate": 6.04089219330855e-07, "loss": 4.434131240844726, "memory(GiB)": 22.09, "step": 65, "token_acc": 0.4308273257809818, "train_speed(iter/s)": 0.110843 }, { "epoch": 0.0032552556392944233, "grad_norm": 11.780677795410156, "learning_rate": 6.505576208178439e-07, "loss": 4.880805969238281, "memory(GiB)": 22.09, "step": 70, "token_acc": 0.420174741858618, "train_speed(iter/s)": 0.114884 }, { "epoch": 0.003487773899244025, "grad_norm": 10.340365409851074, "learning_rate": 6.970260223048328e-07, "loss": 4.451810836791992, "memory(GiB)": 22.09, "step": 75, "token_acc": 0.4266284896206156, "train_speed(iter/s)": 0.118675 }, { "epoch": 0.0037202921591936266, "grad_norm": 8.917567253112793, "learning_rate": 7.434944237918217e-07, "loss": 4.337361145019531, "memory(GiB)": 22.09, "step": 80, "token_acc": 0.4407567208762031, "train_speed(iter/s)": 0.122245 }, { "epoch": 0.003952810419143228, "grad_norm": 7.564225673675537, "learning_rate": 7.899628252788105e-07, "loss": 4.301780319213867, "memory(GiB)": 22.09, "step": 85, "token_acc": 0.43113176236309325, "train_speed(iter/s)": 0.125462 }, { "epoch": 0.00418532867909283, "grad_norm": 9.511303901672363, "learning_rate": 8.364312267657993e-07, "loss": 4.201332473754883, "memory(GiB)": 22.09, "step": 90, "token_acc": 0.43982630272952855, "train_speed(iter/s)": 0.128436 }, { "epoch": 0.004417846939042432, "grad_norm": 11.19315242767334, "learning_rate": 8.828996282527883e-07, "loss": 4.037698745727539, "memory(GiB)": 22.09, "step": 95, "token_acc": 0.4582210242587601, "train_speed(iter/s)": 0.131168 }, { "epoch": 0.004650365198992033, "grad_norm": 10.009793281555176, "learning_rate": 9.29368029739777e-07, "loss": 4.3285572052001955, "memory(GiB)": 22.09, "step": 100, "token_acc": 0.44606819763395966, "train_speed(iter/s)": 0.133872 }, { "epoch": 0.004650365198992033, "eval_loss": 6.00515079498291, "eval_runtime": 293.5348, "eval_samples_per_second": 11.838, "eval_steps_per_second": 11.838, "step": 100 }, { "epoch": 0.004882883458941635, "grad_norm": 10.538952827453613, "learning_rate": 9.75836431226766e-07, "loss": 4.578464508056641, "memory(GiB)": 22.09, "step": 105, "token_acc": 0.4315021837560604, "train_speed(iter/s)": 0.098632 }, { "epoch": 0.005115401718891237, "grad_norm": 8.359787940979004, "learning_rate": 1.0223048327137547e-06, "loss": 3.992523193359375, "memory(GiB)": 22.09, "step": 110, "token_acc": 0.45879120879120877, "train_speed(iter/s)": 0.10116 }, { "epoch": 0.005347919978840839, "grad_norm": 9.403013229370117, "learning_rate": 1.0687732342007436e-06, "loss": 4.678741836547852, "memory(GiB)": 22.09, "step": 115, "token_acc": 0.4170796797560046, "train_speed(iter/s)": 0.103556 }, { "epoch": 0.00558043823879044, "grad_norm": 12.418402671813965, "learning_rate": 1.1152416356877324e-06, "loss": 4.810413360595703, "memory(GiB)": 22.09, "step": 120, "token_acc": 0.42337876910367617, "train_speed(iter/s)": 0.105865 }, { "epoch": 0.005812956498740042, "grad_norm": 10.668410301208496, "learning_rate": 1.1617100371747212e-06, "loss": 4.338812637329101, "memory(GiB)": 22.09, "step": 125, "token_acc": 0.444243301955105, "train_speed(iter/s)": 0.108127 }, { "epoch": 0.0060454747586896436, "grad_norm": 9.335319519042969, "learning_rate": 1.20817843866171e-06, "loss": 4.58795051574707, "memory(GiB)": 22.09, "step": 130, "token_acc": 0.42386831275720166, "train_speed(iter/s)": 0.110288 }, { "epoch": 0.006277993018639245, "grad_norm": 7.6158881187438965, "learning_rate": 1.2546468401486989e-06, "loss": 4.130472183227539, "memory(GiB)": 22.09, "step": 135, "token_acc": 0.4362486828240253, "train_speed(iter/s)": 0.112356 }, { "epoch": 0.0065105112785888465, "grad_norm": 9.206437110900879, "learning_rate": 1.3011152416356879e-06, "loss": 4.286816787719727, "memory(GiB)": 22.09, "step": 140, "token_acc": 0.42203258339798294, "train_speed(iter/s)": 0.11431 }, { "epoch": 0.006743029538538448, "grad_norm": 8.033815383911133, "learning_rate": 1.3475836431226765e-06, "loss": 3.698823165893555, "memory(GiB)": 22.09, "step": 145, "token_acc": 0.4792787092692186, "train_speed(iter/s)": 0.116227 }, { "epoch": 0.00697554779848805, "grad_norm": 9.180941581726074, "learning_rate": 1.3940520446096655e-06, "loss": 3.7941364288330077, "memory(GiB)": 22.09, "step": 150, "token_acc": 0.4449166394246486, "train_speed(iter/s)": 0.118057 }, { "epoch": 0.00697554779848805, "eval_loss": 4.911423206329346, "eval_runtime": 292.8063, "eval_samples_per_second": 11.868, "eval_steps_per_second": 11.868, "step": 150 }, { "epoch": 0.007208066058437651, "grad_norm": 8.536662101745605, "learning_rate": 1.4405204460966544e-06, "loss": 3.2620067596435547, "memory(GiB)": 22.09, "step": 155, "token_acc": 0.45824087550603265, "train_speed(iter/s)": 0.097604 }, { "epoch": 0.007440584318387253, "grad_norm": 12.196755409240723, "learning_rate": 1.4869888475836434e-06, "loss": 3.9180809020996095, "memory(GiB)": 22.09, "step": 160, "token_acc": 0.4449484536082474, "train_speed(iter/s)": 0.09936 }, { "epoch": 0.007673102578336855, "grad_norm": 4.950088977813721, "learning_rate": 1.533457249070632e-06, "loss": 3.5914573669433594, "memory(GiB)": 22.09, "step": 165, "token_acc": 0.4844632768361582, "train_speed(iter/s)": 0.101032 }, { "epoch": 0.007905620838286456, "grad_norm": 6.352217197418213, "learning_rate": 1.579925650557621e-06, "loss": 3.5214935302734376, "memory(GiB)": 22.09, "step": 170, "token_acc": 0.49245647969052225, "train_speed(iter/s)": 0.102611 }, { "epoch": 0.008138139098236059, "grad_norm": 7.197726249694824, "learning_rate": 1.6263940520446099e-06, "loss": 3.174709701538086, "memory(GiB)": 22.09, "step": 175, "token_acc": 0.5147719974309569, "train_speed(iter/s)": 0.1042 }, { "epoch": 0.00837065735818566, "grad_norm": 6.746879577636719, "learning_rate": 1.6728624535315987e-06, "loss": 3.4479164123535155, "memory(GiB)": 22.09, "step": 180, "token_acc": 0.47926447574334896, "train_speed(iter/s)": 0.105701 }, { "epoch": 0.008603175618135261, "grad_norm": 5.178398132324219, "learning_rate": 1.7193308550185875e-06, "loss": 3.3821487426757812, "memory(GiB)": 22.09, "step": 185, "token_acc": 0.489778534923339, "train_speed(iter/s)": 0.107214 }, { "epoch": 0.008835693878084864, "grad_norm": 4.9685540199279785, "learning_rate": 1.7657992565055765e-06, "loss": 2.7297746658325197, "memory(GiB)": 22.09, "step": 190, "token_acc": 0.5216201423097975, "train_speed(iter/s)": 0.108622 }, { "epoch": 0.009068212138034465, "grad_norm": 3.6990842819213867, "learning_rate": 1.8122676579925652e-06, "loss": 2.962936782836914, "memory(GiB)": 22.09, "step": 195, "token_acc": 0.5222845823704193, "train_speed(iter/s)": 0.110023 }, { "epoch": 0.009300730397984066, "grad_norm": 4.623185634613037, "learning_rate": 1.858736059479554e-06, "loss": 2.6473554611206054, "memory(GiB)": 22.09, "step": 200, "token_acc": 0.5438311688311688, "train_speed(iter/s)": 0.111444 }, { "epoch": 0.009300730397984066, "eval_loss": 3.5655155181884766, "eval_runtime": 293.694, "eval_samples_per_second": 11.832, "eval_steps_per_second": 11.832, "step": 200 }, { "epoch": 0.009533248657933669, "grad_norm": 6.069931507110596, "learning_rate": 1.905204460966543e-06, "loss": 2.718964767456055, "memory(GiB)": 22.09, "step": 205, "token_acc": 0.5408020470992764, "train_speed(iter/s)": 0.097091 }, { "epoch": 0.00976576691788327, "grad_norm": 18.15644645690918, "learning_rate": 1.951672862453532e-06, "loss": 2.583760070800781, "memory(GiB)": 22.09, "step": 210, "token_acc": 0.5504587155963303, "train_speed(iter/s)": 0.098378 }, { "epoch": 0.009998285177832873, "grad_norm": 6.019149303436279, "learning_rate": 1.9981412639405207e-06, "loss": 2.5243818283081056, "memory(GiB)": 22.09, "step": 215, "token_acc": 0.549553208773355, "train_speed(iter/s)": 0.099675 }, { "epoch": 0.010230803437782474, "grad_norm": 8.37928581237793, "learning_rate": 2.0446096654275095e-06, "loss": 2.481988525390625, "memory(GiB)": 22.09, "step": 220, "token_acc": 0.5509568313306631, "train_speed(iter/s)": 0.100916 }, { "epoch": 0.010463321697732075, "grad_norm": 12.24028205871582, "learning_rate": 2.0910780669144983e-06, "loss": 2.3563426971435546, "memory(GiB)": 22.09, "step": 225, "token_acc": 0.5385551948051948, "train_speed(iter/s)": 0.102149 }, { "epoch": 0.010695839957681677, "grad_norm": 6.658745288848877, "learning_rate": 2.137546468401487e-06, "loss": 2.2650516510009764, "memory(GiB)": 22.09, "step": 230, "token_acc": 0.538664323374341, "train_speed(iter/s)": 0.10335 }, { "epoch": 0.010928358217631278, "grad_norm": 4.5154008865356445, "learning_rate": 2.184014869888476e-06, "loss": 2.082209587097168, "memory(GiB)": 22.09, "step": 235, "token_acc": 0.5826681870011402, "train_speed(iter/s)": 0.1045 }, { "epoch": 0.01116087647758088, "grad_norm": 3.9229602813720703, "learning_rate": 2.2304832713754648e-06, "loss": 2.032619857788086, "memory(GiB)": 22.09, "step": 240, "token_acc": 0.5845122859270291, "train_speed(iter/s)": 0.105659 }, { "epoch": 0.011393394737530482, "grad_norm": 4.006194591522217, "learning_rate": 2.276951672862454e-06, "loss": 2.0263885498046874, "memory(GiB)": 22.09, "step": 245, "token_acc": 0.6218020022246941, "train_speed(iter/s)": 0.106808 }, { "epoch": 0.011625912997480083, "grad_norm": 3.4309797286987305, "learning_rate": 2.3234200743494424e-06, "loss": 1.9709346771240235, "memory(GiB)": 22.09, "step": 250, "token_acc": 0.5988117573483427, "train_speed(iter/s)": 0.107935 }, { "epoch": 0.011625912997480083, "eval_loss": 1.9419108629226685, "eval_runtime": 292.9372, "eval_samples_per_second": 11.863, "eval_steps_per_second": 11.863, "step": 250 }, { "epoch": 0.011858431257429684, "grad_norm": 4.214391231536865, "learning_rate": 2.3698884758364317e-06, "loss": 1.9560165405273438, "memory(GiB)": 22.09, "step": 255, "token_acc": 0.6185187256789907, "train_speed(iter/s)": 0.096879 }, { "epoch": 0.012090949517379287, "grad_norm": 2.496126651763916, "learning_rate": 2.41635687732342e-06, "loss": 1.8916751861572265, "memory(GiB)": 22.09, "step": 260, "token_acc": 0.6213592233009708, "train_speed(iter/s)": 0.097938 }, { "epoch": 0.012323467777328888, "grad_norm": 2.3947010040283203, "learning_rate": 2.462825278810409e-06, "loss": 1.8349943161010742, "memory(GiB)": 22.09, "step": 265, "token_acc": 0.6115827774408733, "train_speed(iter/s)": 0.098953 }, { "epoch": 0.01255598603727849, "grad_norm": 2.534743070602417, "learning_rate": 2.5092936802973977e-06, "loss": 1.7406538009643555, "memory(GiB)": 22.09, "step": 270, "token_acc": 0.6380839739798936, "train_speed(iter/s)": 0.099965 }, { "epoch": 0.012788504297228092, "grad_norm": 3.6293578147888184, "learning_rate": 2.555762081784387e-06, "loss": 1.8015171051025392, "memory(GiB)": 22.09, "step": 275, "token_acc": 0.6511627906976745, "train_speed(iter/s)": 0.100974 }, { "epoch": 0.013021022557177693, "grad_norm": 4.162702560424805, "learning_rate": 2.6022304832713758e-06, "loss": 1.8374807357788085, "memory(GiB)": 22.09, "step": 280, "token_acc": 0.6273263731275533, "train_speed(iter/s)": 0.101975 }, { "epoch": 0.013253540817127296, "grad_norm": 4.197765827178955, "learning_rate": 2.6486988847583646e-06, "loss": 1.6645938873291015, "memory(GiB)": 22.09, "step": 285, "token_acc": 0.6414484319430973, "train_speed(iter/s)": 0.102915 }, { "epoch": 0.013486059077076897, "grad_norm": 2.419771671295166, "learning_rate": 2.695167286245353e-06, "loss": 1.7501996994018554, "memory(GiB)": 22.09, "step": 290, "token_acc": 0.6264450867052023, "train_speed(iter/s)": 0.103873 }, { "epoch": 0.013718577337026498, "grad_norm": 2.622260808944702, "learning_rate": 2.7416356877323423e-06, "loss": 1.5766103744506836, "memory(GiB)": 22.09, "step": 295, "token_acc": 0.6576110392410521, "train_speed(iter/s)": 0.104818 }, { "epoch": 0.0139510955969761, "grad_norm": 3.4051737785339355, "learning_rate": 2.788104089219331e-06, "loss": 1.6507225036621094, "memory(GiB)": 22.09, "step": 300, "token_acc": 0.6394230769230769, "train_speed(iter/s)": 0.105736 }, { "epoch": 0.0139510955969761, "eval_loss": 1.518836498260498, "eval_runtime": 288.759, "eval_samples_per_second": 12.034, "eval_steps_per_second": 12.034, "step": 300 }, { "epoch": 0.014183613856925702, "grad_norm": 2.1325795650482178, "learning_rate": 2.83457249070632e-06, "loss": 1.4976882934570312, "memory(GiB)": 22.09, "step": 305, "token_acc": 0.6439182878445585, "train_speed(iter/s)": 0.096836 }, { "epoch": 0.014416132116875303, "grad_norm": 2.5515503883361816, "learning_rate": 2.8810408921933087e-06, "loss": 1.582894992828369, "memory(GiB)": 22.09, "step": 310, "token_acc": 0.6342662632375189, "train_speed(iter/s)": 0.097711 }, { "epoch": 0.014648650376824906, "grad_norm": 2.6792547702789307, "learning_rate": 2.927509293680298e-06, "loss": 1.6397327423095702, "memory(GiB)": 22.09, "step": 315, "token_acc": 0.6391833529642716, "train_speed(iter/s)": 0.098569 }, { "epoch": 0.014881168636774507, "grad_norm": 3.859010934829712, "learning_rate": 2.973977695167287e-06, "loss": 1.3939047813415528, "memory(GiB)": 22.09, "step": 320, "token_acc": 0.6882701962574167, "train_speed(iter/s)": 0.099432 }, { "epoch": 0.015113686896724108, "grad_norm": 2.2595622539520264, "learning_rate": 3.020446096654275e-06, "loss": 1.4366521835327148, "memory(GiB)": 22.09, "step": 325, "token_acc": 0.6654362416107382, "train_speed(iter/s)": 0.100268 }, { "epoch": 0.01534620515667371, "grad_norm": 3.0457279682159424, "learning_rate": 3.066914498141264e-06, "loss": 1.526663589477539, "memory(GiB)": 24.06, "step": 330, "token_acc": 0.6629547141796585, "train_speed(iter/s)": 0.101072 }, { "epoch": 0.015578723416623311, "grad_norm": 9.594057083129883, "learning_rate": 3.113382899628253e-06, "loss": 1.3833000183105468, "memory(GiB)": 24.06, "step": 335, "token_acc": 0.6841689696012633, "train_speed(iter/s)": 0.101881 }, { "epoch": 0.015811241676572912, "grad_norm": 3.075023889541626, "learning_rate": 3.159851301115242e-06, "loss": 1.5897629737854004, "memory(GiB)": 24.06, "step": 340, "token_acc": 0.66429418742586, "train_speed(iter/s)": 0.102694 }, { "epoch": 0.016043759936522514, "grad_norm": 5.14961051940918, "learning_rate": 3.206319702602231e-06, "loss": 1.5434186935424805, "memory(GiB)": 24.06, "step": 345, "token_acc": 0.6542893725992317, "train_speed(iter/s)": 0.10349 }, { "epoch": 0.016276278196472118, "grad_norm": 2.473466157913208, "learning_rate": 3.2527881040892197e-06, "loss": 1.457532024383545, "memory(GiB)": 24.06, "step": 350, "token_acc": 0.668398533007335, "train_speed(iter/s)": 0.104276 }, { "epoch": 0.016276278196472118, "eval_loss": 1.3259131908416748, "eval_runtime": 288.2494, "eval_samples_per_second": 12.056, "eval_steps_per_second": 12.056, "step": 350 }, { "epoch": 0.01650879645642172, "grad_norm": 2.0314037799835205, "learning_rate": 3.299256505576208e-06, "loss": 1.3253539085388184, "memory(GiB)": 24.06, "step": 355, "token_acc": 0.6744063535139881, "train_speed(iter/s)": 0.096782 }, { "epoch": 0.01674131471637132, "grad_norm": 2.7340409755706787, "learning_rate": 3.3457249070631974e-06, "loss": 1.3673904418945313, "memory(GiB)": 24.06, "step": 360, "token_acc": 0.7089552238805971, "train_speed(iter/s)": 0.097525 }, { "epoch": 0.01697383297632092, "grad_norm": 1.6114614009857178, "learning_rate": 3.392193308550186e-06, "loss": 1.3874520301818847, "memory(GiB)": 24.06, "step": 365, "token_acc": 0.6815522020326455, "train_speed(iter/s)": 0.098261 }, { "epoch": 0.017206351236270522, "grad_norm": 3.4921016693115234, "learning_rate": 3.438661710037175e-06, "loss": 1.558394718170166, "memory(GiB)": 24.06, "step": 370, "token_acc": 0.6628183361629881, "train_speed(iter/s)": 0.098969 }, { "epoch": 0.017438869496220127, "grad_norm": 3.3175108432769775, "learning_rate": 3.485130111524164e-06, "loss": 1.4499250411987306, "memory(GiB)": 24.06, "step": 375, "token_acc": 0.6738898756660746, "train_speed(iter/s)": 0.09971 }, { "epoch": 0.017671387756169728, "grad_norm": 1.9265453815460205, "learning_rate": 3.531598513011153e-06, "loss": 1.455325222015381, "memory(GiB)": 24.06, "step": 380, "token_acc": 0.6769176387416048, "train_speed(iter/s)": 0.100433 }, { "epoch": 0.01790390601611933, "grad_norm": 2.1353354454040527, "learning_rate": 3.5780669144981415e-06, "loss": 1.4541678428649902, "memory(GiB)": 26.73, "step": 385, "token_acc": 0.6730624529721595, "train_speed(iter/s)": 0.101103 }, { "epoch": 0.01813642427606893, "grad_norm": 3.1384456157684326, "learning_rate": 3.6245353159851303e-06, "loss": 1.4149900436401368, "memory(GiB)": 26.73, "step": 390, "token_acc": 0.6772802653399669, "train_speed(iter/s)": 0.101775 }, { "epoch": 0.01836894253601853, "grad_norm": 2.3894238471984863, "learning_rate": 3.671003717472119e-06, "loss": 1.4391159057617187, "memory(GiB)": 26.73, "step": 395, "token_acc": 0.6768361581920904, "train_speed(iter/s)": 0.102484 }, { "epoch": 0.018601460795968132, "grad_norm": 2.095750093460083, "learning_rate": 3.717472118959108e-06, "loss": 1.2132759094238281, "memory(GiB)": 26.73, "step": 400, "token_acc": 0.7314036725801432, "train_speed(iter/s)": 0.103155 }, { "epoch": 0.018601460795968132, "eval_loss": 1.243235468864441, "eval_runtime": 290.9778, "eval_samples_per_second": 11.942, "eval_steps_per_second": 11.942, "step": 400 }, { "epoch": 0.018833979055917736, "grad_norm": 2.0978684425354004, "learning_rate": 3.763940520446097e-06, "loss": 1.314230728149414, "memory(GiB)": 26.73, "step": 405, "token_acc": 0.6890068657193038, "train_speed(iter/s)": 0.096617 }, { "epoch": 0.019066497315867337, "grad_norm": 2.425307273864746, "learning_rate": 3.810408921933086e-06, "loss": 1.3469379425048829, "memory(GiB)": 26.73, "step": 410, "token_acc": 0.6947027901334412, "train_speed(iter/s)": 0.09728 }, { "epoch": 0.01929901557581694, "grad_norm": 2.1733996868133545, "learning_rate": 3.8568773234200744e-06, "loss": 1.4006044387817382, "memory(GiB)": 26.73, "step": 415, "token_acc": 0.6759921925829538, "train_speed(iter/s)": 0.097936 }, { "epoch": 0.01953153383576654, "grad_norm": 2.709667444229126, "learning_rate": 3.903345724907064e-06, "loss": 1.3892633438110351, "memory(GiB)": 26.73, "step": 420, "token_acc": 0.6919795221843004, "train_speed(iter/s)": 0.098593 }, { "epoch": 0.01976405209571614, "grad_norm": 2.0785586833953857, "learning_rate": 3.949814126394053e-06, "loss": 1.3460134506225585, "memory(GiB)": 26.73, "step": 425, "token_acc": 0.695500387897595, "train_speed(iter/s)": 0.099222 }, { "epoch": 0.019996570355665745, "grad_norm": 2.9196395874023438, "learning_rate": 3.996282527881041e-06, "loss": 1.4188276290893556, "memory(GiB)": 26.73, "step": 430, "token_acc": 0.6826769132244637, "train_speed(iter/s)": 0.099863 }, { "epoch": 0.020229088615615346, "grad_norm": 2.977738618850708, "learning_rate": 4.04275092936803e-06, "loss": 1.3545875549316406, "memory(GiB)": 26.73, "step": 435, "token_acc": 0.6918604651162791, "train_speed(iter/s)": 0.100495 }, { "epoch": 0.020461606875564947, "grad_norm": 2.716374635696411, "learning_rate": 4.089219330855019e-06, "loss": 1.4137415885925293, "memory(GiB)": 26.73, "step": 440, "token_acc": 0.6859414528370076, "train_speed(iter/s)": 0.101102 }, { "epoch": 0.020694125135514548, "grad_norm": 2.3097028732299805, "learning_rate": 4.135687732342008e-06, "loss": 1.4122305870056153, "memory(GiB)": 26.73, "step": 445, "token_acc": 0.6715374841168996, "train_speed(iter/s)": 0.101712 }, { "epoch": 0.02092664339546415, "grad_norm": 2.968329429626465, "learning_rate": 4.182156133828997e-06, "loss": 1.3420489311218262, "memory(GiB)": 26.73, "step": 450, "token_acc": 0.7025730484081989, "train_speed(iter/s)": 0.102313 }, { "epoch": 0.02092664339546415, "eval_loss": 1.1884208917617798, "eval_runtime": 290.2048, "eval_samples_per_second": 11.974, "eval_steps_per_second": 11.974, "step": 450 }, { "epoch": 0.02115916165541375, "grad_norm": 2.2978663444519043, "learning_rate": 4.228624535315986e-06, "loss": 1.2818711280822754, "memory(GiB)": 26.73, "step": 455, "token_acc": 0.6983738667434163, "train_speed(iter/s)": 0.096555 }, { "epoch": 0.021391679915363355, "grad_norm": 3.826552629470825, "learning_rate": 4.275092936802974e-06, "loss": 1.373377227783203, "memory(GiB)": 26.73, "step": 460, "token_acc": 0.6942446043165468, "train_speed(iter/s)": 0.097138 }, { "epoch": 0.021624198175312956, "grad_norm": 2.246628999710083, "learning_rate": 4.321561338289963e-06, "loss": 1.3337231636047364, "memory(GiB)": 26.73, "step": 465, "token_acc": 0.7166531932093775, "train_speed(iter/s)": 0.097731 }, { "epoch": 0.021856716435262557, "grad_norm": 3.411140203475952, "learning_rate": 4.368029739776952e-06, "loss": 1.408590030670166, "memory(GiB)": 26.73, "step": 470, "token_acc": 0.6903353057199211, "train_speed(iter/s)": 0.098298 }, { "epoch": 0.022089234695212158, "grad_norm": 3.054403781890869, "learning_rate": 4.414498141263941e-06, "loss": 1.229485321044922, "memory(GiB)": 26.73, "step": 475, "token_acc": 0.7275797373358349, "train_speed(iter/s)": 0.09886 }, { "epoch": 0.02232175295516176, "grad_norm": 2.294562578201294, "learning_rate": 4.4609665427509296e-06, "loss": 1.3114431381225586, "memory(GiB)": 26.73, "step": 480, "token_acc": 0.713089802130898, "train_speed(iter/s)": 0.099429 }, { "epoch": 0.02255427121511136, "grad_norm": 2.637377977371216, "learning_rate": 4.507434944237919e-06, "loss": 1.3108051300048829, "memory(GiB)": 26.73, "step": 485, "token_acc": 0.6991845363938387, "train_speed(iter/s)": 0.099973 }, { "epoch": 0.022786789475060965, "grad_norm": 2.643446922302246, "learning_rate": 4.553903345724908e-06, "loss": 1.2051572799682617, "memory(GiB)": 26.73, "step": 490, "token_acc": 0.7252704791344667, "train_speed(iter/s)": 0.100528 }, { "epoch": 0.023019307735010566, "grad_norm": 2.522397994995117, "learning_rate": 4.6003717472118964e-06, "loss": 1.3973845481872558, "memory(GiB)": 26.73, "step": 495, "token_acc": 0.6806872037914692, "train_speed(iter/s)": 0.101073 }, { "epoch": 0.023251825994960167, "grad_norm": 2.2974302768707275, "learning_rate": 4.646840148698885e-06, "loss": 1.3299295425415039, "memory(GiB)": 26.73, "step": 500, "token_acc": 0.7066365007541479, "train_speed(iter/s)": 0.101625 }, { "epoch": 0.023251825994960167, "eval_loss": 1.1440743207931519, "eval_runtime": 291.5453, "eval_samples_per_second": 11.919, "eval_steps_per_second": 11.919, "step": 500 }, { "epoch": 0.023484344254909768, "grad_norm": 2.4379918575286865, "learning_rate": 4.693308550185874e-06, "loss": 1.2742385864257812, "memory(GiB)": 26.73, "step": 505, "token_acc": 0.706128955128716, "train_speed(iter/s)": 0.096465 }, { "epoch": 0.02371686251485937, "grad_norm": 3.101616621017456, "learning_rate": 4.739776951672863e-06, "loss": 1.1919514656066894, "memory(GiB)": 26.73, "step": 510, "token_acc": 0.7258243793997777, "train_speed(iter/s)": 0.096999 }, { "epoch": 0.023949380774808973, "grad_norm": 2.3769919872283936, "learning_rate": 4.786245353159852e-06, "loss": 1.3223759651184082, "memory(GiB)": 26.73, "step": 515, "token_acc": 0.7008708822415751, "train_speed(iter/s)": 0.097522 }, { "epoch": 0.024181899034758574, "grad_norm": 2.893718719482422, "learning_rate": 4.83271375464684e-06, "loss": 1.3019302368164063, "memory(GiB)": 26.73, "step": 520, "token_acc": 0.6975822433610781, "train_speed(iter/s)": 0.098045 }, { "epoch": 0.024414417294708175, "grad_norm": 3.2003087997436523, "learning_rate": 4.879182156133829e-06, "loss": 1.221653938293457, "memory(GiB)": 26.73, "step": 525, "token_acc": 0.723404255319149, "train_speed(iter/s)": 0.098566 }, { "epoch": 0.024646935554657776, "grad_norm": 3.078279495239258, "learning_rate": 4.925650557620818e-06, "loss": 1.1143360137939453, "memory(GiB)": 26.73, "step": 530, "token_acc": 0.7328652624540287, "train_speed(iter/s)": 0.099081 }, { "epoch": 0.024879453814607377, "grad_norm": 2.866652488708496, "learning_rate": 4.972118959107807e-06, "loss": 1.2642970085144043, "memory(GiB)": 26.73, "step": 535, "token_acc": 0.7042682926829268, "train_speed(iter/s)": 0.09959 }, { "epoch": 0.02511197207455698, "grad_norm": 2.440160036087036, "learning_rate": 5.0185873605947954e-06, "loss": 1.2780232429504395, "memory(GiB)": 26.73, "step": 540, "token_acc": 0.6966232385003989, "train_speed(iter/s)": 0.100089 }, { "epoch": 0.025344490334506583, "grad_norm": 2.839524030685425, "learning_rate": 5.0650557620817855e-06, "loss": 1.1742261886596679, "memory(GiB)": 26.73, "step": 545, "token_acc": 0.7220434432823813, "train_speed(iter/s)": 0.100589 }, { "epoch": 0.025577008594456184, "grad_norm": 4.023037910461426, "learning_rate": 5.111524163568774e-06, "loss": 1.1907401084899902, "memory(GiB)": 26.73, "step": 550, "token_acc": 0.7297198981447799, "train_speed(iter/s)": 0.101092 }, { "epoch": 0.025577008594456184, "eval_loss": 1.100506067276001, "eval_runtime": 289.3509, "eval_samples_per_second": 12.01, "eval_steps_per_second": 12.01, "step": 550 }, { "epoch": 0.025809526854405785, "grad_norm": 2.9395487308502197, "learning_rate": 5.157992565055762e-06, "loss": 1.2625597953796386, "memory(GiB)": 26.73, "step": 555, "token_acc": 0.7145255684543798, "train_speed(iter/s)": 0.096472 }, { "epoch": 0.026042045114355386, "grad_norm": 2.969301462173462, "learning_rate": 5.2044609665427516e-06, "loss": 1.2307548522949219, "memory(GiB)": 26.73, "step": 560, "token_acc": 0.7089829250185598, "train_speed(iter/s)": 0.09695 }, { "epoch": 0.026274563374304987, "grad_norm": 2.336460828781128, "learning_rate": 5.25092936802974e-06, "loss": 1.2285655975341796, "memory(GiB)": 26.73, "step": 565, "token_acc": 0.7056101792943898, "train_speed(iter/s)": 0.097424 }, { "epoch": 0.02650708163425459, "grad_norm": 2.536938428878784, "learning_rate": 5.297397769516729e-06, "loss": 1.2232088088989257, "memory(GiB)": 26.73, "step": 570, "token_acc": 0.7129063405214033, "train_speed(iter/s)": 0.09789 }, { "epoch": 0.026739599894204193, "grad_norm": 2.696225643157959, "learning_rate": 5.343866171003718e-06, "loss": 1.1694017410278321, "memory(GiB)": 26.73, "step": 575, "token_acc": 0.7356643356643356, "train_speed(iter/s)": 0.098353 }, { "epoch": 0.026972118154153794, "grad_norm": 2.478032350540161, "learning_rate": 5.390334572490706e-06, "loss": 1.1354408264160156, "memory(GiB)": 26.73, "step": 580, "token_acc": 0.7360194511983328, "train_speed(iter/s)": 0.098816 }, { "epoch": 0.027204636414103395, "grad_norm": 3.790090799331665, "learning_rate": 5.436802973977695e-06, "loss": 1.2879012107849122, "memory(GiB)": 26.73, "step": 585, "token_acc": 0.7038508557457213, "train_speed(iter/s)": 0.099279 }, { "epoch": 0.027437154674052996, "grad_norm": 3.797004461288452, "learning_rate": 5.4832713754646845e-06, "loss": 1.3572250366210938, "memory(GiB)": 26.73, "step": 590, "token_acc": 0.6939364773820982, "train_speed(iter/s)": 0.099741 }, { "epoch": 0.027669672934002597, "grad_norm": 3.488421678543091, "learning_rate": 5.529739776951674e-06, "loss": 1.2515945434570312, "memory(GiB)": 26.73, "step": 595, "token_acc": 0.7290346626910175, "train_speed(iter/s)": 0.1002 }, { "epoch": 0.0279021911939522, "grad_norm": 2.518224000930786, "learning_rate": 5.576208178438662e-06, "loss": 1.1993464469909667, "memory(GiB)": 26.73, "step": 600, "token_acc": 0.7091660923501034, "train_speed(iter/s)": 0.100654 }, { "epoch": 0.0279021911939522, "eval_loss": 1.0732550621032715, "eval_runtime": 290.72, "eval_samples_per_second": 11.953, "eval_steps_per_second": 11.953, "step": 600 }, { "epoch": 0.028134709453901802, "grad_norm": 2.8344390392303467, "learning_rate": 5.622676579925651e-06, "loss": 1.3822043418884278, "memory(GiB)": 26.73, "step": 605, "token_acc": 0.7207555612375351, "train_speed(iter/s)": 0.096411 }, { "epoch": 0.028367227713851403, "grad_norm": 2.5761542320251465, "learning_rate": 5.66914498141264e-06, "loss": 1.1615178108215332, "memory(GiB)": 26.73, "step": 610, "token_acc": 0.7227501397428732, "train_speed(iter/s)": 0.096857 }, { "epoch": 0.028599745973801004, "grad_norm": 3.259291172027588, "learning_rate": 5.715613382899628e-06, "loss": 1.300617504119873, "memory(GiB)": 26.73, "step": 615, "token_acc": 0.6939717334871647, "train_speed(iter/s)": 0.097287 }, { "epoch": 0.028832264233750605, "grad_norm": 2.453037738800049, "learning_rate": 5.7620817843866174e-06, "loss": 1.2526305198669434, "memory(GiB)": 26.73, "step": 620, "token_acc": 0.7125262841694202, "train_speed(iter/s)": 0.097716 }, { "epoch": 0.02906478249370021, "grad_norm": 4.138836860656738, "learning_rate": 5.808550185873606e-06, "loss": 1.1886103630065918, "memory(GiB)": 26.73, "step": 625, "token_acc": 0.7297491039426524, "train_speed(iter/s)": 0.098156 }, { "epoch": 0.02929730075364981, "grad_norm": 2.7090673446655273, "learning_rate": 5.855018587360596e-06, "loss": 1.2077580451965333, "memory(GiB)": 26.73, "step": 630, "token_acc": 0.733251633986928, "train_speed(iter/s)": 0.09859 }, { "epoch": 0.029529819013599412, "grad_norm": 3.340815544128418, "learning_rate": 5.901486988847584e-06, "loss": 1.2236101150512695, "memory(GiB)": 26.73, "step": 635, "token_acc": 0.7258354293927416, "train_speed(iter/s)": 0.099024 }, { "epoch": 0.029762337273549013, "grad_norm": 2.5718116760253906, "learning_rate": 5.947955390334574e-06, "loss": 1.2151712417602538, "memory(GiB)": 26.73, "step": 640, "token_acc": 0.7069377990430622, "train_speed(iter/s)": 0.099447 }, { "epoch": 0.029994855533498614, "grad_norm": 3.5712640285491943, "learning_rate": 5.994423791821562e-06, "loss": 1.1882932662963868, "memory(GiB)": 26.73, "step": 645, "token_acc": 0.7349031522977592, "train_speed(iter/s)": 0.099867 }, { "epoch": 0.030227373793448215, "grad_norm": 3.336963176727295, "learning_rate": 6.04089219330855e-06, "loss": 1.115128993988037, "memory(GiB)": 26.73, "step": 650, "token_acc": 0.7469618055555556, "train_speed(iter/s)": 0.100287 }, { "epoch": 0.030227373793448215, "eval_loss": 1.038861870765686, "eval_runtime": 294.2553, "eval_samples_per_second": 11.809, "eval_steps_per_second": 11.809, "step": 650 }, { "epoch": 0.03045989205339782, "grad_norm": 2.6979942321777344, "learning_rate": 6.08736059479554e-06, "loss": 1.1597721099853515, "memory(GiB)": 26.73, "step": 655, "token_acc": 0.7270051037550198, "train_speed(iter/s)": 0.09634 }, { "epoch": 0.03069241031334742, "grad_norm": 2.986121654510498, "learning_rate": 6.133828996282528e-06, "loss": 1.0470152854919434, "memory(GiB)": 26.73, "step": 660, "token_acc": 0.7729346173340091, "train_speed(iter/s)": 0.096749 }, { "epoch": 0.030924928573297022, "grad_norm": 2.93112850189209, "learning_rate": 6.180297397769517e-06, "loss": 1.149097728729248, "memory(GiB)": 26.73, "step": 665, "token_acc": 0.7351664254703328, "train_speed(iter/s)": 0.097163 }, { "epoch": 0.031157446833246623, "grad_norm": 3.3635780811309814, "learning_rate": 6.226765799256506e-06, "loss": 1.2041844367980956, "memory(GiB)": 26.73, "step": 670, "token_acc": 0.7335233345208407, "train_speed(iter/s)": 0.097577 }, { "epoch": 0.03138996509319623, "grad_norm": 3.214078664779663, "learning_rate": 6.273234200743496e-06, "loss": 1.1929821968078613, "memory(GiB)": 26.73, "step": 675, "token_acc": 0.7315741583257507, "train_speed(iter/s)": 0.097977 }, { "epoch": 0.031622483353145825, "grad_norm": 2.8574559688568115, "learning_rate": 6.319702602230484e-06, "loss": 1.0180715560913085, "memory(GiB)": 26.73, "step": 680, "token_acc": 0.7521399330107927, "train_speed(iter/s)": 0.098385 }, { "epoch": 0.03185500161309543, "grad_norm": 3.0687994956970215, "learning_rate": 6.3661710037174726e-06, "loss": 1.1894699096679688, "memory(GiB)": 26.73, "step": 685, "token_acc": 0.7381535947712419, "train_speed(iter/s)": 0.098779 }, { "epoch": 0.03208751987304503, "grad_norm": 2.5668647289276123, "learning_rate": 6.412639405204462e-06, "loss": 1.1381871223449707, "memory(GiB)": 26.73, "step": 690, "token_acc": 0.7289256198347107, "train_speed(iter/s)": 0.099181 }, { "epoch": 0.03232003813299463, "grad_norm": 3.334214687347412, "learning_rate": 6.45910780669145e-06, "loss": 1.1664478302001953, "memory(GiB)": 26.73, "step": 695, "token_acc": 0.7353790613718412, "train_speed(iter/s)": 0.099563 }, { "epoch": 0.032552556392944236, "grad_norm": 2.6217947006225586, "learning_rate": 6.5055762081784395e-06, "loss": 1.1068604469299317, "memory(GiB)": 26.73, "step": 700, "token_acc": 0.7076585231951689, "train_speed(iter/s)": 0.099951 }, { "epoch": 0.032552556392944236, "eval_loss": 1.0255147218704224, "eval_runtime": 293.8133, "eval_samples_per_second": 11.827, "eval_steps_per_second": 11.827, "step": 700 }, { "epoch": 0.032785074652893834, "grad_norm": 4.364160060882568, "learning_rate": 6.552044609665428e-06, "loss": 1.0353403091430664, "memory(GiB)": 26.73, "step": 705, "token_acc": 0.7327724745283322, "train_speed(iter/s)": 0.096298 }, { "epoch": 0.03301759291284344, "grad_norm": 3.8299734592437744, "learning_rate": 6.598513011152416e-06, "loss": 1.2472078323364257, "memory(GiB)": 26.73, "step": 710, "token_acc": 0.7108608790410461, "train_speed(iter/s)": 0.096683 }, { "epoch": 0.033250111172793036, "grad_norm": 2.9473071098327637, "learning_rate": 6.6449814126394055e-06, "loss": 1.0487863540649414, "memory(GiB)": 26.73, "step": 715, "token_acc": 0.7527955271565495, "train_speed(iter/s)": 0.097065 }, { "epoch": 0.03348262943274264, "grad_norm": 3.801494598388672, "learning_rate": 6.691449814126395e-06, "loss": 1.176010513305664, "memory(GiB)": 26.73, "step": 720, "token_acc": 0.7328456983629398, "train_speed(iter/s)": 0.097437 }, { "epoch": 0.033715147692692245, "grad_norm": 3.633549213409424, "learning_rate": 6.737918215613384e-06, "loss": 1.1107640266418457, "memory(GiB)": 26.73, "step": 725, "token_acc": 0.7462328767123287, "train_speed(iter/s)": 0.097817 }, { "epoch": 0.03394766595264184, "grad_norm": 2.499803304672241, "learning_rate": 6.784386617100372e-06, "loss": 1.1150272369384766, "memory(GiB)": 26.73, "step": 730, "token_acc": 0.738086576937068, "train_speed(iter/s)": 0.098192 }, { "epoch": 0.03418018421259145, "grad_norm": 4.188467979431152, "learning_rate": 6.830855018587361e-06, "loss": 1.223805046081543, "memory(GiB)": 26.73, "step": 735, "token_acc": 0.7213947190250508, "train_speed(iter/s)": 0.098545 }, { "epoch": 0.034412702472541044, "grad_norm": 3.4925193786621094, "learning_rate": 6.87732342007435e-06, "loss": 1.0611821174621583, "memory(GiB)": 26.73, "step": 740, "token_acc": 0.7538409013315125, "train_speed(iter/s)": 0.098917 }, { "epoch": 0.03464522073249065, "grad_norm": 2.734851121902466, "learning_rate": 6.9237918215613384e-06, "loss": 1.2451375007629395, "memory(GiB)": 26.73, "step": 745, "token_acc": 0.7303664921465969, "train_speed(iter/s)": 0.099281 }, { "epoch": 0.03487773899244025, "grad_norm": 3.9239959716796875, "learning_rate": 6.970260223048328e-06, "loss": 1.1344339370727539, "memory(GiB)": 26.73, "step": 750, "token_acc": 0.726453488372093, "train_speed(iter/s)": 0.099641 }, { "epoch": 0.03487773899244025, "eval_loss": 1.0062031745910645, "eval_runtime": 293.4158, "eval_samples_per_second": 11.843, "eval_steps_per_second": 11.843, "step": 750 }, { "epoch": 0.03511025725238985, "grad_norm": 2.1670944690704346, "learning_rate": 7.016728624535316e-06, "loss": 1.0993772506713868, "memory(GiB)": 26.73, "step": 755, "token_acc": 0.7372170460730637, "train_speed(iter/s)": 0.09626 }, { "epoch": 0.035342775512339455, "grad_norm": 3.950388193130493, "learning_rate": 7.063197026022306e-06, "loss": 1.2084269523620605, "memory(GiB)": 26.73, "step": 760, "token_acc": 0.731457800511509, "train_speed(iter/s)": 0.096612 }, { "epoch": 0.03557529377228905, "grad_norm": 3.085629940032959, "learning_rate": 7.109665427509295e-06, "loss": 1.115158462524414, "memory(GiB)": 26.73, "step": 765, "token_acc": 0.7291159772911597, "train_speed(iter/s)": 0.096965 }, { "epoch": 0.03580781203223866, "grad_norm": 3.8739664554595947, "learning_rate": 7.156133828996283e-06, "loss": 1.1317030906677246, "memory(GiB)": 26.73, "step": 770, "token_acc": 0.7414814814814815, "train_speed(iter/s)": 0.097318 }, { "epoch": 0.036040330292188255, "grad_norm": 3.862696409225464, "learning_rate": 7.202602230483272e-06, "loss": 1.1016357421875, "memory(GiB)": 26.73, "step": 775, "token_acc": 0.7436731742588576, "train_speed(iter/s)": 0.097674 }, { "epoch": 0.03627284855213786, "grad_norm": 3.5100576877593994, "learning_rate": 7.249070631970261e-06, "loss": 1.0728687286376952, "memory(GiB)": 26.73, "step": 780, "token_acc": 0.7553956834532374, "train_speed(iter/s)": 0.09803 }, { "epoch": 0.036505366812087464, "grad_norm": 3.369295358657837, "learning_rate": 7.29553903345725e-06, "loss": 1.1247928619384766, "memory(GiB)": 26.73, "step": 785, "token_acc": 0.7476307476307477, "train_speed(iter/s)": 0.098369 }, { "epoch": 0.03673788507203706, "grad_norm": 3.494028329849243, "learning_rate": 7.342007434944238e-06, "loss": 1.1848819732666016, "memory(GiB)": 26.73, "step": 790, "token_acc": 0.7192463982268194, "train_speed(iter/s)": 0.098712 }, { "epoch": 0.036970403331986666, "grad_norm": 4.516552925109863, "learning_rate": 7.388475836431227e-06, "loss": 1.0329771995544434, "memory(GiB)": 26.73, "step": 795, "token_acc": 0.7662447257383966, "train_speed(iter/s)": 0.099061 }, { "epoch": 0.037202921591936264, "grad_norm": 3.831563711166382, "learning_rate": 7.434944237918216e-06, "loss": 1.1165730476379394, "memory(GiB)": 26.73, "step": 800, "token_acc": 0.7447425670775925, "train_speed(iter/s)": 0.099387 }, { "epoch": 0.037202921591936264, "eval_loss": 0.9778481721878052, "eval_runtime": 294.7837, "eval_samples_per_second": 11.788, "eval_steps_per_second": 11.788, "step": 800 }, { "epoch": 0.03743543985188587, "grad_norm": 5.585468769073486, "learning_rate": 7.481412639405205e-06, "loss": 1.1400897979736329, "memory(GiB)": 26.73, "step": 805, "token_acc": 0.7408928271258075, "train_speed(iter/s)": 0.096205 }, { "epoch": 0.03766795811183547, "grad_norm": 3.9298033714294434, "learning_rate": 7.527881040892194e-06, "loss": 1.0923521995544434, "memory(GiB)": 26.73, "step": 810, "token_acc": 0.7488021902806297, "train_speed(iter/s)": 0.096544 }, { "epoch": 0.03790047637178507, "grad_norm": 3.8628060817718506, "learning_rate": 7.574349442379183e-06, "loss": 1.083217716217041, "memory(GiB)": 26.73, "step": 815, "token_acc": 0.7465940054495913, "train_speed(iter/s)": 0.096873 }, { "epoch": 0.038132994631734675, "grad_norm": 3.3381187915802, "learning_rate": 7.620817843866172e-06, "loss": 1.0980140686035156, "memory(GiB)": 26.73, "step": 820, "token_acc": 0.7514580529385375, "train_speed(iter/s)": 0.097186 }, { "epoch": 0.03836551289168427, "grad_norm": 2.6115517616271973, "learning_rate": 7.667286245353161e-06, "loss": 1.0684693336486817, "memory(GiB)": 26.73, "step": 825, "token_acc": 0.75678391959799, "train_speed(iter/s)": 0.097519 }, { "epoch": 0.03859803115163388, "grad_norm": 3.8252604007720947, "learning_rate": 7.713754646840149e-06, "loss": 1.1756773948669434, "memory(GiB)": 26.73, "step": 830, "token_acc": 0.727756114852889, "train_speed(iter/s)": 0.097846 }, { "epoch": 0.03883054941158348, "grad_norm": 3.1177918910980225, "learning_rate": 7.760223048327138e-06, "loss": 1.1260833740234375, "memory(GiB)": 26.73, "step": 835, "token_acc": 0.7389162561576355, "train_speed(iter/s)": 0.098165 }, { "epoch": 0.03906306767153308, "grad_norm": 3.329439878463745, "learning_rate": 7.806691449814127e-06, "loss": 1.0523783683776855, "memory(GiB)": 26.73, "step": 840, "token_acc": 0.7564054957296695, "train_speed(iter/s)": 0.098484 }, { "epoch": 0.039295585931482684, "grad_norm": 2.8450145721435547, "learning_rate": 7.853159851301115e-06, "loss": 1.1531224250793457, "memory(GiB)": 26.73, "step": 845, "token_acc": 0.7297872340425532, "train_speed(iter/s)": 0.098796 }, { "epoch": 0.03952810419143228, "grad_norm": 5.251894950866699, "learning_rate": 7.899628252788106e-06, "loss": 1.1336482048034668, "memory(GiB)": 26.73, "step": 850, "token_acc": 0.7409217877094972, "train_speed(iter/s)": 0.099117 }, { "epoch": 0.03952810419143228, "eval_loss": 0.9617792963981628, "eval_runtime": 292.5541, "eval_samples_per_second": 11.878, "eval_steps_per_second": 11.878, "step": 850 }, { "epoch": 0.039760622451381886, "grad_norm": 1.9178143739700317, "learning_rate": 7.946096654275093e-06, "loss": 1.1401193618774415, "memory(GiB)": 26.73, "step": 855, "token_acc": 0.7444372143219908, "train_speed(iter/s)": 0.09615 }, { "epoch": 0.03999314071133149, "grad_norm": 4.079286575317383, "learning_rate": 7.992565055762083e-06, "loss": 1.1457449913024902, "memory(GiB)": 26.73, "step": 860, "token_acc": 0.7299968122409946, "train_speed(iter/s)": 0.096459 }, { "epoch": 0.04022565897128109, "grad_norm": 3.180300712585449, "learning_rate": 8.039033457249072e-06, "loss": 1.1377723693847657, "memory(GiB)": 26.73, "step": 865, "token_acc": 0.7370562130177515, "train_speed(iter/s)": 0.096769 }, { "epoch": 0.04045817723123069, "grad_norm": 2.781759738922119, "learning_rate": 8.08550185873606e-06, "loss": 1.0756050109863282, "memory(GiB)": 26.73, "step": 870, "token_acc": 0.7453805198872534, "train_speed(iter/s)": 0.097086 }, { "epoch": 0.04069069549118029, "grad_norm": 3.273380994796753, "learning_rate": 8.131970260223049e-06, "loss": 1.1176755905151368, "memory(GiB)": 26.73, "step": 875, "token_acc": 0.7340090877315624, "train_speed(iter/s)": 0.097397 }, { "epoch": 0.040923213751129894, "grad_norm": 3.9136433601379395, "learning_rate": 8.178438661710038e-06, "loss": 1.1917973518371583, "memory(GiB)": 26.73, "step": 880, "token_acc": 0.7312151137827394, "train_speed(iter/s)": 0.097701 }, { "epoch": 0.04115573201107949, "grad_norm": 3.471822500228882, "learning_rate": 8.224907063197025e-06, "loss": 1.0876335144042968, "memory(GiB)": 26.73, "step": 885, "token_acc": 0.7307692307692307, "train_speed(iter/s)": 0.098007 }, { "epoch": 0.041388250271029096, "grad_norm": 3.936947822570801, "learning_rate": 8.271375464684016e-06, "loss": 1.0483864784240722, "memory(GiB)": 26.73, "step": 890, "token_acc": 0.7621787495205217, "train_speed(iter/s)": 0.098317 }, { "epoch": 0.0416207685309787, "grad_norm": 3.633143424987793, "learning_rate": 8.317843866171004e-06, "loss": 1.127341079711914, "memory(GiB)": 26.73, "step": 895, "token_acc": 0.7553226334752702, "train_speed(iter/s)": 0.098615 }, { "epoch": 0.0418532867909283, "grad_norm": 4.088837623596191, "learning_rate": 8.364312267657993e-06, "loss": 1.015018844604492, "memory(GiB)": 26.73, "step": 900, "token_acc": 0.7831031681559708, "train_speed(iter/s)": 0.098915 }, { "epoch": 0.0418532867909283, "eval_loss": 0.9545445442199707, "eval_runtime": 294.6021, "eval_samples_per_second": 11.796, "eval_steps_per_second": 11.796, "step": 900 }, { "epoch": 0.0420858050508779, "grad_norm": 3.2518317699432373, "learning_rate": 8.410780669144982e-06, "loss": 1.0016798019409179, "memory(GiB)": 26.73, "step": 905, "token_acc": 0.7486964923067095, "train_speed(iter/s)": 0.096115 }, { "epoch": 0.0423183233108275, "grad_norm": 3.467190980911255, "learning_rate": 8.457249070631972e-06, "loss": 1.0845005989074707, "memory(GiB)": 26.73, "step": 910, "token_acc": 0.7519889311656867, "train_speed(iter/s)": 0.096399 }, { "epoch": 0.042550841570777105, "grad_norm": 3.6865177154541016, "learning_rate": 8.50371747211896e-06, "loss": 1.2374433517456054, "memory(GiB)": 26.73, "step": 915, "token_acc": 0.7189285714285715, "train_speed(iter/s)": 0.096698 }, { "epoch": 0.04278335983072671, "grad_norm": 4.201014518737793, "learning_rate": 8.550185873605949e-06, "loss": 1.1373135566711425, "memory(GiB)": 26.73, "step": 920, "token_acc": 0.7400126823081801, "train_speed(iter/s)": 0.096992 }, { "epoch": 0.04301587809067631, "grad_norm": 2.818901300430298, "learning_rate": 8.596654275092938e-06, "loss": 1.017166519165039, "memory(GiB)": 26.73, "step": 925, "token_acc": 0.7844311377245509, "train_speed(iter/s)": 0.09729 }, { "epoch": 0.04324839635062591, "grad_norm": 4.298871040344238, "learning_rate": 8.643122676579925e-06, "loss": 0.9277063369750976, "memory(GiB)": 26.73, "step": 930, "token_acc": 0.7862513426423201, "train_speed(iter/s)": 0.097581 }, { "epoch": 0.04348091461057551, "grad_norm": 4.076528549194336, "learning_rate": 8.689591078066916e-06, "loss": 0.998668098449707, "memory(GiB)": 26.73, "step": 935, "token_acc": 0.7610944277610945, "train_speed(iter/s)": 0.097859 }, { "epoch": 0.043713432870525114, "grad_norm": 3.1914594173431396, "learning_rate": 8.736059479553904e-06, "loss": 1.0335095405578614, "memory(GiB)": 26.73, "step": 940, "token_acc": 0.7718513420509291, "train_speed(iter/s)": 0.098151 }, { "epoch": 0.04394595113047472, "grad_norm": 2.9357616901397705, "learning_rate": 8.782527881040893e-06, "loss": 1.0621576309204102, "memory(GiB)": 26.73, "step": 945, "token_acc": 0.7534029756251979, "train_speed(iter/s)": 0.098419 }, { "epoch": 0.044178469390424316, "grad_norm": 3.612863779067993, "learning_rate": 8.828996282527882e-06, "loss": 1.024215030670166, "memory(GiB)": 26.73, "step": 950, "token_acc": 0.7534153005464481, "train_speed(iter/s)": 0.098704 }, { "epoch": 0.044178469390424316, "eval_loss": 0.9331585764884949, "eval_runtime": 296.5324, "eval_samples_per_second": 11.719, "eval_steps_per_second": 11.719, "step": 950 }, { "epoch": 0.04441098765037392, "grad_norm": 3.4591856002807617, "learning_rate": 8.87546468401487e-06, "loss": 1.0047401428222655, "memory(GiB)": 26.73, "step": 955, "token_acc": 0.751922091235264, "train_speed(iter/s)": 0.096033 }, { "epoch": 0.04464350591032352, "grad_norm": 4.42740535736084, "learning_rate": 8.921933085501859e-06, "loss": 0.9573320388793946, "memory(GiB)": 26.73, "step": 960, "token_acc": 0.7904052165812762, "train_speed(iter/s)": 0.096312 }, { "epoch": 0.04487602417027312, "grad_norm": 4.089885234832764, "learning_rate": 8.968401486988848e-06, "loss": 1.1120384216308594, "memory(GiB)": 26.73, "step": 965, "token_acc": 0.7395833333333334, "train_speed(iter/s)": 0.096591 }, { "epoch": 0.04510854243022272, "grad_norm": 3.35774827003479, "learning_rate": 9.014869888475838e-06, "loss": 1.0453130722045898, "memory(GiB)": 26.73, "step": 970, "token_acc": 0.7602854743912678, "train_speed(iter/s)": 0.096867 }, { "epoch": 0.045341060690172325, "grad_norm": 3.1541173458099365, "learning_rate": 9.061338289962825e-06, "loss": 1.1856597900390624, "memory(GiB)": 26.73, "step": 975, "token_acc": 0.7269852424455376, "train_speed(iter/s)": 0.097145 }, { "epoch": 0.04557357895012193, "grad_norm": 3.100411891937256, "learning_rate": 9.107806691449816e-06, "loss": 0.913086986541748, "memory(GiB)": 26.73, "step": 980, "token_acc": 0.7782732063234697, "train_speed(iter/s)": 0.097422 }, { "epoch": 0.04580609721007153, "grad_norm": 3.1742074489593506, "learning_rate": 9.154275092936804e-06, "loss": 0.8818007469177246, "memory(GiB)": 26.73, "step": 985, "token_acc": 0.8005411673753382, "train_speed(iter/s)": 0.097697 }, { "epoch": 0.04603861547002113, "grad_norm": 4.51315450668335, "learning_rate": 9.200743494423793e-06, "loss": 0.9337103843688965, "memory(GiB)": 26.73, "step": 990, "token_acc": 0.7855678556785568, "train_speed(iter/s)": 0.097973 }, { "epoch": 0.04627113372997073, "grad_norm": 3.2544333934783936, "learning_rate": 9.247211895910782e-06, "loss": 1.1088324546813966, "memory(GiB)": 26.73, "step": 995, "token_acc": 0.7601842012043925, "train_speed(iter/s)": 0.098244 }, { "epoch": 0.04650365198992033, "grad_norm": 3.8300986289978027, "learning_rate": 9.29368029739777e-06, "loss": 1.0316854476928712, "memory(GiB)": 26.73, "step": 1000, "token_acc": 0.7476149176062445, "train_speed(iter/s)": 0.098513 }, { "epoch": 0.04650365198992033, "eval_loss": 0.9164021611213684, "eval_runtime": 290.9576, "eval_samples_per_second": 11.943, "eval_steps_per_second": 11.943, "step": 1000 }, { "epoch": 0.04673617024986994, "grad_norm": 4.002346038818359, "learning_rate": 9.340148698884759e-06, "loss": 1.068749237060547, "memory(GiB)": 26.73, "step": 1005, "token_acc": 0.7531827778133197, "train_speed(iter/s)": 0.096035 }, { "epoch": 0.046968688509819535, "grad_norm": 3.9563968181610107, "learning_rate": 9.386617100371748e-06, "loss": 0.9444809913635254, "memory(GiB)": 26.73, "step": 1010, "token_acc": 0.7721238938053098, "train_speed(iter/s)": 0.0963 }, { "epoch": 0.04720120676976914, "grad_norm": 4.915591716766357, "learning_rate": 9.433085501858736e-06, "loss": 0.9487569808959961, "memory(GiB)": 26.73, "step": 1015, "token_acc": 0.7838190517616355, "train_speed(iter/s)": 0.096566 }, { "epoch": 0.04743372502971874, "grad_norm": 5.050897598266602, "learning_rate": 9.479553903345727e-06, "loss": 1.0228119850158692, "memory(GiB)": 26.73, "step": 1020, "token_acc": 0.7647294589178357, "train_speed(iter/s)": 0.096826 }, { "epoch": 0.04766624328966834, "grad_norm": 2.706132650375366, "learning_rate": 9.526022304832714e-06, "loss": 1.0279125213623046, "memory(GiB)": 26.73, "step": 1025, "token_acc": 0.7501481920569057, "train_speed(iter/s)": 0.097091 }, { "epoch": 0.047898761549617946, "grad_norm": 3.322195529937744, "learning_rate": 9.572490706319703e-06, "loss": 1.0890856742858888, "memory(GiB)": 26.73, "step": 1030, "token_acc": 0.737261698440208, "train_speed(iter/s)": 0.097354 }, { "epoch": 0.048131279809567544, "grad_norm": 2.9123032093048096, "learning_rate": 9.618959107806693e-06, "loss": 1.077380657196045, "memory(GiB)": 26.73, "step": 1035, "token_acc": 0.7402678293366552, "train_speed(iter/s)": 0.097615 }, { "epoch": 0.04836379806951715, "grad_norm": 3.951082706451416, "learning_rate": 9.66542750929368e-06, "loss": 1.0775503158569335, "memory(GiB)": 26.73, "step": 1040, "token_acc": 0.7607033639143731, "train_speed(iter/s)": 0.09787 }, { "epoch": 0.048596316329466746, "grad_norm": 4.118011951446533, "learning_rate": 9.71189591078067e-06, "loss": 1.1944708824157715, "memory(GiB)": 26.73, "step": 1045, "token_acc": 0.7104085893229943, "train_speed(iter/s)": 0.098126 }, { "epoch": 0.04882883458941635, "grad_norm": 4.057410717010498, "learning_rate": 9.758364312267659e-06, "loss": 0.9629217147827148, "memory(GiB)": 26.73, "step": 1050, "token_acc": 0.762325448845825, "train_speed(iter/s)": 0.09838 }, { "epoch": 0.04882883458941635, "eval_loss": 0.9048006534576416, "eval_runtime": 290.1861, "eval_samples_per_second": 11.975, "eval_steps_per_second": 11.975, "step": 1050 }, { "epoch": 0.049061352849365955, "grad_norm": 3.130415439605713, "learning_rate": 9.804832713754648e-06, "loss": 0.9988020896911621, "memory(GiB)": 26.73, "step": 1055, "token_acc": 0.7562960773820459, "train_speed(iter/s)": 0.096023 }, { "epoch": 0.04929387110931555, "grad_norm": 4.304872989654541, "learning_rate": 9.851301115241636e-06, "loss": 1.0478652000427247, "memory(GiB)": 26.73, "step": 1060, "token_acc": 0.7515170871925902, "train_speed(iter/s)": 0.096272 }, { "epoch": 0.04952638936926516, "grad_norm": 3.4750263690948486, "learning_rate": 9.897769516728627e-06, "loss": 1.090310001373291, "memory(GiB)": 26.73, "step": 1065, "token_acc": 0.7461376404494382, "train_speed(iter/s)": 0.096527 }, { "epoch": 0.049758907629214755, "grad_norm": 3.2683334350585938, "learning_rate": 9.944237918215614e-06, "loss": 1.0517633438110352, "memory(GiB)": 26.73, "step": 1070, "token_acc": 0.7599436818021823, "train_speed(iter/s)": 0.09678 }, { "epoch": 0.04999142588916436, "grad_norm": 3.1977105140686035, "learning_rate": 9.990706319702603e-06, "loss": 0.9936330795288086, "memory(GiB)": 26.73, "step": 1075, "token_acc": 0.7530349013657056, "train_speed(iter/s)": 0.097031 }, { "epoch": 0.05022394414911396, "grad_norm": 3.543762445449829, "learning_rate": 9.999999053870585e-06, "loss": 0.9840545654296875, "memory(GiB)": 26.73, "step": 1080, "token_acc": 0.7613271494826971, "train_speed(iter/s)": 0.097282 }, { "epoch": 0.05045646240906356, "grad_norm": 4.944690227508545, "learning_rate": 9.999995210220447e-06, "loss": 1.022100067138672, "memory(GiB)": 26.73, "step": 1085, "token_acc": 0.7664176169878096, "train_speed(iter/s)": 0.097534 }, { "epoch": 0.050688980669013166, "grad_norm": 3.441666841506958, "learning_rate": 9.999988409918769e-06, "loss": 1.0619563102722167, "memory(GiB)": 29.49, "step": 1090, "token_acc": 0.7531625040544924, "train_speed(iter/s)": 0.097773 }, { "epoch": 0.05092149892896276, "grad_norm": 5.653553485870361, "learning_rate": 9.999978652969573e-06, "loss": 0.9953934669494628, "memory(GiB)": 29.49, "step": 1095, "token_acc": 0.764, "train_speed(iter/s)": 0.098024 }, { "epoch": 0.05115401718891237, "grad_norm": 3.904196262359619, "learning_rate": 9.999965939378626e-06, "loss": 0.9837069511413574, "memory(GiB)": 29.49, "step": 1100, "token_acc": 0.7674581005586593, "train_speed(iter/s)": 0.098275 }, { "epoch": 0.05115401718891237, "eval_loss": 0.8877241015434265, "eval_runtime": 293.9743, "eval_samples_per_second": 11.821, "eval_steps_per_second": 11.821, "step": 1100 }, { "epoch": 0.051386535448861966, "grad_norm": 3.1758522987365723, "learning_rate": 9.999950269153451e-06, "loss": 1.0526921272277832, "memory(GiB)": 29.49, "step": 1105, "token_acc": 0.7591879537940586, "train_speed(iter/s)": 0.096001 }, { "epoch": 0.05161905370881157, "grad_norm": 3.341015577316284, "learning_rate": 9.999931642303309e-06, "loss": 1.084926223754883, "memory(GiB)": 29.49, "step": 1110, "token_acc": 0.7447089947089947, "train_speed(iter/s)": 0.09624 }, { "epoch": 0.051851571968761175, "grad_norm": 4.055420875549316, "learning_rate": 9.999910058839218e-06, "loss": 1.0594484329223632, "memory(GiB)": 29.49, "step": 1115, "token_acc": 0.7498171177761521, "train_speed(iter/s)": 0.096482 }, { "epoch": 0.05208409022871077, "grad_norm": 3.7540385723114014, "learning_rate": 9.999885518773939e-06, "loss": 1.0508157730102539, "memory(GiB)": 29.49, "step": 1120, "token_acc": 0.7722502914885347, "train_speed(iter/s)": 0.096719 }, { "epoch": 0.05231660848866038, "grad_norm": 3.7452423572540283, "learning_rate": 9.999858022121987e-06, "loss": 0.9673895835876465, "memory(GiB)": 29.49, "step": 1125, "token_acc": 0.7685028725920919, "train_speed(iter/s)": 0.096952 }, { "epoch": 0.052549126748609974, "grad_norm": 2.8961267471313477, "learning_rate": 9.999827568899615e-06, "loss": 1.046191120147705, "memory(GiB)": 29.49, "step": 1130, "token_acc": 0.7544949329846355, "train_speed(iter/s)": 0.097192 }, { "epoch": 0.05278164500855958, "grad_norm": 4.284615516662598, "learning_rate": 9.999794159124838e-06, "loss": 1.1306605339050293, "memory(GiB)": 29.49, "step": 1135, "token_acc": 0.7502649240551043, "train_speed(iter/s)": 0.097431 }, { "epoch": 0.05301416326850918, "grad_norm": 3.700716257095337, "learning_rate": 9.999757792817407e-06, "loss": 1.0605037689208985, "memory(GiB)": 29.49, "step": 1140, "token_acc": 0.7426001160766106, "train_speed(iter/s)": 0.097663 }, { "epoch": 0.05324668152845878, "grad_norm": 3.2940375804901123, "learning_rate": 9.999718469998829e-06, "loss": 1.070816421508789, "memory(GiB)": 29.49, "step": 1145, "token_acc": 0.7607361963190185, "train_speed(iter/s)": 0.097904 }, { "epoch": 0.053479199788408385, "grad_norm": 3.6671321392059326, "learning_rate": 9.999676190692356e-06, "loss": 1.008139419555664, "memory(GiB)": 29.49, "step": 1150, "token_acc": 0.7638326585695007, "train_speed(iter/s)": 0.09814 }, { "epoch": 0.053479199788408385, "eval_loss": 0.8825203776359558, "eval_runtime": 293.4828, "eval_samples_per_second": 11.841, "eval_steps_per_second": 11.841, "step": 1150 }, { "epoch": 0.05371171804835798, "grad_norm": 3.003434181213379, "learning_rate": 9.99963095492299e-06, "loss": 0.7977881908416748, "memory(GiB)": 29.49, "step": 1155, "token_acc": 0.7611431316042268, "train_speed(iter/s)": 0.09597 }, { "epoch": 0.05394423630830759, "grad_norm": 3.953749418258667, "learning_rate": 9.999582762717479e-06, "loss": 1.0369236946105957, "memory(GiB)": 29.49, "step": 1160, "token_acc": 0.7574827321565618, "train_speed(iter/s)": 0.096205 }, { "epoch": 0.05417675456825719, "grad_norm": 3.2749271392822266, "learning_rate": 9.999531614104323e-06, "loss": 1.0453920364379883, "memory(GiB)": 29.49, "step": 1165, "token_acc": 0.7414411477013368, "train_speed(iter/s)": 0.096432 }, { "epoch": 0.05440927282820679, "grad_norm": 3.203056812286377, "learning_rate": 9.999477509113764e-06, "loss": 1.0291184425354003, "memory(GiB)": 29.49, "step": 1170, "token_acc": 0.7581111468037263, "train_speed(iter/s)": 0.096664 }, { "epoch": 0.054641791088156394, "grad_norm": 3.7450356483459473, "learning_rate": 9.999420447777799e-06, "loss": 0.9435734748840332, "memory(GiB)": 29.49, "step": 1175, "token_acc": 0.7707606420097697, "train_speed(iter/s)": 0.096896 }, { "epoch": 0.05487430934810599, "grad_norm": 3.322979211807251, "learning_rate": 9.999360430130168e-06, "loss": 0.9902138710021973, "memory(GiB)": 29.49, "step": 1180, "token_acc": 0.7555851950650216, "train_speed(iter/s)": 0.097124 }, { "epoch": 0.055106827608055596, "grad_norm": 3.5927646160125732, "learning_rate": 9.999297456206363e-06, "loss": 0.9759355545043945, "memory(GiB)": 29.49, "step": 1185, "token_acc": 0.7508080155138979, "train_speed(iter/s)": 0.097349 }, { "epoch": 0.055339345868005194, "grad_norm": 3.642490863800049, "learning_rate": 9.99923152604362e-06, "loss": 1.0184328079223632, "memory(GiB)": 29.49, "step": 1190, "token_acc": 0.7636363636363637, "train_speed(iter/s)": 0.097575 }, { "epoch": 0.0555718641279548, "grad_norm": 3.2297916412353516, "learning_rate": 9.99916263968093e-06, "loss": 0.9956600189208984, "memory(GiB)": 29.49, "step": 1195, "token_acc": 0.7647887323943662, "train_speed(iter/s)": 0.097806 }, { "epoch": 0.0558043823879044, "grad_norm": 3.0777218341827393, "learning_rate": 9.999090797159024e-06, "loss": 1.0422906875610352, "memory(GiB)": 29.49, "step": 1200, "token_acc": 0.7535483870967742, "train_speed(iter/s)": 0.098027 }, { "epoch": 0.0558043823879044, "eval_loss": 0.8728676438331604, "eval_runtime": 295.1145, "eval_samples_per_second": 11.775, "eval_steps_per_second": 11.775, "step": 1200 }, { "epoch": 0.056036900647854, "grad_norm": 5.250125408172607, "learning_rate": 9.999015998520385e-06, "loss": 1.0027048110961914, "memory(GiB)": 29.49, "step": 1205, "token_acc": 0.7636625926668003, "train_speed(iter/s)": 0.095943 }, { "epoch": 0.056269418907803605, "grad_norm": 4.09804630279541, "learning_rate": 9.998938243809244e-06, "loss": 0.8814007759094238, "memory(GiB)": 29.49, "step": 1210, "token_acc": 0.7876106194690266, "train_speed(iter/s)": 0.096168 }, { "epoch": 0.0565019371677532, "grad_norm": 3.9543261528015137, "learning_rate": 9.998857533071582e-06, "loss": 0.879784107208252, "memory(GiB)": 29.49, "step": 1215, "token_acc": 0.7943156320119671, "train_speed(iter/s)": 0.096389 }, { "epoch": 0.05673445542770281, "grad_norm": 4.678823947906494, "learning_rate": 9.998773866355123e-06, "loss": 0.9489768981933594, "memory(GiB)": 29.49, "step": 1220, "token_acc": 0.7849514563106796, "train_speed(iter/s)": 0.096614 }, { "epoch": 0.05696697368765241, "grad_norm": 3.336897611618042, "learning_rate": 9.998687243709342e-06, "loss": 0.9821072578430176, "memory(GiB)": 29.49, "step": 1225, "token_acc": 0.7741603838245373, "train_speed(iter/s)": 0.096829 }, { "epoch": 0.05719949194760201, "grad_norm": 4.5111002922058105, "learning_rate": 9.998597665185463e-06, "loss": 0.9603194236755371, "memory(GiB)": 29.49, "step": 1230, "token_acc": 0.7746188710341986, "train_speed(iter/s)": 0.097046 }, { "epoch": 0.05743201020755161, "grad_norm": 3.5741658210754395, "learning_rate": 9.998505130836456e-06, "loss": 0.976735782623291, "memory(GiB)": 29.49, "step": 1235, "token_acc": 0.7599756320438623, "train_speed(iter/s)": 0.097271 }, { "epoch": 0.05766452846750121, "grad_norm": 3.4121596813201904, "learning_rate": 9.998409640717038e-06, "loss": 0.8603731155395508, "memory(GiB)": 29.49, "step": 1240, "token_acc": 0.7960591133004926, "train_speed(iter/s)": 0.097495 }, { "epoch": 0.057897046727450815, "grad_norm": 3.4050662517547607, "learning_rate": 9.998311194883676e-06, "loss": 0.9685305595397949, "memory(GiB)": 29.49, "step": 1245, "token_acc": 0.7773972602739726, "train_speed(iter/s)": 0.097716 }, { "epoch": 0.05812956498740042, "grad_norm": 3.6689274311065674, "learning_rate": 9.998209793394586e-06, "loss": 0.973471736907959, "memory(GiB)": 29.49, "step": 1250, "token_acc": 0.7650891632373114, "train_speed(iter/s)": 0.097939 }, { "epoch": 0.05812956498740042, "eval_loss": 0.8615283966064453, "eval_runtime": 293.4868, "eval_samples_per_second": 11.84, "eval_steps_per_second": 11.84, "step": 1250 }, { "epoch": 0.05836208324735002, "grad_norm": 2.7972047328948975, "learning_rate": 9.99810543630973e-06, "loss": 0.9186818122863769, "memory(GiB)": 29.49, "step": 1255, "token_acc": 0.765333910006888, "train_speed(iter/s)": 0.095949 }, { "epoch": 0.05859460150729962, "grad_norm": 3.5054550170898438, "learning_rate": 9.997998123690813e-06, "loss": 1.0190353393554688, "memory(GiB)": 29.49, "step": 1260, "token_acc": 0.7730008598452278, "train_speed(iter/s)": 0.096164 }, { "epoch": 0.05882711976724922, "grad_norm": 4.9383087158203125, "learning_rate": 9.997887855601296e-06, "loss": 0.8528729438781738, "memory(GiB)": 29.49, "step": 1265, "token_acc": 0.8101571946795647, "train_speed(iter/s)": 0.096374 }, { "epoch": 0.059059638027198824, "grad_norm": 3.853332996368408, "learning_rate": 9.997774632106384e-06, "loss": 0.9558304786682129, "memory(GiB)": 29.49, "step": 1270, "token_acc": 0.7764960346070656, "train_speed(iter/s)": 0.096588 }, { "epoch": 0.05929215628714842, "grad_norm": 3.9047439098358154, "learning_rate": 9.997658453273027e-06, "loss": 0.9520219802856446, "memory(GiB)": 29.49, "step": 1275, "token_acc": 0.7671997024916326, "train_speed(iter/s)": 0.096802 }, { "epoch": 0.059524674547098026, "grad_norm": 3.4936561584472656, "learning_rate": 9.997539319169928e-06, "loss": 0.982548999786377, "memory(GiB)": 29.49, "step": 1280, "token_acc": 0.7688804554079697, "train_speed(iter/s)": 0.097014 }, { "epoch": 0.05975719280704763, "grad_norm": 5.598224639892578, "learning_rate": 9.997417229867534e-06, "loss": 1.0808399200439454, "memory(GiB)": 29.49, "step": 1285, "token_acc": 0.7508747375787264, "train_speed(iter/s)": 0.097217 }, { "epoch": 0.05998971106699723, "grad_norm": 3.887967109680176, "learning_rate": 9.997292185438038e-06, "loss": 1.0092259407043458, "memory(GiB)": 29.49, "step": 1290, "token_acc": 0.7590950779578111, "train_speed(iter/s)": 0.097424 }, { "epoch": 0.06022222932694683, "grad_norm": 3.637932538986206, "learning_rate": 9.997164185955385e-06, "loss": 1.0872056007385253, "memory(GiB)": 29.49, "step": 1295, "token_acc": 0.7493606138107417, "train_speed(iter/s)": 0.097631 }, { "epoch": 0.06045474758689643, "grad_norm": 3.762218475341797, "learning_rate": 9.997033231495263e-06, "loss": 0.9506484985351562, "memory(GiB)": 29.49, "step": 1300, "token_acc": 0.7712395543175488, "train_speed(iter/s)": 0.097841 }, { "epoch": 0.06045474758689643, "eval_loss": 0.8519848585128784, "eval_runtime": 292.5996, "eval_samples_per_second": 11.876, "eval_steps_per_second": 11.876, "step": 1300 }, { "epoch": 0.060687265846846035, "grad_norm": 3.022722005844116, "learning_rate": 9.996899322135113e-06, "loss": 1.1480344772338866, "memory(GiB)": 29.49, "step": 1305, "token_acc": 0.7660458955700242, "train_speed(iter/s)": 0.095938 }, { "epoch": 0.06091978410679564, "grad_norm": 3.4392518997192383, "learning_rate": 9.996762457954116e-06, "loss": 1.0385401725769043, "memory(GiB)": 29.49, "step": 1310, "token_acc": 0.7515671395579017, "train_speed(iter/s)": 0.096145 }, { "epoch": 0.06115230236674524, "grad_norm": 3.6800425052642822, "learning_rate": 9.996622639033206e-06, "loss": 0.9112386703491211, "memory(GiB)": 29.49, "step": 1315, "token_acc": 0.7730819245773732, "train_speed(iter/s)": 0.096354 }, { "epoch": 0.06138482062669484, "grad_norm": 4.807951927185059, "learning_rate": 9.996479865455063e-06, "loss": 0.9230103492736816, "memory(GiB)": 29.49, "step": 1320, "token_acc": 0.7816488444291135, "train_speed(iter/s)": 0.096556 }, { "epoch": 0.06161733888664444, "grad_norm": 3.865591287612915, "learning_rate": 9.996334137304111e-06, "loss": 1.008216381072998, "memory(GiB)": 29.49, "step": 1325, "token_acc": 0.7491337491337491, "train_speed(iter/s)": 0.096759 }, { "epoch": 0.061849857146594044, "grad_norm": 3.1500022411346436, "learning_rate": 9.996185454666525e-06, "loss": 0.9567780494689941, "memory(GiB)": 29.49, "step": 1330, "token_acc": 0.7723855092276145, "train_speed(iter/s)": 0.096965 }, { "epoch": 0.06208237540654365, "grad_norm": 4.783307075500488, "learning_rate": 9.996033817630224e-06, "loss": 0.8996448516845703, "memory(GiB)": 29.49, "step": 1335, "token_acc": 0.7988252569750367, "train_speed(iter/s)": 0.097167 }, { "epoch": 0.062314893666493246, "grad_norm": 3.412679433822632, "learning_rate": 9.995879226284878e-06, "loss": 0.874872875213623, "memory(GiB)": 29.49, "step": 1340, "token_acc": 0.7957110609480813, "train_speed(iter/s)": 0.097368 }, { "epoch": 0.06254741192644285, "grad_norm": 3.106863498687744, "learning_rate": 9.995721680721901e-06, "loss": 1.107049560546875, "memory(GiB)": 29.49, "step": 1345, "token_acc": 0.7413730475844533, "train_speed(iter/s)": 0.097566 }, { "epoch": 0.06277993018639245, "grad_norm": 3.066474437713623, "learning_rate": 9.995561181034454e-06, "loss": 0.8950592041015625, "memory(GiB)": 29.49, "step": 1350, "token_acc": 0.7750075097626915, "train_speed(iter/s)": 0.097766 }, { "epoch": 0.06277993018639245, "eval_loss": 0.8605217337608337, "eval_runtime": 294.6974, "eval_samples_per_second": 11.792, "eval_steps_per_second": 11.792, "step": 1350 }, { "epoch": 0.06301244844634205, "grad_norm": 4.106268405914307, "learning_rate": 9.995397727317447e-06, "loss": 1.0116336822509766, "memory(GiB)": 29.49, "step": 1355, "token_acc": 0.7674015773172834, "train_speed(iter/s)": 0.095918 }, { "epoch": 0.06324496670629165, "grad_norm": 3.730259418487549, "learning_rate": 9.99523131966753e-06, "loss": 1.0562795639038085, "memory(GiB)": 29.49, "step": 1360, "token_acc": 0.7494160827494161, "train_speed(iter/s)": 0.096117 }, { "epoch": 0.06347748496624125, "grad_norm": 4.691348552703857, "learning_rate": 9.995061958183111e-06, "loss": 1.1530324935913085, "memory(GiB)": 29.49, "step": 1365, "token_acc": 0.7197295636140135, "train_speed(iter/s)": 0.096302 }, { "epoch": 0.06371000322619086, "grad_norm": 4.493622303009033, "learning_rate": 9.994889642964338e-06, "loss": 0.9282937049865723, "memory(GiB)": 29.49, "step": 1370, "token_acc": 0.753072625698324, "train_speed(iter/s)": 0.096499 }, { "epoch": 0.06394252148614046, "grad_norm": 3.885254144668579, "learning_rate": 9.994714374113104e-06, "loss": 0.9173580169677734, "memory(GiB)": 29.49, "step": 1375, "token_acc": 0.775804289544236, "train_speed(iter/s)": 0.096698 }, { "epoch": 0.06417503974609005, "grad_norm": 4.416103839874268, "learning_rate": 9.994536151733051e-06, "loss": 0.96749849319458, "memory(GiB)": 29.49, "step": 1380, "token_acc": 0.7750706214689266, "train_speed(iter/s)": 0.096894 }, { "epoch": 0.06440755800603966, "grad_norm": 4.307039260864258, "learning_rate": 9.994354975929567e-06, "loss": 0.9266422271728516, "memory(GiB)": 29.49, "step": 1385, "token_acc": 0.7925219941348973, "train_speed(iter/s)": 0.097093 }, { "epoch": 0.06464007626598926, "grad_norm": 3.3025949001312256, "learning_rate": 9.99417084680979e-06, "loss": 1.0316217422485352, "memory(GiB)": 29.49, "step": 1390, "token_acc": 0.7507598784194529, "train_speed(iter/s)": 0.097289 }, { "epoch": 0.06487259452593887, "grad_norm": 3.5625064373016357, "learning_rate": 9.993983764482598e-06, "loss": 0.9325406074523925, "memory(GiB)": 29.49, "step": 1395, "token_acc": 0.7747720364741641, "train_speed(iter/s)": 0.097481 }, { "epoch": 0.06510511278588847, "grad_norm": 2.98435640335083, "learning_rate": 9.99379372905862e-06, "loss": 0.9729434967041015, "memory(GiB)": 29.49, "step": 1400, "token_acc": 0.7669683257918553, "train_speed(iter/s)": 0.097676 }, { "epoch": 0.06510511278588847, "eval_loss": 0.8391401171684265, "eval_runtime": 294.1127, "eval_samples_per_second": 11.815, "eval_steps_per_second": 11.815, "step": 1400 }, { "epoch": 0.06533763104583806, "grad_norm": 3.670905828475952, "learning_rate": 9.99360074065023e-06, "loss": 0.9499552726745606, "memory(GiB)": 29.49, "step": 1405, "token_acc": 0.7702438440355989, "train_speed(iter/s)": 0.095898 }, { "epoch": 0.06557014930578767, "grad_norm": 3.8291659355163574, "learning_rate": 9.993404799371548e-06, "loss": 0.8762431144714355, "memory(GiB)": 29.49, "step": 1410, "token_acc": 0.7836134453781513, "train_speed(iter/s)": 0.096089 }, { "epoch": 0.06580266756573727, "grad_norm": 5.117160797119141, "learning_rate": 9.99320590533844e-06, "loss": 0.9930843353271485, "memory(GiB)": 29.49, "step": 1415, "token_acc": 0.7609187965059852, "train_speed(iter/s)": 0.096282 }, { "epoch": 0.06603518582568688, "grad_norm": 4.551426410675049, "learning_rate": 9.993004058668516e-06, "loss": 1.0043935775756836, "memory(GiB)": 29.49, "step": 1420, "token_acc": 0.7556226921785835, "train_speed(iter/s)": 0.096472 }, { "epoch": 0.06626770408563648, "grad_norm": 3.7161145210266113, "learning_rate": 9.992799259481136e-06, "loss": 1.058093547821045, "memory(GiB)": 29.49, "step": 1425, "token_acc": 0.745412078745412, "train_speed(iter/s)": 0.096662 }, { "epoch": 0.06650022234558607, "grad_norm": 3.9534449577331543, "learning_rate": 9.992591507897405e-06, "loss": 0.894005012512207, "memory(GiB)": 29.49, "step": 1430, "token_acc": 0.7950200088928413, "train_speed(iter/s)": 0.096848 }, { "epoch": 0.06673274060553568, "grad_norm": 4.306530475616455, "learning_rate": 9.992380804040172e-06, "loss": 0.9588717460632324, "memory(GiB)": 29.49, "step": 1435, "token_acc": 0.7775534921275736, "train_speed(iter/s)": 0.097039 }, { "epoch": 0.06696525886548528, "grad_norm": 4.580782890319824, "learning_rate": 9.992167148034033e-06, "loss": 0.8765263557434082, "memory(GiB)": 29.49, "step": 1440, "token_acc": 0.7939895470383276, "train_speed(iter/s)": 0.097225 }, { "epoch": 0.06719777712543488, "grad_norm": 3.7674219608306885, "learning_rate": 9.991950540005329e-06, "loss": 0.9246517181396484, "memory(GiB)": 29.49, "step": 1445, "token_acc": 0.7846655791190864, "train_speed(iter/s)": 0.097413 }, { "epoch": 0.06743029538538449, "grad_norm": 4.060750961303711, "learning_rate": 9.991730980082147e-06, "loss": 0.9379199028015137, "memory(GiB)": 29.49, "step": 1450, "token_acc": 0.7786173026067246, "train_speed(iter/s)": 0.097597 }, { "epoch": 0.06743029538538449, "eval_loss": 0.8332929015159607, "eval_runtime": 294.3264, "eval_samples_per_second": 11.807, "eval_steps_per_second": 11.807, "step": 1450 }, { "epoch": 0.06766281364533408, "grad_norm": 4.1459197998046875, "learning_rate": 9.99150846839432e-06, "loss": 0.8683028221130371, "memory(GiB)": 29.49, "step": 1455, "token_acc": 0.7721197916248606, "train_speed(iter/s)": 0.095882 }, { "epoch": 0.06789533190528368, "grad_norm": 4.427006721496582, "learning_rate": 9.991283005073425e-06, "loss": 0.8849419593811035, "memory(GiB)": 29.49, "step": 1460, "token_acc": 0.7902208201892744, "train_speed(iter/s)": 0.096063 }, { "epoch": 0.06812785016523329, "grad_norm": 4.184934139251709, "learning_rate": 9.991054590252786e-06, "loss": 0.9550199508666992, "memory(GiB)": 29.49, "step": 1465, "token_acc": 0.7796402289452167, "train_speed(iter/s)": 0.096248 }, { "epoch": 0.0683603684251829, "grad_norm": 3.8598761558532715, "learning_rate": 9.990823224067473e-06, "loss": 0.9285133361816407, "memory(GiB)": 29.49, "step": 1470, "token_acc": 0.7871080139372822, "train_speed(iter/s)": 0.096435 }, { "epoch": 0.0685928866851325, "grad_norm": 4.542653560638428, "learning_rate": 9.990588906654296e-06, "loss": 0.8635785102844238, "memory(GiB)": 29.49, "step": 1475, "token_acc": 0.7989153108051731, "train_speed(iter/s)": 0.096622 }, { "epoch": 0.06882540494508209, "grad_norm": 4.145790100097656, "learning_rate": 9.99035163815182e-06, "loss": 0.8900579452514649, "memory(GiB)": 29.49, "step": 1480, "token_acc": 0.7928062944923192, "train_speed(iter/s)": 0.096806 }, { "epoch": 0.0690579232050317, "grad_norm": 4.27974271774292, "learning_rate": 9.990111418700345e-06, "loss": 0.9037236213684082, "memory(GiB)": 29.49, "step": 1485, "token_acc": 0.7942857142857143, "train_speed(iter/s)": 0.096986 }, { "epoch": 0.0692904414649813, "grad_norm": 4.4757399559021, "learning_rate": 9.989868248441922e-06, "loss": 0.9862874984741211, "memory(GiB)": 29.49, "step": 1490, "token_acc": 0.7700774561761109, "train_speed(iter/s)": 0.097165 }, { "epoch": 0.0695229597249309, "grad_norm": 5.238999366760254, "learning_rate": 9.989622127520345e-06, "loss": 1.1203701972961426, "memory(GiB)": 29.49, "step": 1495, "token_acc": 0.7371184771906794, "train_speed(iter/s)": 0.097343 }, { "epoch": 0.0697554779848805, "grad_norm": 4.224093437194824, "learning_rate": 9.989373056081151e-06, "loss": 0.9576206207275391, "memory(GiB)": 29.49, "step": 1500, "token_acc": 0.7671414375621066, "train_speed(iter/s)": 0.097524 }, { "epoch": 0.0697554779848805, "eval_loss": 0.829556941986084, "eval_runtime": 292.0969, "eval_samples_per_second": 11.897, "eval_steps_per_second": 11.897, "step": 1500 }, { "epoch": 0.0699879962448301, "grad_norm": 4.1649675369262695, "learning_rate": 9.989121034271625e-06, "loss": 0.9163863182067871, "memory(GiB)": 29.49, "step": 1505, "token_acc": 0.772836721143616, "train_speed(iter/s)": 0.095881 }, { "epoch": 0.0702205145047797, "grad_norm": 5.037374019622803, "learning_rate": 9.988866062240796e-06, "loss": 0.9264779090881348, "memory(GiB)": 29.49, "step": 1510, "token_acc": 0.7805779569892473, "train_speed(iter/s)": 0.096059 }, { "epoch": 0.0704530327647293, "grad_norm": 4.991156578063965, "learning_rate": 9.988608140139436e-06, "loss": 0.9976637840270997, "memory(GiB)": 29.49, "step": 1515, "token_acc": 0.7648648648648648, "train_speed(iter/s)": 0.09624 }, { "epoch": 0.07068555102467891, "grad_norm": 3.67498517036438, "learning_rate": 9.988347268120062e-06, "loss": 0.9292027473449707, "memory(GiB)": 29.49, "step": 1520, "token_acc": 0.7783676177836761, "train_speed(iter/s)": 0.096419 }, { "epoch": 0.0709180692846285, "grad_norm": 4.408437728881836, "learning_rate": 9.988083446336936e-06, "loss": 0.9982312202453614, "memory(GiB)": 29.49, "step": 1525, "token_acc": 0.7497497497497497, "train_speed(iter/s)": 0.096596 }, { "epoch": 0.0711505875445781, "grad_norm": 4.5508832931518555, "learning_rate": 9.987816674946064e-06, "loss": 0.8968223571777344, "memory(GiB)": 29.49, "step": 1530, "token_acc": 0.7771679473106476, "train_speed(iter/s)": 0.096768 }, { "epoch": 0.07138310580452771, "grad_norm": 3.7403552532196045, "learning_rate": 9.987546954105198e-06, "loss": 0.8528156280517578, "memory(GiB)": 29.49, "step": 1535, "token_acc": 0.7941391941391941, "train_speed(iter/s)": 0.096944 }, { "epoch": 0.07161562406447732, "grad_norm": 3.6034836769104004, "learning_rate": 9.987274283973829e-06, "loss": 0.9096664428710938, "memory(GiB)": 29.49, "step": 1540, "token_acc": 0.7896147403685092, "train_speed(iter/s)": 0.097124 }, { "epoch": 0.07184814232442692, "grad_norm": 5.132664680480957, "learning_rate": 9.986998664713195e-06, "loss": 0.9881318092346192, "memory(GiB)": 29.49, "step": 1545, "token_acc": 0.7650876054510058, "train_speed(iter/s)": 0.097299 }, { "epoch": 0.07208066058437651, "grad_norm": 4.338294506072998, "learning_rate": 9.986720096486282e-06, "loss": 1.01829195022583, "memory(GiB)": 29.49, "step": 1550, "token_acc": 0.7651966626936829, "train_speed(iter/s)": 0.09748 }, { "epoch": 0.07208066058437651, "eval_loss": 0.8277249336242676, "eval_runtime": 290.3989, "eval_samples_per_second": 11.966, "eval_steps_per_second": 11.966, "step": 1550 }, { "epoch": 0.07231317884432611, "grad_norm": 13.258360862731934, "learning_rate": 9.986438579457813e-06, "loss": 0.8957409858703613, "memory(GiB)": 29.49, "step": 1555, "token_acc": 0.7736871986374213, "train_speed(iter/s)": 0.0959 }, { "epoch": 0.07254569710427572, "grad_norm": 4.9179840087890625, "learning_rate": 9.98615411379426e-06, "loss": 0.9114449501037598, "memory(GiB)": 29.49, "step": 1560, "token_acc": 0.7850467289719626, "train_speed(iter/s)": 0.096075 }, { "epoch": 0.07277821536422532, "grad_norm": 3.7983877658843994, "learning_rate": 9.985866699663833e-06, "loss": 0.9112727165222168, "memory(GiB)": 29.49, "step": 1565, "token_acc": 0.7770820288040076, "train_speed(iter/s)": 0.096234 }, { "epoch": 0.07301073362417493, "grad_norm": 3.6494104862213135, "learning_rate": 9.98557633723649e-06, "loss": 0.99598388671875, "memory(GiB)": 29.49, "step": 1570, "token_acc": 0.7602203537257176, "train_speed(iter/s)": 0.096406 }, { "epoch": 0.07324325188412452, "grad_norm": 5.046865940093994, "learning_rate": 9.985283026683934e-06, "loss": 0.9579297065734863, "memory(GiB)": 29.49, "step": 1575, "token_acc": 0.7820121951219512, "train_speed(iter/s)": 0.096577 }, { "epoch": 0.07347577014407412, "grad_norm": 4.406998157501221, "learning_rate": 9.984986768179608e-06, "loss": 0.8625598907470703, "memory(GiB)": 29.49, "step": 1580, "token_acc": 0.8079896907216495, "train_speed(iter/s)": 0.096749 }, { "epoch": 0.07370828840402373, "grad_norm": 4.63735818862915, "learning_rate": 9.984687561898693e-06, "loss": 0.9187863349914551, "memory(GiB)": 29.49, "step": 1585, "token_acc": 0.7812375249500998, "train_speed(iter/s)": 0.096924 }, { "epoch": 0.07394080666397333, "grad_norm": 5.131118297576904, "learning_rate": 9.984385408018127e-06, "loss": 0.8392014503479004, "memory(GiB)": 29.49, "step": 1590, "token_acc": 0.7880321524263174, "train_speed(iter/s)": 0.097094 }, { "epoch": 0.07417332492392294, "grad_norm": 4.358776569366455, "learning_rate": 9.98408030671658e-06, "loss": 0.8473024368286133, "memory(GiB)": 29.49, "step": 1595, "token_acc": 0.8058499364137346, "train_speed(iter/s)": 0.09727 }, { "epoch": 0.07440584318387253, "grad_norm": 5.489317417144775, "learning_rate": 9.983772258174464e-06, "loss": 0.9494953155517578, "memory(GiB)": 29.49, "step": 1600, "token_acc": 0.7736131934032984, "train_speed(iter/s)": 0.097441 }, { "epoch": 0.07440584318387253, "eval_loss": 0.8192211985588074, "eval_runtime": 293.9639, "eval_samples_per_second": 11.821, "eval_steps_per_second": 11.821, "step": 1600 }, { "epoch": 0.07463836144382213, "grad_norm": 5.0594024658203125, "learning_rate": 9.98346126257394e-06, "loss": 1.0007355690002442, "memory(GiB)": 29.49, "step": 1605, "token_acc": 0.773707034500416, "train_speed(iter/s)": 0.095896 }, { "epoch": 0.07487087970377174, "grad_norm": 10.87908935546875, "learning_rate": 9.983147320098914e-06, "loss": 0.9566534996032715, "memory(GiB)": 29.49, "step": 1610, "token_acc": 0.7540680473372781, "train_speed(iter/s)": 0.096063 }, { "epoch": 0.07510339796372134, "grad_norm": 3.9066755771636963, "learning_rate": 9.982830430935024e-06, "loss": 0.8844928741455078, "memory(GiB)": 29.49, "step": 1615, "token_acc": 0.7842778793418648, "train_speed(iter/s)": 0.096232 }, { "epoch": 0.07533591622367095, "grad_norm": 4.4570207595825195, "learning_rate": 9.982510595269658e-06, "loss": 0.9009759902954102, "memory(GiB)": 29.49, "step": 1620, "token_acc": 0.7826250470101542, "train_speed(iter/s)": 0.096398 }, { "epoch": 0.07556843448362054, "grad_norm": 4.537950038909912, "learning_rate": 9.982187813291944e-06, "loss": 0.9638691902160644, "memory(GiB)": 29.49, "step": 1625, "token_acc": 0.7793783169067475, "train_speed(iter/s)": 0.096565 }, { "epoch": 0.07580095274357014, "grad_norm": 3.762213945388794, "learning_rate": 9.981862085192756e-06, "loss": 0.8557533264160156, "memory(GiB)": 29.49, "step": 1630, "token_acc": 0.7996086105675146, "train_speed(iter/s)": 0.096724 }, { "epoch": 0.07603347100351975, "grad_norm": 3.7838034629821777, "learning_rate": 9.981533411164703e-06, "loss": 0.9432172775268555, "memory(GiB)": 29.49, "step": 1635, "token_acc": 0.7803448275862069, "train_speed(iter/s)": 0.096889 }, { "epoch": 0.07626598926346935, "grad_norm": 4.075239181518555, "learning_rate": 9.981201791402142e-06, "loss": 0.9049320220947266, "memory(GiB)": 29.49, "step": 1640, "token_acc": 0.7912266450040617, "train_speed(iter/s)": 0.097056 }, { "epoch": 0.07649850752341895, "grad_norm": 3.987454414367676, "learning_rate": 9.980867226101172e-06, "loss": 0.9458544731140137, "memory(GiB)": 29.49, "step": 1645, "token_acc": 0.7716707683893902, "train_speed(iter/s)": 0.097219 }, { "epoch": 0.07673102578336854, "grad_norm": 3.9305269718170166, "learning_rate": 9.980529715459628e-06, "loss": 0.9325850486755372, "memory(GiB)": 29.49, "step": 1650, "token_acc": 0.7838125665601704, "train_speed(iter/s)": 0.097384 }, { "epoch": 0.07673102578336854, "eval_loss": 0.8100156188011169, "eval_runtime": 294.3166, "eval_samples_per_second": 11.807, "eval_steps_per_second": 11.807, "step": 1650 }, { "epoch": 0.07696354404331815, "grad_norm": 4.639121055603027, "learning_rate": 9.980189259677093e-06, "loss": 0.9464486122131348, "memory(GiB)": 29.49, "step": 1655, "token_acc": 0.7757631985155918, "train_speed(iter/s)": 0.095872 }, { "epoch": 0.07719606230326775, "grad_norm": 4.0201945304870605, "learning_rate": 9.979845858954889e-06, "loss": 0.9635875701904297, "memory(GiB)": 29.49, "step": 1660, "token_acc": 0.7598730606488011, "train_speed(iter/s)": 0.096034 }, { "epoch": 0.07742858056321736, "grad_norm": 3.4885239601135254, "learning_rate": 9.979499513496078e-06, "loss": 0.889770793914795, "memory(GiB)": 29.49, "step": 1665, "token_acc": 0.785060417429513, "train_speed(iter/s)": 0.096196 }, { "epoch": 0.07766109882316696, "grad_norm": 3.780503749847412, "learning_rate": 9.979150223505465e-06, "loss": 0.8421327590942382, "memory(GiB)": 29.49, "step": 1670, "token_acc": 0.7923784494086727, "train_speed(iter/s)": 0.096354 }, { "epoch": 0.07789361708311655, "grad_norm": 3.9452600479125977, "learning_rate": 9.978797989189598e-06, "loss": 0.9661630630493164, "memory(GiB)": 29.49, "step": 1675, "token_acc": 0.7715868361029652, "train_speed(iter/s)": 0.096511 }, { "epoch": 0.07812613534306616, "grad_norm": 4.459422588348389, "learning_rate": 9.978442810756761e-06, "loss": 0.8917196273803711, "memory(GiB)": 29.49, "step": 1680, "token_acc": 0.792498980839788, "train_speed(iter/s)": 0.096672 }, { "epoch": 0.07835865360301576, "grad_norm": 6.250596046447754, "learning_rate": 9.978084688416983e-06, "loss": 1.031515884399414, "memory(GiB)": 29.49, "step": 1685, "token_acc": 0.7625, "train_speed(iter/s)": 0.096831 }, { "epoch": 0.07859117186296537, "grad_norm": 4.471418857574463, "learning_rate": 9.977723622382034e-06, "loss": 0.8646291732788086, "memory(GiB)": 29.49, "step": 1690, "token_acc": 0.7904109589041096, "train_speed(iter/s)": 0.096988 }, { "epoch": 0.07882369012291497, "grad_norm": 4.630346775054932, "learning_rate": 9.977359612865424e-06, "loss": 0.9914566993713378, "memory(GiB)": 29.49, "step": 1695, "token_acc": 0.7661343978709249, "train_speed(iter/s)": 0.097147 }, { "epoch": 0.07905620838286456, "grad_norm": 3.786259651184082, "learning_rate": 9.9769926600824e-06, "loss": 0.8837844848632812, "memory(GiB)": 29.49, "step": 1700, "token_acc": 0.7803897309000928, "train_speed(iter/s)": 0.097304 }, { "epoch": 0.07905620838286456, "eval_loss": 0.8090683221817017, "eval_runtime": 294.3788, "eval_samples_per_second": 11.805, "eval_steps_per_second": 11.805, "step": 1700 }, { "epoch": 0.07928872664281417, "grad_norm": 4.507750511169434, "learning_rate": 9.976622764249956e-06, "loss": 0.9460622787475585, "memory(GiB)": 29.49, "step": 1705, "token_acc": 0.7773199512601808, "train_speed(iter/s)": 0.09585 }, { "epoch": 0.07952124490276377, "grad_norm": 3.4761149883270264, "learning_rate": 9.97624992558682e-06, "loss": 0.8971858024597168, "memory(GiB)": 29.49, "step": 1710, "token_acc": 0.7684088940225238, "train_speed(iter/s)": 0.096007 }, { "epoch": 0.07975376316271338, "grad_norm": 3.712315559387207, "learning_rate": 9.975874144313465e-06, "loss": 1.006467914581299, "memory(GiB)": 29.49, "step": 1715, "token_acc": 0.7429280397022332, "train_speed(iter/s)": 0.096157 }, { "epoch": 0.07998628142266298, "grad_norm": 3.706094741821289, "learning_rate": 9.975495420652102e-06, "loss": 0.7353443622589111, "memory(GiB)": 29.49, "step": 1720, "token_acc": 0.8296415626258559, "train_speed(iter/s)": 0.096317 }, { "epoch": 0.08021879968261257, "grad_norm": 5.486309051513672, "learning_rate": 9.97511375482668e-06, "loss": 0.9426840782165528, "memory(GiB)": 29.49, "step": 1725, "token_acc": 0.7679036458333334, "train_speed(iter/s)": 0.096471 }, { "epoch": 0.08045131794256218, "grad_norm": 5.596253871917725, "learning_rate": 9.974729147062891e-06, "loss": 0.939633560180664, "memory(GiB)": 29.49, "step": 1730, "token_acc": 0.7654434250764526, "train_speed(iter/s)": 0.096628 }, { "epoch": 0.08068383620251178, "grad_norm": 3.46860671043396, "learning_rate": 9.974341597588166e-06, "loss": 0.8142014503479004, "memory(GiB)": 29.49, "step": 1735, "token_acc": 0.7989148864021702, "train_speed(iter/s)": 0.096779 }, { "epoch": 0.08091635446246138, "grad_norm": 4.415549278259277, "learning_rate": 9.973951106631672e-06, "loss": 0.8912237167358399, "memory(GiB)": 29.49, "step": 1740, "token_acc": 0.7891541885876163, "train_speed(iter/s)": 0.096933 }, { "epoch": 0.08114887272241098, "grad_norm": 3.439880847930908, "learning_rate": 9.973557674424324e-06, "loss": 0.9315021514892579, "memory(GiB)": 29.49, "step": 1745, "token_acc": 0.7705095771593784, "train_speed(iter/s)": 0.097084 }, { "epoch": 0.08138139098236058, "grad_norm": 4.43915319442749, "learning_rate": 9.973161301198766e-06, "loss": 0.945796012878418, "memory(GiB)": 29.49, "step": 1750, "token_acc": 0.7663170163170163, "train_speed(iter/s)": 0.097239 }, { "epoch": 0.08138139098236058, "eval_loss": 0.8076890110969543, "eval_runtime": 290.1412, "eval_samples_per_second": 11.977, "eval_steps_per_second": 11.977, "step": 1750 }, { "epoch": 0.08161390924231018, "grad_norm": 3.757079839706421, "learning_rate": 9.972761987189387e-06, "loss": 0.939511489868164, "memory(GiB)": 29.49, "step": 1755, "token_acc": 0.7777243512686125, "train_speed(iter/s)": 0.09585 }, { "epoch": 0.08184642750225979, "grad_norm": 4.13332986831665, "learning_rate": 9.972359732632316e-06, "loss": 0.8005983352661132, "memory(GiB)": 29.49, "step": 1760, "token_acc": 0.8006837606837607, "train_speed(iter/s)": 0.096004 }, { "epoch": 0.0820789457622094, "grad_norm": 5.232628345489502, "learning_rate": 9.971954537765414e-06, "loss": 0.8781660079956055, "memory(GiB)": 29.49, "step": 1765, "token_acc": 0.7958847736625514, "train_speed(iter/s)": 0.096157 }, { "epoch": 0.08231146402215898, "grad_norm": 4.8083696365356445, "learning_rate": 9.971546402828289e-06, "loss": 0.7852079391479492, "memory(GiB)": 29.49, "step": 1770, "token_acc": 0.8230055658627087, "train_speed(iter/s)": 0.096309 }, { "epoch": 0.08254398228210859, "grad_norm": 5.42658805847168, "learning_rate": 9.971135328062282e-06, "loss": 1.06929292678833, "memory(GiB)": 29.49, "step": 1775, "token_acc": 0.746515397082658, "train_speed(iter/s)": 0.096455 }, { "epoch": 0.08277650054205819, "grad_norm": 4.100043773651123, "learning_rate": 9.970721313710475e-06, "loss": 0.8212204933166504, "memory(GiB)": 29.49, "step": 1780, "token_acc": 0.8087121212121212, "train_speed(iter/s)": 0.096608 }, { "epoch": 0.0830090188020078, "grad_norm": 5.11602258682251, "learning_rate": 9.970304360017686e-06, "loss": 0.8928379058837891, "memory(GiB)": 29.49, "step": 1785, "token_acc": 0.7898375388869685, "train_speed(iter/s)": 0.096758 }, { "epoch": 0.0832415370619574, "grad_norm": 4.066575050354004, "learning_rate": 9.969884467230472e-06, "loss": 0.8791553497314453, "memory(GiB)": 29.49, "step": 1790, "token_acc": 0.7894538606403013, "train_speed(iter/s)": 0.096909 }, { "epoch": 0.08347405532190699, "grad_norm": 3.9396798610687256, "learning_rate": 9.969461635597134e-06, "loss": 0.8068005561828613, "memory(GiB)": 29.49, "step": 1795, "token_acc": 0.8042886317222601, "train_speed(iter/s)": 0.097058 }, { "epoch": 0.0837065735818566, "grad_norm": 4.261980056762695, "learning_rate": 9.9690358653677e-06, "loss": 0.9571780204772949, "memory(GiB)": 29.49, "step": 1800, "token_acc": 0.7771060056428859, "train_speed(iter/s)": 0.097209 }, { "epoch": 0.0837065735818566, "eval_loss": 0.8003239035606384, "eval_runtime": 291.1492, "eval_samples_per_second": 11.935, "eval_steps_per_second": 11.935, "step": 1800 }, { "epoch": 0.0839390918418062, "grad_norm": 4.323093414306641, "learning_rate": 9.968607156793944e-06, "loss": 0.8221240997314453, "memory(GiB)": 29.49, "step": 1805, "token_acc": 0.7793450881612091, "train_speed(iter/s)": 0.095854 }, { "epoch": 0.0841716101017558, "grad_norm": 3.989581346511841, "learning_rate": 9.96817551012937e-06, "loss": 0.9691762924194336, "memory(GiB)": 29.49, "step": 1810, "token_acc": 0.7553009992688277, "train_speed(iter/s)": 0.095997 }, { "epoch": 0.08440412836170541, "grad_norm": 3.824002265930176, "learning_rate": 9.96774092562923e-06, "loss": 0.8493466377258301, "memory(GiB)": 29.49, "step": 1815, "token_acc": 0.7916417910447762, "train_speed(iter/s)": 0.096142 }, { "epoch": 0.084636646621655, "grad_norm": 5.909646511077881, "learning_rate": 9.967303403550505e-06, "loss": 0.9302974700927734, "memory(GiB)": 29.49, "step": 1820, "token_acc": 0.7567913071268776, "train_speed(iter/s)": 0.096292 }, { "epoch": 0.0848691648816046, "grad_norm": 5.779644966125488, "learning_rate": 9.966862944151914e-06, "loss": 1.206606101989746, "memory(GiB)": 29.49, "step": 1825, "token_acc": 0.7205930807248765, "train_speed(iter/s)": 0.09644 }, { "epoch": 0.08510168314155421, "grad_norm": 5.22266960144043, "learning_rate": 9.966419547693915e-06, "loss": 0.8638523101806641, "memory(GiB)": 29.49, "step": 1830, "token_acc": 0.8013672616786935, "train_speed(iter/s)": 0.096588 }, { "epoch": 0.08533420140150381, "grad_norm": 6.209298610687256, "learning_rate": 9.965973214438702e-06, "loss": 0.8310011863708496, "memory(GiB)": 29.49, "step": 1835, "token_acc": 0.7985272459499264, "train_speed(iter/s)": 0.096731 }, { "epoch": 0.08556671966145342, "grad_norm": 4.538772106170654, "learning_rate": 9.965523944650206e-06, "loss": 0.8197231292724609, "memory(GiB)": 29.49, "step": 1840, "token_acc": 0.8025325119780972, "train_speed(iter/s)": 0.096881 }, { "epoch": 0.08579923792140301, "grad_norm": 4.979494571685791, "learning_rate": 9.965071738594095e-06, "loss": 0.8975962638854981, "memory(GiB)": 29.49, "step": 1845, "token_acc": 0.7914086687306502, "train_speed(iter/s)": 0.09703 }, { "epoch": 0.08603175618135261, "grad_norm": 4.453029155731201, "learning_rate": 9.964616596537768e-06, "loss": 0.8544903755187988, "memory(GiB)": 29.49, "step": 1850, "token_acc": 0.8063121487246001, "train_speed(iter/s)": 0.09718 }, { "epoch": 0.08603175618135261, "eval_loss": 0.8005815744400024, "eval_runtime": 292.6623, "eval_samples_per_second": 11.874, "eval_steps_per_second": 11.874, "step": 1850 }, { "epoch": 0.08626427444130222, "grad_norm": 4.928404331207275, "learning_rate": 9.96415851875037e-06, "loss": 0.8729582786560058, "memory(GiB)": 29.49, "step": 1855, "token_acc": 0.7795837812685525, "train_speed(iter/s)": 0.095855 }, { "epoch": 0.08649679270125182, "grad_norm": 4.308002948760986, "learning_rate": 9.963697505502776e-06, "loss": 0.9543866157531739, "memory(GiB)": 29.49, "step": 1860, "token_acc": 0.7780277465316835, "train_speed(iter/s)": 0.096 }, { "epoch": 0.08672931096120143, "grad_norm": 4.0189595222473145, "learning_rate": 9.963233557067593e-06, "loss": 0.9054231643676758, "memory(GiB)": 29.49, "step": 1865, "token_acc": 0.8008409785932722, "train_speed(iter/s)": 0.096149 }, { "epoch": 0.08696182922115102, "grad_norm": 4.4174299240112305, "learning_rate": 9.962766673719172e-06, "loss": 0.8594171524047851, "memory(GiB)": 29.49, "step": 1870, "token_acc": 0.7968021319120586, "train_speed(iter/s)": 0.096292 }, { "epoch": 0.08719434748110062, "grad_norm": 4.788074016571045, "learning_rate": 9.962296855733594e-06, "loss": 1.0707283973693849, "memory(GiB)": 29.49, "step": 1875, "token_acc": 0.7581755593803786, "train_speed(iter/s)": 0.096436 }, { "epoch": 0.08742686574105023, "grad_norm": 4.352080345153809, "learning_rate": 9.961824103388678e-06, "loss": 0.9819015502929688, "memory(GiB)": 29.49, "step": 1880, "token_acc": 0.7637842723711962, "train_speed(iter/s)": 0.096579 }, { "epoch": 0.08765938400099983, "grad_norm": 3.994741201400757, "learning_rate": 9.961348416963974e-06, "loss": 0.8944009780883789, "memory(GiB)": 29.49, "step": 1885, "token_acc": 0.7813012575177692, "train_speed(iter/s)": 0.09672 }, { "epoch": 0.08789190226094944, "grad_norm": 4.152418613433838, "learning_rate": 9.960869796740776e-06, "loss": 0.9587595939636231, "memory(GiB)": 29.49, "step": 1890, "token_acc": 0.7640990371389271, "train_speed(iter/s)": 0.096864 }, { "epoch": 0.08812442052089903, "grad_norm": 5.073063850402832, "learning_rate": 9.9603882430021e-06, "loss": 0.9772052764892578, "memory(GiB)": 29.49, "step": 1895, "token_acc": 0.7661883094154708, "train_speed(iter/s)": 0.097008 }, { "epoch": 0.08835693878084863, "grad_norm": 4.3130059242248535, "learning_rate": 9.959903756032707e-06, "loss": 0.9528634071350097, "memory(GiB)": 29.49, "step": 1900, "token_acc": 0.7809652379345258, "train_speed(iter/s)": 0.097149 }, { "epoch": 0.08835693878084863, "eval_loss": 0.7958551049232483, "eval_runtime": 291.8739, "eval_samples_per_second": 11.906, "eval_steps_per_second": 11.906, "step": 1900 }, { "epoch": 0.08858945704079824, "grad_norm": 4.308380603790283, "learning_rate": 9.959416336119091e-06, "loss": 0.9988663673400879, "memory(GiB)": 29.49, "step": 1905, "token_acc": 0.7786771526197706, "train_speed(iter/s)": 0.095854 }, { "epoch": 0.08882197530074784, "grad_norm": 4.396267414093018, "learning_rate": 9.958925983549475e-06, "loss": 0.7884073734283448, "memory(GiB)": 29.49, "step": 1910, "token_acc": 0.8175895765472313, "train_speed(iter/s)": 0.095997 }, { "epoch": 0.08905449356069745, "grad_norm": 4.666249752044678, "learning_rate": 9.958432698613822e-06, "loss": 0.9932140350341797, "memory(GiB)": 29.49, "step": 1915, "token_acc": 0.7660390516039052, "train_speed(iter/s)": 0.096135 }, { "epoch": 0.08928701182064704, "grad_norm": 3.9180893898010254, "learning_rate": 9.957936481603825e-06, "loss": 0.7315858364105224, "memory(GiB)": 29.49, "step": 1920, "token_acc": 0.8133279807306303, "train_speed(iter/s)": 0.096274 }, { "epoch": 0.08951953008059664, "grad_norm": 4.10879373550415, "learning_rate": 9.957437332812911e-06, "loss": 0.9591626167297364, "memory(GiB)": 29.49, "step": 1925, "token_acc": 0.7715167888846005, "train_speed(iter/s)": 0.096411 }, { "epoch": 0.08975204834054624, "grad_norm": 5.104907035827637, "learning_rate": 9.956935252536246e-06, "loss": 0.9636960029602051, "memory(GiB)": 29.49, "step": 1930, "token_acc": 0.7698966769058923, "train_speed(iter/s)": 0.096546 }, { "epoch": 0.08998456660049585, "grad_norm": 4.108262062072754, "learning_rate": 9.956430241070726e-06, "loss": 0.8493362426757812, "memory(GiB)": 29.49, "step": 1935, "token_acc": 0.7907935417382342, "train_speed(iter/s)": 0.096687 }, { "epoch": 0.09021708486044544, "grad_norm": 3.6644999980926514, "learning_rate": 9.955922298714974e-06, "loss": 1.0462658882141114, "memory(GiB)": 29.49, "step": 1940, "token_acc": 0.7325724861196792, "train_speed(iter/s)": 0.096823 }, { "epoch": 0.09044960312039504, "grad_norm": 5.004022598266602, "learning_rate": 9.955411425769357e-06, "loss": 0.9329165458679199, "memory(GiB)": 29.49, "step": 1945, "token_acc": 0.768247656915649, "train_speed(iter/s)": 0.09696 }, { "epoch": 0.09068212138034465, "grad_norm": 3.4844934940338135, "learning_rate": 9.954897622535969e-06, "loss": 0.9116199493408204, "memory(GiB)": 29.49, "step": 1950, "token_acc": 0.7900699766744419, "train_speed(iter/s)": 0.097095 }, { "epoch": 0.09068212138034465, "eval_loss": 0.7911024689674377, "eval_runtime": 290.3316, "eval_samples_per_second": 11.969, "eval_steps_per_second": 11.969, "step": 1950 }, { "epoch": 0.09091463964029425, "grad_norm": 5.0355753898620605, "learning_rate": 9.954380889318636e-06, "loss": 0.9376407623291015, "memory(GiB)": 29.49, "step": 1955, "token_acc": 0.7810113058315128, "train_speed(iter/s)": 0.095848 }, { "epoch": 0.09114715790024386, "grad_norm": 5.319980144500732, "learning_rate": 9.95386122642292e-06, "loss": 0.9036301612854004, "memory(GiB)": 29.49, "step": 1960, "token_acc": 0.7804032766225583, "train_speed(iter/s)": 0.095983 }, { "epoch": 0.09137967616019345, "grad_norm": 4.639838695526123, "learning_rate": 9.953338634156113e-06, "loss": 0.7657929420471191, "memory(GiB)": 29.49, "step": 1965, "token_acc": 0.810907786149982, "train_speed(iter/s)": 0.096118 }, { "epoch": 0.09161219442014305, "grad_norm": 4.133523941040039, "learning_rate": 9.95281311282724e-06, "loss": 0.9569430351257324, "memory(GiB)": 29.49, "step": 1970, "token_acc": 0.7655044739278001, "train_speed(iter/s)": 0.096255 }, { "epoch": 0.09184471268009266, "grad_norm": 6.160674571990967, "learning_rate": 9.95228466274706e-06, "loss": 0.9236304283142089, "memory(GiB)": 29.49, "step": 1975, "token_acc": 0.7819602272727273, "train_speed(iter/s)": 0.096393 }, { "epoch": 0.09207723094004226, "grad_norm": 4.117314338684082, "learning_rate": 9.951753284228058e-06, "loss": 0.9396333694458008, "memory(GiB)": 29.49, "step": 1980, "token_acc": 0.7647058823529411, "train_speed(iter/s)": 0.096529 }, { "epoch": 0.09230974919999187, "grad_norm": 3.8997716903686523, "learning_rate": 9.951218977584456e-06, "loss": 0.9236691474914551, "memory(GiB)": 29.49, "step": 1985, "token_acc": 0.7829246139872843, "train_speed(iter/s)": 0.096665 }, { "epoch": 0.09254226745994146, "grad_norm": 5.6404852867126465, "learning_rate": 9.950681743132209e-06, "loss": 0.8209335327148437, "memory(GiB)": 29.49, "step": 1990, "token_acc": 0.7989203778677463, "train_speed(iter/s)": 0.096799 }, { "epoch": 0.09277478571989106, "grad_norm": 5.0860209465026855, "learning_rate": 9.950141581188997e-06, "loss": 1.0448697090148926, "memory(GiB)": 29.49, "step": 1995, "token_acc": 0.7433050293925539, "train_speed(iter/s)": 0.096933 }, { "epoch": 0.09300730397984067, "grad_norm": 4.093280792236328, "learning_rate": 9.949598492074234e-06, "loss": 0.8636885643005371, "memory(GiB)": 29.49, "step": 2000, "token_acc": 0.7966202193892677, "train_speed(iter/s)": 0.097068 }, { "epoch": 0.09300730397984067, "eval_loss": 0.7961020469665527, "eval_runtime": 289.8869, "eval_samples_per_second": 11.987, "eval_steps_per_second": 11.987, "step": 2000 }, { "epoch": 0.09323982223979027, "grad_norm": 3.603029251098633, "learning_rate": 9.949052476109067e-06, "loss": 0.9437604904174804, "memory(GiB)": 29.49, "step": 2005, "token_acc": 0.7814692859768866, "train_speed(iter/s)": 0.095853 }, { "epoch": 0.09347234049973988, "grad_norm": 4.166951656341553, "learning_rate": 9.948503533616374e-06, "loss": 0.8821152687072754, "memory(GiB)": 29.49, "step": 2010, "token_acc": 0.780439121756487, "train_speed(iter/s)": 0.095988 }, { "epoch": 0.09370485875968947, "grad_norm": 5.039173126220703, "learning_rate": 9.947951664920758e-06, "loss": 1.0004417419433593, "memory(GiB)": 29.49, "step": 2015, "token_acc": 0.7596216568819308, "train_speed(iter/s)": 0.096123 }, { "epoch": 0.09393737701963907, "grad_norm": 5.216615200042725, "learning_rate": 9.947396870348555e-06, "loss": 0.9004398345947265, "memory(GiB)": 29.49, "step": 2020, "token_acc": 0.7764618800888231, "train_speed(iter/s)": 0.096257 }, { "epoch": 0.09416989527958868, "grad_norm": 3.9363391399383545, "learning_rate": 9.946839150227838e-06, "loss": 0.8003036499023437, "memory(GiB)": 29.49, "step": 2025, "token_acc": 0.8057581573896353, "train_speed(iter/s)": 0.096393 }, { "epoch": 0.09440241353953828, "grad_norm": 3.8237192630767822, "learning_rate": 9.946278504888401e-06, "loss": 0.8571130752563476, "memory(GiB)": 29.49, "step": 2030, "token_acc": 0.79004329004329, "train_speed(iter/s)": 0.096526 }, { "epoch": 0.09463493179948788, "grad_norm": 5.707287788391113, "learning_rate": 9.945714934661767e-06, "loss": 0.8530313491821289, "memory(GiB)": 29.49, "step": 2035, "token_acc": 0.7946362968405584, "train_speed(iter/s)": 0.096662 }, { "epoch": 0.09486745005943747, "grad_norm": 4.582156658172607, "learning_rate": 9.9451484398812e-06, "loss": 0.8503716468811036, "memory(GiB)": 29.49, "step": 2040, "token_acc": 0.7962447844228094, "train_speed(iter/s)": 0.096795 }, { "epoch": 0.09509996831938708, "grad_norm": 5.141015529632568, "learning_rate": 9.94457902088168e-06, "loss": 0.8094500541687012, "memory(GiB)": 29.49, "step": 2045, "token_acc": 0.801535974130962, "train_speed(iter/s)": 0.09693 }, { "epoch": 0.09533248657933668, "grad_norm": 5.945651054382324, "learning_rate": 9.944006677999923e-06, "loss": 0.7684842109680176, "memory(GiB)": 29.49, "step": 2050, "token_acc": 0.8084388185654009, "train_speed(iter/s)": 0.097063 }, { "epoch": 0.09533248657933668, "eval_loss": 0.7812900543212891, "eval_runtime": 290.1308, "eval_samples_per_second": 11.977, "eval_steps_per_second": 11.977, "step": 2050 }, { "epoch": 0.09556500483928629, "grad_norm": 4.04123067855835, "learning_rate": 9.943431411574377e-06, "loss": 0.7828290462493896, "memory(GiB)": 29.49, "step": 2055, "token_acc": 0.7829557178160413, "train_speed(iter/s)": 0.095874 }, { "epoch": 0.09579752309923589, "grad_norm": 5.821208477020264, "learning_rate": 9.942853221945208e-06, "loss": 0.9211565971374511, "memory(GiB)": 29.49, "step": 2060, "token_acc": 0.7775191220485533, "train_speed(iter/s)": 0.096006 }, { "epoch": 0.09603004135918548, "grad_norm": 4.410974502563477, "learning_rate": 9.942272109454322e-06, "loss": 0.78861083984375, "memory(GiB)": 29.49, "step": 2065, "token_acc": 0.7906564163217031, "train_speed(iter/s)": 0.096136 }, { "epoch": 0.09626255961913509, "grad_norm": 3.8728504180908203, "learning_rate": 9.94168807444535e-06, "loss": 0.8335811614990234, "memory(GiB)": 29.49, "step": 2070, "token_acc": 0.8082344213649851, "train_speed(iter/s)": 0.096268 }, { "epoch": 0.09649507787908469, "grad_norm": 4.204519271850586, "learning_rate": 9.941101117263648e-06, "loss": 0.8729138374328613, "memory(GiB)": 29.49, "step": 2075, "token_acc": 0.7818115144847818, "train_speed(iter/s)": 0.096399 }, { "epoch": 0.0967275961390343, "grad_norm": 5.9173383712768555, "learning_rate": 9.9405112382563e-06, "loss": 0.8117104530334472, "memory(GiB)": 29.49, "step": 2080, "token_acc": 0.8, "train_speed(iter/s)": 0.096531 }, { "epoch": 0.0969601143989839, "grad_norm": 4.552997589111328, "learning_rate": 9.939918437772122e-06, "loss": 0.9070040702819824, "memory(GiB)": 29.49, "step": 2085, "token_acc": 0.7808084127505751, "train_speed(iter/s)": 0.096658 }, { "epoch": 0.09719263265893349, "grad_norm": 3.836216449737549, "learning_rate": 9.939322716161654e-06, "loss": 0.8257838249206543, "memory(GiB)": 29.49, "step": 2090, "token_acc": 0.8128583128583129, "train_speed(iter/s)": 0.096791 }, { "epoch": 0.0974251509188831, "grad_norm": 4.201406002044678, "learning_rate": 9.938724073777167e-06, "loss": 0.9113821029663086, "memory(GiB)": 29.49, "step": 2095, "token_acc": 0.7799520766773163, "train_speed(iter/s)": 0.09692 }, { "epoch": 0.0976576691788327, "grad_norm": 4.389884948730469, "learning_rate": 9.938122510972652e-06, "loss": 0.9145685195922851, "memory(GiB)": 29.49, "step": 2100, "token_acc": 0.786472148541114, "train_speed(iter/s)": 0.097049 }, { "epoch": 0.0976576691788327, "eval_loss": 0.7848458290100098, "eval_runtime": 294.2657, "eval_samples_per_second": 11.809, "eval_steps_per_second": 11.809, "step": 2100 }, { "epoch": 0.0978901874387823, "grad_norm": 4.336113929748535, "learning_rate": 9.937518028103837e-06, "loss": 0.985959815979004, "memory(GiB)": 29.49, "step": 2105, "token_acc": 0.7828502454076004, "train_speed(iter/s)": 0.095873 }, { "epoch": 0.09812270569873191, "grad_norm": 5.265202522277832, "learning_rate": 9.936910625528169e-06, "loss": 0.9030218124389648, "memory(GiB)": 29.49, "step": 2110, "token_acc": 0.7816537467700259, "train_speed(iter/s)": 0.096001 }, { "epoch": 0.0983552239586815, "grad_norm": 5.062147617340088, "learning_rate": 9.936300303604823e-06, "loss": 0.8893616676330567, "memory(GiB)": 29.49, "step": 2115, "token_acc": 0.7948929159802306, "train_speed(iter/s)": 0.096129 }, { "epoch": 0.0985877422186311, "grad_norm": 5.708126544952393, "learning_rate": 9.935687062694702e-06, "loss": 0.8678428649902343, "memory(GiB)": 29.49, "step": 2120, "token_acc": 0.7893436838390967, "train_speed(iter/s)": 0.096256 }, { "epoch": 0.09882026047858071, "grad_norm": 5.069558620452881, "learning_rate": 9.935070903160436e-06, "loss": 0.9081151962280274, "memory(GiB)": 29.49, "step": 2125, "token_acc": 0.7773660205245154, "train_speed(iter/s)": 0.096378 }, { "epoch": 0.09905277873853031, "grad_norm": 3.8125367164611816, "learning_rate": 9.934451825366375e-06, "loss": 0.846406364440918, "memory(GiB)": 29.49, "step": 2130, "token_acc": 0.7957292506043513, "train_speed(iter/s)": 0.096505 }, { "epoch": 0.0992852969984799, "grad_norm": 5.074999809265137, "learning_rate": 9.933829829678603e-06, "loss": 0.9698441505432129, "memory(GiB)": 29.49, "step": 2135, "token_acc": 0.7694581280788177, "train_speed(iter/s)": 0.096632 }, { "epoch": 0.09951781525842951, "grad_norm": 6.1027021408081055, "learning_rate": 9.933204916464922e-06, "loss": 0.8034770965576172, "memory(GiB)": 29.49, "step": 2140, "token_acc": 0.7895142636854279, "train_speed(iter/s)": 0.096757 }, { "epoch": 0.09975033351837911, "grad_norm": 6.004554748535156, "learning_rate": 9.932577086094866e-06, "loss": 0.8347911834716797, "memory(GiB)": 29.49, "step": 2145, "token_acc": 0.7937138728323699, "train_speed(iter/s)": 0.096885 }, { "epoch": 0.09998285177832872, "grad_norm": 5.652978897094727, "learning_rate": 9.931946338939688e-06, "loss": 0.8349695205688477, "memory(GiB)": 29.49, "step": 2150, "token_acc": 0.8160493827160494, "train_speed(iter/s)": 0.097009 }, { "epoch": 0.09998285177832872, "eval_loss": 0.7777920365333557, "eval_runtime": 291.8039, "eval_samples_per_second": 11.909, "eval_steps_per_second": 11.909, "step": 2150 }, { "epoch": 0.10021537003827832, "grad_norm": 5.332450866699219, "learning_rate": 9.931312675372368e-06, "loss": 0.7858267784118652, "memory(GiB)": 29.49, "step": 2155, "token_acc": 0.7848584531899057, "train_speed(iter/s)": 0.095872 }, { "epoch": 0.10044788829822791, "grad_norm": 5.089801788330078, "learning_rate": 9.930676095767612e-06, "loss": 0.8572509765625, "memory(GiB)": 29.49, "step": 2160, "token_acc": 0.7963917525773195, "train_speed(iter/s)": 0.095999 }, { "epoch": 0.10068040655817752, "grad_norm": 4.407390594482422, "learning_rate": 9.93003660050185e-06, "loss": 0.8731700897216796, "memory(GiB)": 29.49, "step": 2165, "token_acc": 0.7905614881157423, "train_speed(iter/s)": 0.096124 }, { "epoch": 0.10091292481812712, "grad_norm": 4.490730285644531, "learning_rate": 9.929394189953232e-06, "loss": 0.8779221534729004, "memory(GiB)": 29.49, "step": 2170, "token_acc": 0.7994467496542186, "train_speed(iter/s)": 0.09625 }, { "epoch": 0.10114544307807673, "grad_norm": 4.566867828369141, "learning_rate": 9.92874886450164e-06, "loss": 0.8151620864868164, "memory(GiB)": 29.49, "step": 2175, "token_acc": 0.809959721713658, "train_speed(iter/s)": 0.096375 }, { "epoch": 0.10137796133802633, "grad_norm": 4.89373254776001, "learning_rate": 9.92810062452867e-06, "loss": 0.9058709144592285, "memory(GiB)": 29.49, "step": 2180, "token_acc": 0.7809948032665182, "train_speed(iter/s)": 0.096497 }, { "epoch": 0.10161047959797592, "grad_norm": 5.802387714385986, "learning_rate": 9.92744947041765e-06, "loss": 0.8258605003356934, "memory(GiB)": 29.49, "step": 2185, "token_acc": 0.7867144252686421, "train_speed(iter/s)": 0.09662 }, { "epoch": 0.10184299785792553, "grad_norm": 4.026666641235352, "learning_rate": 9.926795402553624e-06, "loss": 0.7584074974060059, "memory(GiB)": 29.49, "step": 2190, "token_acc": 0.8099924299772899, "train_speed(iter/s)": 0.096746 }, { "epoch": 0.10207551611787513, "grad_norm": 4.953476428985596, "learning_rate": 9.926138421323365e-06, "loss": 0.9181832313537598, "memory(GiB)": 29.49, "step": 2195, "token_acc": 0.7740170940170941, "train_speed(iter/s)": 0.09687 }, { "epoch": 0.10230803437782474, "grad_norm": 3.769212245941162, "learning_rate": 9.925478527115369e-06, "loss": 0.901462173461914, "memory(GiB)": 29.49, "step": 2200, "token_acc": 0.7800376647834275, "train_speed(iter/s)": 0.096994 }, { "epoch": 0.10230803437782474, "eval_loss": 0.7704504132270813, "eval_runtime": 292.4552, "eval_samples_per_second": 11.882, "eval_steps_per_second": 11.882, "step": 2200 }, { "epoch": 0.10254055263777434, "grad_norm": 4.067014694213867, "learning_rate": 9.924815720319845e-06, "loss": 0.876547908782959, "memory(GiB)": 29.49, "step": 2205, "token_acc": 0.7853665544045586, "train_speed(iter/s)": 0.095879 }, { "epoch": 0.10277307089772393, "grad_norm": 4.733416557312012, "learning_rate": 9.924150001328736e-06, "loss": 0.8199963569641113, "memory(GiB)": 29.49, "step": 2210, "token_acc": 0.7988871224165341, "train_speed(iter/s)": 0.096001 }, { "epoch": 0.10300558915767354, "grad_norm": 4.051647663116455, "learning_rate": 9.923481370535702e-06, "loss": 0.8695549011230469, "memory(GiB)": 29.49, "step": 2215, "token_acc": 0.7721365971592626, "train_speed(iter/s)": 0.096119 }, { "epoch": 0.10323810741762314, "grad_norm": 4.613498687744141, "learning_rate": 9.922809828336122e-06, "loss": 0.8630804061889649, "memory(GiB)": 29.49, "step": 2220, "token_acc": 0.7895302975977053, "train_speed(iter/s)": 0.09624 }, { "epoch": 0.10347062567757274, "grad_norm": 5.266293525695801, "learning_rate": 9.922135375127103e-06, "loss": 0.8668428421020508, "memory(GiB)": 29.49, "step": 2225, "token_acc": 0.7758620689655172, "train_speed(iter/s)": 0.096363 }, { "epoch": 0.10370314393752235, "grad_norm": 4.555314540863037, "learning_rate": 9.921458011307468e-06, "loss": 0.9019486427307128, "memory(GiB)": 29.49, "step": 2230, "token_acc": 0.776064610866373, "train_speed(iter/s)": 0.096484 }, { "epoch": 0.10393566219747194, "grad_norm": 4.664549350738525, "learning_rate": 9.920777737277764e-06, "loss": 0.8585104942321777, "memory(GiB)": 29.49, "step": 2235, "token_acc": 0.7947368421052632, "train_speed(iter/s)": 0.096605 }, { "epoch": 0.10416818045742154, "grad_norm": 3.9962921142578125, "learning_rate": 9.920094553440257e-06, "loss": 0.966301441192627, "memory(GiB)": 29.49, "step": 2240, "token_acc": 0.7736486486486487, "train_speed(iter/s)": 0.096721 }, { "epoch": 0.10440069871737115, "grad_norm": 5.359710216522217, "learning_rate": 9.919408460198937e-06, "loss": 0.8347146034240722, "memory(GiB)": 29.49, "step": 2245, "token_acc": 0.7948028673835126, "train_speed(iter/s)": 0.096841 }, { "epoch": 0.10463321697732075, "grad_norm": 4.495904445648193, "learning_rate": 9.918719457959509e-06, "loss": 0.9961088180541993, "memory(GiB)": 29.49, "step": 2250, "token_acc": 0.7679214402618658, "train_speed(iter/s)": 0.096961 }, { "epoch": 0.10463321697732075, "eval_loss": 0.771256685256958, "eval_runtime": 291.6294, "eval_samples_per_second": 11.916, "eval_steps_per_second": 11.916, "step": 2250 }, { "epoch": 0.10486573523727036, "grad_norm": 4.535305500030518, "learning_rate": 9.918027547129405e-06, "loss": 0.7924888134002686, "memory(GiB)": 29.49, "step": 2255, "token_acc": 0.7862682171411193, "train_speed(iter/s)": 0.095876 }, { "epoch": 0.10509825349721995, "grad_norm": 4.197555065155029, "learning_rate": 9.91733272811777e-06, "loss": 0.8547355651855468, "memory(GiB)": 29.49, "step": 2260, "token_acc": 0.7923462986198243, "train_speed(iter/s)": 0.095994 }, { "epoch": 0.10533077175716955, "grad_norm": 4.672827243804932, "learning_rate": 9.916635001335473e-06, "loss": 0.8752650260925293, "memory(GiB)": 29.49, "step": 2265, "token_acc": 0.7856060606060606, "train_speed(iter/s)": 0.096114 }, { "epoch": 0.10556329001711916, "grad_norm": 5.364903926849365, "learning_rate": 9.9159343671951e-06, "loss": 0.8720294952392578, "memory(GiB)": 29.49, "step": 2270, "token_acc": 0.7919115105426893, "train_speed(iter/s)": 0.096231 }, { "epoch": 0.10579580827706876, "grad_norm": 4.762035846710205, "learning_rate": 9.915230826110962e-06, "loss": 0.8258625984191894, "memory(GiB)": 29.49, "step": 2275, "token_acc": 0.8075097108329736, "train_speed(iter/s)": 0.096349 }, { "epoch": 0.10602832653701837, "grad_norm": 4.820277690887451, "learning_rate": 9.91452437849908e-06, "loss": 0.8725085258483887, "memory(GiB)": 29.49, "step": 2280, "token_acc": 0.7912274736484189, "train_speed(iter/s)": 0.096465 }, { "epoch": 0.10626084479696796, "grad_norm": 4.2000555992126465, "learning_rate": 9.9138150247772e-06, "loss": 0.8382984161376953, "memory(GiB)": 29.49, "step": 2285, "token_acc": 0.811981234211476, "train_speed(iter/s)": 0.096584 }, { "epoch": 0.10649336305691756, "grad_norm": 4.270804405212402, "learning_rate": 9.913102765364786e-06, "loss": 0.871574592590332, "memory(GiB)": 29.49, "step": 2290, "token_acc": 0.7906079125120618, "train_speed(iter/s)": 0.096702 }, { "epoch": 0.10672588131686717, "grad_norm": 4.381374359130859, "learning_rate": 9.912387600683016e-06, "loss": 0.8795125007629394, "memory(GiB)": 29.49, "step": 2295, "token_acc": 0.7884333821376281, "train_speed(iter/s)": 0.096816 }, { "epoch": 0.10695839957681677, "grad_norm": 5.970802307128906, "learning_rate": 9.91166953115479e-06, "loss": 0.9401533126831054, "memory(GiB)": 29.49, "step": 2300, "token_acc": 0.7745098039215687, "train_speed(iter/s)": 0.096934 }, { "epoch": 0.10695839957681677, "eval_loss": 0.7655821442604065, "eval_runtime": 291.9701, "eval_samples_per_second": 11.902, "eval_steps_per_second": 11.902, "step": 2300 }, { "epoch": 0.10719091783676638, "grad_norm": 6.104898929595947, "learning_rate": 9.910948557204727e-06, "loss": 0.8890548706054687, "memory(GiB)": 29.49, "step": 2305, "token_acc": 0.7863885505481121, "train_speed(iter/s)": 0.095866 }, { "epoch": 0.10742343609671597, "grad_norm": 3.8734943866729736, "learning_rate": 9.910224679259159e-06, "loss": 0.892047119140625, "memory(GiB)": 29.49, "step": 2310, "token_acc": 0.787044220325834, "train_speed(iter/s)": 0.095981 }, { "epoch": 0.10765595435666557, "grad_norm": 5.029067039489746, "learning_rate": 9.909497897746139e-06, "loss": 0.9151164054870605, "memory(GiB)": 29.49, "step": 2315, "token_acc": 0.7836443032949583, "train_speed(iter/s)": 0.096095 }, { "epoch": 0.10788847261661517, "grad_norm": 3.318230628967285, "learning_rate": 9.908768213095432e-06, "loss": 0.8855794906616211, "memory(GiB)": 29.49, "step": 2320, "token_acc": 0.785736726358781, "train_speed(iter/s)": 0.096209 }, { "epoch": 0.10812099087656478, "grad_norm": 5.197476863861084, "learning_rate": 9.908035625738525e-06, "loss": 0.9641876220703125, "memory(GiB)": 29.49, "step": 2325, "token_acc": 0.7662420382165606, "train_speed(iter/s)": 0.096324 }, { "epoch": 0.10835350913651438, "grad_norm": 5.319194793701172, "learning_rate": 9.907300136108622e-06, "loss": 0.9029970169067383, "memory(GiB)": 29.49, "step": 2330, "token_acc": 0.7849344978165939, "train_speed(iter/s)": 0.096438 }, { "epoch": 0.10858602739646397, "grad_norm": 4.619373321533203, "learning_rate": 9.906561744640638e-06, "loss": 0.7368559837341309, "memory(GiB)": 29.49, "step": 2335, "token_acc": 0.8242205151378219, "train_speed(iter/s)": 0.096555 }, { "epoch": 0.10881854565641358, "grad_norm": 9.945032119750977, "learning_rate": 9.905820451771206e-06, "loss": 0.8624940872192383, "memory(GiB)": 29.49, "step": 2340, "token_acc": 0.8019303399076794, "train_speed(iter/s)": 0.096672 }, { "epoch": 0.10905106391636318, "grad_norm": 4.2016730308532715, "learning_rate": 9.905076257938677e-06, "loss": 0.8212512969970703, "memory(GiB)": 29.49, "step": 2345, "token_acc": 0.794137022397892, "train_speed(iter/s)": 0.09679 }, { "epoch": 0.10928358217631279, "grad_norm": 3.608508348464966, "learning_rate": 9.904329163583115e-06, "loss": 0.890407943725586, "memory(GiB)": 29.49, "step": 2350, "token_acc": 0.7811355311355311, "train_speed(iter/s)": 0.096907 }, { "epoch": 0.10928358217631279, "eval_loss": 0.768470823764801, "eval_runtime": 290.2717, "eval_samples_per_second": 11.972, "eval_steps_per_second": 11.972, "step": 2350 }, { "epoch": 0.10951610043626238, "grad_norm": 5.3882646560668945, "learning_rate": 9.903579169146302e-06, "loss": 0.8198015213012695, "memory(GiB)": 29.49, "step": 2355, "token_acc": 0.7874883825273211, "train_speed(iter/s)": 0.095876 }, { "epoch": 0.10974861869621198, "grad_norm": 4.694791793823242, "learning_rate": 9.90282627507173e-06, "loss": 0.9625364303588867, "memory(GiB)": 29.49, "step": 2360, "token_acc": 0.771689497716895, "train_speed(iter/s)": 0.095987 }, { "epoch": 0.10998113695616159, "grad_norm": 4.6519694328308105, "learning_rate": 9.902070481804608e-06, "loss": 0.9932247161865234, "memory(GiB)": 29.49, "step": 2365, "token_acc": 0.7187100076785258, "train_speed(iter/s)": 0.0961 }, { "epoch": 0.11021365521611119, "grad_norm": 8.811151504516602, "learning_rate": 9.901311789791863e-06, "loss": 0.9222234725952149, "memory(GiB)": 29.49, "step": 2370, "token_acc": 0.7835302293259208, "train_speed(iter/s)": 0.09621 }, { "epoch": 0.1104461734760608, "grad_norm": 5.532461643218994, "learning_rate": 9.900550199482132e-06, "loss": 0.8486194610595703, "memory(GiB)": 29.49, "step": 2375, "token_acc": 0.7865168539325843, "train_speed(iter/s)": 0.096321 }, { "epoch": 0.11067869173601039, "grad_norm": 5.290581226348877, "learning_rate": 9.899785711325767e-06, "loss": 0.7777493000030518, "memory(GiB)": 29.49, "step": 2380, "token_acc": 0.8072541966426858, "train_speed(iter/s)": 0.096434 }, { "epoch": 0.11091120999595999, "grad_norm": 5.493096828460693, "learning_rate": 9.89901832577483e-06, "loss": 0.8348598480224609, "memory(GiB)": 29.49, "step": 2385, "token_acc": 0.8035310095065641, "train_speed(iter/s)": 0.096549 }, { "epoch": 0.1111437282559096, "grad_norm": 6.49524450302124, "learning_rate": 9.898248043283105e-06, "loss": 0.9553499221801758, "memory(GiB)": 29.49, "step": 2390, "token_acc": 0.7736389684813754, "train_speed(iter/s)": 0.096661 }, { "epoch": 0.1113762465158592, "grad_norm": 4.445330619812012, "learning_rate": 9.897474864306082e-06, "loss": 0.8675954818725586, "memory(GiB)": 29.49, "step": 2395, "token_acc": 0.7960128159487362, "train_speed(iter/s)": 0.096775 }, { "epoch": 0.1116087647758088, "grad_norm": 4.415380477905273, "learning_rate": 9.896698789300963e-06, "loss": 0.7713698387145996, "memory(GiB)": 29.49, "step": 2400, "token_acc": 0.8173153296266878, "train_speed(iter/s)": 0.096889 }, { "epoch": 0.1116087647758088, "eval_loss": 0.7590782642364502, "eval_runtime": 293.8879, "eval_samples_per_second": 11.824, "eval_steps_per_second": 11.824, "step": 2400 }, { "epoch": 0.1118412830357584, "grad_norm": 5.605898857116699, "learning_rate": 9.89591981872667e-06, "loss": 0.8945033073425293, "memory(GiB)": 29.49, "step": 2405, "token_acc": 0.7878308273982239, "train_speed(iter/s)": 0.095865 }, { "epoch": 0.112073801295708, "grad_norm": 4.756052017211914, "learning_rate": 9.895137953043826e-06, "loss": 0.8087597846984863, "memory(GiB)": 29.49, "step": 2410, "token_acc": 0.7922782386726228, "train_speed(iter/s)": 0.095976 }, { "epoch": 0.1123063195556576, "grad_norm": 5.700892925262451, "learning_rate": 9.894353192714779e-06, "loss": 0.8685206413269043, "memory(GiB)": 29.49, "step": 2415, "token_acc": 0.786042944785276, "train_speed(iter/s)": 0.096088 }, { "epoch": 0.11253883781560721, "grad_norm": 4.865542888641357, "learning_rate": 9.893565538203575e-06, "loss": 0.8942924499511719, "memory(GiB)": 29.49, "step": 2420, "token_acc": 0.7881873727087576, "train_speed(iter/s)": 0.096202 }, { "epoch": 0.11277135607555681, "grad_norm": 4.80360746383667, "learning_rate": 9.892774989975986e-06, "loss": 0.8122398376464843, "memory(GiB)": 29.49, "step": 2425, "token_acc": 0.8057724957555178, "train_speed(iter/s)": 0.096315 }, { "epoch": 0.1130038743355064, "grad_norm": 4.724724292755127, "learning_rate": 9.891981548499483e-06, "loss": 0.7987810611724854, "memory(GiB)": 29.49, "step": 2430, "token_acc": 0.797164667393675, "train_speed(iter/s)": 0.09643 }, { "epoch": 0.11323639259545601, "grad_norm": 4.790870666503906, "learning_rate": 9.891185214243254e-06, "loss": 0.7731681823730469, "memory(GiB)": 29.49, "step": 2435, "token_acc": 0.8126709206927986, "train_speed(iter/s)": 0.096547 }, { "epoch": 0.11346891085540561, "grad_norm": 5.879446506500244, "learning_rate": 9.890385987678192e-06, "loss": 0.9109359741210937, "memory(GiB)": 29.49, "step": 2440, "token_acc": 0.7865546218487395, "train_speed(iter/s)": 0.096661 }, { "epoch": 0.11370142911535522, "grad_norm": 4.92235803604126, "learning_rate": 9.889583869276911e-06, "loss": 0.8439332962036132, "memory(GiB)": 29.49, "step": 2445, "token_acc": 0.7941460276616276, "train_speed(iter/s)": 0.096777 }, { "epoch": 0.11393394737530482, "grad_norm": 5.367386341094971, "learning_rate": 9.888778859513723e-06, "loss": 0.8719941139221191, "memory(GiB)": 29.49, "step": 2450, "token_acc": 0.7949034114262228, "train_speed(iter/s)": 0.096891 }, { "epoch": 0.11393394737530482, "eval_loss": 0.7564442753791809, "eval_runtime": 290.7043, "eval_samples_per_second": 11.954, "eval_steps_per_second": 11.954, "step": 2450 }, { "epoch": 0.11416646563525441, "grad_norm": 4.416979789733887, "learning_rate": 9.88797095886466e-06, "loss": 0.8568400382995606, "memory(GiB)": 29.49, "step": 2455, "token_acc": 0.788566903689721, "train_speed(iter/s)": 0.095896 }, { "epoch": 0.11439898389520402, "grad_norm": 4.938545227050781, "learning_rate": 9.887160167807452e-06, "loss": 0.7519895553588867, "memory(GiB)": 29.49, "step": 2460, "token_acc": 0.8103386809269162, "train_speed(iter/s)": 0.096007 }, { "epoch": 0.11463150215515362, "grad_norm": 4.431781768798828, "learning_rate": 9.88634648682155e-06, "loss": 0.8250004768371582, "memory(GiB)": 29.49, "step": 2465, "token_acc": 0.7998715065852875, "train_speed(iter/s)": 0.096116 }, { "epoch": 0.11486402041510323, "grad_norm": 6.4470086097717285, "learning_rate": 9.885529916388108e-06, "loss": 0.8673094749450684, "memory(GiB)": 29.49, "step": 2470, "token_acc": 0.7979130434782609, "train_speed(iter/s)": 0.096226 }, { "epoch": 0.11509653867505283, "grad_norm": 5.069904804229736, "learning_rate": 9.884710456989987e-06, "loss": 0.8118146896362305, "memory(GiB)": 29.49, "step": 2475, "token_acc": 0.7955861070911722, "train_speed(iter/s)": 0.096335 }, { "epoch": 0.11532905693500242, "grad_norm": 4.7304911613464355, "learning_rate": 9.883888109111763e-06, "loss": 0.8498652458190918, "memory(GiB)": 29.49, "step": 2480, "token_acc": 0.7961630695443646, "train_speed(iter/s)": 0.096445 }, { "epoch": 0.11556157519495203, "grad_norm": 5.705682754516602, "learning_rate": 9.883062873239711e-06, "loss": 0.8524469375610352, "memory(GiB)": 29.49, "step": 2485, "token_acc": 0.7909624008278717, "train_speed(iter/s)": 0.096555 }, { "epoch": 0.11579409345490163, "grad_norm": 5.161575794219971, "learning_rate": 9.88223474986182e-06, "loss": 0.8959260940551758, "memory(GiB)": 29.49, "step": 2490, "token_acc": 0.783974862529458, "train_speed(iter/s)": 0.096666 }, { "epoch": 0.11602661171485124, "grad_norm": 4.681116580963135, "learning_rate": 9.881403739467788e-06, "loss": 0.8818140029907227, "memory(GiB)": 29.49, "step": 2495, "token_acc": 0.7958260869565218, "train_speed(iter/s)": 0.096774 }, { "epoch": 0.11625912997480084, "grad_norm": 4.936962127685547, "learning_rate": 9.88056984254901e-06, "loss": 0.8515759468078613, "memory(GiB)": 29.49, "step": 2500, "token_acc": 0.7895465559016961, "train_speed(iter/s)": 0.096882 }, { "epoch": 0.11625912997480084, "eval_loss": 0.7618740797042847, "eval_runtime": 288.9198, "eval_samples_per_second": 12.028, "eval_steps_per_second": 12.028, "step": 2500 }, { "epoch": 0.11649164823475043, "grad_norm": 5.392852306365967, "learning_rate": 9.879733059598602e-06, "loss": 0.7953616619110108, "memory(GiB)": 29.49, "step": 2505, "token_acc": 0.789321072698728, "train_speed(iter/s)": 0.095916 }, { "epoch": 0.11672416649470004, "grad_norm": 4.226475238800049, "learning_rate": 9.878893391111377e-06, "loss": 1.033120346069336, "memory(GiB)": 29.49, "step": 2510, "token_acc": 0.7546312478154491, "train_speed(iter/s)": 0.096024 }, { "epoch": 0.11695668475464964, "grad_norm": 4.294797897338867, "learning_rate": 9.878050837583857e-06, "loss": 0.8275615692138671, "memory(GiB)": 29.49, "step": 2515, "token_acc": 0.7989915966386555, "train_speed(iter/s)": 0.096131 }, { "epoch": 0.11718920301459924, "grad_norm": 5.027287006378174, "learning_rate": 9.87720539951427e-06, "loss": 0.8524178504943848, "memory(GiB)": 29.49, "step": 2520, "token_acc": 0.809153713298791, "train_speed(iter/s)": 0.096241 }, { "epoch": 0.11742172127454885, "grad_norm": 4.703906536102295, "learning_rate": 9.876357077402548e-06, "loss": 0.9237653732299804, "memory(GiB)": 29.49, "step": 2525, "token_acc": 0.7695299837925446, "train_speed(iter/s)": 0.096347 }, { "epoch": 0.11765423953449844, "grad_norm": 5.149855136871338, "learning_rate": 9.875505871750332e-06, "loss": 0.8868477821350098, "memory(GiB)": 29.49, "step": 2530, "token_acc": 0.7888123226591002, "train_speed(iter/s)": 0.096455 }, { "epoch": 0.11788675779444804, "grad_norm": 4.148962020874023, "learning_rate": 9.874651783060965e-06, "loss": 0.8953413009643555, "memory(GiB)": 29.49, "step": 2535, "token_acc": 0.787868038311458, "train_speed(iter/s)": 0.096564 }, { "epoch": 0.11811927605439765, "grad_norm": 4.259050369262695, "learning_rate": 9.873794811839496e-06, "loss": 0.7834689617156982, "memory(GiB)": 29.49, "step": 2540, "token_acc": 0.8124745209947004, "train_speed(iter/s)": 0.096673 }, { "epoch": 0.11835179431434725, "grad_norm": 5.244549751281738, "learning_rate": 9.872934958592682e-06, "loss": 0.7544142723083496, "memory(GiB)": 29.49, "step": 2545, "token_acc": 0.8242142025611175, "train_speed(iter/s)": 0.096783 }, { "epoch": 0.11858431257429684, "grad_norm": 4.926893711090088, "learning_rate": 9.872072223828976e-06, "loss": 0.8376446723937988, "memory(GiB)": 29.49, "step": 2550, "token_acc": 0.7962895598399418, "train_speed(iter/s)": 0.09689 }, { "epoch": 0.11858431257429684, "eval_loss": 0.7521718144416809, "eval_runtime": 291.5566, "eval_samples_per_second": 11.919, "eval_steps_per_second": 11.919, "step": 2550 }, { "epoch": 0.11881683083424645, "grad_norm": 5.548832893371582, "learning_rate": 9.871206608058542e-06, "loss": 0.851633358001709, "memory(GiB)": 29.49, "step": 2555, "token_acc": 0.789743054942763, "train_speed(iter/s)": 0.095932 }, { "epoch": 0.11904934909419605, "grad_norm": 4.430542469024658, "learning_rate": 9.870338111793245e-06, "loss": 0.8358804702758789, "memory(GiB)": 29.49, "step": 2560, "token_acc": 0.7926465717124875, "train_speed(iter/s)": 0.096038 }, { "epoch": 0.11928186735414566, "grad_norm": 5.821200370788574, "learning_rate": 9.869466735546655e-06, "loss": 0.7679703235626221, "memory(GiB)": 29.49, "step": 2565, "token_acc": 0.8165555945282357, "train_speed(iter/s)": 0.096144 }, { "epoch": 0.11951438561409526, "grad_norm": 4.216818809509277, "learning_rate": 9.86859247983404e-06, "loss": 0.7578719139099122, "memory(GiB)": 29.49, "step": 2570, "token_acc": 0.8305369127516778, "train_speed(iter/s)": 0.09625 }, { "epoch": 0.11974690387404485, "grad_norm": 4.801919937133789, "learning_rate": 9.867715345172378e-06, "loss": 0.8901889801025391, "memory(GiB)": 29.49, "step": 2575, "token_acc": 0.7774348422496571, "train_speed(iter/s)": 0.096356 }, { "epoch": 0.11997942213399446, "grad_norm": 6.391806125640869, "learning_rate": 9.866835332080345e-06, "loss": 0.8922554969787597, "memory(GiB)": 29.49, "step": 2580, "token_acc": 0.7786946736684172, "train_speed(iter/s)": 0.096461 }, { "epoch": 0.12021194039394406, "grad_norm": 4.466254234313965, "learning_rate": 9.86595244107832e-06, "loss": 0.901337718963623, "memory(GiB)": 29.49, "step": 2585, "token_acc": 0.7783475783475784, "train_speed(iter/s)": 0.096565 }, { "epoch": 0.12044445865389367, "grad_norm": 5.171731472015381, "learning_rate": 9.865066672688381e-06, "loss": 0.8802291870117187, "memory(GiB)": 29.49, "step": 2590, "token_acc": 0.8067769261799113, "train_speed(iter/s)": 0.096671 }, { "epoch": 0.12067697691384327, "grad_norm": 4.936208248138428, "learning_rate": 9.864178027434312e-06, "loss": 0.7710800647735596, "memory(GiB)": 29.49, "step": 2595, "token_acc": 0.8116624411445128, "train_speed(iter/s)": 0.096779 }, { "epoch": 0.12090949517379286, "grad_norm": 4.140257835388184, "learning_rate": 9.863286505841599e-06, "loss": 0.902885627746582, "memory(GiB)": 29.49, "step": 2600, "token_acc": 0.7825848849945235, "train_speed(iter/s)": 0.096884 }, { "epoch": 0.12090949517379286, "eval_loss": 0.750512421131134, "eval_runtime": 290.6496, "eval_samples_per_second": 11.956, "eval_steps_per_second": 11.956, "step": 2600 }, { "epoch": 0.12114201343374247, "grad_norm": 5.622610092163086, "learning_rate": 9.862392108437423e-06, "loss": 0.8916511535644531, "memory(GiB)": 29.49, "step": 2605, "token_acc": 0.7897618549904324, "train_speed(iter/s)": 0.095947 }, { "epoch": 0.12137453169369207, "grad_norm": 4.701568126678467, "learning_rate": 9.861494835750669e-06, "loss": 0.9207223892211914, "memory(GiB)": 29.49, "step": 2610, "token_acc": 0.782055262340888, "train_speed(iter/s)": 0.09605 }, { "epoch": 0.12160704995364167, "grad_norm": 4.3934431076049805, "learning_rate": 9.860594688311924e-06, "loss": 0.820650863647461, "memory(GiB)": 29.49, "step": 2615, "token_acc": 0.801851217003771, "train_speed(iter/s)": 0.096152 }, { "epoch": 0.12183956821359128, "grad_norm": 5.942111968994141, "learning_rate": 9.859691666653471e-06, "loss": 0.975086784362793, "memory(GiB)": 29.49, "step": 2620, "token_acc": 0.7700453857791225, "train_speed(iter/s)": 0.096256 }, { "epoch": 0.12207208647354087, "grad_norm": 5.694136142730713, "learning_rate": 9.858785771309296e-06, "loss": 0.8277214050292969, "memory(GiB)": 29.49, "step": 2625, "token_acc": 0.7854190154077415, "train_speed(iter/s)": 0.09636 }, { "epoch": 0.12230460473349047, "grad_norm": 4.6577630043029785, "learning_rate": 9.857877002815081e-06, "loss": 0.8744843482971192, "memory(GiB)": 29.49, "step": 2630, "token_acc": 0.7825768667642753, "train_speed(iter/s)": 0.096464 }, { "epoch": 0.12253712299344008, "grad_norm": 3.98728084564209, "learning_rate": 9.856965361708213e-06, "loss": 0.8487506866455078, "memory(GiB)": 29.49, "step": 2635, "token_acc": 0.7898970398970399, "train_speed(iter/s)": 0.096568 }, { "epoch": 0.12276964125338968, "grad_norm": 4.995297908782959, "learning_rate": 9.856050848527768e-06, "loss": 0.7177443027496337, "memory(GiB)": 29.49, "step": 2640, "token_acc": 0.8281311734492296, "train_speed(iter/s)": 0.096671 }, { "epoch": 0.12300215951333929, "grad_norm": 6.051422595977783, "learning_rate": 9.855133463814529e-06, "loss": 0.9563394546508789, "memory(GiB)": 29.49, "step": 2645, "token_acc": 0.7816429170159263, "train_speed(iter/s)": 0.096774 }, { "epoch": 0.12323467777328888, "grad_norm": 5.628708362579346, "learning_rate": 9.854213208110974e-06, "loss": 0.8455151557922364, "memory(GiB)": 29.49, "step": 2650, "token_acc": 0.7891130567186905, "train_speed(iter/s)": 0.096878 }, { "epoch": 0.12323467777328888, "eval_loss": 0.7496427297592163, "eval_runtime": 289.7636, "eval_samples_per_second": 11.993, "eval_steps_per_second": 11.993, "step": 2650 }, { "epoch": 0.12346719603323848, "grad_norm": 3.999803066253662, "learning_rate": 9.853290081961278e-06, "loss": 0.8988096237182617, "memory(GiB)": 29.49, "step": 2655, "token_acc": 0.7904755841344624, "train_speed(iter/s)": 0.095965 }, { "epoch": 0.12369971429318809, "grad_norm": 4.903356552124023, "learning_rate": 9.852364085911313e-06, "loss": 0.8998661041259766, "memory(GiB)": 29.49, "step": 2660, "token_acc": 0.781635581061693, "train_speed(iter/s)": 0.096066 }, { "epoch": 0.12393223255313769, "grad_norm": 5.046064853668213, "learning_rate": 9.851435220508652e-06, "loss": 0.8154894828796386, "memory(GiB)": 29.49, "step": 2665, "token_acc": 0.814495254529767, "train_speed(iter/s)": 0.096168 }, { "epoch": 0.1241647508130873, "grad_norm": 4.473296165466309, "learning_rate": 9.850503486302559e-06, "loss": 0.8603778839111328, "memory(GiB)": 29.49, "step": 2670, "token_acc": 0.7875688434303698, "train_speed(iter/s)": 0.096271 }, { "epoch": 0.12439726907303689, "grad_norm": 5.46212911605835, "learning_rate": 9.849568883843997e-06, "loss": 0.7482133865356445, "memory(GiB)": 29.49, "step": 2675, "token_acc": 0.8121958202118523, "train_speed(iter/s)": 0.096369 }, { "epoch": 0.12462978733298649, "grad_norm": 5.4718170166015625, "learning_rate": 9.848631413685627e-06, "loss": 0.7595831871032714, "memory(GiB)": 29.49, "step": 2680, "token_acc": 0.8065456902138691, "train_speed(iter/s)": 0.09647 }, { "epoch": 0.1248623055929361, "grad_norm": 4.871596336364746, "learning_rate": 9.847691076381803e-06, "loss": 0.8095902442932129, "memory(GiB)": 29.49, "step": 2685, "token_acc": 0.8086928934010152, "train_speed(iter/s)": 0.096571 }, { "epoch": 0.1250948238528857, "grad_norm": 4.615569591522217, "learning_rate": 9.846747872488578e-06, "loss": 0.7411964416503907, "memory(GiB)": 29.49, "step": 2690, "token_acc": 0.8041069100391134, "train_speed(iter/s)": 0.09667 }, { "epoch": 0.1253273421128353, "grad_norm": 4.434457302093506, "learning_rate": 9.845801802563693e-06, "loss": 0.9427834510803222, "memory(GiB)": 29.49, "step": 2695, "token_acc": 0.7910832719233604, "train_speed(iter/s)": 0.096768 }, { "epoch": 0.1255598603727849, "grad_norm": 5.101200103759766, "learning_rate": 9.844852867166592e-06, "loss": 0.7691882133483887, "memory(GiB)": 29.49, "step": 2700, "token_acc": 0.8168316831683168, "train_speed(iter/s)": 0.096871 }, { "epoch": 0.1255598603727849, "eval_loss": 0.7434446215629578, "eval_runtime": 290.5786, "eval_samples_per_second": 11.959, "eval_steps_per_second": 11.959, "step": 2700 }, { "epoch": 0.12579237863273451, "grad_norm": 5.41193151473999, "learning_rate": 9.843901066858408e-06, "loss": 0.75772123336792, "memory(GiB)": 29.49, "step": 2705, "token_acc": 0.792011822502696, "train_speed(iter/s)": 0.095969 }, { "epoch": 0.1260248968926841, "grad_norm": 5.032259464263916, "learning_rate": 9.842946402201971e-06, "loss": 0.7770239353179932, "memory(GiB)": 29.49, "step": 2710, "token_acc": 0.8157417482771128, "train_speed(iter/s)": 0.09607 }, { "epoch": 0.1262574151526337, "grad_norm": 5.005000591278076, "learning_rate": 9.841988873761804e-06, "loss": 0.9538334846496582, "memory(GiB)": 29.49, "step": 2715, "token_acc": 0.7771830043492807, "train_speed(iter/s)": 0.096169 }, { "epoch": 0.1264899334125833, "grad_norm": 4.789558410644531, "learning_rate": 9.84102848210412e-06, "loss": 0.8359936714172364, "memory(GiB)": 29.49, "step": 2720, "token_acc": 0.7862092862092862, "train_speed(iter/s)": 0.096268 }, { "epoch": 0.1267224516725329, "grad_norm": 3.9694697856903076, "learning_rate": 9.840065227796833e-06, "loss": 0.866541576385498, "memory(GiB)": 29.49, "step": 2725, "token_acc": 0.8002589834898025, "train_speed(iter/s)": 0.096367 }, { "epoch": 0.1269549699324825, "grad_norm": 4.770729064941406, "learning_rate": 9.839099111409543e-06, "loss": 0.7865410804748535, "memory(GiB)": 29.49, "step": 2730, "token_acc": 0.8031796502384738, "train_speed(iter/s)": 0.096462 }, { "epoch": 0.1271874881924321, "grad_norm": 5.078505516052246, "learning_rate": 9.838130133513543e-06, "loss": 0.8374693870544434, "memory(GiB)": 29.49, "step": 2735, "token_acc": 0.7991054789414834, "train_speed(iter/s)": 0.09656 }, { "epoch": 0.12742000645238172, "grad_norm": 5.44411039352417, "learning_rate": 9.83715829468182e-06, "loss": 0.7986952304840088, "memory(GiB)": 29.49, "step": 2740, "token_acc": 0.801953125, "train_speed(iter/s)": 0.096659 }, { "epoch": 0.12765252471233132, "grad_norm": 4.840211391448975, "learning_rate": 9.836183595489054e-06, "loss": 0.8615546226501465, "memory(GiB)": 29.49, "step": 2745, "token_acc": 0.7914564413050791, "train_speed(iter/s)": 0.09676 }, { "epoch": 0.12788504297228093, "grad_norm": 4.512420177459717, "learning_rate": 9.835206036511613e-06, "loss": 0.8429337501525879, "memory(GiB)": 29.49, "step": 2750, "token_acc": 0.7959511568123393, "train_speed(iter/s)": 0.09686 }, { "epoch": 0.12788504297228093, "eval_loss": 0.754675567150116, "eval_runtime": 289.7348, "eval_samples_per_second": 11.994, "eval_steps_per_second": 11.994, "step": 2750 }, { "epoch": 0.12811756123223053, "grad_norm": 5.223292350769043, "learning_rate": 9.834225618327558e-06, "loss": 0.8188864707946777, "memory(GiB)": 29.49, "step": 2755, "token_acc": 0.7912945004465994, "train_speed(iter/s)": 0.095978 }, { "epoch": 0.1283500794921801, "grad_norm": 4.913177967071533, "learning_rate": 9.833242341516643e-06, "loss": 0.8221258163452149, "memory(GiB)": 29.49, "step": 2760, "token_acc": 0.803325079589671, "train_speed(iter/s)": 0.096077 }, { "epoch": 0.1285825977521297, "grad_norm": 6.017866134643555, "learning_rate": 9.832256206660305e-06, "loss": 1.0436551094055175, "memory(GiB)": 29.49, "step": 2765, "token_acc": 0.7476943346508564, "train_speed(iter/s)": 0.096174 }, { "epoch": 0.12881511601207932, "grad_norm": 4.889126777648926, "learning_rate": 9.83126721434168e-06, "loss": 0.8034382820129394, "memory(GiB)": 29.49, "step": 2770, "token_acc": 0.8095238095238095, "train_speed(iter/s)": 0.096272 }, { "epoch": 0.12904763427202892, "grad_norm": 4.911211967468262, "learning_rate": 9.83027536514559e-06, "loss": 0.7517318725585938, "memory(GiB)": 29.49, "step": 2775, "token_acc": 0.8236705317872851, "train_speed(iter/s)": 0.09637 }, { "epoch": 0.12928015253197853, "grad_norm": 5.036534786224365, "learning_rate": 9.829280659658544e-06, "loss": 0.8068610191345215, "memory(GiB)": 29.49, "step": 2780, "token_acc": 0.7897165458141068, "train_speed(iter/s)": 0.096469 }, { "epoch": 0.12951267079192813, "grad_norm": 5.624305725097656, "learning_rate": 9.828283098468741e-06, "loss": 0.8407914161682128, "memory(GiB)": 29.49, "step": 2785, "token_acc": 0.7917938284164123, "train_speed(iter/s)": 0.096567 }, { "epoch": 0.12974518905187774, "grad_norm": 5.084197521209717, "learning_rate": 9.827282682166074e-06, "loss": 0.8023724555969238, "memory(GiB)": 29.49, "step": 2790, "token_acc": 0.7984674329501916, "train_speed(iter/s)": 0.096663 }, { "epoch": 0.12997770731182734, "grad_norm": 5.027306079864502, "learning_rate": 9.826279411342117e-06, "loss": 0.9611904144287109, "memory(GiB)": 29.49, "step": 2795, "token_acc": 0.7777015437392796, "train_speed(iter/s)": 0.096757 }, { "epoch": 0.13021022557177694, "grad_norm": 4.208934307098389, "learning_rate": 9.825273286590133e-06, "loss": 0.8066798210144043, "memory(GiB)": 29.49, "step": 2800, "token_acc": 0.8011676938047356, "train_speed(iter/s)": 0.096853 }, { "epoch": 0.13021022557177694, "eval_loss": 0.7422595620155334, "eval_runtime": 289.8504, "eval_samples_per_second": 11.989, "eval_steps_per_second": 11.989, "step": 2800 }, { "epoch": 0.13044274383172655, "grad_norm": 6.204680919647217, "learning_rate": 9.82426430850508e-06, "loss": 0.8447407722473145, "memory(GiB)": 29.49, "step": 2805, "token_acc": 0.7921122457184209, "train_speed(iter/s)": 0.095988 }, { "epoch": 0.13067526209167613, "grad_norm": 6.124194622039795, "learning_rate": 9.823252477683594e-06, "loss": 0.8883560180664063, "memory(GiB)": 29.49, "step": 2810, "token_acc": 0.7881856540084389, "train_speed(iter/s)": 0.096085 }, { "epoch": 0.13090778035162573, "grad_norm": 5.090604782104492, "learning_rate": 9.822237794724003e-06, "loss": 0.9355738639831543, "memory(GiB)": 29.49, "step": 2815, "token_acc": 0.7807560137457045, "train_speed(iter/s)": 0.09618 }, { "epoch": 0.13114029861157533, "grad_norm": 4.79983377456665, "learning_rate": 9.821220260226319e-06, "loss": 0.9572502136230469, "memory(GiB)": 29.49, "step": 2820, "token_acc": 0.785137861466039, "train_speed(iter/s)": 0.096277 }, { "epoch": 0.13137281687152494, "grad_norm": 4.211757659912109, "learning_rate": 9.820199874792245e-06, "loss": 0.8392532348632813, "memory(GiB)": 29.49, "step": 2825, "token_acc": 0.7980360065466449, "train_speed(iter/s)": 0.096372 }, { "epoch": 0.13160533513147454, "grad_norm": 6.155999660491943, "learning_rate": 9.819176639025162e-06, "loss": 0.8681906700134278, "memory(GiB)": 29.49, "step": 2830, "token_acc": 0.7908745247148289, "train_speed(iter/s)": 0.096468 }, { "epoch": 0.13183785339142415, "grad_norm": 5.698853492736816, "learning_rate": 9.818150553530144e-06, "loss": 0.803080940246582, "memory(GiB)": 29.49, "step": 2835, "token_acc": 0.8136070853462157, "train_speed(iter/s)": 0.096563 }, { "epoch": 0.13207037165137375, "grad_norm": 4.489797115325928, "learning_rate": 9.817121618913948e-06, "loss": 0.8084547042846679, "memory(GiB)": 29.49, "step": 2840, "token_acc": 0.804380664652568, "train_speed(iter/s)": 0.096661 }, { "epoch": 0.13230288991132336, "grad_norm": 5.172839164733887, "learning_rate": 9.81608983578501e-06, "loss": 0.6797237873077393, "memory(GiB)": 29.49, "step": 2845, "token_acc": 0.8252279635258358, "train_speed(iter/s)": 0.096756 }, { "epoch": 0.13253540817127296, "grad_norm": 4.8296942710876465, "learning_rate": 9.815055204753458e-06, "loss": 0.7829336166381836, "memory(GiB)": 29.49, "step": 2850, "token_acc": 0.801798800799467, "train_speed(iter/s)": 0.096851 }, { "epoch": 0.13253540817127296, "eval_loss": 0.7429930567741394, "eval_runtime": 288.7556, "eval_samples_per_second": 12.034, "eval_steps_per_second": 12.034, "step": 2850 }, { "epoch": 0.13276792643122254, "grad_norm": 5.827232360839844, "learning_rate": 9.814017726431105e-06, "loss": 0.8832127571105957, "memory(GiB)": 29.49, "step": 2855, "token_acc": 0.7922617522736006, "train_speed(iter/s)": 0.096001 }, { "epoch": 0.13300044469117214, "grad_norm": 3.6636886596679688, "learning_rate": 9.812977401431437e-06, "loss": 0.8548738479614257, "memory(GiB)": 29.49, "step": 2860, "token_acc": 0.7891721569750788, "train_speed(iter/s)": 0.096096 }, { "epoch": 0.13323296295112175, "grad_norm": 4.969047546386719, "learning_rate": 9.811934230369636e-06, "loss": 0.76055908203125, "memory(GiB)": 29.49, "step": 2865, "token_acc": 0.8173374613003096, "train_speed(iter/s)": 0.096191 }, { "epoch": 0.13346548121107135, "grad_norm": 2.693516492843628, "learning_rate": 9.810888213862556e-06, "loss": 0.8962690353393554, "memory(GiB)": 33.07, "step": 2870, "token_acc": 0.751769587503051, "train_speed(iter/s)": 0.096278 }, { "epoch": 0.13369799947102096, "grad_norm": 5.445571422576904, "learning_rate": 9.809839352528743e-06, "loss": 0.8073612213134765, "memory(GiB)": 33.07, "step": 2875, "token_acc": 0.8058076225045372, "train_speed(iter/s)": 0.096372 }, { "epoch": 0.13393051773097056, "grad_norm": 4.459766387939453, "learning_rate": 9.808787646988422e-06, "loss": 0.8136503219604492, "memory(GiB)": 33.07, "step": 2880, "token_acc": 0.8049738219895288, "train_speed(iter/s)": 0.096466 }, { "epoch": 0.13416303599092017, "grad_norm": 5.122330665588379, "learning_rate": 9.807733097863494e-06, "loss": 0.8207123756408692, "memory(GiB)": 33.07, "step": 2885, "token_acc": 0.8027571580063627, "train_speed(iter/s)": 0.09656 }, { "epoch": 0.13439555425086977, "grad_norm": 2.956819772720337, "learning_rate": 9.80667570577755e-06, "loss": 0.9861713409423828, "memory(GiB)": 33.07, "step": 2890, "token_acc": 0.7597343797162692, "train_speed(iter/s)": 0.09665 }, { "epoch": 0.13462807251081937, "grad_norm": 4.836545467376709, "learning_rate": 9.805615471355859e-06, "loss": 0.7751515865325928, "memory(GiB)": 33.07, "step": 2895, "token_acc": 0.8123145400593472, "train_speed(iter/s)": 0.096744 }, { "epoch": 0.13486059077076898, "grad_norm": 4.383913993835449, "learning_rate": 9.804552395225368e-06, "loss": 0.8893208503723145, "memory(GiB)": 33.07, "step": 2900, "token_acc": 0.7776507573592455, "train_speed(iter/s)": 0.096838 }, { "epoch": 0.13486059077076898, "eval_loss": 0.7408275008201599, "eval_runtime": 290.8079, "eval_samples_per_second": 11.949, "eval_steps_per_second": 11.949, "step": 2900 }, { "epoch": 0.13509310903071856, "grad_norm": 5.118293762207031, "learning_rate": 9.80348647801471e-06, "loss": 0.8451316833496094, "memory(GiB)": 33.07, "step": 2905, "token_acc": 0.7927192454344772, "train_speed(iter/s)": 0.095998 }, { "epoch": 0.13532562729066816, "grad_norm": 4.629569053649902, "learning_rate": 9.802417720354189e-06, "loss": 0.8460866928100585, "memory(GiB)": 33.07, "step": 2910, "token_acc": 0.8077192982456141, "train_speed(iter/s)": 0.096086 }, { "epoch": 0.13555814555061776, "grad_norm": 5.260866641998291, "learning_rate": 9.801346122875801e-06, "loss": 0.75780029296875, "memory(GiB)": 33.07, "step": 2915, "token_acc": 0.8127731092436975, "train_speed(iter/s)": 0.096177 }, { "epoch": 0.13579066381056737, "grad_norm": 4.477482318878174, "learning_rate": 9.800271686213213e-06, "loss": 0.9403352737426758, "memory(GiB)": 33.07, "step": 2920, "token_acc": 0.773038605230386, "train_speed(iter/s)": 0.096264 }, { "epoch": 0.13602318207051697, "grad_norm": 4.261521816253662, "learning_rate": 9.799194411001768e-06, "loss": 0.8434426307678222, "memory(GiB)": 33.07, "step": 2925, "token_acc": 0.7681672025723473, "train_speed(iter/s)": 0.096355 }, { "epoch": 0.13625570033046658, "grad_norm": 3.634920358657837, "learning_rate": 9.798114297878496e-06, "loss": 0.8066039085388184, "memory(GiB)": 33.07, "step": 2930, "token_acc": 0.797752808988764, "train_speed(iter/s)": 0.096447 }, { "epoch": 0.13648821859041618, "grad_norm": 5.318452835083008, "learning_rate": 9.797031347482101e-06, "loss": 0.8009425163269043, "memory(GiB)": 33.07, "step": 2935, "token_acc": 0.8004246284501062, "train_speed(iter/s)": 0.096537 }, { "epoch": 0.1367207368503658, "grad_norm": 6.3095316886901855, "learning_rate": 9.795945560452967e-06, "loss": 0.8986371040344239, "memory(GiB)": 33.07, "step": 2940, "token_acc": 0.7920758550626481, "train_speed(iter/s)": 0.096628 }, { "epoch": 0.1369532551103154, "grad_norm": 3.810253381729126, "learning_rate": 9.794856937433148e-06, "loss": 0.8121430397033691, "memory(GiB)": 33.07, "step": 2945, "token_acc": 0.8095394736842105, "train_speed(iter/s)": 0.096717 }, { "epoch": 0.137185773370265, "grad_norm": 4.707873344421387, "learning_rate": 9.793765479066385e-06, "loss": 0.8329290390014649, "memory(GiB)": 33.07, "step": 2950, "token_acc": 0.7846309403437816, "train_speed(iter/s)": 0.096809 }, { "epoch": 0.137185773370265, "eval_loss": 0.7358382940292358, "eval_runtime": 292.6301, "eval_samples_per_second": 11.875, "eval_steps_per_second": 11.875, "step": 2950 }, { "epoch": 0.13741829163021457, "grad_norm": 5.086911201477051, "learning_rate": 9.79267118599809e-06, "loss": 0.8006107330322265, "memory(GiB)": 33.07, "step": 2955, "token_acc": 0.7935293080510988, "train_speed(iter/s)": 0.09598 }, { "epoch": 0.13765080989016418, "grad_norm": 4.827671527862549, "learning_rate": 9.791574058875351e-06, "loss": 0.777289867401123, "memory(GiB)": 33.07, "step": 2960, "token_acc": 0.8047665687234737, "train_speed(iter/s)": 0.09607 }, { "epoch": 0.13788332815011378, "grad_norm": 5.354944705963135, "learning_rate": 9.790474098346933e-06, "loss": 0.7382638931274415, "memory(GiB)": 33.07, "step": 2965, "token_acc": 0.820254110612855, "train_speed(iter/s)": 0.096161 }, { "epoch": 0.1381158464100634, "grad_norm": 5.039259910583496, "learning_rate": 9.78937130506328e-06, "loss": 0.8652180671691895, "memory(GiB)": 33.07, "step": 2970, "token_acc": 0.7841680129240711, "train_speed(iter/s)": 0.096251 }, { "epoch": 0.138348364670013, "grad_norm": 5.581478118896484, "learning_rate": 9.788265679676503e-06, "loss": 0.7484108448028565, "memory(GiB)": 33.07, "step": 2975, "token_acc": 0.8159857904085257, "train_speed(iter/s)": 0.096344 }, { "epoch": 0.1385808829299626, "grad_norm": 4.969134330749512, "learning_rate": 9.787157222840395e-06, "loss": 0.8082466125488281, "memory(GiB)": 33.07, "step": 2980, "token_acc": 0.8080651415277239, "train_speed(iter/s)": 0.096435 }, { "epoch": 0.1388134011899122, "grad_norm": 4.701774597167969, "learning_rate": 9.786045935210423e-06, "loss": 0.894073486328125, "memory(GiB)": 33.07, "step": 2985, "token_acc": 0.7753647777400746, "train_speed(iter/s)": 0.096526 }, { "epoch": 0.1390459194498618, "grad_norm": 5.563474655151367, "learning_rate": 9.78493181744372e-06, "loss": 0.7509230136871338, "memory(GiB)": 33.07, "step": 2990, "token_acc": 0.819632881085395, "train_speed(iter/s)": 0.096619 }, { "epoch": 0.1392784377098114, "grad_norm": 6.414600849151611, "learning_rate": 9.783814870199101e-06, "loss": 0.8086988449096679, "memory(GiB)": 33.07, "step": 2995, "token_acc": 0.8048202291584354, "train_speed(iter/s)": 0.09671 }, { "epoch": 0.139510955969761, "grad_norm": 4.5219597816467285, "learning_rate": 9.782695094137056e-06, "loss": 0.7585030555725097, "memory(GiB)": 33.07, "step": 3000, "token_acc": 0.8189102564102564, "train_speed(iter/s)": 0.096802 }, { "epoch": 0.139510955969761, "eval_loss": 0.7317752838134766, "eval_runtime": 290.2766, "eval_samples_per_second": 11.971, "eval_steps_per_second": 11.971, "step": 3000 }, { "epoch": 0.1397434742297106, "grad_norm": 4.116754055023193, "learning_rate": 9.781572489919735e-06, "loss": 0.8825703620910644, "memory(GiB)": 33.07, "step": 3005, "token_acc": 0.7937784522003035, "train_speed(iter/s)": 0.095992 }, { "epoch": 0.1399759924896602, "grad_norm": 5.839756488800049, "learning_rate": 9.780447058210973e-06, "loss": 0.8361115455627441, "memory(GiB)": 33.07, "step": 3010, "token_acc": 0.7739352020385876, "train_speed(iter/s)": 0.096081 }, { "epoch": 0.1402085107496098, "grad_norm": 4.884430885314941, "learning_rate": 9.779318799676274e-06, "loss": 0.7522700786590576, "memory(GiB)": 33.07, "step": 3015, "token_acc": 0.8246367239101717, "train_speed(iter/s)": 0.096168 }, { "epoch": 0.1404410290095594, "grad_norm": 4.92686128616333, "learning_rate": 9.778187714982808e-06, "loss": 0.7651845455169678, "memory(GiB)": 33.07, "step": 3020, "token_acc": 0.7955761683910096, "train_speed(iter/s)": 0.096256 }, { "epoch": 0.140673547269509, "grad_norm": 4.128361225128174, "learning_rate": 9.777053804799423e-06, "loss": 0.8679119110107422, "memory(GiB)": 33.07, "step": 3025, "token_acc": 0.78975487115022, "train_speed(iter/s)": 0.096345 }, { "epoch": 0.1409060655294586, "grad_norm": 5.358954429626465, "learning_rate": 9.775917069796635e-06, "loss": 0.8796347618103028, "memory(GiB)": 33.07, "step": 3030, "token_acc": 0.7975945017182131, "train_speed(iter/s)": 0.096434 }, { "epoch": 0.14113858378940822, "grad_norm": 4.029304027557373, "learning_rate": 9.77477751064663e-06, "loss": 0.7879987239837647, "memory(GiB)": 33.07, "step": 3035, "token_acc": 0.8097361575622445, "train_speed(iter/s)": 0.096523 }, { "epoch": 0.14137110204935782, "grad_norm": 5.047082424163818, "learning_rate": 9.773635128023263e-06, "loss": 0.7463294506072998, "memory(GiB)": 33.07, "step": 3040, "token_acc": 0.815464587394412, "train_speed(iter/s)": 0.09661 }, { "epoch": 0.14160362030930743, "grad_norm": 4.665285587310791, "learning_rate": 9.772489922602064e-06, "loss": 0.7370441913604736, "memory(GiB)": 33.07, "step": 3045, "token_acc": 0.8179551122194514, "train_speed(iter/s)": 0.096699 }, { "epoch": 0.141836138569257, "grad_norm": 3.9613771438598633, "learning_rate": 9.771341895060223e-06, "loss": 0.8843655586242676, "memory(GiB)": 33.07, "step": 3050, "token_acc": 0.77269800386349, "train_speed(iter/s)": 0.096789 }, { "epoch": 0.141836138569257, "eval_loss": 0.7323087453842163, "eval_runtime": 291.6112, "eval_samples_per_second": 11.917, "eval_steps_per_second": 11.917, "step": 3050 }, { "epoch": 0.1420686568292066, "grad_norm": 4.098989963531494, "learning_rate": 9.770191046076609e-06, "loss": 0.8098397254943848, "memory(GiB)": 33.07, "step": 3055, "token_acc": 0.7947033102312605, "train_speed(iter/s)": 0.095987 }, { "epoch": 0.1423011750891562, "grad_norm": 5.723684787750244, "learning_rate": 9.769037376331752e-06, "loss": 0.7382633686065674, "memory(GiB)": 33.07, "step": 3060, "token_acc": 0.8312332951508209, "train_speed(iter/s)": 0.096076 }, { "epoch": 0.14253369334910582, "grad_norm": 4.521707057952881, "learning_rate": 9.767880886507853e-06, "loss": 0.9055806159973144, "memory(GiB)": 33.07, "step": 3065, "token_acc": 0.7789291882556131, "train_speed(iter/s)": 0.096164 }, { "epoch": 0.14276621160905542, "grad_norm": 5.3605732917785645, "learning_rate": 9.766721577288781e-06, "loss": 0.822453784942627, "memory(GiB)": 33.07, "step": 3070, "token_acc": 0.7985028072364317, "train_speed(iter/s)": 0.096251 }, { "epoch": 0.14299872986900503, "grad_norm": 5.691252708435059, "learning_rate": 9.76555944936007e-06, "loss": 0.9621199607849121, "memory(GiB)": 33.07, "step": 3075, "token_acc": 0.7724851143842055, "train_speed(iter/s)": 0.096336 }, { "epoch": 0.14323124812895463, "grad_norm": 6.443828105926514, "learning_rate": 9.764394503408922e-06, "loss": 0.8782764434814453, "memory(GiB)": 33.07, "step": 3080, "token_acc": 0.8062077198567449, "train_speed(iter/s)": 0.096424 }, { "epoch": 0.14346376638890423, "grad_norm": 5.440097808837891, "learning_rate": 9.763226740124209e-06, "loss": 0.7793534278869629, "memory(GiB)": 33.07, "step": 3085, "token_acc": 0.8125247720967103, "train_speed(iter/s)": 0.096513 }, { "epoch": 0.14369628464885384, "grad_norm": 6.047162055969238, "learning_rate": 9.76205616019646e-06, "loss": 0.7774827480316162, "memory(GiB)": 33.07, "step": 3090, "token_acc": 0.8274209012464045, "train_speed(iter/s)": 0.096601 }, { "epoch": 0.14392880290880344, "grad_norm": 4.667301654815674, "learning_rate": 9.760882764317879e-06, "loss": 0.8556358337402343, "memory(GiB)": 33.07, "step": 3095, "token_acc": 0.7904670505438259, "train_speed(iter/s)": 0.096685 }, { "epoch": 0.14416132116875302, "grad_norm": 4.598082065582275, "learning_rate": 9.75970655318233e-06, "loss": 0.7539079189300537, "memory(GiB)": 33.07, "step": 3100, "token_acc": 0.8225, "train_speed(iter/s)": 0.096774 }, { "epoch": 0.14416132116875302, "eval_loss": 0.7284711003303528, "eval_runtime": 293.8493, "eval_samples_per_second": 11.826, "eval_steps_per_second": 11.826, "step": 3100 }, { "epoch": 0.14439383942870263, "grad_norm": 5.339951992034912, "learning_rate": 9.758527527485342e-06, "loss": 0.7430771350860595, "memory(GiB)": 33.07, "step": 3105, "token_acc": 0.7953026327403449, "train_speed(iter/s)": 0.095981 }, { "epoch": 0.14462635768865223, "grad_norm": 5.519880294799805, "learning_rate": 9.757345687924112e-06, "loss": 0.8814563751220703, "memory(GiB)": 33.07, "step": 3110, "token_acc": 0.801297371116422, "train_speed(iter/s)": 0.096069 }, { "epoch": 0.14485887594860183, "grad_norm": 4.690358638763428, "learning_rate": 9.756161035197495e-06, "loss": 0.8213804244995118, "memory(GiB)": 33.07, "step": 3115, "token_acc": 0.7810784630287375, "train_speed(iter/s)": 0.096154 }, { "epoch": 0.14509139420855144, "grad_norm": 5.900088787078857, "learning_rate": 9.754973570006014e-06, "loss": 0.7102549076080322, "memory(GiB)": 33.07, "step": 3120, "token_acc": 0.8243793997776955, "train_speed(iter/s)": 0.09624 }, { "epoch": 0.14532391246850104, "grad_norm": 5.579226970672607, "learning_rate": 9.753783293051854e-06, "loss": 0.6997756481170654, "memory(GiB)": 33.07, "step": 3125, "token_acc": 0.818785578747628, "train_speed(iter/s)": 0.096327 }, { "epoch": 0.14555643072845065, "grad_norm": 4.501831531524658, "learning_rate": 9.752590205038863e-06, "loss": 0.9300002098083496, "memory(GiB)": 33.07, "step": 3130, "token_acc": 0.7713385826771654, "train_speed(iter/s)": 0.096412 }, { "epoch": 0.14578894898840025, "grad_norm": 5.4228034019470215, "learning_rate": 9.75139430667255e-06, "loss": 0.7476221561431885, "memory(GiB)": 33.07, "step": 3135, "token_acc": 0.8206948076204706, "train_speed(iter/s)": 0.096499 }, { "epoch": 0.14602146724834986, "grad_norm": 5.298704624176025, "learning_rate": 9.750195598660088e-06, "loss": 0.7928246021270752, "memory(GiB)": 33.07, "step": 3140, "token_acc": 0.8, "train_speed(iter/s)": 0.096581 }, { "epoch": 0.14625398550829946, "grad_norm": 5.123478889465332, "learning_rate": 9.748994081710308e-06, "loss": 0.9278718948364257, "memory(GiB)": 33.07, "step": 3145, "token_acc": 0.7674144037780402, "train_speed(iter/s)": 0.096666 }, { "epoch": 0.14648650376824904, "grad_norm": 6.589613437652588, "learning_rate": 9.747789756533706e-06, "loss": 0.8111718177795411, "memory(GiB)": 33.07, "step": 3150, "token_acc": 0.7971469329529244, "train_speed(iter/s)": 0.096752 }, { "epoch": 0.14648650376824904, "eval_loss": 0.7266234159469604, "eval_runtime": 294.8453, "eval_samples_per_second": 11.786, "eval_steps_per_second": 11.786, "step": 3150 }, { "epoch": 0.14671902202819864, "grad_norm": 5.1463942527771, "learning_rate": 9.746582623842434e-06, "loss": 0.7378671169281006, "memory(GiB)": 33.07, "step": 3155, "token_acc": 0.7954281596541094, "train_speed(iter/s)": 0.095969 }, { "epoch": 0.14695154028814825, "grad_norm": 4.170019626617432, "learning_rate": 9.745372684350309e-06, "loss": 0.8750859260559082, "memory(GiB)": 33.07, "step": 3160, "token_acc": 0.7874066861408634, "train_speed(iter/s)": 0.096054 }, { "epoch": 0.14718405854809785, "grad_norm": 5.746112823486328, "learning_rate": 9.744159938772807e-06, "loss": 0.7857075691223144, "memory(GiB)": 33.07, "step": 3165, "token_acc": 0.8218029350104822, "train_speed(iter/s)": 0.096138 }, { "epoch": 0.14741657680804746, "grad_norm": 5.502606391906738, "learning_rate": 9.742944387827059e-06, "loss": 0.8238849639892578, "memory(GiB)": 33.07, "step": 3170, "token_acc": 0.7904761904761904, "train_speed(iter/s)": 0.096224 }, { "epoch": 0.14764909506799706, "grad_norm": 5.353701114654541, "learning_rate": 9.741726032231858e-06, "loss": 0.832034683227539, "memory(GiB)": 33.07, "step": 3175, "token_acc": 0.8003101977510663, "train_speed(iter/s)": 0.096309 }, { "epoch": 0.14788161332794666, "grad_norm": 4.378777980804443, "learning_rate": 9.740504872707656e-06, "loss": 0.9383623123168945, "memory(GiB)": 33.07, "step": 3180, "token_acc": 0.7763684913217623, "train_speed(iter/s)": 0.096394 }, { "epoch": 0.14811413158789627, "grad_norm": 6.112171173095703, "learning_rate": 9.739280909976566e-06, "loss": 0.8390913963317871, "memory(GiB)": 33.07, "step": 3185, "token_acc": 0.8102409638554217, "train_speed(iter/s)": 0.09648 }, { "epoch": 0.14834664984784587, "grad_norm": 7.882232666015625, "learning_rate": 9.738054144762347e-06, "loss": 0.7900448322296143, "memory(GiB)": 33.07, "step": 3190, "token_acc": 0.807822489657766, "train_speed(iter/s)": 0.096562 }, { "epoch": 0.14857916810779548, "grad_norm": 5.705371856689453, "learning_rate": 9.736824577790428e-06, "loss": 0.8507783889770508, "memory(GiB)": 33.07, "step": 3195, "token_acc": 0.7648448043184886, "train_speed(iter/s)": 0.096644 }, { "epoch": 0.14881168636774506, "grad_norm": 4.271676063537598, "learning_rate": 9.735592209787893e-06, "loss": 0.77920503616333, "memory(GiB)": 33.07, "step": 3200, "token_acc": 0.8135593220338984, "train_speed(iter/s)": 0.096727 }, { "epoch": 0.14881168636774506, "eval_loss": 0.7255586981773376, "eval_runtime": 291.2062, "eval_samples_per_second": 11.933, "eval_steps_per_second": 11.933, "step": 3200 }, { "epoch": 0.14904420462769466, "grad_norm": 5.626006603240967, "learning_rate": 9.734357041483473e-06, "loss": 0.8095316886901855, "memory(GiB)": 33.07, "step": 3205, "token_acc": 0.7956126741414885, "train_speed(iter/s)": 0.095964 }, { "epoch": 0.14927672288764426, "grad_norm": 5.581325531005859, "learning_rate": 9.733119073607563e-06, "loss": 0.9211545944213867, "memory(GiB)": 33.07, "step": 3210, "token_acc": 0.7900207900207901, "train_speed(iter/s)": 0.096046 }, { "epoch": 0.14950924114759387, "grad_norm": 4.774792671203613, "learning_rate": 9.731878306892213e-06, "loss": 0.7863685607910156, "memory(GiB)": 33.07, "step": 3215, "token_acc": 0.8199121522693997, "train_speed(iter/s)": 0.09613 }, { "epoch": 0.14974175940754347, "grad_norm": 6.042647838592529, "learning_rate": 9.730634742071128e-06, "loss": 0.7859435081481934, "memory(GiB)": 33.07, "step": 3220, "token_acc": 0.8172798677139314, "train_speed(iter/s)": 0.096213 }, { "epoch": 0.14997427766749308, "grad_norm": 6.267547607421875, "learning_rate": 9.729388379879663e-06, "loss": 0.8451736450195313, "memory(GiB)": 33.07, "step": 3225, "token_acc": 0.7858958068614994, "train_speed(iter/s)": 0.096294 }, { "epoch": 0.15020679592744268, "grad_norm": 5.5586347579956055, "learning_rate": 9.728139221054833e-06, "loss": 0.8673666000366211, "memory(GiB)": 33.07, "step": 3230, "token_acc": 0.8081487341772152, "train_speed(iter/s)": 0.096378 }, { "epoch": 0.1504393141873923, "grad_norm": 4.9623823165893555, "learning_rate": 9.726887266335302e-06, "loss": 0.7912930011749267, "memory(GiB)": 33.07, "step": 3235, "token_acc": 0.812361049355269, "train_speed(iter/s)": 0.096461 }, { "epoch": 0.1506718324473419, "grad_norm": 3.856882095336914, "learning_rate": 9.72563251646139e-06, "loss": 0.7687274932861328, "memory(GiB)": 33.07, "step": 3240, "token_acc": 0.8265987909640471, "train_speed(iter/s)": 0.096543 }, { "epoch": 0.1509043507072915, "grad_norm": 6.322175025939941, "learning_rate": 9.72437497217507e-06, "loss": 0.8785791397094727, "memory(GiB)": 33.07, "step": 3245, "token_acc": 0.7953172205438066, "train_speed(iter/s)": 0.096625 }, { "epoch": 0.15113686896724107, "grad_norm": 5.029903411865234, "learning_rate": 9.723114634219968e-06, "loss": 0.7664390563964844, "memory(GiB)": 33.07, "step": 3250, "token_acc": 0.8135489777235276, "train_speed(iter/s)": 0.096708 }, { "epoch": 0.15113686896724107, "eval_loss": 0.7299229502677917, "eval_runtime": 294.3976, "eval_samples_per_second": 11.804, "eval_steps_per_second": 11.804, "step": 3250 }, { "epoch": 0.15136938722719068, "grad_norm": 4.417317867279053, "learning_rate": 9.721851503341357e-06, "loss": 0.8709222793579101, "memory(GiB)": 33.07, "step": 3255, "token_acc": 0.7953129374125175, "train_speed(iter/s)": 0.095951 }, { "epoch": 0.15160190548714028, "grad_norm": 5.620341777801514, "learning_rate": 9.72058558028617e-06, "loss": 0.7596760272979737, "memory(GiB)": 33.07, "step": 3260, "token_acc": 0.8017127799736495, "train_speed(iter/s)": 0.096033 }, { "epoch": 0.15183442374708989, "grad_norm": 4.787825107574463, "learning_rate": 9.719316865802983e-06, "loss": 0.7746444702148437, "memory(GiB)": 33.07, "step": 3265, "token_acc": 0.8201466615206484, "train_speed(iter/s)": 0.096116 }, { "epoch": 0.1520669420070395, "grad_norm": 4.002923965454102, "learning_rate": 9.718045360642028e-06, "loss": 0.8768243789672852, "memory(GiB)": 33.07, "step": 3270, "token_acc": 0.778902677988243, "train_speed(iter/s)": 0.096199 }, { "epoch": 0.1522994602669891, "grad_norm": 6.7627272605896, "learning_rate": 9.716771065555184e-06, "loss": 0.7489274978637696, "memory(GiB)": 33.07, "step": 3275, "token_acc": 0.8207650273224044, "train_speed(iter/s)": 0.096282 }, { "epoch": 0.1525319785269387, "grad_norm": 4.953536033630371, "learning_rate": 9.71549398129598e-06, "loss": 0.8561309814453125, "memory(GiB)": 33.07, "step": 3280, "token_acc": 0.7854435178165277, "train_speed(iter/s)": 0.096362 }, { "epoch": 0.1527644967868883, "grad_norm": 5.244431018829346, "learning_rate": 9.7142141086196e-06, "loss": 0.8115564346313476, "memory(GiB)": 33.07, "step": 3285, "token_acc": 0.7993816557883888, "train_speed(iter/s)": 0.096444 }, { "epoch": 0.1529970150468379, "grad_norm": 5.603392601013184, "learning_rate": 9.712931448282864e-06, "loss": 0.9105894088745117, "memory(GiB)": 33.07, "step": 3290, "token_acc": 0.76864, "train_speed(iter/s)": 0.096527 }, { "epoch": 0.15322953330678749, "grad_norm": 5.889342784881592, "learning_rate": 9.711646001044259e-06, "loss": 0.7476118087768555, "memory(GiB)": 33.07, "step": 3295, "token_acc": 0.8253218884120171, "train_speed(iter/s)": 0.09661 }, { "epoch": 0.1534620515667371, "grad_norm": 5.446425437927246, "learning_rate": 9.7103577676639e-06, "loss": 0.8373539924621582, "memory(GiB)": 33.07, "step": 3300, "token_acc": 0.8043965903992822, "train_speed(iter/s)": 0.096693 }, { "epoch": 0.1534620515667371, "eval_loss": 0.7242446541786194, "eval_runtime": 290.5903, "eval_samples_per_second": 11.958, "eval_steps_per_second": 11.958, "step": 3300 }, { "epoch": 0.1536945698266867, "grad_norm": 6.903223991394043, "learning_rate": 9.709066748903566e-06, "loss": 0.784368896484375, "memory(GiB)": 33.07, "step": 3305, "token_acc": 0.7966411332970097, "train_speed(iter/s)": 0.095957 }, { "epoch": 0.1539270880866363, "grad_norm": 4.271733283996582, "learning_rate": 9.707772945526672e-06, "loss": 0.8733412742614746, "memory(GiB)": 33.07, "step": 3310, "token_acc": 0.7872619829284307, "train_speed(iter/s)": 0.096037 }, { "epoch": 0.1541596063465859, "grad_norm": 5.4446940422058105, "learning_rate": 9.706476358298286e-06, "loss": 0.8001057624816894, "memory(GiB)": 33.07, "step": 3315, "token_acc": 0.8016831320892792, "train_speed(iter/s)": 0.09612 }, { "epoch": 0.1543921246065355, "grad_norm": 5.717871189117432, "learning_rate": 9.70517698798512e-06, "loss": 0.7779604911804199, "memory(GiB)": 33.07, "step": 3320, "token_acc": 0.8100538599640934, "train_speed(iter/s)": 0.096202 }, { "epoch": 0.1546246428664851, "grad_norm": 4.912775039672852, "learning_rate": 9.703874835355533e-06, "loss": 0.77957763671875, "memory(GiB)": 33.07, "step": 3325, "token_acc": 0.800769442154438, "train_speed(iter/s)": 0.096284 }, { "epoch": 0.15485716112643472, "grad_norm": 5.523812294006348, "learning_rate": 9.702569901179524e-06, "loss": 0.8424034118652344, "memory(GiB)": 33.07, "step": 3330, "token_acc": 0.7964601769911505, "train_speed(iter/s)": 0.096364 }, { "epoch": 0.15508967938638432, "grad_norm": 5.215954303741455, "learning_rate": 9.701262186228744e-06, "loss": 0.7897037029266357, "memory(GiB)": 33.07, "step": 3335, "token_acc": 0.8032228075612023, "train_speed(iter/s)": 0.096438 }, { "epoch": 0.15532219764633393, "grad_norm": 4.996734142303467, "learning_rate": 9.699951691276486e-06, "loss": 0.7758293151855469, "memory(GiB)": 33.07, "step": 3340, "token_acc": 0.8208223311957752, "train_speed(iter/s)": 0.09652 }, { "epoch": 0.1555547159062835, "grad_norm": 5.724847793579102, "learning_rate": 9.698638417097683e-06, "loss": 0.7793337821960449, "memory(GiB)": 33.07, "step": 3345, "token_acc": 0.8164094232331438, "train_speed(iter/s)": 0.096602 }, { "epoch": 0.1557872341662331, "grad_norm": 6.377450466156006, "learning_rate": 9.697322364468917e-06, "loss": 0.8371264457702636, "memory(GiB)": 33.07, "step": 3350, "token_acc": 0.7927991528415108, "train_speed(iter/s)": 0.096683 }, { "epoch": 0.1557872341662331, "eval_loss": 0.7258186340332031, "eval_runtime": 292.1213, "eval_samples_per_second": 11.896, "eval_steps_per_second": 11.896, "step": 3350 }, { "epoch": 0.1560197524261827, "grad_norm": 6.011385440826416, "learning_rate": 9.69600353416841e-06, "loss": 0.798521089553833, "memory(GiB)": 33.07, "step": 3355, "token_acc": 0.7971320658088471, "train_speed(iter/s)": 0.095956 }, { "epoch": 0.15625227068613232, "grad_norm": 6.298033237457275, "learning_rate": 9.694681926976025e-06, "loss": 0.7486701488494873, "memory(GiB)": 33.07, "step": 3360, "token_acc": 0.8195459792227779, "train_speed(iter/s)": 0.096035 }, { "epoch": 0.15648478894608192, "grad_norm": 6.298468112945557, "learning_rate": 9.693357543673274e-06, "loss": 0.8666628837585449, "memory(GiB)": 33.07, "step": 3365, "token_acc": 0.7910060536177573, "train_speed(iter/s)": 0.096112 }, { "epoch": 0.15671730720603153, "grad_norm": 5.438319206237793, "learning_rate": 9.6920303850433e-06, "loss": 0.6734135627746582, "memory(GiB)": 33.07, "step": 3370, "token_acc": 0.8503155996393147, "train_speed(iter/s)": 0.096192 }, { "epoch": 0.15694982546598113, "grad_norm": 5.32026481628418, "learning_rate": 9.690700451870898e-06, "loss": 0.8130708694458008, "memory(GiB)": 33.07, "step": 3375, "token_acc": 0.8077761627906976, "train_speed(iter/s)": 0.096272 }, { "epoch": 0.15718234372593073, "grad_norm": 5.873499870300293, "learning_rate": 9.689367744942494e-06, "loss": 0.7929094314575196, "memory(GiB)": 33.07, "step": 3380, "token_acc": 0.8097868981846882, "train_speed(iter/s)": 0.096354 }, { "epoch": 0.15741486198588034, "grad_norm": 6.561089992523193, "learning_rate": 9.688032265046162e-06, "loss": 0.6968857765197753, "memory(GiB)": 33.07, "step": 3385, "token_acc": 0.851013672795851, "train_speed(iter/s)": 0.096434 }, { "epoch": 0.15764738024582994, "grad_norm": 6.442429065704346, "learning_rate": 9.686694012971612e-06, "loss": 0.8454565048217774, "memory(GiB)": 33.07, "step": 3390, "token_acc": 0.7944564434845213, "train_speed(iter/s)": 0.096514 }, { "epoch": 0.15787989850577952, "grad_norm": 5.067663669586182, "learning_rate": 9.685352989510193e-06, "loss": 0.7534542083740234, "memory(GiB)": 33.07, "step": 3395, "token_acc": 0.8213627992633518, "train_speed(iter/s)": 0.096595 }, { "epoch": 0.15811241676572912, "grad_norm": 3.8264877796173096, "learning_rate": 9.684009195454893e-06, "loss": 0.9291213989257813, "memory(GiB)": 33.07, "step": 3400, "token_acc": 0.7741023466214306, "train_speed(iter/s)": 0.096675 }, { "epoch": 0.15811241676572912, "eval_loss": 0.7275504469871521, "eval_runtime": 294.5423, "eval_samples_per_second": 11.798, "eval_steps_per_second": 11.798, "step": 3400 }, { "epoch": 0.15834493502567873, "grad_norm": 6.075207710266113, "learning_rate": 9.68266263160034e-06, "loss": 0.7632218360900879, "memory(GiB)": 33.07, "step": 3405, "token_acc": 0.7977882080627532, "train_speed(iter/s)": 0.095951 }, { "epoch": 0.15857745328562833, "grad_norm": 5.3615336418151855, "learning_rate": 9.681313298742798e-06, "loss": 0.8492207527160645, "memory(GiB)": 33.07, "step": 3410, "token_acc": 0.7734967892586107, "train_speed(iter/s)": 0.09603 }, { "epoch": 0.15880997154557794, "grad_norm": 4.396504878997803, "learning_rate": 9.67996119768017e-06, "loss": 0.7675450325012207, "memory(GiB)": 33.07, "step": 3415, "token_acc": 0.8188755020080322, "train_speed(iter/s)": 0.096108 }, { "epoch": 0.15904248980552754, "grad_norm": 5.461509704589844, "learning_rate": 9.678606329211995e-06, "loss": 0.8365023612976075, "memory(GiB)": 33.07, "step": 3420, "token_acc": 0.7900235928547354, "train_speed(iter/s)": 0.096186 }, { "epoch": 0.15927500806547715, "grad_norm": 6.789258003234863, "learning_rate": 9.677248694139447e-06, "loss": 0.86660737991333, "memory(GiB)": 33.07, "step": 3425, "token_acc": 0.7959479015918958, "train_speed(iter/s)": 0.096262 }, { "epoch": 0.15950752632542675, "grad_norm": 5.055928707122803, "learning_rate": 9.675888293265341e-06, "loss": 0.729840087890625, "memory(GiB)": 33.07, "step": 3430, "token_acc": 0.8189922480620155, "train_speed(iter/s)": 0.096339 }, { "epoch": 0.15974004458537636, "grad_norm": 5.088936805725098, "learning_rate": 9.674525127394122e-06, "loss": 0.9462801933288574, "memory(GiB)": 33.07, "step": 3435, "token_acc": 0.7601904195180006, "train_speed(iter/s)": 0.096415 }, { "epoch": 0.15997256284532596, "grad_norm": 4.846744060516357, "learning_rate": 9.67315919733187e-06, "loss": 0.8493914604187012, "memory(GiB)": 33.07, "step": 3440, "token_acc": 0.8003169572107766, "train_speed(iter/s)": 0.096494 }, { "epoch": 0.16020508110527554, "grad_norm": 8.210926055908203, "learning_rate": 9.671790503886304e-06, "loss": 0.7502418518066406, "memory(GiB)": 33.07, "step": 3445, "token_acc": 0.8145896656534954, "train_speed(iter/s)": 0.096573 }, { "epoch": 0.16043759936522514, "grad_norm": 5.043862342834473, "learning_rate": 9.670419047866776e-06, "loss": 0.7558164119720459, "memory(GiB)": 33.07, "step": 3450, "token_acc": 0.8108108108108109, "train_speed(iter/s)": 0.09665 }, { "epoch": 0.16043759936522514, "eval_loss": 0.7215369939804077, "eval_runtime": 292.7346, "eval_samples_per_second": 11.871, "eval_steps_per_second": 11.871, "step": 3450 }, { "epoch": 0.16067011762517475, "grad_norm": 5.069505214691162, "learning_rate": 9.669044830084266e-06, "loss": 0.9493141174316406, "memory(GiB)": 33.07, "step": 3455, "token_acc": 0.7972598793194513, "train_speed(iter/s)": 0.095941 }, { "epoch": 0.16090263588512435, "grad_norm": 5.921719074249268, "learning_rate": 9.667667851351394e-06, "loss": 0.8479342460632324, "memory(GiB)": 33.07, "step": 3460, "token_acc": 0.7883467883467884, "train_speed(iter/s)": 0.096019 }, { "epoch": 0.16113515414507396, "grad_norm": 5.095674991607666, "learning_rate": 9.666288112482411e-06, "loss": 0.7614772319793701, "memory(GiB)": 33.07, "step": 3465, "token_acc": 0.8230712166172107, "train_speed(iter/s)": 0.096097 }, { "epoch": 0.16136767240502356, "grad_norm": 5.0426530838012695, "learning_rate": 9.664905614293198e-06, "loss": 0.7405894279479981, "memory(GiB)": 33.07, "step": 3470, "token_acc": 0.8121739130434783, "train_speed(iter/s)": 0.096174 }, { "epoch": 0.16160019066497316, "grad_norm": 4.290180206298828, "learning_rate": 9.66352035760127e-06, "loss": 0.8251392364501953, "memory(GiB)": 33.07, "step": 3475, "token_acc": 0.7998174627319744, "train_speed(iter/s)": 0.09625 }, { "epoch": 0.16183270892492277, "grad_norm": 6.105207443237305, "learning_rate": 9.66213234322577e-06, "loss": 0.9235754013061523, "memory(GiB)": 33.07, "step": 3480, "token_acc": 0.771029555050341, "train_speed(iter/s)": 0.096327 }, { "epoch": 0.16206522718487237, "grad_norm": 5.007493019104004, "learning_rate": 9.660741571987476e-06, "loss": 0.7650205135345459, "memory(GiB)": 33.07, "step": 3485, "token_acc": 0.8162409454822722, "train_speed(iter/s)": 0.096405 }, { "epoch": 0.16229774544482195, "grad_norm": 3.939481496810913, "learning_rate": 9.659348044708791e-06, "loss": 0.8588067054748535, "memory(GiB)": 33.07, "step": 3490, "token_acc": 0.7917938284164123, "train_speed(iter/s)": 0.096482 }, { "epoch": 0.16253026370477155, "grad_norm": 5.938999176025391, "learning_rate": 9.657951762213754e-06, "loss": 0.6494212627410889, "memory(GiB)": 33.07, "step": 3495, "token_acc": 0.836876691148048, "train_speed(iter/s)": 0.096559 }, { "epoch": 0.16276278196472116, "grad_norm": 5.342368125915527, "learning_rate": 9.656552725328028e-06, "loss": 0.8250043869018555, "memory(GiB)": 33.07, "step": 3500, "token_acc": 0.7960215778826703, "train_speed(iter/s)": 0.096637 }, { "epoch": 0.16276278196472116, "eval_loss": 0.7176188826560974, "eval_runtime": 296.5613, "eval_samples_per_second": 11.718, "eval_steps_per_second": 11.718, "step": 3500 }, { "epoch": 0.16299530022467076, "grad_norm": 5.9379353523254395, "learning_rate": 9.655150934878907e-06, "loss": 0.9059648513793945, "memory(GiB)": 33.07, "step": 3505, "token_acc": 0.7976547804858526, "train_speed(iter/s)": 0.095929 }, { "epoch": 0.16322781848462037, "grad_norm": 4.100049018859863, "learning_rate": 9.653746391695314e-06, "loss": 0.8847969055175782, "memory(GiB)": 33.07, "step": 3510, "token_acc": 0.7722834645669291, "train_speed(iter/s)": 0.096005 }, { "epoch": 0.16346033674456997, "grad_norm": 3.792137861251831, "learning_rate": 9.652339096607796e-06, "loss": 0.796082878112793, "memory(GiB)": 33.07, "step": 3515, "token_acc": 0.804725959960617, "train_speed(iter/s)": 0.096082 }, { "epoch": 0.16369285500451958, "grad_norm": 6.276231288909912, "learning_rate": 9.650929050448534e-06, "loss": 0.7875662803649902, "memory(GiB)": 33.07, "step": 3520, "token_acc": 0.7962891379976808, "train_speed(iter/s)": 0.096161 }, { "epoch": 0.16392537326446918, "grad_norm": 5.953547477722168, "learning_rate": 9.649516254051327e-06, "loss": 0.7139126300811768, "memory(GiB)": 33.07, "step": 3525, "token_acc": 0.8333924140375754, "train_speed(iter/s)": 0.096242 }, { "epoch": 0.1641578915244188, "grad_norm": 6.996596336364746, "learning_rate": 9.648100708251612e-06, "loss": 0.8537234306335449, "memory(GiB)": 33.07, "step": 3530, "token_acc": 0.7948618139353835, "train_speed(iter/s)": 0.09632 }, { "epoch": 0.1643904097843684, "grad_norm": 5.0464396476745605, "learning_rate": 9.646682413886437e-06, "loss": 0.7961117267608643, "memory(GiB)": 33.07, "step": 3535, "token_acc": 0.8100498930862438, "train_speed(iter/s)": 0.096397 }, { "epoch": 0.16462292804431797, "grad_norm": 6.0903167724609375, "learning_rate": 9.64526137179449e-06, "loss": 0.906730842590332, "memory(GiB)": 33.07, "step": 3540, "token_acc": 0.7886123423116264, "train_speed(iter/s)": 0.096476 }, { "epoch": 0.16485544630426757, "grad_norm": 7.942210674285889, "learning_rate": 9.643837582816071e-06, "loss": 0.8420848846435547, "memory(GiB)": 33.07, "step": 3545, "token_acc": 0.7900228236061297, "train_speed(iter/s)": 0.096552 }, { "epoch": 0.16508796456421718, "grad_norm": 5.138957500457764, "learning_rate": 9.642411047793115e-06, "loss": 0.7483475685119629, "memory(GiB)": 33.07, "step": 3550, "token_acc": 0.8143631436314364, "train_speed(iter/s)": 0.096631 }, { "epoch": 0.16508796456421718, "eval_loss": 0.7186556458473206, "eval_runtime": 294.7293, "eval_samples_per_second": 11.79, "eval_steps_per_second": 11.79, "step": 3550 }, { "epoch": 0.16532048282416678, "grad_norm": 6.104626655578613, "learning_rate": 9.640981767569176e-06, "loss": 0.8763669967651367, "memory(GiB)": 33.07, "step": 3555, "token_acc": 0.7971994052115554, "train_speed(iter/s)": 0.095937 }, { "epoch": 0.16555300108411639, "grad_norm": 6.483398914337158, "learning_rate": 9.63954974298943e-06, "loss": 0.9383742332458496, "memory(GiB)": 33.07, "step": 3560, "token_acc": 0.7736309731426578, "train_speed(iter/s)": 0.096013 }, { "epoch": 0.165785519344066, "grad_norm": 5.247052192687988, "learning_rate": 9.638114974900675e-06, "loss": 0.801731014251709, "memory(GiB)": 33.07, "step": 3565, "token_acc": 0.802016129032258, "train_speed(iter/s)": 0.096088 }, { "epoch": 0.1660180376040156, "grad_norm": 4.652188301086426, "learning_rate": 9.636677464151339e-06, "loss": 0.7853640079498291, "memory(GiB)": 33.07, "step": 3570, "token_acc": 0.813795702977761, "train_speed(iter/s)": 0.096165 }, { "epoch": 0.1662505558639652, "grad_norm": 4.874006748199463, "learning_rate": 9.635237211591461e-06, "loss": 0.7610373497009277, "memory(GiB)": 33.07, "step": 3575, "token_acc": 0.8124118476727785, "train_speed(iter/s)": 0.09624 }, { "epoch": 0.1664830741239148, "grad_norm": 6.398994445800781, "learning_rate": 9.633794218072711e-06, "loss": 0.7691407203674316, "memory(GiB)": 33.07, "step": 3580, "token_acc": 0.803083391730904, "train_speed(iter/s)": 0.096314 }, { "epoch": 0.1667155923838644, "grad_norm": 5.58176851272583, "learning_rate": 9.632348484448375e-06, "loss": 0.7621356964111328, "memory(GiB)": 33.07, "step": 3585, "token_acc": 0.7978406552494416, "train_speed(iter/s)": 0.096391 }, { "epoch": 0.16694811064381399, "grad_norm": 4.73793888092041, "learning_rate": 9.630900011573358e-06, "loss": 0.8498669624328613, "memory(GiB)": 33.07, "step": 3590, "token_acc": 0.7809806835066865, "train_speed(iter/s)": 0.096467 }, { "epoch": 0.1671806289037636, "grad_norm": 5.0031352043151855, "learning_rate": 9.629448800304189e-06, "loss": 0.8297422409057618, "memory(GiB)": 33.07, "step": 3595, "token_acc": 0.8004722550177096, "train_speed(iter/s)": 0.096543 }, { "epoch": 0.1674131471637132, "grad_norm": 4.638136386871338, "learning_rate": 9.627994851499012e-06, "loss": 0.8547920227050781, "memory(GiB)": 33.07, "step": 3600, "token_acc": 0.8039288361749444, "train_speed(iter/s)": 0.096619 }, { "epoch": 0.1674131471637132, "eval_loss": 0.7175089716911316, "eval_runtime": 293.2233, "eval_samples_per_second": 11.851, "eval_steps_per_second": 11.851, "step": 3600 }, { "epoch": 0.1676456654236628, "grad_norm": 5.295803070068359, "learning_rate": 9.626538166017594e-06, "loss": 0.6819862842559814, "memory(GiB)": 33.07, "step": 3605, "token_acc": 0.7997306548454071, "train_speed(iter/s)": 0.095937 }, { "epoch": 0.1678781836836124, "grad_norm": 6.202095031738281, "learning_rate": 9.625078744721315e-06, "loss": 0.9101020812988281, "memory(GiB)": 33.07, "step": 3610, "token_acc": 0.7704337562702862, "train_speed(iter/s)": 0.096011 }, { "epoch": 0.168110701943562, "grad_norm": 6.182511329650879, "learning_rate": 9.623616588473178e-06, "loss": 0.7675065517425537, "memory(GiB)": 33.07, "step": 3615, "token_acc": 0.8091299019607843, "train_speed(iter/s)": 0.096086 }, { "epoch": 0.1683432202035116, "grad_norm": 5.590625762939453, "learning_rate": 9.622151698137801e-06, "loss": 0.8312381744384766, "memory(GiB)": 33.07, "step": 3620, "token_acc": 0.8003894839337877, "train_speed(iter/s)": 0.09616 }, { "epoch": 0.16857573846346122, "grad_norm": 7.963487148284912, "learning_rate": 9.62068407458142e-06, "loss": 0.6981609344482422, "memory(GiB)": 33.07, "step": 3625, "token_acc": 0.8294966835739368, "train_speed(iter/s)": 0.096236 }, { "epoch": 0.16880825672341082, "grad_norm": 7.476184368133545, "learning_rate": 9.619213718671882e-06, "loss": 0.7124053478240967, "memory(GiB)": 33.07, "step": 3630, "token_acc": 0.8472818311874106, "train_speed(iter/s)": 0.096309 }, { "epoch": 0.16904077498336043, "grad_norm": 5.105494499206543, "learning_rate": 9.617740631278659e-06, "loss": 0.7903679847717285, "memory(GiB)": 33.07, "step": 3635, "token_acc": 0.8045439358503174, "train_speed(iter/s)": 0.096384 }, { "epoch": 0.16927329324331, "grad_norm": 5.298689365386963, "learning_rate": 9.61626481327283e-06, "loss": 0.8809099197387695, "memory(GiB)": 33.07, "step": 3640, "token_acc": 0.8095663835493965, "train_speed(iter/s)": 0.096459 }, { "epoch": 0.1695058115032596, "grad_norm": 5.424408912658691, "learning_rate": 9.614786265527092e-06, "loss": 0.8258963584899902, "memory(GiB)": 33.07, "step": 3645, "token_acc": 0.7723616865708018, "train_speed(iter/s)": 0.096531 }, { "epoch": 0.1697383297632092, "grad_norm": 6.607914924621582, "learning_rate": 9.613304988915754e-06, "loss": 0.8794160842895508, "memory(GiB)": 33.07, "step": 3650, "token_acc": 0.7938718662952646, "train_speed(iter/s)": 0.096605 }, { "epoch": 0.1697383297632092, "eval_loss": 0.7148188948631287, "eval_runtime": 293.6307, "eval_samples_per_second": 11.835, "eval_steps_per_second": 11.835, "step": 3650 }, { "epoch": 0.16997084802315882, "grad_norm": 5.146786212921143, "learning_rate": 9.611820984314746e-06, "loss": 0.7711294651031494, "memory(GiB)": 33.07, "step": 3655, "token_acc": 0.7995883755992884, "train_speed(iter/s)": 0.095931 }, { "epoch": 0.17020336628310842, "grad_norm": 7.579797744750977, "learning_rate": 9.610334252601603e-06, "loss": 0.6962503910064697, "memory(GiB)": 33.07, "step": 3660, "token_acc": 0.8216216216216217, "train_speed(iter/s)": 0.096005 }, { "epoch": 0.17043588454305802, "grad_norm": 3.413147211074829, "learning_rate": 9.608844794655475e-06, "loss": 0.8060155868530273, "memory(GiB)": 33.07, "step": 3665, "token_acc": 0.7950065703022339, "train_speed(iter/s)": 0.096078 }, { "epoch": 0.17066840280300763, "grad_norm": 7.161190986633301, "learning_rate": 9.607352611357123e-06, "loss": 0.6839473247528076, "memory(GiB)": 33.07, "step": 3670, "token_acc": 0.8454003407155025, "train_speed(iter/s)": 0.096151 }, { "epoch": 0.17090092106295723, "grad_norm": 5.271682262420654, "learning_rate": 9.605857703588924e-06, "loss": 0.8989489555358887, "memory(GiB)": 33.07, "step": 3675, "token_acc": 0.798501872659176, "train_speed(iter/s)": 0.096226 }, { "epoch": 0.17113343932290684, "grad_norm": 5.448130130767822, "learning_rate": 9.604360072234861e-06, "loss": 0.7826570987701416, "memory(GiB)": 33.07, "step": 3680, "token_acc": 0.8154887854422345, "train_speed(iter/s)": 0.0963 }, { "epoch": 0.17136595758285642, "grad_norm": 4.372930526733398, "learning_rate": 9.60285971818053e-06, "loss": 0.7660940170288086, "memory(GiB)": 33.07, "step": 3685, "token_acc": 0.8209019947961839, "train_speed(iter/s)": 0.096375 }, { "epoch": 0.17159847584280602, "grad_norm": 4.681722164154053, "learning_rate": 9.601356642313138e-06, "loss": 0.8155523300170898, "memory(GiB)": 33.07, "step": 3690, "token_acc": 0.7983606557377049, "train_speed(iter/s)": 0.096448 }, { "epoch": 0.17183099410275562, "grad_norm": 5.451058864593506, "learning_rate": 9.5998508455215e-06, "loss": 0.737110185623169, "memory(GiB)": 33.07, "step": 3695, "token_acc": 0.8147727272727273, "train_speed(iter/s)": 0.096521 }, { "epoch": 0.17206351236270523, "grad_norm": 6.284342288970947, "learning_rate": 9.598342328696035e-06, "loss": 0.8151021003723145, "memory(GiB)": 33.07, "step": 3700, "token_acc": 0.8129205921938089, "train_speed(iter/s)": 0.096595 }, { "epoch": 0.17206351236270523, "eval_loss": 0.7119737863540649, "eval_runtime": 291.2382, "eval_samples_per_second": 11.932, "eval_steps_per_second": 11.932, "step": 3700 }, { "epoch": 0.17229603062265483, "grad_norm": 5.116153240203857, "learning_rate": 9.596831092728784e-06, "loss": 0.7929253101348877, "memory(GiB)": 33.07, "step": 3705, "token_acc": 0.7995435069875465, "train_speed(iter/s)": 0.095936 }, { "epoch": 0.17252854888260444, "grad_norm": 7.7103590965271, "learning_rate": 9.595317138513383e-06, "loss": 0.7780908584594727, "memory(GiB)": 33.07, "step": 3710, "token_acc": 0.8088341781317886, "train_speed(iter/s)": 0.096008 }, { "epoch": 0.17276106714255404, "grad_norm": 6.119678497314453, "learning_rate": 9.593800466945077e-06, "loss": 0.8345657348632812, "memory(GiB)": 33.07, "step": 3715, "token_acc": 0.803952321204517, "train_speed(iter/s)": 0.096082 }, { "epoch": 0.17299358540250365, "grad_norm": 5.879092693328857, "learning_rate": 9.592281078920729e-06, "loss": 0.8818518638610839, "memory(GiB)": 33.07, "step": 3720, "token_acc": 0.7890543817111189, "train_speed(iter/s)": 0.096154 }, { "epoch": 0.17322610366245325, "grad_norm": 6.872639179229736, "learning_rate": 9.590758975338793e-06, "loss": 0.8579018592834473, "memory(GiB)": 33.07, "step": 3725, "token_acc": 0.7804423128164135, "train_speed(iter/s)": 0.096225 }, { "epoch": 0.17345862192240286, "grad_norm": 6.415402889251709, "learning_rate": 9.589234157099336e-06, "loss": 0.8382321357727051, "memory(GiB)": 33.07, "step": 3730, "token_acc": 0.8034565916398714, "train_speed(iter/s)": 0.096299 }, { "epoch": 0.17369114018235243, "grad_norm": 6.094632625579834, "learning_rate": 9.587706625104035e-06, "loss": 0.7249147415161132, "memory(GiB)": 33.07, "step": 3735, "token_acc": 0.8148740503798481, "train_speed(iter/s)": 0.096372 }, { "epoch": 0.17392365844230204, "grad_norm": 5.106109142303467, "learning_rate": 9.58617638025616e-06, "loss": 0.858515739440918, "memory(GiB)": 33.07, "step": 3740, "token_acc": 0.7806090739589807, "train_speed(iter/s)": 0.096444 }, { "epoch": 0.17415617670225164, "grad_norm": 7.242406845092773, "learning_rate": 9.584643423460599e-06, "loss": 0.7636741638183594, "memory(GiB)": 33.07, "step": 3745, "token_acc": 0.8018543956043956, "train_speed(iter/s)": 0.096516 }, { "epoch": 0.17438869496220125, "grad_norm": 7.00205135345459, "learning_rate": 9.583107755623832e-06, "loss": 0.7448306560516358, "memory(GiB)": 33.07, "step": 3750, "token_acc": 0.8144083969465649, "train_speed(iter/s)": 0.096589 }, { "epoch": 0.17438869496220125, "eval_loss": 0.7071701884269714, "eval_runtime": 290.4999, "eval_samples_per_second": 11.962, "eval_steps_per_second": 11.962, "step": 3750 }, { "epoch": 0.17462121322215085, "grad_norm": 5.753495216369629, "learning_rate": 9.58156937765395e-06, "loss": 0.6926516532897949, "memory(GiB)": 33.07, "step": 3755, "token_acc": 0.8004340981729714, "train_speed(iter/s)": 0.095939 }, { "epoch": 0.17485373148210046, "grad_norm": 4.843967437744141, "learning_rate": 9.58002829046064e-06, "loss": 0.7503583431243896, "memory(GiB)": 33.07, "step": 3760, "token_acc": 0.8114439324116743, "train_speed(iter/s)": 0.09601 }, { "epoch": 0.17508624974205006, "grad_norm": 6.034976959228516, "learning_rate": 9.578484494955196e-06, "loss": 0.7742047786712647, "memory(GiB)": 33.07, "step": 3765, "token_acc": 0.8205445544554455, "train_speed(iter/s)": 0.096081 }, { "epoch": 0.17531876800199966, "grad_norm": 5.2408061027526855, "learning_rate": 9.576937992050515e-06, "loss": 0.7521644115447998, "memory(GiB)": 33.07, "step": 3770, "token_acc": 0.8303755674783326, "train_speed(iter/s)": 0.096153 }, { "epoch": 0.17555128626194927, "grad_norm": 5.320487976074219, "learning_rate": 9.575388782661086e-06, "loss": 0.7956186294555664, "memory(GiB)": 33.07, "step": 3775, "token_acc": 0.8172245204964272, "train_speed(iter/s)": 0.096223 }, { "epoch": 0.17578380452189887, "grad_norm": 5.391150951385498, "learning_rate": 9.573836867703007e-06, "loss": 0.8982381820678711, "memory(GiB)": 33.07, "step": 3780, "token_acc": 0.7747972551466001, "train_speed(iter/s)": 0.096291 }, { "epoch": 0.17601632278184845, "grad_norm": 4.874892234802246, "learning_rate": 9.572282248093976e-06, "loss": 0.9775179862976074, "memory(GiB)": 33.07, "step": 3785, "token_acc": 0.7860677578987438, "train_speed(iter/s)": 0.096362 }, { "epoch": 0.17624884104179805, "grad_norm": 5.059951305389404, "learning_rate": 9.570724924753284e-06, "loss": 0.7578266143798829, "memory(GiB)": 33.07, "step": 3790, "token_acc": 0.805045871559633, "train_speed(iter/s)": 0.096433 }, { "epoch": 0.17648135930174766, "grad_norm": 4.954115867614746, "learning_rate": 9.569164898601826e-06, "loss": 0.8646341323852539, "memory(GiB)": 33.07, "step": 3795, "token_acc": 0.7873303167420814, "train_speed(iter/s)": 0.096505 }, { "epoch": 0.17671387756169726, "grad_norm": 5.7353057861328125, "learning_rate": 9.567602170562092e-06, "loss": 0.8507672309875488, "memory(GiB)": 33.07, "step": 3800, "token_acc": 0.7837078651685393, "train_speed(iter/s)": 0.096577 }, { "epoch": 0.17671387756169726, "eval_loss": 0.7128520011901855, "eval_runtime": 290.7916, "eval_samples_per_second": 11.95, "eval_steps_per_second": 11.95, "step": 3800 }, { "epoch": 0.17694639582164687, "grad_norm": 5.397838592529297, "learning_rate": 9.566036741558173e-06, "loss": 0.8797189712524414, "memory(GiB)": 33.07, "step": 3805, "token_acc": 0.7995074755331697, "train_speed(iter/s)": 0.095938 }, { "epoch": 0.17717891408159647, "grad_norm": 4.222261905670166, "learning_rate": 9.564468612515756e-06, "loss": 0.8522190093994141, "memory(GiB)": 33.07, "step": 3810, "token_acc": 0.796123226029768, "train_speed(iter/s)": 0.09601 }, { "epoch": 0.17741143234154608, "grad_norm": 4.521677017211914, "learning_rate": 9.562897784362121e-06, "loss": 0.7106293678283692, "memory(GiB)": 33.07, "step": 3815, "token_acc": 0.8250936329588014, "train_speed(iter/s)": 0.096079 }, { "epoch": 0.17764395060149568, "grad_norm": 5.949578285217285, "learning_rate": 9.561324258026151e-06, "loss": 0.8217846870422363, "memory(GiB)": 33.07, "step": 3820, "token_acc": 0.8033730974907446, "train_speed(iter/s)": 0.09615 }, { "epoch": 0.17787646886144529, "grad_norm": 5.23591947555542, "learning_rate": 9.559748034438319e-06, "loss": 0.7070714950561523, "memory(GiB)": 33.07, "step": 3825, "token_acc": 0.8170782754290498, "train_speed(iter/s)": 0.09622 }, { "epoch": 0.1781089871213949, "grad_norm": 7.460520267486572, "learning_rate": 9.558169114530694e-06, "loss": 0.6729435920715332, "memory(GiB)": 33.07, "step": 3830, "token_acc": 0.8424681144432954, "train_speed(iter/s)": 0.096292 }, { "epoch": 0.17834150538134447, "grad_norm": 5.60178279876709, "learning_rate": 9.556587499236942e-06, "loss": 0.7683042049407959, "memory(GiB)": 33.07, "step": 3835, "token_acc": 0.8206605222734255, "train_speed(iter/s)": 0.096362 }, { "epoch": 0.17857402364129407, "grad_norm": 6.029583930969238, "learning_rate": 9.555003189492318e-06, "loss": 0.7823711395263672, "memory(GiB)": 33.07, "step": 3840, "token_acc": 0.8051906440243511, "train_speed(iter/s)": 0.096428 }, { "epoch": 0.17880654190124368, "grad_norm": 4.636234283447266, "learning_rate": 9.553416186233674e-06, "loss": 0.7817121982574463, "memory(GiB)": 33.07, "step": 3845, "token_acc": 0.8130484547882487, "train_speed(iter/s)": 0.096497 }, { "epoch": 0.17903906016119328, "grad_norm": 5.539736747741699, "learning_rate": 9.551826490399459e-06, "loss": 0.8406240463256835, "memory(GiB)": 33.07, "step": 3850, "token_acc": 0.8003020007550019, "train_speed(iter/s)": 0.096566 }, { "epoch": 0.17903906016119328, "eval_loss": 0.7051939368247986, "eval_runtime": 289.0798, "eval_samples_per_second": 12.021, "eval_steps_per_second": 12.021, "step": 3850 }, { "epoch": 0.17927157842114289, "grad_norm": 4.831907749176025, "learning_rate": 9.550234102929702e-06, "loss": 0.7266098022460937, "memory(GiB)": 33.07, "step": 3855, "token_acc": 0.8010063495866778, "train_speed(iter/s)": 0.09594 }, { "epoch": 0.1795040966810925, "grad_norm": 3.985592842102051, "learning_rate": 9.548639024766036e-06, "loss": 0.8843966484069824, "memory(GiB)": 33.07, "step": 3860, "token_acc": 0.8019549511262218, "train_speed(iter/s)": 0.096009 }, { "epoch": 0.1797366149410421, "grad_norm": 5.885317802429199, "learning_rate": 9.547041256851676e-06, "loss": 0.9130638122558594, "memory(GiB)": 33.07, "step": 3865, "token_acc": 0.7749216300940439, "train_speed(iter/s)": 0.096077 }, { "epoch": 0.1799691332009917, "grad_norm": 5.011005401611328, "learning_rate": 9.545440800131437e-06, "loss": 0.7087615013122559, "memory(GiB)": 33.07, "step": 3870, "token_acc": 0.8194444444444444, "train_speed(iter/s)": 0.096146 }, { "epoch": 0.1802016514609413, "grad_norm": 5.256019592285156, "learning_rate": 9.543837655551711e-06, "loss": 0.8234603881835938, "memory(GiB)": 33.07, "step": 3875, "token_acc": 0.7919126328217237, "train_speed(iter/s)": 0.096216 }, { "epoch": 0.18043416972089088, "grad_norm": 5.069699287414551, "learning_rate": 9.542231824060494e-06, "loss": 0.6185711860656739, "memory(GiB)": 33.07, "step": 3880, "token_acc": 0.8525206922498119, "train_speed(iter/s)": 0.096286 }, { "epoch": 0.18066668798084048, "grad_norm": 4.84377384185791, "learning_rate": 9.54062330660736e-06, "loss": 0.8174562454223633, "memory(GiB)": 33.07, "step": 3885, "token_acc": 0.7946681792399319, "train_speed(iter/s)": 0.096354 }, { "epoch": 0.1808992062407901, "grad_norm": 6.560987949371338, "learning_rate": 9.539012104143474e-06, "loss": 0.7553558349609375, "memory(GiB)": 33.07, "step": 3890, "token_acc": 0.8088002532446977, "train_speed(iter/s)": 0.096422 }, { "epoch": 0.1811317245007397, "grad_norm": 5.4929938316345215, "learning_rate": 9.537398217621593e-06, "loss": 0.7413972377777099, "memory(GiB)": 33.07, "step": 3895, "token_acc": 0.8199160625715376, "train_speed(iter/s)": 0.096493 }, { "epoch": 0.1813642427606893, "grad_norm": 5.737190246582031, "learning_rate": 9.535781647996057e-06, "loss": 0.7559893131256104, "memory(GiB)": 33.07, "step": 3900, "token_acc": 0.8125879043600562, "train_speed(iter/s)": 0.096561 }, { "epoch": 0.1813642427606893, "eval_loss": 0.7063232660293579, "eval_runtime": 293.3208, "eval_samples_per_second": 11.847, "eval_steps_per_second": 11.847, "step": 3900 }, { "epoch": 0.1815967610206389, "grad_norm": 5.9086761474609375, "learning_rate": 9.53416239622279e-06, "loss": 0.7419236660003662, "memory(GiB)": 33.07, "step": 3905, "token_acc": 0.8012286055247203, "train_speed(iter/s)": 0.095933 }, { "epoch": 0.1818292792805885, "grad_norm": 4.9416184425354, "learning_rate": 9.53254046325931e-06, "loss": 0.653481912612915, "memory(GiB)": 33.07, "step": 3910, "token_acc": 0.8408619975134687, "train_speed(iter/s)": 0.096001 }, { "epoch": 0.1820617975405381, "grad_norm": 4.461050033569336, "learning_rate": 9.530915850064715e-06, "loss": 0.702120590209961, "memory(GiB)": 33.07, "step": 3915, "token_acc": 0.8253035644339992, "train_speed(iter/s)": 0.09607 }, { "epoch": 0.18229431580048772, "grad_norm": 4.964265823364258, "learning_rate": 9.52928855759969e-06, "loss": 0.7623435974121093, "memory(GiB)": 33.07, "step": 3920, "token_acc": 0.8236765318882868, "train_speed(iter/s)": 0.096139 }, { "epoch": 0.18252683406043732, "grad_norm": 5.344741344451904, "learning_rate": 9.5276585868265e-06, "loss": 0.6239171028137207, "memory(GiB)": 33.07, "step": 3925, "token_acc": 0.8420608108108109, "train_speed(iter/s)": 0.096208 }, { "epoch": 0.1827593523203869, "grad_norm": 5.035322189331055, "learning_rate": 9.526025938708999e-06, "loss": 0.713650131225586, "memory(GiB)": 33.07, "step": 3930, "token_acc": 0.8268041237113402, "train_speed(iter/s)": 0.096277 }, { "epoch": 0.1829918705803365, "grad_norm": 5.159491062164307, "learning_rate": 9.524390614212622e-06, "loss": 0.6837416172027588, "memory(GiB)": 33.07, "step": 3935, "token_acc": 0.8171238570241064, "train_speed(iter/s)": 0.096347 }, { "epoch": 0.1832243888402861, "grad_norm": 6.687197208404541, "learning_rate": 9.522752614304387e-06, "loss": 0.724323844909668, "memory(GiB)": 33.07, "step": 3940, "token_acc": 0.8087412587412588, "train_speed(iter/s)": 0.096415 }, { "epoch": 0.1834569071002357, "grad_norm": 4.916758060455322, "learning_rate": 9.521111939952895e-06, "loss": 0.6912760734558105, "memory(GiB)": 33.07, "step": 3945, "token_acc": 0.830335934848999, "train_speed(iter/s)": 0.096482 }, { "epoch": 0.18368942536018532, "grad_norm": 5.0996527671813965, "learning_rate": 9.519468592128324e-06, "loss": 0.7524893283843994, "memory(GiB)": 33.07, "step": 3950, "token_acc": 0.806738715829625, "train_speed(iter/s)": 0.09655 }, { "epoch": 0.18368942536018532, "eval_loss": 0.7087674736976624, "eval_runtime": 291.2471, "eval_samples_per_second": 11.931, "eval_steps_per_second": 11.931, "step": 3950 }, { "epoch": 0.18392194362013492, "grad_norm": 6.056982517242432, "learning_rate": 9.51782257180244e-06, "loss": 0.8986503601074218, "memory(GiB)": 33.07, "step": 3955, "token_acc": 0.8012853224468968, "train_speed(iter/s)": 0.095933 }, { "epoch": 0.18415446188008452, "grad_norm": 5.4020280838012695, "learning_rate": 9.516173879948583e-06, "loss": 0.7656956672668457, "memory(GiB)": 33.07, "step": 3960, "token_acc": 0.8064238600516203, "train_speed(iter/s)": 0.096001 }, { "epoch": 0.18438698014003413, "grad_norm": 7.4822869300842285, "learning_rate": 9.514522517541678e-06, "loss": 0.7231870651245117, "memory(GiB)": 33.07, "step": 3965, "token_acc": 0.8162162162162162, "train_speed(iter/s)": 0.09607 }, { "epoch": 0.18461949839998373, "grad_norm": 4.300986289978027, "learning_rate": 9.512868485558223e-06, "loss": 0.846955394744873, "memory(GiB)": 33.07, "step": 3970, "token_acc": 0.7907311456534254, "train_speed(iter/s)": 0.096136 }, { "epoch": 0.18485201665993334, "grad_norm": 3.9788742065429688, "learning_rate": 9.511211784976301e-06, "loss": 0.8556766510009766, "memory(GiB)": 33.07, "step": 3975, "token_acc": 0.7815327301756253, "train_speed(iter/s)": 0.096204 }, { "epoch": 0.18508453491988291, "grad_norm": 6.10697078704834, "learning_rate": 9.509552416775572e-06, "loss": 0.73006272315979, "memory(GiB)": 33.07, "step": 3980, "token_acc": 0.8176037483266398, "train_speed(iter/s)": 0.096273 }, { "epoch": 0.18531705317983252, "grad_norm": 6.847419738769531, "learning_rate": 9.507890381937266e-06, "loss": 0.7203133583068848, "memory(GiB)": 33.07, "step": 3985, "token_acc": 0.8189987163029525, "train_speed(iter/s)": 0.09634 }, { "epoch": 0.18554957143978212, "grad_norm": 7.144951343536377, "learning_rate": 9.506225681444202e-06, "loss": 0.7901617527008057, "memory(GiB)": 33.07, "step": 3990, "token_acc": 0.8129742033383915, "train_speed(iter/s)": 0.096409 }, { "epoch": 0.18578208969973173, "grad_norm": 4.346937656402588, "learning_rate": 9.504558316280761e-06, "loss": 0.7825525760650635, "memory(GiB)": 33.07, "step": 3995, "token_acc": 0.8113915416098226, "train_speed(iter/s)": 0.096475 }, { "epoch": 0.18601460795968133, "grad_norm": 4.778166770935059, "learning_rate": 9.502888287432915e-06, "loss": 0.7969249248504638, "memory(GiB)": 33.07, "step": 4000, "token_acc": 0.822502030869212, "train_speed(iter/s)": 0.096543 }, { "epoch": 0.18601460795968133, "eval_loss": 0.699872612953186, "eval_runtime": 292.1449, "eval_samples_per_second": 11.895, "eval_steps_per_second": 11.895, "step": 4000 }, { "epoch": 0.18624712621963094, "grad_norm": 6.564324378967285, "learning_rate": 9.501215595888201e-06, "loss": 0.739466381072998, "memory(GiB)": 33.07, "step": 4005, "token_acc": 0.8025343189017952, "train_speed(iter/s)": 0.095933 }, { "epoch": 0.18647964447958054, "grad_norm": 4.453283786773682, "learning_rate": 9.499540242635732e-06, "loss": 1.0081160545349122, "memory(GiB)": 33.07, "step": 4010, "token_acc": 0.7659355723098012, "train_speed(iter/s)": 0.095997 }, { "epoch": 0.18671216273953015, "grad_norm": 6.239867687225342, "learning_rate": 9.497862228666196e-06, "loss": 0.8795578956604004, "memory(GiB)": 33.07, "step": 4015, "token_acc": 0.7943651664837176, "train_speed(iter/s)": 0.096065 }, { "epoch": 0.18694468099947975, "grad_norm": 7.037559509277344, "learning_rate": 9.496181554971856e-06, "loss": 0.8672590255737305, "memory(GiB)": 33.07, "step": 4020, "token_acc": 0.7891297891297891, "train_speed(iter/s)": 0.096132 }, { "epoch": 0.18717719925942936, "grad_norm": 6.250262260437012, "learning_rate": 9.494498222546545e-06, "loss": 0.6804422378540039, "memory(GiB)": 33.07, "step": 4025, "token_acc": 0.8410415856976292, "train_speed(iter/s)": 0.096199 }, { "epoch": 0.18740971751937893, "grad_norm": 5.407467365264893, "learning_rate": 9.49281223238567e-06, "loss": 0.7347963809967041, "memory(GiB)": 33.07, "step": 4030, "token_acc": 0.8263201320132013, "train_speed(iter/s)": 0.096267 }, { "epoch": 0.18764223577932854, "grad_norm": 5.787410736083984, "learning_rate": 9.491123585486211e-06, "loss": 0.7938172817230225, "memory(GiB)": 33.07, "step": 4035, "token_acc": 0.8069164265129684, "train_speed(iter/s)": 0.096332 }, { "epoch": 0.18787475403927814, "grad_norm": 6.597839832305908, "learning_rate": 9.489432282846714e-06, "loss": 0.7364625930786133, "memory(GiB)": 33.07, "step": 4040, "token_acc": 0.8278404163052906, "train_speed(iter/s)": 0.0964 }, { "epoch": 0.18810727229922775, "grad_norm": 6.383011817932129, "learning_rate": 9.487738325467299e-06, "loss": 0.6238168716430664, "memory(GiB)": 33.07, "step": 4045, "token_acc": 0.8554265118141771, "train_speed(iter/s)": 0.096467 }, { "epoch": 0.18833979055917735, "grad_norm": 4.523214817047119, "learning_rate": 9.486041714349655e-06, "loss": 0.7638760089874268, "memory(GiB)": 33.07, "step": 4050, "token_acc": 0.816711590296496, "train_speed(iter/s)": 0.096536 }, { "epoch": 0.18833979055917735, "eval_loss": 0.7085168957710266, "eval_runtime": 293.1328, "eval_samples_per_second": 11.855, "eval_steps_per_second": 11.855, "step": 4050 }, { "epoch": 0.18857230881912695, "grad_norm": 4.8166584968566895, "learning_rate": 9.484342450497043e-06, "loss": 0.6987978458404541, "memory(GiB)": 33.07, "step": 4055, "token_acc": 0.802425636969392, "train_speed(iter/s)": 0.095929 }, { "epoch": 0.18880482707907656, "grad_norm": 6.945908546447754, "learning_rate": 9.482640534914289e-06, "loss": 0.7164095401763916, "memory(GiB)": 33.07, "step": 4060, "token_acc": 0.8259895444361464, "train_speed(iter/s)": 0.095995 }, { "epoch": 0.18903734533902616, "grad_norm": 6.610608100891113, "learning_rate": 9.480935968607784e-06, "loss": 0.630191707611084, "memory(GiB)": 33.07, "step": 4065, "token_acc": 0.844571975131516, "train_speed(iter/s)": 0.096062 }, { "epoch": 0.18926986359897577, "grad_norm": 5.781813621520996, "learning_rate": 9.479228752585498e-06, "loss": 0.79894118309021, "memory(GiB)": 33.07, "step": 4070, "token_acc": 0.80873330927463, "train_speed(iter/s)": 0.096127 }, { "epoch": 0.18950238185892535, "grad_norm": 8.563652992248535, "learning_rate": 9.477518887856958e-06, "loss": 1.0313690185546875, "memory(GiB)": 33.07, "step": 4075, "token_acc": 0.7563822027716995, "train_speed(iter/s)": 0.096193 }, { "epoch": 0.18973490011887495, "grad_norm": 3.825904130935669, "learning_rate": 9.475806375433256e-06, "loss": 0.7562622547149658, "memory(GiB)": 33.07, "step": 4080, "token_acc": 0.8031704095112285, "train_speed(iter/s)": 0.09626 }, { "epoch": 0.18996741837882455, "grad_norm": 5.252847671508789, "learning_rate": 9.474091216327058e-06, "loss": 0.806545352935791, "memory(GiB)": 33.07, "step": 4085, "token_acc": 0.8209828393135725, "train_speed(iter/s)": 0.096327 }, { "epoch": 0.19019993663877416, "grad_norm": 4.933413982391357, "learning_rate": 9.47237341155259e-06, "loss": 0.8696954727172852, "memory(GiB)": 33.07, "step": 4090, "token_acc": 0.7778093883357041, "train_speed(iter/s)": 0.096392 }, { "epoch": 0.19043245489872376, "grad_norm": 5.949667930603027, "learning_rate": 9.470652962125639e-06, "loss": 0.7424722194671631, "memory(GiB)": 33.07, "step": 4095, "token_acc": 0.8045484508899143, "train_speed(iter/s)": 0.096456 }, { "epoch": 0.19066497315867337, "grad_norm": 4.2154998779296875, "learning_rate": 9.468929869063564e-06, "loss": 0.8286898612976075, "memory(GiB)": 33.07, "step": 4100, "token_acc": 0.7962100031065549, "train_speed(iter/s)": 0.096522 }, { "epoch": 0.19066497315867337, "eval_loss": 0.6997935771942139, "eval_runtime": 295.0785, "eval_samples_per_second": 11.777, "eval_steps_per_second": 11.777, "step": 4100 }, { "epoch": 0.19089749141862297, "grad_norm": 5.990864276885986, "learning_rate": 9.46720413338528e-06, "loss": 1.0568864822387696, "memory(GiB)": 33.07, "step": 4105, "token_acc": 0.8014819783132915, "train_speed(iter/s)": 0.09592 }, { "epoch": 0.19113000967857258, "grad_norm": 5.360263824462891, "learning_rate": 9.465475756111271e-06, "loss": 0.7643206596374512, "memory(GiB)": 33.07, "step": 4110, "token_acc": 0.8110726643598616, "train_speed(iter/s)": 0.095984 }, { "epoch": 0.19136252793852218, "grad_norm": 5.552672863006592, "learning_rate": 9.46374473826358e-06, "loss": 0.9097006797790528, "memory(GiB)": 33.07, "step": 4115, "token_acc": 0.7756370416407706, "train_speed(iter/s)": 0.096049 }, { "epoch": 0.19159504619847179, "grad_norm": 6.712371349334717, "learning_rate": 9.462011080865809e-06, "loss": 0.7000391960144043, "memory(GiB)": 33.07, "step": 4120, "token_acc": 0.8365650969529086, "train_speed(iter/s)": 0.096115 }, { "epoch": 0.19182756445842136, "grad_norm": 6.444034099578857, "learning_rate": 9.460274784943122e-06, "loss": 0.6805448055267334, "memory(GiB)": 33.07, "step": 4125, "token_acc": 0.827490454703228, "train_speed(iter/s)": 0.09618 }, { "epoch": 0.19206008271837097, "grad_norm": 7.107831954956055, "learning_rate": 9.458535851522247e-06, "loss": 0.7243359565734864, "memory(GiB)": 33.07, "step": 4130, "token_acc": 0.8241419205553413, "train_speed(iter/s)": 0.096244 }, { "epoch": 0.19229260097832057, "grad_norm": 7.023250102996826, "learning_rate": 9.45679428163147e-06, "loss": 0.786740779876709, "memory(GiB)": 33.07, "step": 4135, "token_acc": 0.8085681204168275, "train_speed(iter/s)": 0.096305 }, { "epoch": 0.19252511923827018, "grad_norm": 5.895570278167725, "learning_rate": 9.455050076300633e-06, "loss": 0.6871311664581299, "memory(GiB)": 33.07, "step": 4140, "token_acc": 0.8327688399661304, "train_speed(iter/s)": 0.096369 }, { "epoch": 0.19275763749821978, "grad_norm": 7.156490325927734, "learning_rate": 9.453303236561138e-06, "loss": 0.7330933570861816, "memory(GiB)": 33.07, "step": 4145, "token_acc": 0.8441851712457659, "train_speed(iter/s)": 0.096435 }, { "epoch": 0.19299015575816938, "grad_norm": 5.157409191131592, "learning_rate": 9.451553763445946e-06, "loss": 0.7420677185058594, "memory(GiB)": 33.07, "step": 4150, "token_acc": 0.8215586307356154, "train_speed(iter/s)": 0.096499 }, { "epoch": 0.19299015575816938, "eval_loss": 0.7018148899078369, "eval_runtime": 295.0708, "eval_samples_per_second": 11.777, "eval_steps_per_second": 11.777, "step": 4150 }, { "epoch": 0.193222674018119, "grad_norm": 4.892160892486572, "learning_rate": 9.449801657989574e-06, "loss": 0.7619297504425049, "memory(GiB)": 33.07, "step": 4155, "token_acc": 0.8021788578953669, "train_speed(iter/s)": 0.095904 }, { "epoch": 0.1934551922780686, "grad_norm": 5.372450351715088, "learning_rate": 9.448046921228098e-06, "loss": 0.9046992301940918, "memory(GiB)": 33.07, "step": 4160, "token_acc": 0.7911676646706587, "train_speed(iter/s)": 0.095969 }, { "epoch": 0.1936877105380182, "grad_norm": 5.386012077331543, "learning_rate": 9.446289554199146e-06, "loss": 0.7827619552612305, "memory(GiB)": 33.07, "step": 4165, "token_acc": 0.8042328042328042, "train_speed(iter/s)": 0.096033 }, { "epoch": 0.1939202287979678, "grad_norm": 5.557136535644531, "learning_rate": 9.444529557941904e-06, "loss": 0.8229413032531738, "memory(GiB)": 33.07, "step": 4170, "token_acc": 0.7890600440113172, "train_speed(iter/s)": 0.096098 }, { "epoch": 0.19415274705791738, "grad_norm": 4.749749183654785, "learning_rate": 9.442766933497112e-06, "loss": 0.7320784568786621, "memory(GiB)": 33.07, "step": 4175, "token_acc": 0.8313518273888155, "train_speed(iter/s)": 0.096163 }, { "epoch": 0.19438526531786698, "grad_norm": 5.676370620727539, "learning_rate": 9.441001681907065e-06, "loss": 0.8156270980834961, "memory(GiB)": 33.07, "step": 4180, "token_acc": 0.8094858509366282, "train_speed(iter/s)": 0.096227 }, { "epoch": 0.1946177835778166, "grad_norm": 4.829561233520508, "learning_rate": 9.43923380421561e-06, "loss": 0.8413228034973145, "memory(GiB)": 33.07, "step": 4185, "token_acc": 0.7903622933520928, "train_speed(iter/s)": 0.096292 }, { "epoch": 0.1948503018377662, "grad_norm": 5.902974605560303, "learning_rate": 9.437463301468146e-06, "loss": 0.7763947010040283, "memory(GiB)": 33.07, "step": 4190, "token_acc": 0.8305927342256214, "train_speed(iter/s)": 0.096355 }, { "epoch": 0.1950828200977158, "grad_norm": 4.831693172454834, "learning_rate": 9.435690174711629e-06, "loss": 0.6980354309082031, "memory(GiB)": 33.07, "step": 4195, "token_acc": 0.8259067357512954, "train_speed(iter/s)": 0.096419 }, { "epoch": 0.1953153383576654, "grad_norm": 6.849177837371826, "learning_rate": 9.433914424994564e-06, "loss": 0.8142841339111329, "memory(GiB)": 33.07, "step": 4200, "token_acc": 0.8020400453343408, "train_speed(iter/s)": 0.096482 }, { "epoch": 0.1953153383576654, "eval_loss": 0.6981073021888733, "eval_runtime": 294.9067, "eval_samples_per_second": 11.783, "eval_steps_per_second": 11.783, "step": 4200 }, { "epoch": 0.195547856617615, "grad_norm": 5.831681728363037, "learning_rate": 9.432136053367003e-06, "loss": 0.8038248062133789, "memory(GiB)": 33.07, "step": 4205, "token_acc": 0.8027805933102947, "train_speed(iter/s)": 0.095896 }, { "epoch": 0.1957803748775646, "grad_norm": 4.811705112457275, "learning_rate": 9.430355060880555e-06, "loss": 0.7309530258178711, "memory(GiB)": 33.07, "step": 4210, "token_acc": 0.8190003104625893, "train_speed(iter/s)": 0.095959 }, { "epoch": 0.19601289313751422, "grad_norm": 5.662635326385498, "learning_rate": 9.428571448588373e-06, "loss": 0.9009736061096192, "memory(GiB)": 33.07, "step": 4215, "token_acc": 0.7850264057160609, "train_speed(iter/s)": 0.096022 }, { "epoch": 0.19624541139746382, "grad_norm": 4.918367385864258, "learning_rate": 9.426785217545166e-06, "loss": 0.7335701942443847, "memory(GiB)": 33.07, "step": 4220, "token_acc": 0.8282166264229045, "train_speed(iter/s)": 0.096087 }, { "epoch": 0.1964779296574134, "grad_norm": 5.138798236846924, "learning_rate": 9.424996368807184e-06, "loss": 0.6669015407562255, "memory(GiB)": 33.07, "step": 4225, "token_acc": 0.8363567649281936, "train_speed(iter/s)": 0.09615 }, { "epoch": 0.196710447917363, "grad_norm": 7.676051616668701, "learning_rate": 9.423204903432232e-06, "loss": 0.6710890769958496, "memory(GiB)": 33.07, "step": 4230, "token_acc": 0.8273730684326711, "train_speed(iter/s)": 0.096213 }, { "epoch": 0.1969429661773126, "grad_norm": 4.74376106262207, "learning_rate": 9.421410822479656e-06, "loss": 0.7752367496490479, "memory(GiB)": 33.07, "step": 4235, "token_acc": 0.8122767132185774, "train_speed(iter/s)": 0.096277 }, { "epoch": 0.1971754844372622, "grad_norm": 6.3608222007751465, "learning_rate": 9.41961412701035e-06, "loss": 0.8766127586364746, "memory(GiB)": 33.07, "step": 4240, "token_acc": 0.782608695652174, "train_speed(iter/s)": 0.096339 }, { "epoch": 0.19740800269721182, "grad_norm": 6.932170867919922, "learning_rate": 9.417814818086758e-06, "loss": 0.7886018753051758, "memory(GiB)": 33.07, "step": 4245, "token_acc": 0.8145620022753128, "train_speed(iter/s)": 0.096402 }, { "epoch": 0.19764052095716142, "grad_norm": 7.067446231842041, "learning_rate": 9.41601289677287e-06, "loss": 0.7627155780792236, "memory(GiB)": 33.07, "step": 4250, "token_acc": 0.8150163220892275, "train_speed(iter/s)": 0.096467 }, { "epoch": 0.19764052095716142, "eval_loss": 0.6957509517669678, "eval_runtime": 294.9761, "eval_samples_per_second": 11.781, "eval_steps_per_second": 11.781, "step": 4250 }, { "epoch": 0.19787303921711102, "grad_norm": 5.023551940917969, "learning_rate": 9.414208364134211e-06, "loss": 0.7709908962249756, "memory(GiB)": 33.07, "step": 4255, "token_acc": 0.8032265285239192, "train_speed(iter/s)": 0.095886 }, { "epoch": 0.19810555747706063, "grad_norm": 6.044614791870117, "learning_rate": 9.412401221237863e-06, "loss": 0.8650611877441406, "memory(GiB)": 33.07, "step": 4260, "token_acc": 0.7993119266055045, "train_speed(iter/s)": 0.09595 }, { "epoch": 0.19833807573701023, "grad_norm": 6.721257209777832, "learning_rate": 9.410591469152442e-06, "loss": 0.8071677207946777, "memory(GiB)": 33.07, "step": 4265, "token_acc": 0.8033012379642366, "train_speed(iter/s)": 0.096014 }, { "epoch": 0.1985705939969598, "grad_norm": 6.265713214874268, "learning_rate": 9.408779108948108e-06, "loss": 0.7521049499511718, "memory(GiB)": 33.07, "step": 4270, "token_acc": 0.8141247833622184, "train_speed(iter/s)": 0.096078 }, { "epoch": 0.19880311225690941, "grad_norm": 5.308954238891602, "learning_rate": 9.40696414169657e-06, "loss": 0.7319036006927491, "memory(GiB)": 33.07, "step": 4275, "token_acc": 0.8273524720893142, "train_speed(iter/s)": 0.096142 }, { "epoch": 0.19903563051685902, "grad_norm": 7.264330863952637, "learning_rate": 9.405146568471073e-06, "loss": 0.807645320892334, "memory(GiB)": 33.07, "step": 4280, "token_acc": 0.7988970588235295, "train_speed(iter/s)": 0.096204 }, { "epoch": 0.19926814877680862, "grad_norm": 6.443247318267822, "learning_rate": 9.403326390346404e-06, "loss": 0.8453804016113281, "memory(GiB)": 33.07, "step": 4285, "token_acc": 0.8029490616621984, "train_speed(iter/s)": 0.096268 }, { "epoch": 0.19950066703675823, "grad_norm": 4.0575056076049805, "learning_rate": 9.40150360839889e-06, "loss": 0.9449618339538575, "memory(GiB)": 33.07, "step": 4290, "token_acc": 0.773838630806846, "train_speed(iter/s)": 0.096329 }, { "epoch": 0.19973318529670783, "grad_norm": 4.946542739868164, "learning_rate": 9.3996782237064e-06, "loss": 0.7974777221679688, "memory(GiB)": 33.07, "step": 4295, "token_acc": 0.8088344469190795, "train_speed(iter/s)": 0.096388 }, { "epoch": 0.19996570355665744, "grad_norm": 5.26956844329834, "learning_rate": 9.397850237348336e-06, "loss": 0.8243688583374024, "memory(GiB)": 33.07, "step": 4300, "token_acc": 0.7927695287282117, "train_speed(iter/s)": 0.09645 }, { "epoch": 0.19996570355665744, "eval_loss": 0.6982521414756775, "eval_runtime": 294.9008, "eval_samples_per_second": 11.784, "eval_steps_per_second": 11.784, "step": 4300 }, { "epoch": 0.20019822181660704, "grad_norm": 6.252737522125244, "learning_rate": 9.396019650405646e-06, "loss": 0.7586172580718994, "memory(GiB)": 33.07, "step": 4305, "token_acc": 0.803494358900446, "train_speed(iter/s)": 0.095879 }, { "epoch": 0.20043074007655665, "grad_norm": 5.019197940826416, "learning_rate": 9.394186463960814e-06, "loss": 0.8452945709228515, "memory(GiB)": 33.07, "step": 4310, "token_acc": 0.7955331865366467, "train_speed(iter/s)": 0.09594 }, { "epoch": 0.20066325833650625, "grad_norm": 6.704598903656006, "learning_rate": 9.392350679097857e-06, "loss": 0.9175315856933594, "memory(GiB)": 33.07, "step": 4315, "token_acc": 0.7831031681559708, "train_speed(iter/s)": 0.096002 }, { "epoch": 0.20089577659645583, "grad_norm": 6.991080284118652, "learning_rate": 9.390512296902331e-06, "loss": 0.7498832225799561, "memory(GiB)": 33.07, "step": 4320, "token_acc": 0.8220264317180617, "train_speed(iter/s)": 0.096066 }, { "epoch": 0.20112829485640543, "grad_norm": 4.867962837219238, "learning_rate": 9.388671318461331e-06, "loss": 0.739860725402832, "memory(GiB)": 33.07, "step": 4325, "token_acc": 0.8180700676090965, "train_speed(iter/s)": 0.096128 }, { "epoch": 0.20136081311635504, "grad_norm": 5.916605472564697, "learning_rate": 9.386827744863483e-06, "loss": 1.0181119918823243, "memory(GiB)": 33.07, "step": 4330, "token_acc": 0.754506128334535, "train_speed(iter/s)": 0.096191 }, { "epoch": 0.20159333137630464, "grad_norm": 7.321384906768799, "learning_rate": 9.384981577198946e-06, "loss": 0.7450331211090088, "memory(GiB)": 33.07, "step": 4335, "token_acc": 0.8184615384615385, "train_speed(iter/s)": 0.096254 }, { "epoch": 0.20182584963625425, "grad_norm": 4.210700035095215, "learning_rate": 9.383132816559422e-06, "loss": 0.7693531513214111, "memory(GiB)": 33.07, "step": 4340, "token_acc": 0.8010770784247728, "train_speed(iter/s)": 0.096315 }, { "epoch": 0.20205836789620385, "grad_norm": 6.521642684936523, "learning_rate": 9.381281464038134e-06, "loss": 0.7599985122680664, "memory(GiB)": 33.07, "step": 4345, "token_acc": 0.8084516799445791, "train_speed(iter/s)": 0.096376 }, { "epoch": 0.20229088615615345, "grad_norm": 5.679013252258301, "learning_rate": 9.37942752072985e-06, "loss": 0.7193542003631592, "memory(GiB)": 33.07, "step": 4350, "token_acc": 0.81234499862221, "train_speed(iter/s)": 0.096437 }, { "epoch": 0.20229088615615345, "eval_loss": 0.6899635791778564, "eval_runtime": 296.4949, "eval_samples_per_second": 11.72, "eval_steps_per_second": 11.72, "step": 4350 }, { "epoch": 0.20252340441610306, "grad_norm": 5.6605939865112305, "learning_rate": 9.377570987730857e-06, "loss": 0.7166150093078614, "memory(GiB)": 33.07, "step": 4355, "token_acc": 0.8045957678568568, "train_speed(iter/s)": 0.095868 }, { "epoch": 0.20275592267605266, "grad_norm": 6.247797012329102, "learning_rate": 9.375711866138986e-06, "loss": 0.8742569923400879, "memory(GiB)": 33.07, "step": 4360, "token_acc": 0.7985636114911081, "train_speed(iter/s)": 0.09593 }, { "epoch": 0.20298844093600227, "grad_norm": 4.9714813232421875, "learning_rate": 9.373850157053591e-06, "loss": 0.7221141338348389, "memory(GiB)": 33.07, "step": 4365, "token_acc": 0.8208573256557902, "train_speed(iter/s)": 0.095991 }, { "epoch": 0.20322095919595184, "grad_norm": 6.083737850189209, "learning_rate": 9.37198586157556e-06, "loss": 0.7091527938842773, "memory(GiB)": 33.07, "step": 4370, "token_acc": 0.8148148148148148, "train_speed(iter/s)": 0.096052 }, { "epoch": 0.20345347745590145, "grad_norm": 5.963134765625, "learning_rate": 9.370118980807303e-06, "loss": 0.7980108261108398, "memory(GiB)": 33.07, "step": 4375, "token_acc": 0.8181049069373942, "train_speed(iter/s)": 0.096113 }, { "epoch": 0.20368599571585105, "grad_norm": 6.6309285163879395, "learning_rate": 9.36824951585277e-06, "loss": 0.7637380123138428, "memory(GiB)": 33.07, "step": 4380, "token_acc": 0.803343949044586, "train_speed(iter/s)": 0.096175 }, { "epoch": 0.20391851397580066, "grad_norm": 7.040733814239502, "learning_rate": 9.36637746781743e-06, "loss": 0.7601099967956543, "memory(GiB)": 33.07, "step": 4385, "token_acc": 0.825043630017452, "train_speed(iter/s)": 0.096237 }, { "epoch": 0.20415103223575026, "grad_norm": 7.548309803009033, "learning_rate": 9.364502837808284e-06, "loss": 0.7506435871124267, "memory(GiB)": 33.07, "step": 4390, "token_acc": 0.8206075533661741, "train_speed(iter/s)": 0.096298 }, { "epoch": 0.20438355049569987, "grad_norm": 6.83359956741333, "learning_rate": 9.36262562693386e-06, "loss": 0.8372472763061524, "memory(GiB)": 33.07, "step": 4395, "token_acc": 0.7879834254143646, "train_speed(iter/s)": 0.096359 }, { "epoch": 0.20461606875564947, "grad_norm": 4.644917964935303, "learning_rate": 9.360745836304207e-06, "loss": 0.8313392639160156, "memory(GiB)": 33.07, "step": 4400, "token_acc": 0.8097439544807966, "train_speed(iter/s)": 0.09642 }, { "epoch": 0.20461606875564947, "eval_loss": 0.692659318447113, "eval_runtime": 294.9376, "eval_samples_per_second": 11.782, "eval_steps_per_second": 11.782, "step": 4400 }, { "epoch": 0.20484858701559908, "grad_norm": 6.488748550415039, "learning_rate": 9.358863467030907e-06, "loss": 0.7236376762390136, "memory(GiB)": 33.07, "step": 4405, "token_acc": 0.8043207610413241, "train_speed(iter/s)": 0.095862 }, { "epoch": 0.20508110527554868, "grad_norm": 4.683987140655518, "learning_rate": 9.356978520227062e-06, "loss": 0.7017735958099365, "memory(GiB)": 33.07, "step": 4410, "token_acc": 0.8319974350753446, "train_speed(iter/s)": 0.095923 }, { "epoch": 0.20531362353549829, "grad_norm": 4.9379658699035645, "learning_rate": 9.355090997007297e-06, "loss": 0.7702983856201172, "memory(GiB)": 33.07, "step": 4415, "token_acc": 0.8051425030978935, "train_speed(iter/s)": 0.095983 }, { "epoch": 0.20554614179544786, "grad_norm": 5.977777481079102, "learning_rate": 9.353200898487767e-06, "loss": 0.6408030986785889, "memory(GiB)": 33.07, "step": 4420, "token_acc": 0.8345717637856863, "train_speed(iter/s)": 0.096044 }, { "epoch": 0.20577866005539747, "grad_norm": 6.222667694091797, "learning_rate": 9.35130822578614e-06, "loss": 0.77198486328125, "memory(GiB)": 33.07, "step": 4425, "token_acc": 0.8058188950637464, "train_speed(iter/s)": 0.096106 }, { "epoch": 0.20601117831534707, "grad_norm": 4.507582187652588, "learning_rate": 9.349412980021618e-06, "loss": 0.7829318523406983, "memory(GiB)": 33.07, "step": 4430, "token_acc": 0.8062455642299503, "train_speed(iter/s)": 0.096167 }, { "epoch": 0.20624369657529668, "grad_norm": 8.059425354003906, "learning_rate": 9.347515162314914e-06, "loss": 0.7704340934753418, "memory(GiB)": 33.07, "step": 4435, "token_acc": 0.7938834023574387, "train_speed(iter/s)": 0.096226 }, { "epoch": 0.20647621483524628, "grad_norm": 6.580410957336426, "learning_rate": 9.345614773788268e-06, "loss": 0.6889129638671875, "memory(GiB)": 33.07, "step": 4440, "token_acc": 0.8137973137973138, "train_speed(iter/s)": 0.096287 }, { "epoch": 0.20670873309519588, "grad_norm": 3.9273412227630615, "learning_rate": 9.343711815565438e-06, "loss": 0.7775119304656982, "memory(GiB)": 33.07, "step": 4445, "token_acc": 0.8012727798669367, "train_speed(iter/s)": 0.096348 }, { "epoch": 0.2069412513551455, "grad_norm": 7.893680572509766, "learning_rate": 9.3418062887717e-06, "loss": 0.7683738708496094, "memory(GiB)": 33.07, "step": 4450, "token_acc": 0.8174030658250676, "train_speed(iter/s)": 0.096407 }, { "epoch": 0.2069412513551455, "eval_loss": 0.6881858706474304, "eval_runtime": 292.9637, "eval_samples_per_second": 11.862, "eval_steps_per_second": 11.862, "step": 4450 }, { "epoch": 0.2071737696150951, "grad_norm": 5.971506595611572, "learning_rate": 9.339898194533854e-06, "loss": 0.780084228515625, "memory(GiB)": 33.07, "step": 4455, "token_acc": 0.803921568627451, "train_speed(iter/s)": 0.09586 }, { "epoch": 0.2074062878750447, "grad_norm": 5.8725128173828125, "learning_rate": 9.337987533980214e-06, "loss": 0.9864715576171875, "memory(GiB)": 33.07, "step": 4460, "token_acc": 0.7490438364224772, "train_speed(iter/s)": 0.095917 }, { "epoch": 0.2076388061349943, "grad_norm": 6.447871208190918, "learning_rate": 9.336074308240613e-06, "loss": 0.7951199531555175, "memory(GiB)": 33.07, "step": 4465, "token_acc": 0.791958495460441, "train_speed(iter/s)": 0.095976 }, { "epoch": 0.20787132439494388, "grad_norm": 6.752201080322266, "learning_rate": 9.334158518446398e-06, "loss": 0.7589597225189209, "memory(GiB)": 33.07, "step": 4470, "token_acc": 0.8137150936726758, "train_speed(iter/s)": 0.096037 }, { "epoch": 0.20810384265489348, "grad_norm": 5.577692985534668, "learning_rate": 9.332240165730439e-06, "loss": 0.8426610946655273, "memory(GiB)": 33.07, "step": 4475, "token_acc": 0.8084077380952381, "train_speed(iter/s)": 0.096097 }, { "epoch": 0.2083363609148431, "grad_norm": 5.145914077758789, "learning_rate": 9.330319251227114e-06, "loss": 0.7314593315124511, "memory(GiB)": 33.07, "step": 4480, "token_acc": 0.8349742147210502, "train_speed(iter/s)": 0.096157 }, { "epoch": 0.2085688791747927, "grad_norm": 4.913346767425537, "learning_rate": 9.328395776072318e-06, "loss": 0.6858441829681396, "memory(GiB)": 33.07, "step": 4485, "token_acc": 0.819038642789821, "train_speed(iter/s)": 0.096217 }, { "epoch": 0.2088013974347423, "grad_norm": 6.451091289520264, "learning_rate": 9.326469741403463e-06, "loss": 0.680500841140747, "memory(GiB)": 33.07, "step": 4490, "token_acc": 0.8409187579753297, "train_speed(iter/s)": 0.096277 }, { "epoch": 0.2090339156946919, "grad_norm": 6.490147590637207, "learning_rate": 9.324541148359473e-06, "loss": 0.7878528594970703, "memory(GiB)": 33.07, "step": 4495, "token_acc": 0.8057866184448463, "train_speed(iter/s)": 0.096338 }, { "epoch": 0.2092664339546415, "grad_norm": 4.894039630889893, "learning_rate": 9.322609998080784e-06, "loss": 0.6724793434143066, "memory(GiB)": 33.07, "step": 4500, "token_acc": 0.8291692692067458, "train_speed(iter/s)": 0.096397 }, { "epoch": 0.2092664339546415, "eval_loss": 0.686882734298706, "eval_runtime": 291.9685, "eval_samples_per_second": 11.902, "eval_steps_per_second": 11.902, "step": 4500 }, { "epoch": 0.2094989522145911, "grad_norm": 6.31425142288208, "learning_rate": 9.320676291709348e-06, "loss": 0.6531912326812744, "memory(GiB)": 33.07, "step": 4505, "token_acc": 0.8051886489265973, "train_speed(iter/s)": 0.095858 }, { "epoch": 0.20973147047454072, "grad_norm": 7.33244514465332, "learning_rate": 9.31874003038862e-06, "loss": 0.7912643909454345, "memory(GiB)": 33.07, "step": 4510, "token_acc": 0.8016850291639663, "train_speed(iter/s)": 0.095917 }, { "epoch": 0.2099639887344903, "grad_norm": 5.7589335441589355, "learning_rate": 9.316801215263574e-06, "loss": 0.7930665016174316, "memory(GiB)": 33.07, "step": 4515, "token_acc": 0.8037995414346545, "train_speed(iter/s)": 0.095976 }, { "epoch": 0.2101965069944399, "grad_norm": 5.287975788116455, "learning_rate": 9.31485984748069e-06, "loss": 0.8597976684570312, "memory(GiB)": 33.07, "step": 4520, "token_acc": 0.8099725166862976, "train_speed(iter/s)": 0.096036 }, { "epoch": 0.2104290252543895, "grad_norm": 5.207564353942871, "learning_rate": 9.31291592818796e-06, "loss": 0.7041160583496093, "memory(GiB)": 33.07, "step": 4525, "token_acc": 0.8289205702647657, "train_speed(iter/s)": 0.096095 }, { "epoch": 0.2106615435143391, "grad_norm": 5.495445251464844, "learning_rate": 9.310969458534882e-06, "loss": 0.7163251876831055, "memory(GiB)": 33.07, "step": 4530, "token_acc": 0.8275154004106776, "train_speed(iter/s)": 0.096154 }, { "epoch": 0.2108940617742887, "grad_norm": 5.854438781738281, "learning_rate": 9.309020439672465e-06, "loss": 0.7081255435943603, "memory(GiB)": 33.07, "step": 4535, "token_acc": 0.8392466053438458, "train_speed(iter/s)": 0.096215 }, { "epoch": 0.21112658003423831, "grad_norm": 5.526129722595215, "learning_rate": 9.307068872753223e-06, "loss": 0.7211766719818116, "memory(GiB)": 33.07, "step": 4540, "token_acc": 0.8206380208333334, "train_speed(iter/s)": 0.096275 }, { "epoch": 0.21135909829418792, "grad_norm": 7.228954315185547, "learning_rate": 9.30511475893118e-06, "loss": 0.7566389083862305, "memory(GiB)": 33.07, "step": 4545, "token_acc": 0.8173076923076923, "train_speed(iter/s)": 0.096336 }, { "epoch": 0.21159161655413752, "grad_norm": 6.528788089752197, "learning_rate": 9.30315809936186e-06, "loss": 0.8317952156066895, "memory(GiB)": 33.07, "step": 4550, "token_acc": 0.7946549391069012, "train_speed(iter/s)": 0.096396 }, { "epoch": 0.21159161655413752, "eval_loss": 0.6860916018486023, "eval_runtime": 292.6928, "eval_samples_per_second": 11.873, "eval_steps_per_second": 11.873, "step": 4550 }, { "epoch": 0.21182413481408713, "grad_norm": 5.846338272094727, "learning_rate": 9.3011988952023e-06, "loss": 0.8814240455627441, "memory(GiB)": 33.07, "step": 4555, "token_acc": 0.8044239350750274, "train_speed(iter/s)": 0.095861 }, { "epoch": 0.21205665307403673, "grad_norm": 5.912703514099121, "learning_rate": 9.299237147611036e-06, "loss": 0.8261652946472168, "memory(GiB)": 33.07, "step": 4560, "token_acc": 0.7952633219071362, "train_speed(iter/s)": 0.095919 }, { "epoch": 0.2122891713339863, "grad_norm": 6.0356526374816895, "learning_rate": 9.29727285774811e-06, "loss": 0.8617947578430176, "memory(GiB)": 33.07, "step": 4565, "token_acc": 0.7860512129380054, "train_speed(iter/s)": 0.095979 }, { "epoch": 0.21252168959393591, "grad_norm": 5.160603046417236, "learning_rate": 9.295306026775066e-06, "loss": 0.7188240051269531, "memory(GiB)": 33.07, "step": 4570, "token_acc": 0.8221818181818182, "train_speed(iter/s)": 0.096038 }, { "epoch": 0.21275420785388552, "grad_norm": 6.349583625793457, "learning_rate": 9.293336655854955e-06, "loss": 0.6778467178344727, "memory(GiB)": 33.07, "step": 4575, "token_acc": 0.839390386869871, "train_speed(iter/s)": 0.096098 }, { "epoch": 0.21298672611383512, "grad_norm": 5.2883100509643555, "learning_rate": 9.291364746152325e-06, "loss": 0.9108637809753418, "memory(GiB)": 33.07, "step": 4580, "token_acc": 0.7718334297281666, "train_speed(iter/s)": 0.096157 }, { "epoch": 0.21321924437378473, "grad_norm": 6.238943099975586, "learning_rate": 9.289390298833226e-06, "loss": 0.7632899284362793, "memory(GiB)": 33.07, "step": 4585, "token_acc": 0.8032896945283652, "train_speed(iter/s)": 0.096216 }, { "epoch": 0.21345176263373433, "grad_norm": 5.001307487487793, "learning_rate": 9.287413315065212e-06, "loss": 0.8181890487670899, "memory(GiB)": 33.07, "step": 4590, "token_acc": 0.8045580110497238, "train_speed(iter/s)": 0.096275 }, { "epoch": 0.21368428089368394, "grad_norm": 6.3453755378723145, "learning_rate": 9.285433796017333e-06, "loss": 0.7735485076904297, "memory(GiB)": 33.07, "step": 4595, "token_acc": 0.7959317585301837, "train_speed(iter/s)": 0.096334 }, { "epoch": 0.21391679915363354, "grad_norm": 5.127955436706543, "learning_rate": 9.28345174286014e-06, "loss": 0.6808011531829834, "memory(GiB)": 33.07, "step": 4600, "token_acc": 0.8435283687943262, "train_speed(iter/s)": 0.096394 }, { "epoch": 0.21391679915363354, "eval_loss": 0.6844122409820557, "eval_runtime": 290.6503, "eval_samples_per_second": 11.956, "eval_steps_per_second": 11.956, "step": 4600 }, { "epoch": 0.21414931741358315, "grad_norm": 6.679483413696289, "learning_rate": 9.281467156765684e-06, "loss": 0.7091818809509277, "memory(GiB)": 33.07, "step": 4605, "token_acc": 0.8054765092063391, "train_speed(iter/s)": 0.095868 }, { "epoch": 0.21438183567353275, "grad_norm": 7.544936656951904, "learning_rate": 9.279480038907508e-06, "loss": 0.7468667984008789, "memory(GiB)": 33.07, "step": 4610, "token_acc": 0.8214285714285714, "train_speed(iter/s)": 0.095926 }, { "epoch": 0.21461435393348233, "grad_norm": 4.884216785430908, "learning_rate": 9.27749039046066e-06, "loss": 0.7384253025054932, "memory(GiB)": 33.07, "step": 4615, "token_acc": 0.8045409674234946, "train_speed(iter/s)": 0.095984 }, { "epoch": 0.21484687219343193, "grad_norm": 6.339810848236084, "learning_rate": 9.275498212601679e-06, "loss": 0.8337714195251464, "memory(GiB)": 33.07, "step": 4620, "token_acc": 0.8185096153846154, "train_speed(iter/s)": 0.096043 }, { "epoch": 0.21507939045338154, "grad_norm": 5.720736980438232, "learning_rate": 9.273503506508601e-06, "loss": 0.6793076038360596, "memory(GiB)": 33.07, "step": 4625, "token_acc": 0.8336227856894756, "train_speed(iter/s)": 0.096102 }, { "epoch": 0.21531190871333114, "grad_norm": 5.042727470397949, "learning_rate": 9.27150627336096e-06, "loss": 0.727689790725708, "memory(GiB)": 33.07, "step": 4630, "token_acc": 0.8223185265438786, "train_speed(iter/s)": 0.096159 }, { "epoch": 0.21554442697328074, "grad_norm": 6.311602592468262, "learning_rate": 9.269506514339776e-06, "loss": 0.834522819519043, "memory(GiB)": 33.07, "step": 4635, "token_acc": 0.7994505494505495, "train_speed(iter/s)": 0.096218 }, { "epoch": 0.21577694523323035, "grad_norm": 6.29012393951416, "learning_rate": 9.267504230627573e-06, "loss": 0.6527014255523682, "memory(GiB)": 33.07, "step": 4640, "token_acc": 0.8425959125134457, "train_speed(iter/s)": 0.096274 }, { "epoch": 0.21600946349317995, "grad_norm": 4.891956806182861, "learning_rate": 9.26549942340836e-06, "loss": 0.7832115173339844, "memory(GiB)": 33.07, "step": 4645, "token_acc": 0.8165349143610013, "train_speed(iter/s)": 0.096332 }, { "epoch": 0.21624198175312956, "grad_norm": 5.229694843292236, "learning_rate": 9.263492093867646e-06, "loss": 0.7881430149078369, "memory(GiB)": 33.07, "step": 4650, "token_acc": 0.8165953243332236, "train_speed(iter/s)": 0.09639 }, { "epoch": 0.21624198175312956, "eval_loss": 0.6870741844177246, "eval_runtime": 288.3894, "eval_samples_per_second": 12.05, "eval_steps_per_second": 12.05, "step": 4650 }, { "epoch": 0.21647450001307916, "grad_norm": 6.321075439453125, "learning_rate": 9.261482243192422e-06, "loss": 0.7271718978881836, "memory(GiB)": 33.07, "step": 4655, "token_acc": 0.80499289693371, "train_speed(iter/s)": 0.095874 }, { "epoch": 0.21670701827302877, "grad_norm": 6.790338039398193, "learning_rate": 9.259469872571179e-06, "loss": 0.8134382247924805, "memory(GiB)": 33.07, "step": 4660, "token_acc": 0.7886208138278719, "train_speed(iter/s)": 0.095932 }, { "epoch": 0.21693953653297834, "grad_norm": 6.273148536682129, "learning_rate": 9.257454983193888e-06, "loss": 0.8073574066162109, "memory(GiB)": 33.07, "step": 4665, "token_acc": 0.8144235186035829, "train_speed(iter/s)": 0.095991 }, { "epoch": 0.21717205479292795, "grad_norm": 5.982663631439209, "learning_rate": 9.255437576252022e-06, "loss": 0.785725736618042, "memory(GiB)": 33.07, "step": 4670, "token_acc": 0.7970684039087947, "train_speed(iter/s)": 0.096049 }, { "epoch": 0.21740457305287755, "grad_norm": 6.031878471374512, "learning_rate": 9.253417652938532e-06, "loss": 0.8083737373352051, "memory(GiB)": 33.07, "step": 4675, "token_acc": 0.7987055016181229, "train_speed(iter/s)": 0.096107 }, { "epoch": 0.21763709131282716, "grad_norm": 7.036018371582031, "learning_rate": 9.251395214447862e-06, "loss": 0.787720775604248, "memory(GiB)": 33.07, "step": 4680, "token_acc": 0.8159931212381771, "train_speed(iter/s)": 0.096166 }, { "epoch": 0.21786960957277676, "grad_norm": 6.4246931076049805, "learning_rate": 9.249370261975943e-06, "loss": 0.6878273010253906, "memory(GiB)": 33.07, "step": 4685, "token_acc": 0.8285606631499624, "train_speed(iter/s)": 0.096224 }, { "epoch": 0.21810212783272637, "grad_norm": 3.8756697177886963, "learning_rate": 9.247342796720192e-06, "loss": 0.7691972732543946, "memory(GiB)": 33.07, "step": 4690, "token_acc": 0.8043310131477185, "train_speed(iter/s)": 0.09628 }, { "epoch": 0.21833464609267597, "grad_norm": 6.382996082305908, "learning_rate": 9.245312819879508e-06, "loss": 0.7722404956817627, "memory(GiB)": 33.07, "step": 4695, "token_acc": 0.8183962264150944, "train_speed(iter/s)": 0.096337 }, { "epoch": 0.21856716435262558, "grad_norm": 5.762166976928711, "learning_rate": 9.243280332654286e-06, "loss": 0.6639322757720947, "memory(GiB)": 33.07, "step": 4700, "token_acc": 0.8204067562909342, "train_speed(iter/s)": 0.096394 }, { "epoch": 0.21856716435262558, "eval_loss": 0.6851846575737, "eval_runtime": 292.3554, "eval_samples_per_second": 11.886, "eval_steps_per_second": 11.886, "step": 4700 }, { "epoch": 0.21879968261257518, "grad_norm": 5.3950324058532715, "learning_rate": 9.241245336246392e-06, "loss": 0.7640002250671387, "memory(GiB)": 33.07, "step": 4705, "token_acc": 0.8051870060613825, "train_speed(iter/s)": 0.095875 }, { "epoch": 0.21903220087252476, "grad_norm": 6.712535381317139, "learning_rate": 9.239207831859184e-06, "loss": 0.7891267776489258, "memory(GiB)": 33.07, "step": 4710, "token_acc": 0.8250728862973761, "train_speed(iter/s)": 0.095932 }, { "epoch": 0.21926471913247436, "grad_norm": 6.155306816101074, "learning_rate": 9.237167820697504e-06, "loss": 0.813847827911377, "memory(GiB)": 33.07, "step": 4715, "token_acc": 0.7971908187735526, "train_speed(iter/s)": 0.095989 }, { "epoch": 0.21949723739242397, "grad_norm": 4.275435447692871, "learning_rate": 9.23512530396767e-06, "loss": 0.6743597030639649, "memory(GiB)": 33.07, "step": 4720, "token_acc": 0.8179708222811671, "train_speed(iter/s)": 0.096045 }, { "epoch": 0.21972975565237357, "grad_norm": 6.898558616638184, "learning_rate": 9.233080282877486e-06, "loss": 0.7777416229248046, "memory(GiB)": 33.07, "step": 4725, "token_acc": 0.8053097345132744, "train_speed(iter/s)": 0.096102 }, { "epoch": 0.21996227391232318, "grad_norm": 4.393642902374268, "learning_rate": 9.231032758636241e-06, "loss": 0.8509530067443848, "memory(GiB)": 33.07, "step": 4730, "token_acc": 0.8003134796238245, "train_speed(iter/s)": 0.096158 }, { "epoch": 0.22019479217227278, "grad_norm": 5.334609508514404, "learning_rate": 9.22898273245469e-06, "loss": 0.7725913047790527, "memory(GiB)": 33.07, "step": 4735, "token_acc": 0.8223370429252782, "train_speed(iter/s)": 0.096216 }, { "epoch": 0.22042731043222238, "grad_norm": 6.812933444976807, "learning_rate": 9.226930205545086e-06, "loss": 0.6820529460906982, "memory(GiB)": 33.07, "step": 4740, "token_acc": 0.8412623645784267, "train_speed(iter/s)": 0.096271 }, { "epoch": 0.220659828692172, "grad_norm": 5.598426342010498, "learning_rate": 9.224875179121145e-06, "loss": 0.7706835746765137, "memory(GiB)": 33.07, "step": 4745, "token_acc": 0.806554756195044, "train_speed(iter/s)": 0.096328 }, { "epoch": 0.2208923469521216, "grad_norm": 6.014193058013916, "learning_rate": 9.22281765439807e-06, "loss": 0.8362375259399414, "memory(GiB)": 33.07, "step": 4750, "token_acc": 0.7828804347826087, "train_speed(iter/s)": 0.096385 }, { "epoch": 0.2208923469521216, "eval_loss": 0.6855367422103882, "eval_runtime": 293.1276, "eval_samples_per_second": 11.855, "eval_steps_per_second": 11.855, "step": 4750 }, { "epoch": 0.2211248652120712, "grad_norm": 3.9696576595306396, "learning_rate": 9.22075763259254e-06, "loss": 0.7803917407989502, "memory(GiB)": 33.07, "step": 4755, "token_acc": 0.8054952423526407, "train_speed(iter/s)": 0.095869 }, { "epoch": 0.22135738347202077, "grad_norm": 4.370530605316162, "learning_rate": 9.21869511492271e-06, "loss": 0.8625608444213867, "memory(GiB)": 33.07, "step": 4760, "token_acc": 0.7932675960557634, "train_speed(iter/s)": 0.095925 }, { "epoch": 0.22158990173197038, "grad_norm": 5.993688583374023, "learning_rate": 9.216630102608205e-06, "loss": 0.8630349159240722, "memory(GiB)": 33.07, "step": 4765, "token_acc": 0.8071367884451996, "train_speed(iter/s)": 0.095981 }, { "epoch": 0.22182241999191998, "grad_norm": 6.219290256500244, "learning_rate": 9.214562596870138e-06, "loss": 0.7270864963531494, "memory(GiB)": 33.07, "step": 4770, "token_acc": 0.8224020442930153, "train_speed(iter/s)": 0.096036 }, { "epoch": 0.2220549382518696, "grad_norm": 5.150698184967041, "learning_rate": 9.212492598931081e-06, "loss": 0.8439167976379395, "memory(GiB)": 33.07, "step": 4775, "token_acc": 0.7991869918699187, "train_speed(iter/s)": 0.096088 }, { "epoch": 0.2222874565118192, "grad_norm": 5.613473415374756, "learning_rate": 9.210420110015098e-06, "loss": 0.7923439979553223, "memory(GiB)": 33.07, "step": 4780, "token_acc": 0.7985475213135459, "train_speed(iter/s)": 0.096145 }, { "epoch": 0.2225199747717688, "grad_norm": 5.041796684265137, "learning_rate": 9.208345131347704e-06, "loss": 0.8898165702819825, "memory(GiB)": 33.07, "step": 4785, "token_acc": 0.7721759809750297, "train_speed(iter/s)": 0.096199 }, { "epoch": 0.2227524930317184, "grad_norm": 7.280837535858154, "learning_rate": 9.206267664155906e-06, "loss": 0.8083833694458008, "memory(GiB)": 33.07, "step": 4790, "token_acc": 0.801067615658363, "train_speed(iter/s)": 0.096254 }, { "epoch": 0.222985011291668, "grad_norm": 7.027002334594727, "learning_rate": 9.204187709668173e-06, "loss": 0.7799443244934082, "memory(GiB)": 33.07, "step": 4795, "token_acc": 0.8110749185667753, "train_speed(iter/s)": 0.09631 }, { "epoch": 0.2232175295516176, "grad_norm": 6.834471225738525, "learning_rate": 9.202105269114444e-06, "loss": 0.7543346405029296, "memory(GiB)": 33.07, "step": 4800, "token_acc": 0.8283281039892425, "train_speed(iter/s)": 0.096367 }, { "epoch": 0.2232175295516176, "eval_loss": 0.6796162724494934, "eval_runtime": 294.619, "eval_samples_per_second": 11.795, "eval_steps_per_second": 11.795, "step": 4800 }, { "epoch": 0.22345004781156722, "grad_norm": 5.5002031326293945, "learning_rate": 9.200020343726132e-06, "loss": 0.724638032913208, "memory(GiB)": 33.07, "step": 4805, "token_acc": 0.8070286010610714, "train_speed(iter/s)": 0.095856 }, { "epoch": 0.2236825660715168, "grad_norm": 7.37080192565918, "learning_rate": 9.197932934736117e-06, "loss": 0.6563894271850585, "memory(GiB)": 33.07, "step": 4810, "token_acc": 0.8334597875569044, "train_speed(iter/s)": 0.095912 }, { "epoch": 0.2239150843314664, "grad_norm": 7.196676731109619, "learning_rate": 9.195843043378751e-06, "loss": 0.8106472015380859, "memory(GiB)": 33.07, "step": 4815, "token_acc": 0.7981157469717362, "train_speed(iter/s)": 0.095968 }, { "epoch": 0.224147602591416, "grad_norm": 5.868455410003662, "learning_rate": 9.193750670889849e-06, "loss": 0.9501402854919434, "memory(GiB)": 33.07, "step": 4820, "token_acc": 0.7806595035198222, "train_speed(iter/s)": 0.096023 }, { "epoch": 0.2243801208513656, "grad_norm": 4.000607490539551, "learning_rate": 9.191655818506694e-06, "loss": 0.8487631797790527, "memory(GiB)": 33.07, "step": 4825, "token_acc": 0.8006230529595015, "train_speed(iter/s)": 0.096079 }, { "epoch": 0.2246126391113152, "grad_norm": 5.362325668334961, "learning_rate": 9.18955848746804e-06, "loss": 0.8604891777038575, "memory(GiB)": 33.07, "step": 4830, "token_acc": 0.7964179104477612, "train_speed(iter/s)": 0.096132 }, { "epoch": 0.22484515737126481, "grad_norm": 6.270984172821045, "learning_rate": 9.1874586790141e-06, "loss": 0.713783073425293, "memory(GiB)": 33.07, "step": 4835, "token_acc": 0.8275732531930879, "train_speed(iter/s)": 0.096187 }, { "epoch": 0.22507767563121442, "grad_norm": 6.523890495300293, "learning_rate": 9.18535639438656e-06, "loss": 0.8093732833862305, "memory(GiB)": 33.07, "step": 4840, "token_acc": 0.811935610522183, "train_speed(iter/s)": 0.096243 }, { "epoch": 0.22531019389116402, "grad_norm": 6.198504447937012, "learning_rate": 9.183251634828563e-06, "loss": 0.678268575668335, "memory(GiB)": 33.07, "step": 4845, "token_acc": 0.8220858895705522, "train_speed(iter/s)": 0.096299 }, { "epoch": 0.22554271215111363, "grad_norm": 5.541542053222656, "learning_rate": 9.181144401584718e-06, "loss": 0.7477645874023438, "memory(GiB)": 33.07, "step": 4850, "token_acc": 0.8140192198982475, "train_speed(iter/s)": 0.096353 }, { "epoch": 0.22554271215111363, "eval_loss": 0.6794469356536865, "eval_runtime": 289.7339, "eval_samples_per_second": 11.994, "eval_steps_per_second": 11.994, "step": 4850 }, { "epoch": 0.22577523041106323, "grad_norm": 7.808995246887207, "learning_rate": 9.1790346959011e-06, "loss": 0.8346663475036621, "memory(GiB)": 33.07, "step": 4855, "token_acc": 0.8062872047622096, "train_speed(iter/s)": 0.095856 }, { "epoch": 0.2260077486710128, "grad_norm": 5.512000560760498, "learning_rate": 9.17692251902524e-06, "loss": 0.7062819004058838, "memory(GiB)": 33.07, "step": 4860, "token_acc": 0.833842627960275, "train_speed(iter/s)": 0.095912 }, { "epoch": 0.2262402669309624, "grad_norm": 6.988223552703857, "learning_rate": 9.174807872206134e-06, "loss": 0.8444395065307617, "memory(GiB)": 33.07, "step": 4865, "token_acc": 0.7886693999254566, "train_speed(iter/s)": 0.095968 }, { "epoch": 0.22647278519091202, "grad_norm": 5.697296142578125, "learning_rate": 9.172690756694238e-06, "loss": 0.823847484588623, "memory(GiB)": 33.07, "step": 4870, "token_acc": 0.7910394265232975, "train_speed(iter/s)": 0.096024 }, { "epoch": 0.22670530345086162, "grad_norm": 5.049820899963379, "learning_rate": 9.17057117374147e-06, "loss": 0.8216519355773926, "memory(GiB)": 33.07, "step": 4875, "token_acc": 0.7899603698811096, "train_speed(iter/s)": 0.096077 }, { "epoch": 0.22693782171081123, "grad_norm": 7.270157337188721, "learning_rate": 9.168449124601202e-06, "loss": 0.7646757125854492, "memory(GiB)": 33.07, "step": 4880, "token_acc": 0.7954699121027722, "train_speed(iter/s)": 0.096133 }, { "epoch": 0.22717033997076083, "grad_norm": 5.253354072570801, "learning_rate": 9.166324610528268e-06, "loss": 0.8869535446166992, "memory(GiB)": 33.07, "step": 4885, "token_acc": 0.7827964131517768, "train_speed(iter/s)": 0.096187 }, { "epoch": 0.22740285823071044, "grad_norm": 6.635804176330566, "learning_rate": 9.164197632778958e-06, "loss": 0.7283540725708008, "memory(GiB)": 33.07, "step": 4890, "token_acc": 0.8065134099616859, "train_speed(iter/s)": 0.096241 }, { "epoch": 0.22763537649066004, "grad_norm": 6.679150104522705, "learning_rate": 9.162068192611022e-06, "loss": 0.6464852809906005, "memory(GiB)": 33.07, "step": 4895, "token_acc": 0.8569292123629113, "train_speed(iter/s)": 0.096297 }, { "epoch": 0.22786789475060965, "grad_norm": 7.047290325164795, "learning_rate": 9.159936291283662e-06, "loss": 0.8367726325988769, "memory(GiB)": 33.07, "step": 4900, "token_acc": 0.8022113022113022, "train_speed(iter/s)": 0.096351 }, { "epoch": 0.22786789475060965, "eval_loss": 0.6772682070732117, "eval_runtime": 291.6576, "eval_samples_per_second": 11.915, "eval_steps_per_second": 11.915, "step": 4900 }, { "epoch": 0.22810041301055922, "grad_norm": 7.302158832550049, "learning_rate": 9.157801930057538e-06, "loss": 0.6890687465667724, "memory(GiB)": 33.07, "step": 4905, "token_acc": 0.8061630616706574, "train_speed(iter/s)": 0.095855 }, { "epoch": 0.22833293127050883, "grad_norm": 6.893573760986328, "learning_rate": 9.15566511019476e-06, "loss": 0.712225866317749, "memory(GiB)": 33.07, "step": 4910, "token_acc": 0.8429596073990185, "train_speed(iter/s)": 0.095911 }, { "epoch": 0.22856544953045843, "grad_norm": 5.463949680328369, "learning_rate": 9.153525832958903e-06, "loss": 0.7492496967315674, "memory(GiB)": 33.07, "step": 4915, "token_acc": 0.8176162409954159, "train_speed(iter/s)": 0.095965 }, { "epoch": 0.22879796779040804, "grad_norm": 5.487346172332764, "learning_rate": 9.151384099614979e-06, "loss": 0.8953396797180175, "memory(GiB)": 33.07, "step": 4920, "token_acc": 0.7843079635615633, "train_speed(iter/s)": 0.096019 }, { "epoch": 0.22903048605035764, "grad_norm": 7.811435222625732, "learning_rate": 9.149239911429468e-06, "loss": 0.7093976020812989, "memory(GiB)": 33.07, "step": 4925, "token_acc": 0.8245260185558693, "train_speed(iter/s)": 0.096075 }, { "epoch": 0.22926300431030724, "grad_norm": 6.169620037078857, "learning_rate": 9.147093269670291e-06, "loss": 0.78594970703125, "memory(GiB)": 33.07, "step": 4930, "token_acc": 0.819672131147541, "train_speed(iter/s)": 0.096129 }, { "epoch": 0.22949552257025685, "grad_norm": 5.188272476196289, "learning_rate": 9.144944175606826e-06, "loss": 0.7385846138000488, "memory(GiB)": 33.07, "step": 4935, "token_acc": 0.812938177182919, "train_speed(iter/s)": 0.096185 }, { "epoch": 0.22972804083020645, "grad_norm": 5.541247367858887, "learning_rate": 9.142792630509896e-06, "loss": 0.7875532627105712, "memory(GiB)": 33.07, "step": 4940, "token_acc": 0.8065062690613352, "train_speed(iter/s)": 0.096238 }, { "epoch": 0.22996055909015606, "grad_norm": 6.210489749908447, "learning_rate": 9.140638635651778e-06, "loss": 0.838779354095459, "memory(GiB)": 33.07, "step": 4945, "token_acc": 0.7984066505022515, "train_speed(iter/s)": 0.096291 }, { "epoch": 0.23019307735010566, "grad_norm": 7.857576370239258, "learning_rate": 9.138482192306194e-06, "loss": 0.7631664752960206, "memory(GiB)": 33.07, "step": 4950, "token_acc": 0.8173150266971777, "train_speed(iter/s)": 0.096345 }, { "epoch": 0.23019307735010566, "eval_loss": 0.6792007684707642, "eval_runtime": 288.8644, "eval_samples_per_second": 12.03, "eval_steps_per_second": 12.03, "step": 4950 }, { "epoch": 0.23042559561005524, "grad_norm": 6.835683822631836, "learning_rate": 9.136323301748317e-06, "loss": 0.7649744510650635, "memory(GiB)": 33.07, "step": 4955, "token_acc": 0.8066777775997179, "train_speed(iter/s)": 0.09586 }, { "epoch": 0.23065811387000484, "grad_norm": 5.466605186462402, "learning_rate": 9.134161965254767e-06, "loss": 0.7909045696258545, "memory(GiB)": 33.07, "step": 4960, "token_acc": 0.7993619283941864, "train_speed(iter/s)": 0.095914 }, { "epoch": 0.23089063212995445, "grad_norm": 5.472935199737549, "learning_rate": 9.131998184103603e-06, "loss": 0.8002223014831543, "memory(GiB)": 33.07, "step": 4965, "token_acc": 0.7913832199546486, "train_speed(iter/s)": 0.095969 }, { "epoch": 0.23112315038990405, "grad_norm": 5.254642963409424, "learning_rate": 9.129831959574342e-06, "loss": 0.7709039688110352, "memory(GiB)": 33.07, "step": 4970, "token_acc": 0.7976470588235294, "train_speed(iter/s)": 0.09602 }, { "epoch": 0.23135566864985366, "grad_norm": 5.667941093444824, "learning_rate": 9.127663292947937e-06, "loss": 0.813076400756836, "memory(GiB)": 33.07, "step": 4975, "token_acc": 0.8018323719036308, "train_speed(iter/s)": 0.096075 }, { "epoch": 0.23158818690980326, "grad_norm": 5.399019241333008, "learning_rate": 9.125492185506787e-06, "loss": 0.7386826515197754, "memory(GiB)": 33.07, "step": 4980, "token_acc": 0.8237602568676418, "train_speed(iter/s)": 0.096129 }, { "epoch": 0.23182070516975287, "grad_norm": 7.110016822814941, "learning_rate": 9.123318638534737e-06, "loss": 0.7625212669372559, "memory(GiB)": 33.07, "step": 4985, "token_acc": 0.8365465213746857, "train_speed(iter/s)": 0.096184 }, { "epoch": 0.23205322342970247, "grad_norm": 5.49600887298584, "learning_rate": 9.121142653317071e-06, "loss": 0.7430119991302491, "memory(GiB)": 33.07, "step": 4990, "token_acc": 0.8249013275923932, "train_speed(iter/s)": 0.096238 }, { "epoch": 0.23228574168965208, "grad_norm": 5.538654804229736, "learning_rate": 9.118964231140516e-06, "loss": 0.6083118915557861, "memory(GiB)": 33.07, "step": 4995, "token_acc": 0.8432861580945806, "train_speed(iter/s)": 0.096291 }, { "epoch": 0.23251825994960168, "grad_norm": 5.317222595214844, "learning_rate": 9.116783373293238e-06, "loss": 0.7773871898651123, "memory(GiB)": 33.07, "step": 5000, "token_acc": 0.8092269326683291, "train_speed(iter/s)": 0.096343 }, { "epoch": 0.23251825994960168, "eval_loss": 0.6743544340133667, "eval_runtime": 290.6524, "eval_samples_per_second": 11.956, "eval_steps_per_second": 11.956, "step": 5000 }, { "epoch": 0.23275077820955126, "grad_norm": 6.647306442260742, "learning_rate": 9.114600081064852e-06, "loss": 0.7474261283874511, "memory(GiB)": 33.07, "step": 5005, "token_acc": 0.8067668450225882, "train_speed(iter/s)": 0.09586 }, { "epoch": 0.23298329646950086, "grad_norm": 6.466226100921631, "learning_rate": 9.1124143557464e-06, "loss": 0.7601866722106934, "memory(GiB)": 33.07, "step": 5010, "token_acc": 0.8133603238866397, "train_speed(iter/s)": 0.095914 }, { "epoch": 0.23321581472945047, "grad_norm": 5.549792766571045, "learning_rate": 9.110226198630372e-06, "loss": 0.6700472354888916, "memory(GiB)": 33.07, "step": 5015, "token_acc": 0.8329427519250083, "train_speed(iter/s)": 0.095968 }, { "epoch": 0.23344833298940007, "grad_norm": 5.168575286865234, "learning_rate": 9.10803561101069e-06, "loss": 0.7552637100219727, "memory(GiB)": 33.07, "step": 5020, "token_acc": 0.8173913043478261, "train_speed(iter/s)": 0.096023 }, { "epoch": 0.23368085124934967, "grad_norm": 4.69772481918335, "learning_rate": 9.10584259418272e-06, "loss": 0.7492507457733154, "memory(GiB)": 33.07, "step": 5025, "token_acc": 0.8177269478198713, "train_speed(iter/s)": 0.096076 }, { "epoch": 0.23391336950929928, "grad_norm": 5.720139503479004, "learning_rate": 9.103647149443258e-06, "loss": 0.8174025535583496, "memory(GiB)": 33.07, "step": 5030, "token_acc": 0.7844343407328163, "train_speed(iter/s)": 0.096129 }, { "epoch": 0.23414588776924888, "grad_norm": 4.886946201324463, "learning_rate": 9.101449278090539e-06, "loss": 0.7711383819580078, "memory(GiB)": 33.07, "step": 5035, "token_acc": 0.8156508653122648, "train_speed(iter/s)": 0.096183 }, { "epoch": 0.2343784060291985, "grad_norm": 5.7471184730529785, "learning_rate": 9.099248981424232e-06, "loss": 0.8346371650695801, "memory(GiB)": 33.07, "step": 5040, "token_acc": 0.7889851485148515, "train_speed(iter/s)": 0.096236 }, { "epoch": 0.2346109242891481, "grad_norm": 4.732773780822754, "learning_rate": 9.097046260745439e-06, "loss": 0.7209797382354737, "memory(GiB)": 33.07, "step": 5045, "token_acc": 0.8060527172144484, "train_speed(iter/s)": 0.096289 }, { "epoch": 0.2348434425490977, "grad_norm": 6.407209396362305, "learning_rate": 9.094841117356698e-06, "loss": 0.7606247425079345, "memory(GiB)": 33.07, "step": 5050, "token_acc": 0.8119364534134822, "train_speed(iter/s)": 0.096342 }, { "epoch": 0.2348434425490977, "eval_loss": 0.6779821515083313, "eval_runtime": 290.5023, "eval_samples_per_second": 11.962, "eval_steps_per_second": 11.962, "step": 5050 }, { "epoch": 0.23507596080904727, "grad_norm": 7.122600555419922, "learning_rate": 9.09263355256198e-06, "loss": 0.7734743118286133, "memory(GiB)": 33.07, "step": 5055, "token_acc": 0.8066206125464201, "train_speed(iter/s)": 0.095863 }, { "epoch": 0.23530847906899688, "grad_norm": 6.685431480407715, "learning_rate": 9.090423567666683e-06, "loss": 0.687248420715332, "memory(GiB)": 33.07, "step": 5060, "token_acc": 0.8142581888246628, "train_speed(iter/s)": 0.095916 }, { "epoch": 0.23554099732894648, "grad_norm": 5.379944801330566, "learning_rate": 9.088211163977644e-06, "loss": 0.7669831275939941, "memory(GiB)": 33.07, "step": 5065, "token_acc": 0.7954483136824787, "train_speed(iter/s)": 0.095968 }, { "epoch": 0.2357735155888961, "grad_norm": 6.775158882141113, "learning_rate": 9.08599634280312e-06, "loss": 0.637472677230835, "memory(GiB)": 33.07, "step": 5070, "token_acc": 0.8509949097639982, "train_speed(iter/s)": 0.096022 }, { "epoch": 0.2360060338488457, "grad_norm": 7.46367883682251, "learning_rate": 9.083779105452809e-06, "loss": 0.8237061500549316, "memory(GiB)": 33.07, "step": 5075, "token_acc": 0.8067835365853658, "train_speed(iter/s)": 0.096075 }, { "epoch": 0.2362385521087953, "grad_norm": 5.912139892578125, "learning_rate": 9.081559453237825e-06, "loss": 0.816553783416748, "memory(GiB)": 33.07, "step": 5080, "token_acc": 0.8075, "train_speed(iter/s)": 0.096129 }, { "epoch": 0.2364710703687449, "grad_norm": 6.141449451446533, "learning_rate": 9.079337387470721e-06, "loss": 0.7349120140075683, "memory(GiB)": 33.07, "step": 5085, "token_acc": 0.8159602901870943, "train_speed(iter/s)": 0.096182 }, { "epoch": 0.2367035886286945, "grad_norm": 5.841860294342041, "learning_rate": 9.077112909465473e-06, "loss": 0.7999568939208984, "memory(GiB)": 33.07, "step": 5090, "token_acc": 0.8011173184357542, "train_speed(iter/s)": 0.096236 }, { "epoch": 0.2369361068886441, "grad_norm": 7.302816867828369, "learning_rate": 9.074886020537486e-06, "loss": 0.7663827896118164, "memory(GiB)": 33.07, "step": 5095, "token_acc": 0.8211050724637681, "train_speed(iter/s)": 0.096289 }, { "epoch": 0.2371686251485937, "grad_norm": 7.1966094970703125, "learning_rate": 9.072656722003581e-06, "loss": 0.7074132919311523, "memory(GiB)": 33.07, "step": 5100, "token_acc": 0.831053901850362, "train_speed(iter/s)": 0.096341 }, { "epoch": 0.2371686251485937, "eval_loss": 0.6764135360717773, "eval_runtime": 290.8232, "eval_samples_per_second": 11.949, "eval_steps_per_second": 11.949, "step": 5100 }, { "epoch": 0.2374011434085433, "grad_norm": 7.087984561920166, "learning_rate": 9.070425015182019e-06, "loss": 0.8261497497558594, "memory(GiB)": 33.07, "step": 5105, "token_acc": 0.8059494266814551, "train_speed(iter/s)": 0.095867 }, { "epoch": 0.2376336616684929, "grad_norm": 6.17009973526001, "learning_rate": 9.06819090139247e-06, "loss": 0.8527179718017578, "memory(GiB)": 33.07, "step": 5110, "token_acc": 0.8023850085178875, "train_speed(iter/s)": 0.095921 }, { "epoch": 0.2378661799284425, "grad_norm": 7.8616180419921875, "learning_rate": 9.065954381956042e-06, "loss": 0.7692588806152344, "memory(GiB)": 33.07, "step": 5115, "token_acc": 0.8109507954125046, "train_speed(iter/s)": 0.095973 }, { "epoch": 0.2380986981883921, "grad_norm": 8.128244400024414, "learning_rate": 9.06371545819525e-06, "loss": 0.6775061607360839, "memory(GiB)": 33.07, "step": 5120, "token_acc": 0.8120456905503635, "train_speed(iter/s)": 0.096024 }, { "epoch": 0.2383312164483417, "grad_norm": 5.31683874130249, "learning_rate": 9.061474131434045e-06, "loss": 0.7091841697692871, "memory(GiB)": 33.07, "step": 5125, "token_acc": 0.8258521768477894, "train_speed(iter/s)": 0.096077 }, { "epoch": 0.23856373470829131, "grad_norm": 5.563629627227783, "learning_rate": 9.05923040299779e-06, "loss": 0.7681272983551025, "memory(GiB)": 33.07, "step": 5130, "token_acc": 0.8094393186657204, "train_speed(iter/s)": 0.096129 }, { "epoch": 0.23879625296824092, "grad_norm": 5.285060405731201, "learning_rate": 9.056984274213272e-06, "loss": 0.7310568809509277, "memory(GiB)": 33.07, "step": 5135, "token_acc": 0.8217200251098556, "train_speed(iter/s)": 0.096181 }, { "epoch": 0.23902877122819052, "grad_norm": 6.377146244049072, "learning_rate": 9.054735746408695e-06, "loss": 0.7459100246429443, "memory(GiB)": 33.07, "step": 5140, "token_acc": 0.8177159590043924, "train_speed(iter/s)": 0.096234 }, { "epoch": 0.23926128948814013, "grad_norm": 5.891545295715332, "learning_rate": 9.052484820913683e-06, "loss": 0.7460856437683105, "memory(GiB)": 33.07, "step": 5145, "token_acc": 0.8140324405884571, "train_speed(iter/s)": 0.096286 }, { "epoch": 0.2394938077480897, "grad_norm": 5.500942707061768, "learning_rate": 9.050231499059278e-06, "loss": 0.7375868320465088, "memory(GiB)": 33.07, "step": 5150, "token_acc": 0.8267284991568297, "train_speed(iter/s)": 0.096338 }, { "epoch": 0.2394938077480897, "eval_loss": 0.6725084185600281, "eval_runtime": 289.1326, "eval_samples_per_second": 12.019, "eval_steps_per_second": 12.019, "step": 5150 }, { "epoch": 0.2397263260080393, "grad_norm": 6.5969319343566895, "learning_rate": 9.04797578217794e-06, "loss": 0.7631452560424805, "memory(GiB)": 33.07, "step": 5155, "token_acc": 0.8082897828067316, "train_speed(iter/s)": 0.09587 }, { "epoch": 0.2399588442679889, "grad_norm": 6.98110294342041, "learning_rate": 9.045717671603544e-06, "loss": 0.7008907794952393, "memory(GiB)": 33.07, "step": 5160, "token_acc": 0.8269662921348314, "train_speed(iter/s)": 0.095922 }, { "epoch": 0.24019136252793852, "grad_norm": 7.913124084472656, "learning_rate": 9.043457168671378e-06, "loss": 0.7300667285919189, "memory(GiB)": 33.07, "step": 5165, "token_acc": 0.8395061728395061, "train_speed(iter/s)": 0.095975 }, { "epoch": 0.24042388078788812, "grad_norm": 6.288471698760986, "learning_rate": 9.041194274718151e-06, "loss": 0.7818746566772461, "memory(GiB)": 33.07, "step": 5170, "token_acc": 0.8066581306017926, "train_speed(iter/s)": 0.096026 }, { "epoch": 0.24065639904783773, "grad_norm": 5.895214080810547, "learning_rate": 9.038928991081976e-06, "loss": 0.7084590911865234, "memory(GiB)": 33.07, "step": 5175, "token_acc": 0.817533129459735, "train_speed(iter/s)": 0.096077 }, { "epoch": 0.24088891730778733, "grad_norm": 4.603113174438477, "learning_rate": 9.036661319102393e-06, "loss": 0.764235258102417, "memory(GiB)": 33.07, "step": 5180, "token_acc": 0.8124600638977636, "train_speed(iter/s)": 0.096127 }, { "epoch": 0.24112143556773694, "grad_norm": 6.578195095062256, "learning_rate": 9.034391260120342e-06, "loss": 0.732538890838623, "memory(GiB)": 33.07, "step": 5185, "token_acc": 0.8214139762975071, "train_speed(iter/s)": 0.09618 }, { "epoch": 0.24135395382768654, "grad_norm": 7.548125267028809, "learning_rate": 9.032118815478177e-06, "loss": 0.7742302894592286, "memory(GiB)": 33.07, "step": 5190, "token_acc": 0.812288993923025, "train_speed(iter/s)": 0.096233 }, { "epoch": 0.24158647208763614, "grad_norm": 6.740896701812744, "learning_rate": 9.029843986519667e-06, "loss": 0.7679877758026123, "memory(GiB)": 33.07, "step": 5195, "token_acc": 0.8307624504186867, "train_speed(iter/s)": 0.096285 }, { "epoch": 0.24181899034758572, "grad_norm": 5.745598316192627, "learning_rate": 9.02756677458999e-06, "loss": 0.7190179824829102, "memory(GiB)": 33.07, "step": 5200, "token_acc": 0.8271690286899412, "train_speed(iter/s)": 0.096336 }, { "epoch": 0.24181899034758572, "eval_loss": 0.6732439994812012, "eval_runtime": 293.5982, "eval_samples_per_second": 11.836, "eval_steps_per_second": 11.836, "step": 5200 }, { "epoch": 0.24205150860753533, "grad_norm": 6.52875280380249, "learning_rate": 9.025287181035731e-06, "loss": 0.7002001285552979, "memory(GiB)": 33.07, "step": 5205, "token_acc": 0.808760162764112, "train_speed(iter/s)": 0.095866 }, { "epoch": 0.24228402686748493, "grad_norm": 4.988480091094971, "learning_rate": 9.023005207204883e-06, "loss": 0.7857324600219726, "memory(GiB)": 33.07, "step": 5210, "token_acc": 0.7975120939875605, "train_speed(iter/s)": 0.095917 }, { "epoch": 0.24251654512743454, "grad_norm": 6.880677700042725, "learning_rate": 9.020720854446847e-06, "loss": 0.7423035621643066, "memory(GiB)": 33.07, "step": 5215, "token_acc": 0.8371703641691084, "train_speed(iter/s)": 0.095968 }, { "epoch": 0.24274906338738414, "grad_norm": 6.216911315917969, "learning_rate": 9.018434124112434e-06, "loss": 0.7673687934875488, "memory(GiB)": 33.07, "step": 5220, "token_acc": 0.80249753532698, "train_speed(iter/s)": 0.09602 }, { "epoch": 0.24298158164733374, "grad_norm": 5.229708194732666, "learning_rate": 9.016145017553856e-06, "loss": 0.8729522705078125, "memory(GiB)": 33.07, "step": 5225, "token_acc": 0.7923865300146413, "train_speed(iter/s)": 0.096072 }, { "epoch": 0.24321409990728335, "grad_norm": 5.493806838989258, "learning_rate": 9.013853536124732e-06, "loss": 0.806683349609375, "memory(GiB)": 33.07, "step": 5230, "token_acc": 0.806822262118492, "train_speed(iter/s)": 0.096125 }, { "epoch": 0.24344661816723295, "grad_norm": 6.720024108886719, "learning_rate": 9.011559681180088e-06, "loss": 0.7355064392089844, "memory(GiB)": 33.07, "step": 5235, "token_acc": 0.8318548387096775, "train_speed(iter/s)": 0.096177 }, { "epoch": 0.24367913642718256, "grad_norm": 7.687710285186768, "learning_rate": 9.009263454076349e-06, "loss": 0.7195593833923339, "memory(GiB)": 33.07, "step": 5240, "token_acc": 0.8145440554059253, "train_speed(iter/s)": 0.096229 }, { "epoch": 0.24391165468713216, "grad_norm": 7.271475315093994, "learning_rate": 9.006964856171347e-06, "loss": 0.6926477432250977, "memory(GiB)": 33.07, "step": 5245, "token_acc": 0.8299108872530027, "train_speed(iter/s)": 0.09628 }, { "epoch": 0.24414417294708174, "grad_norm": 6.054494857788086, "learning_rate": 9.004663888824312e-06, "loss": 0.7744291305541993, "memory(GiB)": 33.07, "step": 5250, "token_acc": 0.7921225382932167, "train_speed(iter/s)": 0.096332 }, { "epoch": 0.24414417294708174, "eval_loss": 0.6698423624038696, "eval_runtime": 289.3444, "eval_samples_per_second": 12.01, "eval_steps_per_second": 12.01, "step": 5250 }, { "epoch": 0.24437669120703134, "grad_norm": 5.884529113769531, "learning_rate": 9.002360553395877e-06, "loss": 0.6975594997406006, "memory(GiB)": 33.07, "step": 5255, "token_acc": 0.808309229555614, "train_speed(iter/s)": 0.095874 }, { "epoch": 0.24460920946698095, "grad_norm": 5.387625217437744, "learning_rate": 9.000054851248078e-06, "loss": 0.721724557876587, "memory(GiB)": 33.07, "step": 5260, "token_acc": 0.820926243567753, "train_speed(iter/s)": 0.095925 }, { "epoch": 0.24484172772693055, "grad_norm": 5.464982986450195, "learning_rate": 8.997746783744346e-06, "loss": 0.8935551643371582, "memory(GiB)": 33.07, "step": 5265, "token_acc": 0.7821548821548822, "train_speed(iter/s)": 0.095976 }, { "epoch": 0.24507424598688016, "grad_norm": 7.145682334899902, "learning_rate": 8.995436352249512e-06, "loss": 0.6936135768890381, "memory(GiB)": 33.07, "step": 5270, "token_acc": 0.8183799491463858, "train_speed(iter/s)": 0.096028 }, { "epoch": 0.24530676424682976, "grad_norm": 6.86348819732666, "learning_rate": 8.993123558129806e-06, "loss": 0.7699307918548584, "memory(GiB)": 33.07, "step": 5275, "token_acc": 0.8124552612741589, "train_speed(iter/s)": 0.09608 }, { "epoch": 0.24553928250677937, "grad_norm": 6.941252708435059, "learning_rate": 8.990808402752856e-06, "loss": 0.7396425247192383, "memory(GiB)": 33.07, "step": 5280, "token_acc": 0.8251801289343952, "train_speed(iter/s)": 0.096132 }, { "epoch": 0.24577180076672897, "grad_norm": 5.640344619750977, "learning_rate": 8.988490887487683e-06, "loss": 0.7938966751098633, "memory(GiB)": 33.07, "step": 5285, "token_acc": 0.8080515297906602, "train_speed(iter/s)": 0.096184 }, { "epoch": 0.24600431902667858, "grad_norm": 5.452683448791504, "learning_rate": 8.986171013704703e-06, "loss": 0.7618279933929444, "memory(GiB)": 33.07, "step": 5290, "token_acc": 0.8115112756647593, "train_speed(iter/s)": 0.096235 }, { "epoch": 0.24623683728662815, "grad_norm": 5.495558738708496, "learning_rate": 8.983848782775735e-06, "loss": 0.8416355133056641, "memory(GiB)": 33.07, "step": 5295, "token_acc": 0.7899057464274856, "train_speed(iter/s)": 0.096287 }, { "epoch": 0.24646935554657776, "grad_norm": 6.56035041809082, "learning_rate": 8.981524196073981e-06, "loss": 0.9013174057006836, "memory(GiB)": 33.07, "step": 5300, "token_acc": 0.8080099091659785, "train_speed(iter/s)": 0.096338 }, { "epoch": 0.24646935554657776, "eval_loss": 0.6700074672698975, "eval_runtime": 288.0838, "eval_samples_per_second": 12.062, "eval_steps_per_second": 12.062, "step": 5300 }, { "epoch": 0.24670187380652736, "grad_norm": 6.2724833488464355, "learning_rate": 8.979197254974045e-06, "loss": 0.7496028900146484, "memory(GiB)": 33.07, "step": 5305, "token_acc": 0.8083637677087931, "train_speed(iter/s)": 0.095886 }, { "epoch": 0.24693439206647697, "grad_norm": 4.500880241394043, "learning_rate": 8.976867960851915e-06, "loss": 0.7409276008605957, "memory(GiB)": 33.07, "step": 5310, "token_acc": 0.8254813600983204, "train_speed(iter/s)": 0.095937 }, { "epoch": 0.24716691032642657, "grad_norm": 6.141285419464111, "learning_rate": 8.974536315084976e-06, "loss": 0.8386311531066895, "memory(GiB)": 33.07, "step": 5315, "token_acc": 0.7797459893048129, "train_speed(iter/s)": 0.095988 }, { "epoch": 0.24739942858637617, "grad_norm": 5.962406635284424, "learning_rate": 8.972202319052004e-06, "loss": 0.6776054859161377, "memory(GiB)": 33.07, "step": 5320, "token_acc": 0.8359511343804538, "train_speed(iter/s)": 0.09604 }, { "epoch": 0.24763194684632578, "grad_norm": 6.162694931030273, "learning_rate": 8.969865974133161e-06, "loss": 0.7959749221801757, "memory(GiB)": 33.07, "step": 5325, "token_acc": 0.7978366281238344, "train_speed(iter/s)": 0.096091 }, { "epoch": 0.24786446510627538, "grad_norm": 6.722538948059082, "learning_rate": 8.96752728171e-06, "loss": 0.7133076190948486, "memory(GiB)": 33.07, "step": 5330, "token_acc": 0.8228179143510951, "train_speed(iter/s)": 0.096139 }, { "epoch": 0.248096983366225, "grad_norm": 5.973226070404053, "learning_rate": 8.965186243165461e-06, "loss": 0.7359925270080566, "memory(GiB)": 33.07, "step": 5335, "token_acc": 0.8213429256594724, "train_speed(iter/s)": 0.096191 }, { "epoch": 0.2483295016261746, "grad_norm": 7.109038829803467, "learning_rate": 8.962842859883875e-06, "loss": 0.714454174041748, "memory(GiB)": 33.07, "step": 5340, "token_acc": 0.8213627992633518, "train_speed(iter/s)": 0.096241 }, { "epoch": 0.24856201988612417, "grad_norm": 5.056357383728027, "learning_rate": 8.960497133250954e-06, "loss": 0.7473381519317627, "memory(GiB)": 33.07, "step": 5345, "token_acc": 0.8127871362940275, "train_speed(iter/s)": 0.096292 }, { "epoch": 0.24879453814607377, "grad_norm": 6.350715637207031, "learning_rate": 8.958149064653802e-06, "loss": 0.7907238483428956, "memory(GiB)": 33.07, "step": 5350, "token_acc": 0.8073970690858339, "train_speed(iter/s)": 0.096342 }, { "epoch": 0.24879453814607377, "eval_loss": 0.6689327359199524, "eval_runtime": 290.5795, "eval_samples_per_second": 11.959, "eval_steps_per_second": 11.959, "step": 5350 }, { "epoch": 0.24902705640602338, "grad_norm": 5.9941534996032715, "learning_rate": 8.955798655480901e-06, "loss": 0.7376707077026368, "memory(GiB)": 33.07, "step": 5355, "token_acc": 0.8085616190529794, "train_speed(iter/s)": 0.095891 }, { "epoch": 0.24925957466597298, "grad_norm": 5.566898345947266, "learning_rate": 8.953445907122123e-06, "loss": 0.854979419708252, "memory(GiB)": 33.07, "step": 5360, "token_acc": 0.7967625899280576, "train_speed(iter/s)": 0.095942 }, { "epoch": 0.2494920929259226, "grad_norm": 5.000674724578857, "learning_rate": 8.95109082096872e-06, "loss": 0.7167798042297363, "memory(GiB)": 33.07, "step": 5365, "token_acc": 0.8103186646433991, "train_speed(iter/s)": 0.095992 }, { "epoch": 0.2497246111858722, "grad_norm": 7.17212438583374, "learning_rate": 8.948733398413326e-06, "loss": 0.9033098220825195, "memory(GiB)": 33.07, "step": 5370, "token_acc": 0.7684180468303826, "train_speed(iter/s)": 0.096042 }, { "epoch": 0.2499571294458218, "grad_norm": 6.104959487915039, "learning_rate": 8.946373640849958e-06, "loss": 0.8636404991149902, "memory(GiB)": 33.07, "step": 5375, "token_acc": 0.8045325779036827, "train_speed(iter/s)": 0.096091 }, { "epoch": 0.2501896477057714, "grad_norm": 5.245123386383057, "learning_rate": 8.944011549674016e-06, "loss": 0.5826745986938476, "memory(GiB)": 33.07, "step": 5380, "token_acc": 0.8652193577566711, "train_speed(iter/s)": 0.096141 }, { "epoch": 0.250422165965721, "grad_norm": 6.003298282623291, "learning_rate": 8.941647126282275e-06, "loss": 0.7276782989501953, "memory(GiB)": 33.07, "step": 5385, "token_acc": 0.8281549673954737, "train_speed(iter/s)": 0.096193 }, { "epoch": 0.2506546842256706, "grad_norm": 5.025068759918213, "learning_rate": 8.939280372072891e-06, "loss": 0.7485162734985351, "memory(GiB)": 33.07, "step": 5390, "token_acc": 0.8045441304981066, "train_speed(iter/s)": 0.096243 }, { "epoch": 0.2508872024856202, "grad_norm": 6.471608638763428, "learning_rate": 8.9369112884454e-06, "loss": 0.8305785179138183, "memory(GiB)": 33.07, "step": 5395, "token_acc": 0.7878504672897196, "train_speed(iter/s)": 0.096294 }, { "epoch": 0.2511197207455698, "grad_norm": 7.496315002441406, "learning_rate": 8.934539876800716e-06, "loss": 0.8001940727233887, "memory(GiB)": 33.07, "step": 5400, "token_acc": 0.8088871411718442, "train_speed(iter/s)": 0.096344 }, { "epoch": 0.2511197207455698, "eval_loss": 0.6697515845298767, "eval_runtime": 291.0629, "eval_samples_per_second": 11.939, "eval_steps_per_second": 11.939, "step": 5400 }, { "epoch": 0.2513522390055194, "grad_norm": 6.354772567749023, "learning_rate": 8.932166138541127e-06, "loss": 0.765593433380127, "memory(GiB)": 33.07, "step": 5405, "token_acc": 0.8085344593189706, "train_speed(iter/s)": 0.095895 }, { "epoch": 0.25158475726546903, "grad_norm": 7.054717063903809, "learning_rate": 8.929790075070295e-06, "loss": 0.7476856708526611, "memory(GiB)": 33.07, "step": 5410, "token_acc": 0.8313807531380754, "train_speed(iter/s)": 0.095945 }, { "epoch": 0.2518172755254186, "grad_norm": 5.243678569793701, "learning_rate": 8.92741168779326e-06, "loss": 0.7400431632995605, "memory(GiB)": 33.07, "step": 5415, "token_acc": 0.8249431633647288, "train_speed(iter/s)": 0.095994 }, { "epoch": 0.2520497937853682, "grad_norm": 7.673439979553223, "learning_rate": 8.925030978116441e-06, "loss": 0.6828903675079345, "memory(GiB)": 33.07, "step": 5420, "token_acc": 0.8329528158295282, "train_speed(iter/s)": 0.096045 }, { "epoch": 0.2522823120453178, "grad_norm": 5.734298229217529, "learning_rate": 8.92264794744762e-06, "loss": 0.7182761192321777, "memory(GiB)": 33.07, "step": 5425, "token_acc": 0.8171109733415995, "train_speed(iter/s)": 0.096095 }, { "epoch": 0.2525148303052674, "grad_norm": 6.604122161865234, "learning_rate": 8.920262597195959e-06, "loss": 0.7213967800140381, "memory(GiB)": 33.07, "step": 5430, "token_acc": 0.8148869836321122, "train_speed(iter/s)": 0.096146 }, { "epoch": 0.252747348565217, "grad_norm": 5.710094451904297, "learning_rate": 8.917874928771988e-06, "loss": 0.7809642314910888, "memory(GiB)": 33.07, "step": 5435, "token_acc": 0.7974232456140351, "train_speed(iter/s)": 0.096195 }, { "epoch": 0.2529798668251666, "grad_norm": 6.227021217346191, "learning_rate": 8.91548494358761e-06, "loss": 0.9448799133300781, "memory(GiB)": 33.07, "step": 5440, "token_acc": 0.7778573987817986, "train_speed(iter/s)": 0.096245 }, { "epoch": 0.25321238508511623, "grad_norm": 6.427996635437012, "learning_rate": 8.913092643056095e-06, "loss": 0.7775098323822022, "memory(GiB)": 33.07, "step": 5445, "token_acc": 0.8085705348764467, "train_speed(iter/s)": 0.096293 }, { "epoch": 0.2534449033450658, "grad_norm": 5.984553337097168, "learning_rate": 8.910698028592087e-06, "loss": 0.7316121578216552, "memory(GiB)": 33.07, "step": 5450, "token_acc": 0.8200064745872451, "train_speed(iter/s)": 0.096344 }, { "epoch": 0.2534449033450658, "eval_loss": 0.6686436533927917, "eval_runtime": 291.087, "eval_samples_per_second": 11.938, "eval_steps_per_second": 11.938, "step": 5450 }, { "epoch": 0.25367742160501544, "grad_norm": 5.354024410247803, "learning_rate": 8.908301101611594e-06, "loss": 0.7186790466308594, "memory(GiB)": 33.07, "step": 5455, "token_acc": 0.8094466720128308, "train_speed(iter/s)": 0.0959 }, { "epoch": 0.253909939864965, "grad_norm": 7.332178592681885, "learning_rate": 8.905901863531992e-06, "loss": 0.9101710319519043, "memory(GiB)": 33.07, "step": 5460, "token_acc": 0.7733913584084551, "train_speed(iter/s)": 0.09595 }, { "epoch": 0.25414245812491465, "grad_norm": 5.764885902404785, "learning_rate": 8.903500315772026e-06, "loss": 0.7292781352996827, "memory(GiB)": 33.07, "step": 5465, "token_acc": 0.8157294832826748, "train_speed(iter/s)": 0.095999 }, { "epoch": 0.2543749763848642, "grad_norm": 5.920174598693848, "learning_rate": 8.901096459751805e-06, "loss": 0.7806157112121582, "memory(GiB)": 33.07, "step": 5470, "token_acc": 0.7944377267230955, "train_speed(iter/s)": 0.096048 }, { "epoch": 0.2546074946448138, "grad_norm": 6.420112609863281, "learning_rate": 8.898690296892804e-06, "loss": 0.7653073310852051, "memory(GiB)": 33.07, "step": 5475, "token_acc": 0.8201388888888889, "train_speed(iter/s)": 0.096097 }, { "epoch": 0.25484001290476344, "grad_norm": 6.058072090148926, "learning_rate": 8.896281828617861e-06, "loss": 0.944705867767334, "memory(GiB)": 33.07, "step": 5480, "token_acc": 0.7741743528711693, "train_speed(iter/s)": 0.096147 }, { "epoch": 0.255072531164713, "grad_norm": 6.223658561706543, "learning_rate": 8.893871056351178e-06, "loss": 0.7866355895996093, "memory(GiB)": 33.07, "step": 5485, "token_acc": 0.8078734858681023, "train_speed(iter/s)": 0.096195 }, { "epoch": 0.25530504942466264, "grad_norm": 4.456324100494385, "learning_rate": 8.891457981518317e-06, "loss": 0.7789161682128907, "memory(GiB)": 33.07, "step": 5490, "token_acc": 0.8106844741235393, "train_speed(iter/s)": 0.096244 }, { "epoch": 0.2555375676846122, "grad_norm": 6.5606160163879395, "learning_rate": 8.889042605546206e-06, "loss": 0.8363648414611816, "memory(GiB)": 33.07, "step": 5495, "token_acc": 0.803680981595092, "train_speed(iter/s)": 0.096292 }, { "epoch": 0.25577008594456185, "grad_norm": 5.641964435577393, "learning_rate": 8.886624929863128e-06, "loss": 0.8033831596374512, "memory(GiB)": 33.07, "step": 5500, "token_acc": 0.8071581196581197, "train_speed(iter/s)": 0.096341 }, { "epoch": 0.25577008594456185, "eval_loss": 0.6669542193412781, "eval_runtime": 290.9922, "eval_samples_per_second": 11.942, "eval_steps_per_second": 11.942, "step": 5500 }, { "epoch": 0.25600260420451143, "grad_norm": 5.638400554656982, "learning_rate": 8.884204955898734e-06, "loss": 0.8716331481933594, "memory(GiB)": 33.07, "step": 5505, "token_acc": 0.8083292041622046, "train_speed(iter/s)": 0.095901 }, { "epoch": 0.25623512246446106, "grad_norm": 6.0181989669799805, "learning_rate": 8.881782685084027e-06, "loss": 0.6829388618469239, "memory(GiB)": 33.07, "step": 5510, "token_acc": 0.8382509776039815, "train_speed(iter/s)": 0.095951 }, { "epoch": 0.25646764072441064, "grad_norm": 6.096235752105713, "learning_rate": 8.879358118851369e-06, "loss": 0.7325149536132812, "memory(GiB)": 33.07, "step": 5515, "token_acc": 0.8188976377952756, "train_speed(iter/s)": 0.095999 }, { "epoch": 0.2567001589843602, "grad_norm": 7.176323413848877, "learning_rate": 8.876931258634483e-06, "loss": 0.7665040969848633, "memory(GiB)": 33.07, "step": 5520, "token_acc": 0.8297587131367292, "train_speed(iter/s)": 0.096048 }, { "epoch": 0.25693267724430985, "grad_norm": 7.309201240539551, "learning_rate": 8.874502105868447e-06, "loss": 0.7806261539459228, "memory(GiB)": 33.07, "step": 5525, "token_acc": 0.8165064102564102, "train_speed(iter/s)": 0.096097 }, { "epoch": 0.2571651955042594, "grad_norm": 6.094222545623779, "learning_rate": 8.872070661989691e-06, "loss": 0.7837971210479736, "memory(GiB)": 33.07, "step": 5530, "token_acc": 0.8048245614035088, "train_speed(iter/s)": 0.096145 }, { "epoch": 0.25739771376420906, "grad_norm": 5.346552848815918, "learning_rate": 8.869636928436006e-06, "loss": 0.919887924194336, "memory(GiB)": 33.07, "step": 5535, "token_acc": 0.7646528403967539, "train_speed(iter/s)": 0.096193 }, { "epoch": 0.25763023202415863, "grad_norm": 5.813202381134033, "learning_rate": 8.867200906646532e-06, "loss": 0.6144495487213135, "memory(GiB)": 33.07, "step": 5540, "token_acc": 0.8432231962238705, "train_speed(iter/s)": 0.096241 }, { "epoch": 0.25786275028410827, "grad_norm": 5.423628330230713, "learning_rate": 8.864762598061764e-06, "loss": 0.7319612026214599, "memory(GiB)": 33.07, "step": 5545, "token_acc": 0.8209011737978038, "train_speed(iter/s)": 0.096288 }, { "epoch": 0.25809526854405784, "grad_norm": 6.213625907897949, "learning_rate": 8.86232200412355e-06, "loss": 0.7926487445831298, "memory(GiB)": 33.07, "step": 5550, "token_acc": 0.8025708635464733, "train_speed(iter/s)": 0.096337 }, { "epoch": 0.25809526854405784, "eval_loss": 0.6657958626747131, "eval_runtime": 290.1857, "eval_samples_per_second": 11.975, "eval_steps_per_second": 11.975, "step": 5550 }, { "epoch": 0.2583277868040075, "grad_norm": 6.262226104736328, "learning_rate": 8.859879126275088e-06, "loss": 0.9141180038452148, "memory(GiB)": 33.07, "step": 5555, "token_acc": 0.8079669387201519, "train_speed(iter/s)": 0.095901 }, { "epoch": 0.25856030506395705, "grad_norm": 8.025341033935547, "learning_rate": 8.857433965960926e-06, "loss": 0.799907112121582, "memory(GiB)": 33.07, "step": 5560, "token_acc": 0.8073394495412844, "train_speed(iter/s)": 0.09595 }, { "epoch": 0.25879282332390663, "grad_norm": 5.492082118988037, "learning_rate": 8.854986524626965e-06, "loss": 0.9118124008178711, "memory(GiB)": 33.07, "step": 5565, "token_acc": 0.7928526249209361, "train_speed(iter/s)": 0.095997 }, { "epoch": 0.25902534158385626, "grad_norm": 5.745024681091309, "learning_rate": 8.852536803720452e-06, "loss": 0.8177533149719238, "memory(GiB)": 33.07, "step": 5570, "token_acc": 0.8142504118616145, "train_speed(iter/s)": 0.096046 }, { "epoch": 0.25925785984380584, "grad_norm": 4.650488376617432, "learning_rate": 8.850084804689982e-06, "loss": 0.8530696868896485, "memory(GiB)": 33.07, "step": 5575, "token_acc": 0.7954545454545454, "train_speed(iter/s)": 0.096095 }, { "epoch": 0.25949037810375547, "grad_norm": 7.37318229675293, "learning_rate": 8.847630528985496e-06, "loss": 0.7755809783935547, "memory(GiB)": 33.07, "step": 5580, "token_acc": 0.7971880492091389, "train_speed(iter/s)": 0.096143 }, { "epoch": 0.25972289636370505, "grad_norm": 6.992980003356934, "learning_rate": 8.845173978058288e-06, "loss": 0.7328850269317627, "memory(GiB)": 33.07, "step": 5585, "token_acc": 0.8208566108007449, "train_speed(iter/s)": 0.096191 }, { "epoch": 0.2599554146236547, "grad_norm": 4.85237979888916, "learning_rate": 8.842715153360987e-06, "loss": 0.6974979400634765, "memory(GiB)": 33.07, "step": 5590, "token_acc": 0.8187894541403639, "train_speed(iter/s)": 0.09624 }, { "epoch": 0.26018793288360426, "grad_norm": 6.679464817047119, "learning_rate": 8.840254056347574e-06, "loss": 0.815558910369873, "memory(GiB)": 33.07, "step": 5595, "token_acc": 0.8056239015817224, "train_speed(iter/s)": 0.096289 }, { "epoch": 0.2604204511435539, "grad_norm": 5.637943744659424, "learning_rate": 8.837790688473373e-06, "loss": 0.9436439514160156, "memory(GiB)": 33.07, "step": 5600, "token_acc": 0.781738907412157, "train_speed(iter/s)": 0.096337 }, { "epoch": 0.2604204511435539, "eval_loss": 0.6708033680915833, "eval_runtime": 291.4602, "eval_samples_per_second": 11.923, "eval_steps_per_second": 11.923, "step": 5600 }, { "epoch": 0.26065296940350346, "grad_norm": 6.944599628448486, "learning_rate": 8.835325051195047e-06, "loss": 0.6876493930816651, "memory(GiB)": 33.07, "step": 5605, "token_acc": 0.8092388568871987, "train_speed(iter/s)": 0.095904 }, { "epoch": 0.2608854876634531, "grad_norm": 6.316150188446045, "learning_rate": 8.832857145970606e-06, "loss": 0.8054632186889649, "memory(GiB)": 33.07, "step": 5610, "token_acc": 0.8057960105382009, "train_speed(iter/s)": 0.095953 }, { "epoch": 0.2611180059234027, "grad_norm": 6.6088175773620605, "learning_rate": 8.830386974259398e-06, "loss": 0.784023666381836, "memory(GiB)": 33.07, "step": 5615, "token_acc": 0.8076171875, "train_speed(iter/s)": 0.096 }, { "epoch": 0.26135052418335225, "grad_norm": 5.055639743804932, "learning_rate": 8.827914537522111e-06, "loss": 0.7353429794311523, "memory(GiB)": 33.07, "step": 5620, "token_acc": 0.8256467941507312, "train_speed(iter/s)": 0.096049 }, { "epoch": 0.2615830424433019, "grad_norm": 6.476213455200195, "learning_rate": 8.825439837220772e-06, "loss": 0.7795384883880615, "memory(GiB)": 33.07, "step": 5625, "token_acc": 0.8153323262839879, "train_speed(iter/s)": 0.096097 }, { "epoch": 0.26181556070325146, "grad_norm": 5.233215808868408, "learning_rate": 8.82296287481875e-06, "loss": 0.8795851707458496, "memory(GiB)": 33.07, "step": 5630, "token_acc": 0.7775357809583074, "train_speed(iter/s)": 0.096145 }, { "epoch": 0.2620480789632011, "grad_norm": 5.791959285736084, "learning_rate": 8.820483651780746e-06, "loss": 0.7681045532226562, "memory(GiB)": 33.07, "step": 5635, "token_acc": 0.8212083847102343, "train_speed(iter/s)": 0.096193 }, { "epoch": 0.26228059722315067, "grad_norm": 7.508996963500977, "learning_rate": 8.818002169572808e-06, "loss": 0.8352363586425782, "memory(GiB)": 33.07, "step": 5640, "token_acc": 0.8109121199500208, "train_speed(iter/s)": 0.096241 }, { "epoch": 0.2625131154831003, "grad_norm": 5.6559906005859375, "learning_rate": 8.815518429662304e-06, "loss": 0.7900479316711426, "memory(GiB)": 33.07, "step": 5645, "token_acc": 0.8133054684778823, "train_speed(iter/s)": 0.09629 }, { "epoch": 0.2627456337430499, "grad_norm": 6.982070446014404, "learning_rate": 8.813032433517953e-06, "loss": 0.7146542549133301, "memory(GiB)": 33.07, "step": 5650, "token_acc": 0.8333946303788158, "train_speed(iter/s)": 0.096338 }, { "epoch": 0.2627456337430499, "eval_loss": 0.6662178039550781, "eval_runtime": 295.0199, "eval_samples_per_second": 11.779, "eval_steps_per_second": 11.779, "step": 5650 }, { "epoch": 0.2629781520029995, "grad_norm": 6.79498815536499, "learning_rate": 8.810544182609799e-06, "loss": 0.7456812381744384, "memory(GiB)": 33.07, "step": 5655, "token_acc": 0.8098742446251013, "train_speed(iter/s)": 0.095903 }, { "epoch": 0.2632106702629491, "grad_norm": 4.711836814880371, "learning_rate": 8.80805367840922e-06, "loss": 0.88272705078125, "memory(GiB)": 33.07, "step": 5660, "token_acc": 0.7980769230769231, "train_speed(iter/s)": 0.095949 }, { "epoch": 0.26344318852289866, "grad_norm": 6.486434459686279, "learning_rate": 8.805560922388932e-06, "loss": 0.7468509674072266, "memory(GiB)": 33.07, "step": 5665, "token_acc": 0.8165413533834587, "train_speed(iter/s)": 0.095996 }, { "epoch": 0.2636757067828483, "grad_norm": 5.890286922454834, "learning_rate": 8.803065916022974e-06, "loss": 0.7029210567474365, "memory(GiB)": 33.07, "step": 5670, "token_acc": 0.8349483204134367, "train_speed(iter/s)": 0.096044 }, { "epoch": 0.2639082250427979, "grad_norm": 5.925194263458252, "learning_rate": 8.800568660786724e-06, "loss": 0.8480375289916993, "memory(GiB)": 33.07, "step": 5675, "token_acc": 0.7957658779576587, "train_speed(iter/s)": 0.096092 }, { "epoch": 0.2641407433027475, "grad_norm": 5.214552402496338, "learning_rate": 8.798069158156884e-06, "loss": 0.7835229396820068, "memory(GiB)": 33.07, "step": 5680, "token_acc": 0.8250564334085779, "train_speed(iter/s)": 0.096138 }, { "epoch": 0.2643732615626971, "grad_norm": 4.562428951263428, "learning_rate": 8.795567409611487e-06, "loss": 0.6990029335021972, "memory(GiB)": 33.07, "step": 5685, "token_acc": 0.81728, "train_speed(iter/s)": 0.096185 }, { "epoch": 0.2646057798226467, "grad_norm": 4.071775913238525, "learning_rate": 8.793063416629895e-06, "loss": 0.6751185417175293, "memory(GiB)": 33.07, "step": 5690, "token_acc": 0.8259620907524411, "train_speed(iter/s)": 0.096232 }, { "epoch": 0.2648382980825963, "grad_norm": 5.290688991546631, "learning_rate": 8.790557180692796e-06, "loss": 0.6121927738189697, "memory(GiB)": 33.07, "step": 5695, "token_acc": 0.845578231292517, "train_speed(iter/s)": 0.09628 }, { "epoch": 0.2650708163425459, "grad_norm": 7.041702747344971, "learning_rate": 8.788048703282204e-06, "loss": 0.7391871929168701, "memory(GiB)": 33.07, "step": 5700, "token_acc": 0.8141878274889157, "train_speed(iter/s)": 0.096327 }, { "epoch": 0.2650708163425459, "eval_loss": 0.6662322878837585, "eval_runtime": 292.6362, "eval_samples_per_second": 11.875, "eval_steps_per_second": 11.875, "step": 5700 }, { "epoch": 0.2653033346024955, "grad_norm": 6.969703674316406, "learning_rate": 8.785537985881463e-06, "loss": 0.7058303356170654, "memory(GiB)": 33.07, "step": 5705, "token_acc": 0.8105295109073336, "train_speed(iter/s)": 0.095901 }, { "epoch": 0.2655358528624451, "grad_norm": 6.892284393310547, "learning_rate": 8.783025029975231e-06, "loss": 0.7768474102020264, "memory(GiB)": 33.07, "step": 5710, "token_acc": 0.8084707646176912, "train_speed(iter/s)": 0.095949 }, { "epoch": 0.2657683711223947, "grad_norm": 5.746127128601074, "learning_rate": 8.780509837049501e-06, "loss": 0.7630928516387939, "memory(GiB)": 33.07, "step": 5715, "token_acc": 0.810706787963611, "train_speed(iter/s)": 0.095997 }, { "epoch": 0.2660008893823443, "grad_norm": 5.716222763061523, "learning_rate": 8.777992408591587e-06, "loss": 0.7920127391815186, "memory(GiB)": 33.07, "step": 5720, "token_acc": 0.8055358410220014, "train_speed(iter/s)": 0.096044 }, { "epoch": 0.2662334076422939, "grad_norm": 6.537604331970215, "learning_rate": 8.775472746090114e-06, "loss": 0.6463952541351319, "memory(GiB)": 33.07, "step": 5725, "token_acc": 0.8377331925378388, "train_speed(iter/s)": 0.096092 }, { "epoch": 0.2664659259022435, "grad_norm": 5.381245136260986, "learning_rate": 8.772950851035043e-06, "loss": 0.7079122543334961, "memory(GiB)": 33.07, "step": 5730, "token_acc": 0.8135184067592034, "train_speed(iter/s)": 0.096139 }, { "epoch": 0.2666984441621931, "grad_norm": 4.338988304138184, "learning_rate": 8.770426724917645e-06, "loss": 0.7638363838195801, "memory(GiB)": 33.07, "step": 5735, "token_acc": 0.8122582564712884, "train_speed(iter/s)": 0.096185 }, { "epoch": 0.2669309624221427, "grad_norm": 5.95316743850708, "learning_rate": 8.767900369230516e-06, "loss": 0.7845072269439697, "memory(GiB)": 33.07, "step": 5740, "token_acc": 0.8254620123203286, "train_speed(iter/s)": 0.096232 }, { "epoch": 0.26716348068209234, "grad_norm": 5.003815650939941, "learning_rate": 8.765371785467565e-06, "loss": 0.7775311946868897, "memory(GiB)": 33.07, "step": 5745, "token_acc": 0.8154538634658665, "train_speed(iter/s)": 0.09628 }, { "epoch": 0.2673959989420419, "grad_norm": 7.775989532470703, "learning_rate": 8.762840975124025e-06, "loss": 0.7087774276733398, "memory(GiB)": 33.07, "step": 5750, "token_acc": 0.8237026131762973, "train_speed(iter/s)": 0.096326 }, { "epoch": 0.2673959989420419, "eval_loss": 0.6630222797393799, "eval_runtime": 294.8049, "eval_samples_per_second": 11.787, "eval_steps_per_second": 11.787, "step": 5750 }, { "epoch": 0.26762851720199154, "grad_norm": 5.743710041046143, "learning_rate": 8.760307939696441e-06, "loss": 0.7595709323883056, "memory(GiB)": 33.07, "step": 5755, "token_acc": 0.810019710644546, "train_speed(iter/s)": 0.095898 }, { "epoch": 0.2678610354619411, "grad_norm": 6.3062615394592285, "learning_rate": 8.757772680682674e-06, "loss": 0.6787972450256348, "memory(GiB)": 33.07, "step": 5760, "token_acc": 0.8221516474791584, "train_speed(iter/s)": 0.095944 }, { "epoch": 0.2680935537218907, "grad_norm": 9.692729949951172, "learning_rate": 8.755235199581902e-06, "loss": 0.9254583358764649, "memory(GiB)": 33.07, "step": 5765, "token_acc": 0.7801642451042324, "train_speed(iter/s)": 0.095991 }, { "epoch": 0.26832607198184033, "grad_norm": 7.099086284637451, "learning_rate": 8.752695497894616e-06, "loss": 0.8400090217590332, "memory(GiB)": 33.07, "step": 5770, "token_acc": 0.7888177837655777, "train_speed(iter/s)": 0.096038 }, { "epoch": 0.2685585902417899, "grad_norm": 5.777800559997559, "learning_rate": 8.750153577122622e-06, "loss": 0.6479124546051025, "memory(GiB)": 33.07, "step": 5775, "token_acc": 0.843238930993219, "train_speed(iter/s)": 0.096085 }, { "epoch": 0.26879110850173954, "grad_norm": 6.415759086608887, "learning_rate": 8.747609438769032e-06, "loss": 0.8482369422912598, "memory(GiB)": 33.07, "step": 5780, "token_acc": 0.7988942639944713, "train_speed(iter/s)": 0.09613 }, { "epoch": 0.2690236267616891, "grad_norm": 6.494517803192139, "learning_rate": 8.745063084338275e-06, "loss": 0.7210347652435303, "memory(GiB)": 33.07, "step": 5785, "token_acc": 0.8194831013916501, "train_speed(iter/s)": 0.096177 }, { "epoch": 0.26925614502163875, "grad_norm": 6.0870137214660645, "learning_rate": 8.742514515336092e-06, "loss": 0.8033206939697266, "memory(GiB)": 33.07, "step": 5790, "token_acc": 0.8005087209302325, "train_speed(iter/s)": 0.096223 }, { "epoch": 0.2694886632815883, "grad_norm": 4.545366287231445, "learning_rate": 8.739963733269526e-06, "loss": 0.8455084800720215, "memory(GiB)": 33.07, "step": 5795, "token_acc": 0.7944677871148459, "train_speed(iter/s)": 0.096269 }, { "epoch": 0.26972118154153796, "grad_norm": 5.711040496826172, "learning_rate": 8.737410739646935e-06, "loss": 0.7394067287445069, "memory(GiB)": 33.07, "step": 5800, "token_acc": 0.8074335148990708, "train_speed(iter/s)": 0.096315 }, { "epoch": 0.26972118154153796, "eval_loss": 0.659167468547821, "eval_runtime": 294.1469, "eval_samples_per_second": 11.814, "eval_steps_per_second": 11.814, "step": 5800 }, { "epoch": 0.26995369980148753, "grad_norm": 7.04903507232666, "learning_rate": 8.734855535977984e-06, "loss": 0.7887364864349365, "memory(GiB)": 33.07, "step": 5805, "token_acc": 0.8100934818594013, "train_speed(iter/s)": 0.095892 }, { "epoch": 0.2701862180614371, "grad_norm": 5.3122334480285645, "learning_rate": 8.732298123773644e-06, "loss": 0.7698288917541504, "memory(GiB)": 33.07, "step": 5810, "token_acc": 0.8110236220472441, "train_speed(iter/s)": 0.095937 }, { "epoch": 0.27041873632138674, "grad_norm": 3.649322032928467, "learning_rate": 8.72973850454619e-06, "loss": 0.808299446105957, "memory(GiB)": 33.07, "step": 5815, "token_acc": 0.7795212765957447, "train_speed(iter/s)": 0.095983 }, { "epoch": 0.2706512545813363, "grad_norm": 7.034269332885742, "learning_rate": 8.727176679809203e-06, "loss": 0.9111138343811035, "memory(GiB)": 33.07, "step": 5820, "token_acc": 0.7846249610955494, "train_speed(iter/s)": 0.096029 }, { "epoch": 0.27088377284128595, "grad_norm": 5.195382118225098, "learning_rate": 8.724612651077573e-06, "loss": 0.7312620162963868, "memory(GiB)": 33.07, "step": 5825, "token_acc": 0.8101784147411524, "train_speed(iter/s)": 0.096074 }, { "epoch": 0.27111629110123553, "grad_norm": 5.934696197509766, "learning_rate": 8.722046419867488e-06, "loss": 0.8077304840087891, "memory(GiB)": 33.07, "step": 5830, "token_acc": 0.8055733504163998, "train_speed(iter/s)": 0.096119 }, { "epoch": 0.27134880936118516, "grad_norm": 5.654599189758301, "learning_rate": 8.719477987696436e-06, "loss": 0.7460138320922851, "memory(GiB)": 33.07, "step": 5835, "token_acc": 0.8146775389177169, "train_speed(iter/s)": 0.096165 }, { "epoch": 0.27158132762113474, "grad_norm": 4.112966537475586, "learning_rate": 8.716907356083217e-06, "loss": 0.8577005386352539, "memory(GiB)": 33.07, "step": 5840, "token_acc": 0.7860661505981703, "train_speed(iter/s)": 0.09621 }, { "epoch": 0.27181384588108437, "grad_norm": 7.279501438140869, "learning_rate": 8.714334526547918e-06, "loss": 0.7479821681976319, "memory(GiB)": 33.07, "step": 5845, "token_acc": 0.8225538971807629, "train_speed(iter/s)": 0.096257 }, { "epoch": 0.27204636414103395, "grad_norm": 6.161126613616943, "learning_rate": 8.711759500611937e-06, "loss": 0.7519172191619873, "memory(GiB)": 33.07, "step": 5850, "token_acc": 0.8014393195943735, "train_speed(iter/s)": 0.096302 }, { "epoch": 0.27204636414103395, "eval_loss": 0.6617825627326965, "eval_runtime": 295.4158, "eval_samples_per_second": 11.763, "eval_steps_per_second": 11.763, "step": 5850 }, { "epoch": 0.2722788824009836, "grad_norm": 7.260519027709961, "learning_rate": 8.709182279797963e-06, "loss": 0.7220078945159912, "memory(GiB)": 33.07, "step": 5855, "token_acc": 0.8102235397804664, "train_speed(iter/s)": 0.095881 }, { "epoch": 0.27251140066093316, "grad_norm": 6.423417568206787, "learning_rate": 8.706602865629989e-06, "loss": 0.6432157039642334, "memory(GiB)": 33.07, "step": 5860, "token_acc": 0.8444360333080999, "train_speed(iter/s)": 0.095927 }, { "epoch": 0.27274391892088273, "grad_norm": 5.766536712646484, "learning_rate": 8.704021259633302e-06, "loss": 0.7632864475250244, "memory(GiB)": 33.07, "step": 5865, "token_acc": 0.8184250764525994, "train_speed(iter/s)": 0.095973 }, { "epoch": 0.27297643718083237, "grad_norm": 5.202002048492432, "learning_rate": 8.701437463334485e-06, "loss": 0.7291605472564697, "memory(GiB)": 33.07, "step": 5870, "token_acc": 0.8104372355430184, "train_speed(iter/s)": 0.096019 }, { "epoch": 0.27320895544078194, "grad_norm": 7.422544956207275, "learning_rate": 8.698851478261416e-06, "loss": 0.8415180206298828, "memory(GiB)": 33.07, "step": 5875, "token_acc": 0.7878695519658641, "train_speed(iter/s)": 0.096065 }, { "epoch": 0.2734414737007316, "grad_norm": 4.930156707763672, "learning_rate": 8.696263305943268e-06, "loss": 0.7762112140655517, "memory(GiB)": 33.07, "step": 5880, "token_acc": 0.8077803203661327, "train_speed(iter/s)": 0.096111 }, { "epoch": 0.27367399196068115, "grad_norm": 5.615900993347168, "learning_rate": 8.693672947910507e-06, "loss": 0.8112217903137207, "memory(GiB)": 33.07, "step": 5885, "token_acc": 0.7994978479196556, "train_speed(iter/s)": 0.096156 }, { "epoch": 0.2739065102206308, "grad_norm": 8.410451889038086, "learning_rate": 8.691080405694891e-06, "loss": 0.7312962532043457, "memory(GiB)": 33.07, "step": 5890, "token_acc": 0.8337928318235526, "train_speed(iter/s)": 0.096202 }, { "epoch": 0.27413902848058036, "grad_norm": 6.97584867477417, "learning_rate": 8.688485680829473e-06, "loss": 0.7963497161865234, "memory(GiB)": 33.07, "step": 5895, "token_acc": 0.7905405405405406, "train_speed(iter/s)": 0.096248 }, { "epoch": 0.27437154674053, "grad_norm": 7.579192161560059, "learning_rate": 8.685888774848591e-06, "loss": 0.8456910133361817, "memory(GiB)": 33.07, "step": 5900, "token_acc": 0.7953077184631078, "train_speed(iter/s)": 0.096293 }, { "epoch": 0.27437154674053, "eval_loss": 0.6617845892906189, "eval_runtime": 294.0898, "eval_samples_per_second": 11.816, "eval_steps_per_second": 11.816, "step": 5900 }, { "epoch": 0.27460406500047957, "grad_norm": 5.993030071258545, "learning_rate": 8.683289689287877e-06, "loss": 0.6374862670898438, "memory(GiB)": 33.07, "step": 5905, "token_acc": 0.811827697650861, "train_speed(iter/s)": 0.095879 }, { "epoch": 0.27483658326042915, "grad_norm": 7.107336044311523, "learning_rate": 8.68068842568425e-06, "loss": 0.724134111404419, "memory(GiB)": 33.07, "step": 5910, "token_acc": 0.832606108687029, "train_speed(iter/s)": 0.095925 }, { "epoch": 0.2750691015203788, "grad_norm": 6.270671367645264, "learning_rate": 8.678084985575918e-06, "loss": 0.7967855930328369, "memory(GiB)": 33.07, "step": 5915, "token_acc": 0.8057302585604472, "train_speed(iter/s)": 0.095971 }, { "epoch": 0.27530161978032835, "grad_norm": 6.208034038543701, "learning_rate": 8.675479370502375e-06, "loss": 0.7351973533630372, "memory(GiB)": 33.07, "step": 5920, "token_acc": 0.8105228105228105, "train_speed(iter/s)": 0.096016 }, { "epoch": 0.275534138040278, "grad_norm": 5.249854564666748, "learning_rate": 8.672871582004404e-06, "loss": 0.6611810684204101, "memory(GiB)": 33.07, "step": 5925, "token_acc": 0.8356846473029046, "train_speed(iter/s)": 0.096062 }, { "epoch": 0.27576665630022756, "grad_norm": 8.400338172912598, "learning_rate": 8.670261621624065e-06, "loss": 0.6317246913909912, "memory(GiB)": 33.07, "step": 5930, "token_acc": 0.8427358961557664, "train_speed(iter/s)": 0.096108 }, { "epoch": 0.2759991745601772, "grad_norm": 8.131949424743652, "learning_rate": 8.667649490904715e-06, "loss": 0.7783225059509278, "memory(GiB)": 33.07, "step": 5935, "token_acc": 0.8031975637609441, "train_speed(iter/s)": 0.096155 }, { "epoch": 0.2762316928201268, "grad_norm": 6.932249069213867, "learning_rate": 8.665035191390982e-06, "loss": 0.7265284061431885, "memory(GiB)": 33.07, "step": 5940, "token_acc": 0.8292762090230444, "train_speed(iter/s)": 0.096199 }, { "epoch": 0.2764642110800764, "grad_norm": 8.016256332397461, "learning_rate": 8.662418724628786e-06, "loss": 0.7612427711486817, "memory(GiB)": 33.07, "step": 5945, "token_acc": 0.8056741915802319, "train_speed(iter/s)": 0.096244 }, { "epoch": 0.276696729340026, "grad_norm": 6.207606315612793, "learning_rate": 8.659800092165324e-06, "loss": 0.6891643524169921, "memory(GiB)": 33.07, "step": 5950, "token_acc": 0.8315262718932444, "train_speed(iter/s)": 0.09629 }, { "epoch": 0.276696729340026, "eval_loss": 0.6595419049263, "eval_runtime": 290.2901, "eval_samples_per_second": 11.971, "eval_steps_per_second": 11.971, "step": 5950 }, { "epoch": 0.27692924759997556, "grad_norm": 5.775126934051514, "learning_rate": 8.657179295549072e-06, "loss": 0.6428290367126465, "memory(GiB)": 33.07, "step": 5955, "token_acc": 0.8115010354450015, "train_speed(iter/s)": 0.095884 }, { "epoch": 0.2771617658599252, "grad_norm": 6.465238094329834, "learning_rate": 8.65455633632979e-06, "loss": 0.7096127510070801, "memory(GiB)": 33.07, "step": 5960, "token_acc": 0.8188382015096817, "train_speed(iter/s)": 0.095929 }, { "epoch": 0.27739428411987477, "grad_norm": 7.701021194458008, "learning_rate": 8.651931216058514e-06, "loss": 0.7280645370483398, "memory(GiB)": 33.07, "step": 5965, "token_acc": 0.8177358490566038, "train_speed(iter/s)": 0.095973 }, { "epoch": 0.2776268023798244, "grad_norm": 6.742284297943115, "learning_rate": 8.649303936287557e-06, "loss": 0.6810751914978027, "memory(GiB)": 33.07, "step": 5970, "token_acc": 0.8265813788201848, "train_speed(iter/s)": 0.096019 }, { "epoch": 0.277859320639774, "grad_norm": 7.681085586547852, "learning_rate": 8.646674498570515e-06, "loss": 0.7115217208862304, "memory(GiB)": 33.07, "step": 5975, "token_acc": 0.8235088943146146, "train_speed(iter/s)": 0.096064 }, { "epoch": 0.2780918388997236, "grad_norm": 6.155812740325928, "learning_rate": 8.64404290446225e-06, "loss": 0.8212972640991211, "memory(GiB)": 33.07, "step": 5980, "token_acc": 0.7921840759678598, "train_speed(iter/s)": 0.096109 }, { "epoch": 0.2783243571596732, "grad_norm": 6.859022617340088, "learning_rate": 8.641409155518911e-06, "loss": 0.8141220092773438, "memory(GiB)": 33.07, "step": 5985, "token_acc": 0.7958908723475918, "train_speed(iter/s)": 0.096155 }, { "epoch": 0.2785568754196228, "grad_norm": 6.1459808349609375, "learning_rate": 8.63877325329791e-06, "loss": 0.7060758590698242, "memory(GiB)": 33.07, "step": 5990, "token_acc": 0.8293323330832708, "train_speed(iter/s)": 0.096201 }, { "epoch": 0.2787893936795724, "grad_norm": 4.698131084442139, "learning_rate": 8.63613519935794e-06, "loss": 0.8317166328430176, "memory(GiB)": 33.07, "step": 5995, "token_acc": 0.7865638058502089, "train_speed(iter/s)": 0.096246 }, { "epoch": 0.279021911939522, "grad_norm": 7.586791515350342, "learning_rate": 8.633494995258963e-06, "loss": 0.7706262588500976, "memory(GiB)": 33.07, "step": 6000, "token_acc": 0.811706629055007, "train_speed(iter/s)": 0.096292 }, { "epoch": 0.279021911939522, "eval_loss": 0.6601762175559998, "eval_runtime": 290.388, "eval_samples_per_second": 11.967, "eval_steps_per_second": 11.967, "step": 6000 }, { "epoch": 0.2792544301994716, "grad_norm": 6.393497943878174, "learning_rate": 8.630852642562214e-06, "loss": 0.7850899219512939, "memory(GiB)": 33.07, "step": 6005, "token_acc": 0.810933776269228, "train_speed(iter/s)": 0.095891 }, { "epoch": 0.2794869484594212, "grad_norm": 7.869515419006348, "learning_rate": 8.628208142830196e-06, "loss": 0.735156774520874, "memory(GiB)": 33.07, "step": 6010, "token_acc": 0.8219298245614035, "train_speed(iter/s)": 0.095939 }, { "epoch": 0.2797194667193708, "grad_norm": 6.392009258270264, "learning_rate": 8.625561497626684e-06, "loss": 0.6847464084625244, "memory(GiB)": 33.07, "step": 6015, "token_acc": 0.8271791767554479, "train_speed(iter/s)": 0.095985 }, { "epoch": 0.2799519849793204, "grad_norm": 7.729519844055176, "learning_rate": 8.622912708516722e-06, "loss": 0.7716471195220947, "memory(GiB)": 33.07, "step": 6020, "token_acc": 0.8147534189805222, "train_speed(iter/s)": 0.096032 }, { "epoch": 0.28018450323927, "grad_norm": 6.673350811004639, "learning_rate": 8.620261777066621e-06, "loss": 0.751627779006958, "memory(GiB)": 33.07, "step": 6025, "token_acc": 0.8256952604778691, "train_speed(iter/s)": 0.096079 }, { "epoch": 0.2804170214992196, "grad_norm": 5.185956954956055, "learning_rate": 8.617608704843956e-06, "loss": 0.7098074436187745, "memory(GiB)": 33.07, "step": 6030, "token_acc": 0.8378985181859003, "train_speed(iter/s)": 0.096126 }, { "epoch": 0.28064953975916923, "grad_norm": 6.692188739776611, "learning_rate": 8.614953493417572e-06, "loss": 0.7026958465576172, "memory(GiB)": 33.07, "step": 6035, "token_acc": 0.825254104769351, "train_speed(iter/s)": 0.096171 }, { "epoch": 0.2808820580191188, "grad_norm": 5.576934814453125, "learning_rate": 8.612296144357578e-06, "loss": 0.6882329463958741, "memory(GiB)": 33.07, "step": 6040, "token_acc": 0.8241112828438949, "train_speed(iter/s)": 0.096215 }, { "epoch": 0.28111457627906844, "grad_norm": 3.9254825115203857, "learning_rate": 8.60963665923535e-06, "loss": 0.8447055816650391, "memory(GiB)": 33.07, "step": 6045, "token_acc": 0.8103691180614258, "train_speed(iter/s)": 0.096259 }, { "epoch": 0.281347094539018, "grad_norm": 5.394123077392578, "learning_rate": 8.606975039623516e-06, "loss": 0.8298517227172851, "memory(GiB)": 33.07, "step": 6050, "token_acc": 0.7894736842105263, "train_speed(iter/s)": 0.096304 }, { "epoch": 0.281347094539018, "eval_loss": 0.659683346748352, "eval_runtime": 295.3634, "eval_samples_per_second": 11.765, "eval_steps_per_second": 11.765, "step": 6050 }, { "epoch": 0.2815796127989676, "grad_norm": 5.169157028198242, "learning_rate": 8.604311287095978e-06, "loss": 0.619074821472168, "memory(GiB)": 33.07, "step": 6055, "token_acc": 0.8119939190270443, "train_speed(iter/s)": 0.095897 }, { "epoch": 0.2818121310589172, "grad_norm": 5.410036087036133, "learning_rate": 8.601645403227897e-06, "loss": 0.6804319858551026, "memory(GiB)": 33.07, "step": 6060, "token_acc": 0.8311770274534553, "train_speed(iter/s)": 0.095943 }, { "epoch": 0.2820446493188668, "grad_norm": 7.332240581512451, "learning_rate": 8.59897738959569e-06, "loss": 0.7271266937255859, "memory(GiB)": 33.07, "step": 6065, "token_acc": 0.8204196933010492, "train_speed(iter/s)": 0.095989 }, { "epoch": 0.28227716757881643, "grad_norm": 7.151330471038818, "learning_rate": 8.596307247777036e-06, "loss": 0.7178050518035889, "memory(GiB)": 33.07, "step": 6070, "token_acc": 0.8232123607617678, "train_speed(iter/s)": 0.096035 }, { "epoch": 0.282509685838766, "grad_norm": 6.48547887802124, "learning_rate": 8.59363497935087e-06, "loss": 0.7122317314147949, "memory(GiB)": 33.07, "step": 6075, "token_acc": 0.8342954159592529, "train_speed(iter/s)": 0.09608 }, { "epoch": 0.28274220409871564, "grad_norm": 6.208950042724609, "learning_rate": 8.590960585897393e-06, "loss": 0.5908382892608642, "memory(GiB)": 33.07, "step": 6080, "token_acc": 0.8553921568627451, "train_speed(iter/s)": 0.096126 }, { "epoch": 0.2829747223586652, "grad_norm": 9.506049156188965, "learning_rate": 8.58828406899805e-06, "loss": 0.8396751403808593, "memory(GiB)": 33.07, "step": 6085, "token_acc": 0.8013124316441852, "train_speed(iter/s)": 0.096172 }, { "epoch": 0.28320724061861485, "grad_norm": 7.106131076812744, "learning_rate": 8.585605430235552e-06, "loss": 0.6826900959014892, "memory(GiB)": 33.07, "step": 6090, "token_acc": 0.8440840398083302, "train_speed(iter/s)": 0.096217 }, { "epoch": 0.28343975887856443, "grad_norm": 6.968870639801025, "learning_rate": 8.58292467119386e-06, "loss": 0.760336971282959, "memory(GiB)": 33.07, "step": 6095, "token_acc": 0.813989239046887, "train_speed(iter/s)": 0.096263 }, { "epoch": 0.283672277138514, "grad_norm": 7.073690891265869, "learning_rate": 8.580241793458188e-06, "loss": 0.6863061428070069, "memory(GiB)": 33.07, "step": 6100, "token_acc": 0.8284789644012945, "train_speed(iter/s)": 0.096308 }, { "epoch": 0.283672277138514, "eval_loss": 0.6553303599357605, "eval_runtime": 290.703, "eval_samples_per_second": 11.954, "eval_steps_per_second": 11.954, "step": 6100 }, { "epoch": 0.28390479539846364, "grad_norm": 4.766737937927246, "learning_rate": 8.577556798615008e-06, "loss": 0.7520906925201416, "memory(GiB)": 33.07, "step": 6105, "token_acc": 0.811454285782675, "train_speed(iter/s)": 0.095912 }, { "epoch": 0.2841373136584132, "grad_norm": 7.36306095123291, "learning_rate": 8.574869688252036e-06, "loss": 0.8124327659606934, "memory(GiB)": 33.07, "step": 6110, "token_acc": 0.814280140460398, "train_speed(iter/s)": 0.095957 }, { "epoch": 0.28436983191836285, "grad_norm": 6.3582329750061035, "learning_rate": 8.572180463958246e-06, "loss": 0.8805639266967773, "memory(GiB)": 33.07, "step": 6115, "token_acc": 0.7744294909303686, "train_speed(iter/s)": 0.096 }, { "epoch": 0.2846023501783124, "grad_norm": 6.230129718780518, "learning_rate": 8.569489127323858e-06, "loss": 0.7166120529174804, "memory(GiB)": 33.07, "step": 6120, "token_acc": 0.8236749116607773, "train_speed(iter/s)": 0.096044 }, { "epoch": 0.28483486843826206, "grad_norm": 7.509944915771484, "learning_rate": 8.566795679940342e-06, "loss": 0.7509881973266601, "memory(GiB)": 33.07, "step": 6125, "token_acc": 0.8205128205128205, "train_speed(iter/s)": 0.096088 }, { "epoch": 0.28506738669821163, "grad_norm": 6.930967330932617, "learning_rate": 8.56410012340042e-06, "loss": 0.7765787601470947, "memory(GiB)": 33.07, "step": 6130, "token_acc": 0.820565342072921, "train_speed(iter/s)": 0.096132 }, { "epoch": 0.28529990495816127, "grad_norm": 6.196775913238525, "learning_rate": 8.561402459298055e-06, "loss": 0.7661912918090821, "memory(GiB)": 33.07, "step": 6135, "token_acc": 0.8045511221945137, "train_speed(iter/s)": 0.096176 }, { "epoch": 0.28553242321811084, "grad_norm": 5.132917881011963, "learning_rate": 8.55870268922846e-06, "loss": 0.8116618156433105, "memory(GiB)": 33.07, "step": 6140, "token_acc": 0.8019323671497585, "train_speed(iter/s)": 0.096221 }, { "epoch": 0.2857649414780605, "grad_norm": 5.738801002502441, "learning_rate": 8.556000814788091e-06, "loss": 0.6142177581787109, "memory(GiB)": 33.07, "step": 6145, "token_acc": 0.8393782383419689, "train_speed(iter/s)": 0.096264 }, { "epoch": 0.28599745973801005, "grad_norm": 6.459722995758057, "learning_rate": 8.553296837574651e-06, "loss": 0.6736063957214355, "memory(GiB)": 33.07, "step": 6150, "token_acc": 0.8397048960429242, "train_speed(iter/s)": 0.096307 }, { "epoch": 0.28599745973801005, "eval_loss": 0.6559529304504395, "eval_runtime": 290.8894, "eval_samples_per_second": 11.946, "eval_steps_per_second": 11.946, "step": 6150 }, { "epoch": 0.28622997799795963, "grad_norm": 8.525583267211914, "learning_rate": 8.550590759187086e-06, "loss": 0.7800402164459228, "memory(GiB)": 33.07, "step": 6155, "token_acc": 0.8106261163562133, "train_speed(iter/s)": 0.095913 }, { "epoch": 0.28646249625790926, "grad_norm": 5.509976863861084, "learning_rate": 8.547882581225581e-06, "loss": 0.7267116069793701, "memory(GiB)": 33.07, "step": 6160, "token_acc": 0.81893528849219, "train_speed(iter/s)": 0.095957 }, { "epoch": 0.28669501451785884, "grad_norm": 6.860876560211182, "learning_rate": 8.545172305291566e-06, "loss": 0.6962712287902832, "memory(GiB)": 33.07, "step": 6165, "token_acc": 0.8284686125549781, "train_speed(iter/s)": 0.096001 }, { "epoch": 0.28692753277780847, "grad_norm": 7.085202217102051, "learning_rate": 8.542459932987714e-06, "loss": 0.7279558181762695, "memory(GiB)": 33.07, "step": 6170, "token_acc": 0.8318947801068639, "train_speed(iter/s)": 0.096045 }, { "epoch": 0.28716005103775805, "grad_norm": 7.539945602416992, "learning_rate": 8.539745465917932e-06, "loss": 0.8185580253601075, "memory(GiB)": 33.07, "step": 6175, "token_acc": 0.8042194092827004, "train_speed(iter/s)": 0.096088 }, { "epoch": 0.2873925692977077, "grad_norm": 6.323469638824463, "learning_rate": 8.537028905687368e-06, "loss": 0.8452632904052735, "memory(GiB)": 33.07, "step": 6180, "token_acc": 0.8044806517311609, "train_speed(iter/s)": 0.096132 }, { "epoch": 0.28762508755765726, "grad_norm": 6.691417217254639, "learning_rate": 8.53431025390241e-06, "loss": 0.7386738300323487, "memory(GiB)": 33.07, "step": 6185, "token_acc": 0.8143226282761167, "train_speed(iter/s)": 0.096174 }, { "epoch": 0.2878576058176069, "grad_norm": 6.233213424682617, "learning_rate": 8.531589512170675e-06, "loss": 0.6917527675628662, "memory(GiB)": 33.07, "step": 6190, "token_acc": 0.8180118416952321, "train_speed(iter/s)": 0.096217 }, { "epoch": 0.28809012407755646, "grad_norm": 5.885256290435791, "learning_rate": 8.528866682101029e-06, "loss": 0.7740827560424804, "memory(GiB)": 33.07, "step": 6195, "token_acc": 0.8050786838340487, "train_speed(iter/s)": 0.096261 }, { "epoch": 0.28832264233750604, "grad_norm": 4.884213447570801, "learning_rate": 8.526141765303562e-06, "loss": 0.7537154674530029, "memory(GiB)": 33.07, "step": 6200, "token_acc": 0.8030990173847317, "train_speed(iter/s)": 0.096305 }, { "epoch": 0.28832264233750604, "eval_loss": 0.656577467918396, "eval_runtime": 290.9454, "eval_samples_per_second": 11.944, "eval_steps_per_second": 11.944, "step": 6200 }, { "epoch": 0.2885551605974557, "grad_norm": 6.681056022644043, "learning_rate": 8.523414763389601e-06, "loss": 0.786424970626831, "memory(GiB)": 33.07, "step": 6205, "token_acc": 0.8106969465955142, "train_speed(iter/s)": 0.095914 }, { "epoch": 0.28878767885740525, "grad_norm": 6.169203758239746, "learning_rate": 8.520685677971707e-06, "loss": 0.6843218326568603, "memory(GiB)": 33.07, "step": 6210, "token_acc": 0.8432510885341074, "train_speed(iter/s)": 0.095958 }, { "epoch": 0.2890201971173549, "grad_norm": 5.191234588623047, "learning_rate": 8.517954510663673e-06, "loss": 0.7208163261413574, "memory(GiB)": 33.07, "step": 6215, "token_acc": 0.8300854700854701, "train_speed(iter/s)": 0.096002 }, { "epoch": 0.28925271537730446, "grad_norm": 6.596996307373047, "learning_rate": 8.515221263080522e-06, "loss": 0.7454941272735596, "memory(GiB)": 33.07, "step": 6220, "token_acc": 0.8168147641831852, "train_speed(iter/s)": 0.096045 }, { "epoch": 0.2894852336372541, "grad_norm": 6.094503879547119, "learning_rate": 8.512485936838507e-06, "loss": 0.7535664081573487, "memory(GiB)": 33.07, "step": 6225, "token_acc": 0.8183839881393625, "train_speed(iter/s)": 0.096089 }, { "epoch": 0.28971775189720367, "grad_norm": 7.548673152923584, "learning_rate": 8.50974853355511e-06, "loss": 0.7302347183227539, "memory(GiB)": 33.07, "step": 6230, "token_acc": 0.819743935309973, "train_speed(iter/s)": 0.096132 }, { "epoch": 0.2899502701571533, "grad_norm": 6.916913032531738, "learning_rate": 8.507009054849047e-06, "loss": 0.7550792217254638, "memory(GiB)": 33.07, "step": 6235, "token_acc": 0.8215641609719059, "train_speed(iter/s)": 0.096176 }, { "epoch": 0.2901827884171029, "grad_norm": 8.061797142028809, "learning_rate": 8.504267502340252e-06, "loss": 0.764448595046997, "memory(GiB)": 33.07, "step": 6240, "token_acc": 0.8140432098765432, "train_speed(iter/s)": 0.09622 }, { "epoch": 0.2904153066770525, "grad_norm": 6.484617233276367, "learning_rate": 8.501523877649891e-06, "loss": 0.8045131683349609, "memory(GiB)": 33.07, "step": 6245, "token_acc": 0.7974641307974641, "train_speed(iter/s)": 0.096263 }, { "epoch": 0.2906478249370021, "grad_norm": 6.302259922027588, "learning_rate": 8.498778182400353e-06, "loss": 0.6931636333465576, "memory(GiB)": 33.07, "step": 6250, "token_acc": 0.8204667863554758, "train_speed(iter/s)": 0.096306 }, { "epoch": 0.2906478249370021, "eval_loss": 0.6549901366233826, "eval_runtime": 289.9607, "eval_samples_per_second": 11.984, "eval_steps_per_second": 11.984, "step": 6250 }, { "epoch": 0.29088034319695166, "grad_norm": 6.187180519104004, "learning_rate": 8.496030418215254e-06, "loss": 0.6815443992614746, "memory(GiB)": 33.07, "step": 6255, "token_acc": 0.8120918964881039, "train_speed(iter/s)": 0.095921 }, { "epoch": 0.2911128614569013, "grad_norm": 5.392935276031494, "learning_rate": 8.493280586719428e-06, "loss": 0.6975905895233154, "memory(GiB)": 33.07, "step": 6260, "token_acc": 0.8076083567196757, "train_speed(iter/s)": 0.095963 }, { "epoch": 0.29134537971685087, "grad_norm": 5.913050174713135, "learning_rate": 8.490528689538939e-06, "loss": 0.7572990894317627, "memory(GiB)": 33.07, "step": 6265, "token_acc": 0.8055087127599775, "train_speed(iter/s)": 0.096006 }, { "epoch": 0.2915778979768005, "grad_norm": 7.170483589172363, "learning_rate": 8.48777472830107e-06, "loss": 0.7077459812164306, "memory(GiB)": 33.07, "step": 6270, "token_acc": 0.835335141418055, "train_speed(iter/s)": 0.096049 }, { "epoch": 0.2918104162367501, "grad_norm": 6.242068767547607, "learning_rate": 8.48501870463432e-06, "loss": 0.5808324337005615, "memory(GiB)": 33.07, "step": 6275, "token_acc": 0.8642131979695431, "train_speed(iter/s)": 0.096092 }, { "epoch": 0.2920429344966997, "grad_norm": 6.824711322784424, "learning_rate": 8.48226062016841e-06, "loss": 0.6758532524108887, "memory(GiB)": 33.07, "step": 6280, "token_acc": 0.8284789644012945, "train_speed(iter/s)": 0.096135 }, { "epoch": 0.2922754527566493, "grad_norm": 5.987837791442871, "learning_rate": 8.479500476534286e-06, "loss": 0.7241264343261719, "memory(GiB)": 33.07, "step": 6285, "token_acc": 0.8183223811364515, "train_speed(iter/s)": 0.096178 }, { "epoch": 0.2925079710165989, "grad_norm": 7.447147369384766, "learning_rate": 8.476738275364101e-06, "loss": 0.8527143478393555, "memory(GiB)": 33.07, "step": 6290, "token_acc": 0.7953020134228188, "train_speed(iter/s)": 0.096222 }, { "epoch": 0.2927404892765485, "grad_norm": 7.889321804046631, "learning_rate": 8.47397401829123e-06, "loss": 0.6823805332183838, "memory(GiB)": 33.07, "step": 6295, "token_acc": 0.8269107257546564, "train_speed(iter/s)": 0.096265 }, { "epoch": 0.2929730075364981, "grad_norm": 5.670337677001953, "learning_rate": 8.471207706950268e-06, "loss": 0.6378509044647217, "memory(GiB)": 33.07, "step": 6300, "token_acc": 0.8265379113018598, "train_speed(iter/s)": 0.096308 }, { "epoch": 0.2929730075364981, "eval_loss": 0.6550729870796204, "eval_runtime": 290.9136, "eval_samples_per_second": 11.945, "eval_steps_per_second": 11.945, "step": 6300 }, { "epoch": 0.2932055257964477, "grad_norm": 6.653439521789551, "learning_rate": 8.468439342977017e-06, "loss": 0.6482341766357422, "memory(GiB)": 33.07, "step": 6305, "token_acc": 0.8118095907030203, "train_speed(iter/s)": 0.095924 }, { "epoch": 0.2934380440563973, "grad_norm": 5.770527362823486, "learning_rate": 8.465668928008494e-06, "loss": 0.8743701934814453, "memory(GiB)": 33.07, "step": 6310, "token_acc": 0.7887878787878788, "train_speed(iter/s)": 0.095966 }, { "epoch": 0.2936705623163469, "grad_norm": 7.127387523651123, "learning_rate": 8.462896463682934e-06, "loss": 0.6957249641418457, "memory(GiB)": 33.07, "step": 6315, "token_acc": 0.8284452853989278, "train_speed(iter/s)": 0.096009 }, { "epoch": 0.2939030805762965, "grad_norm": 6.6421308517456055, "learning_rate": 8.460121951639783e-06, "loss": 0.7410873889923095, "memory(GiB)": 33.07, "step": 6320, "token_acc": 0.82201203783319, "train_speed(iter/s)": 0.096052 }, { "epoch": 0.2941355988362461, "grad_norm": 6.1326775550842285, "learning_rate": 8.457345393519689e-06, "loss": 0.68385009765625, "memory(GiB)": 33.07, "step": 6325, "token_acc": 0.8306617344841759, "train_speed(iter/s)": 0.096095 }, { "epoch": 0.2943681170961957, "grad_norm": 7.342061519622803, "learning_rate": 8.454566790964522e-06, "loss": 0.6733174800872803, "memory(GiB)": 33.07, "step": 6330, "token_acc": 0.8364063023801542, "train_speed(iter/s)": 0.096138 }, { "epoch": 0.29460063535614534, "grad_norm": 5.662764549255371, "learning_rate": 8.451786145617355e-06, "loss": 0.794578742980957, "memory(GiB)": 33.07, "step": 6335, "token_acc": 0.8155339805825242, "train_speed(iter/s)": 0.09618 }, { "epoch": 0.2948331536160949, "grad_norm": 4.566216468811035, "learning_rate": 8.449003459122467e-06, "loss": 0.7572064399719238, "memory(GiB)": 33.07, "step": 6340, "token_acc": 0.8042925727195708, "train_speed(iter/s)": 0.096223 }, { "epoch": 0.2950656718760445, "grad_norm": 6.372825622558594, "learning_rate": 8.446218733125347e-06, "loss": 0.9462939262390136, "memory(GiB)": 33.07, "step": 6345, "token_acc": 0.7759522031366691, "train_speed(iter/s)": 0.096266 }, { "epoch": 0.2952981901359941, "grad_norm": 5.942713737487793, "learning_rate": 8.443431969272691e-06, "loss": 0.6915206909179688, "memory(GiB)": 33.07, "step": 6350, "token_acc": 0.8340192043895748, "train_speed(iter/s)": 0.09631 }, { "epoch": 0.2952981901359941, "eval_loss": 0.6510042548179626, "eval_runtime": 292.3379, "eval_samples_per_second": 11.887, "eval_steps_per_second": 11.887, "step": 6350 }, { "epoch": 0.2955307083959437, "grad_norm": 6.594305992126465, "learning_rate": 8.440643169212396e-06, "loss": 0.702507734298706, "memory(GiB)": 33.07, "step": 6355, "token_acc": 0.8125821972734563, "train_speed(iter/s)": 0.095926 }, { "epoch": 0.29576322665589333, "grad_norm": 5.589081764221191, "learning_rate": 8.43785233459357e-06, "loss": 0.7530613899230957, "memory(GiB)": 33.07, "step": 6360, "token_acc": 0.8185123966942148, "train_speed(iter/s)": 0.09597 }, { "epoch": 0.2959957449158429, "grad_norm": 7.626203536987305, "learning_rate": 8.435059467066516e-06, "loss": 0.8047209739685058, "memory(GiB)": 33.07, "step": 6365, "token_acc": 0.7979341510652034, "train_speed(iter/s)": 0.096011 }, { "epoch": 0.29622826317579254, "grad_norm": 7.446657657623291, "learning_rate": 8.432264568282741e-06, "loss": 0.8075847625732422, "memory(GiB)": 33.07, "step": 6370, "token_acc": 0.7980014275517487, "train_speed(iter/s)": 0.096052 }, { "epoch": 0.2964607814357421, "grad_norm": 6.0461859703063965, "learning_rate": 8.429467639894961e-06, "loss": 0.7339020729064941, "memory(GiB)": 33.07, "step": 6375, "token_acc": 0.821873557914167, "train_speed(iter/s)": 0.096095 }, { "epoch": 0.29669329969569175, "grad_norm": 4.4445600509643555, "learning_rate": 8.426668683557082e-06, "loss": 0.6500592231750488, "memory(GiB)": 33.07, "step": 6380, "token_acc": 0.8398605150214592, "train_speed(iter/s)": 0.096137 }, { "epoch": 0.2969258179556413, "grad_norm": 5.21086311340332, "learning_rate": 8.423867700924213e-06, "loss": 0.7790399074554444, "memory(GiB)": 33.07, "step": 6385, "token_acc": 0.7943536404160475, "train_speed(iter/s)": 0.096179 }, { "epoch": 0.29715833621559096, "grad_norm": 6.420889854431152, "learning_rate": 8.421064693652663e-06, "loss": 0.7149899482727051, "memory(GiB)": 33.07, "step": 6390, "token_acc": 0.8253719655442443, "train_speed(iter/s)": 0.096222 }, { "epoch": 0.29739085447554053, "grad_norm": 6.900496959686279, "learning_rate": 8.418259663399936e-06, "loss": 0.6499155998229981, "memory(GiB)": 33.07, "step": 6395, "token_acc": 0.8465002046663938, "train_speed(iter/s)": 0.096264 }, { "epoch": 0.2976233727354901, "grad_norm": 7.934140205383301, "learning_rate": 8.415452611824733e-06, "loss": 0.6458067893981934, "memory(GiB)": 33.07, "step": 6400, "token_acc": 0.8421300659754948, "train_speed(iter/s)": 0.096307 }, { "epoch": 0.2976233727354901, "eval_loss": 0.6483955979347229, "eval_runtime": 296.3225, "eval_samples_per_second": 11.727, "eval_steps_per_second": 11.727, "step": 6400 }, { "epoch": 0.29785589099543974, "grad_norm": 5.832469940185547, "learning_rate": 8.412643540586951e-06, "loss": 0.8407029151916504, "memory(GiB)": 33.07, "step": 6405, "token_acc": 0.8117392933149475, "train_speed(iter/s)": 0.095921 }, { "epoch": 0.2980884092553893, "grad_norm": 6.6476054191589355, "learning_rate": 8.409832451347682e-06, "loss": 0.6710747241973877, "memory(GiB)": 33.07, "step": 6410, "token_acc": 0.8312541037426132, "train_speed(iter/s)": 0.095963 }, { "epoch": 0.29832092751533895, "grad_norm": 4.817314147949219, "learning_rate": 8.407019345769205e-06, "loss": 0.7402913093566894, "memory(GiB)": 33.07, "step": 6415, "token_acc": 0.8139926945771284, "train_speed(iter/s)": 0.096004 }, { "epoch": 0.29855344577528853, "grad_norm": 6.434061050415039, "learning_rate": 8.404204225515e-06, "loss": 0.577388858795166, "memory(GiB)": 33.07, "step": 6420, "token_acc": 0.855781723689909, "train_speed(iter/s)": 0.096047 }, { "epoch": 0.29878596403523816, "grad_norm": 6.586878776550293, "learning_rate": 8.401387092249733e-06, "loss": 0.763947868347168, "memory(GiB)": 33.07, "step": 6425, "token_acc": 0.8243938280675973, "train_speed(iter/s)": 0.096089 }, { "epoch": 0.29901848229518774, "grad_norm": 6.808994293212891, "learning_rate": 8.398567947639264e-06, "loss": 0.8441635131835937, "memory(GiB)": 33.07, "step": 6430, "token_acc": 0.7888157894736842, "train_speed(iter/s)": 0.09613 }, { "epoch": 0.29925100055513737, "grad_norm": 7.461640357971191, "learning_rate": 8.39574679335064e-06, "loss": 0.9145607948303223, "memory(GiB)": 33.07, "step": 6435, "token_acc": 0.7673830594184576, "train_speed(iter/s)": 0.096172 }, { "epoch": 0.29948351881508695, "grad_norm": 6.615748405456543, "learning_rate": 8.392923631052092e-06, "loss": 0.7803031444549561, "memory(GiB)": 33.07, "step": 6440, "token_acc": 0.8063650306748467, "train_speed(iter/s)": 0.096213 }, { "epoch": 0.2997160370750365, "grad_norm": 6.24949312210083, "learning_rate": 8.390098462413047e-06, "loss": 0.715507173538208, "memory(GiB)": 33.07, "step": 6445, "token_acc": 0.8268319358366751, "train_speed(iter/s)": 0.096255 }, { "epoch": 0.29994855533498616, "grad_norm": 5.609653949737549, "learning_rate": 8.387271289104116e-06, "loss": 0.648672342300415, "memory(GiB)": 33.07, "step": 6450, "token_acc": 0.8555355535553555, "train_speed(iter/s)": 0.096297 }, { "epoch": 0.29994855533498616, "eval_loss": 0.6544153094291687, "eval_runtime": 294.7661, "eval_samples_per_second": 11.789, "eval_steps_per_second": 11.789, "step": 6450 }, { "epoch": 0.30018107359493573, "grad_norm": 7.218892574310303, "learning_rate": 8.38444211279709e-06, "loss": 0.7374346256256104, "memory(GiB)": 33.07, "step": 6455, "token_acc": 0.8123859087269816, "train_speed(iter/s)": 0.095915 }, { "epoch": 0.30041359185488536, "grad_norm": 7.429409027099609, "learning_rate": 8.38161093516495e-06, "loss": 0.6752567291259766, "memory(GiB)": 33.07, "step": 6460, "token_acc": 0.8336781133636739, "train_speed(iter/s)": 0.095956 }, { "epoch": 0.30064611011483494, "grad_norm": 5.649204254150391, "learning_rate": 8.37877775788186e-06, "loss": 0.8494151115417481, "memory(GiB)": 33.07, "step": 6465, "token_acc": 0.7666419203167533, "train_speed(iter/s)": 0.095997 }, { "epoch": 0.3008786283747846, "grad_norm": 5.821555137634277, "learning_rate": 8.375942582623162e-06, "loss": 0.7295107364654541, "memory(GiB)": 33.07, "step": 6470, "token_acc": 0.8188311688311688, "train_speed(iter/s)": 0.096039 }, { "epoch": 0.30111114663473415, "grad_norm": 6.589723110198975, "learning_rate": 8.373105411065386e-06, "loss": 0.7120474815368653, "memory(GiB)": 33.07, "step": 6475, "token_acc": 0.8293269230769231, "train_speed(iter/s)": 0.096081 }, { "epoch": 0.3013436648946838, "grad_norm": 5.835727691650391, "learning_rate": 8.370266244886238e-06, "loss": 0.7486866474151611, "memory(GiB)": 33.07, "step": 6480, "token_acc": 0.8098676293622142, "train_speed(iter/s)": 0.096123 }, { "epoch": 0.30157618315463336, "grad_norm": 8.453908920288086, "learning_rate": 8.367425085764604e-06, "loss": 0.697899341583252, "memory(GiB)": 33.07, "step": 6485, "token_acc": 0.8223976319684262, "train_speed(iter/s)": 0.096165 }, { "epoch": 0.301808701414583, "grad_norm": 4.8690032958984375, "learning_rate": 8.36458193538055e-06, "loss": 0.8523747444152832, "memory(GiB)": 33.07, "step": 6490, "token_acc": 0.7894736842105263, "train_speed(iter/s)": 0.096205 }, { "epoch": 0.30204121967453257, "grad_norm": 6.731339454650879, "learning_rate": 8.361736795415317e-06, "loss": 0.6203603267669677, "memory(GiB)": 33.07, "step": 6495, "token_acc": 0.8547215496368039, "train_speed(iter/s)": 0.096247 }, { "epoch": 0.30227373793448215, "grad_norm": 6.212488174438477, "learning_rate": 8.358889667551327e-06, "loss": 0.6619673728942871, "memory(GiB)": 33.07, "step": 6500, "token_acc": 0.8305249513933896, "train_speed(iter/s)": 0.096287 }, { "epoch": 0.30227373793448215, "eval_loss": 0.6488327383995056, "eval_runtime": 297.4827, "eval_samples_per_second": 11.681, "eval_steps_per_second": 11.681, "step": 6500 }, { "epoch": 0.3025062561944318, "grad_norm": 4.924098014831543, "learning_rate": 8.356040553472172e-06, "loss": 0.6729198455810547, "memory(GiB)": 33.07, "step": 6505, "token_acc": 0.8140951740480994, "train_speed(iter/s)": 0.095905 }, { "epoch": 0.30273877445438135, "grad_norm": 8.07417106628418, "learning_rate": 8.35318945486262e-06, "loss": 0.786518907546997, "memory(GiB)": 33.07, "step": 6510, "token_acc": 0.8138771683075481, "train_speed(iter/s)": 0.095947 }, { "epoch": 0.302971292714331, "grad_norm": 6.638355255126953, "learning_rate": 8.350336373408617e-06, "loss": 0.7544785976409912, "memory(GiB)": 33.07, "step": 6515, "token_acc": 0.8314682943370634, "train_speed(iter/s)": 0.095988 }, { "epoch": 0.30320381097428056, "grad_norm": 6.213714122772217, "learning_rate": 8.347481310797277e-06, "loss": 0.7576951503753662, "memory(GiB)": 33.07, "step": 6520, "token_acc": 0.8079490291262136, "train_speed(iter/s)": 0.09603 }, { "epoch": 0.3034363292342302, "grad_norm": 5.810522079467773, "learning_rate": 8.344624268716888e-06, "loss": 0.8516165733337402, "memory(GiB)": 33.07, "step": 6525, "token_acc": 0.7934306569343066, "train_speed(iter/s)": 0.096072 }, { "epoch": 0.30366884749417977, "grad_norm": 7.226070880889893, "learning_rate": 8.341765248856904e-06, "loss": 0.689451026916504, "memory(GiB)": 33.07, "step": 6530, "token_acc": 0.8396111786148238, "train_speed(iter/s)": 0.096113 }, { "epoch": 0.3039013657541294, "grad_norm": 5.169941425323486, "learning_rate": 8.338904252907953e-06, "loss": 0.7344254970550537, "memory(GiB)": 33.07, "step": 6535, "token_acc": 0.8086441268734751, "train_speed(iter/s)": 0.096155 }, { "epoch": 0.304133884014079, "grad_norm": 5.565005779266357, "learning_rate": 8.33604128256183e-06, "loss": 0.7079814910888672, "memory(GiB)": 33.07, "step": 6540, "token_acc": 0.8164042661249366, "train_speed(iter/s)": 0.096193 }, { "epoch": 0.30436640227402856, "grad_norm": 6.246211051940918, "learning_rate": 8.3331763395115e-06, "loss": 0.7657256126403809, "memory(GiB)": 33.07, "step": 6545, "token_acc": 0.8109756097560976, "train_speed(iter/s)": 0.096233 }, { "epoch": 0.3045989205339782, "grad_norm": 5.423332214355469, "learning_rate": 8.330309425451089e-06, "loss": 0.7353767395019531, "memory(GiB)": 33.07, "step": 6550, "token_acc": 0.8268970189701897, "train_speed(iter/s)": 0.096274 }, { "epoch": 0.3045989205339782, "eval_loss": 0.6473827958106995, "eval_runtime": 291.9143, "eval_samples_per_second": 11.904, "eval_steps_per_second": 11.904, "step": 6550 }, { "epoch": 0.30483143879392777, "grad_norm": 6.537137508392334, "learning_rate": 8.327440542075892e-06, "loss": 0.7706812858581543, "memory(GiB)": 33.07, "step": 6555, "token_acc": 0.8133992338267872, "train_speed(iter/s)": 0.095903 }, { "epoch": 0.3050639570538774, "grad_norm": 2.7098896503448486, "learning_rate": 8.324569691082374e-06, "loss": 0.8289295196533203, "memory(GiB)": 33.07, "step": 6560, "token_acc": 0.7824712643678161, "train_speed(iter/s)": 0.095943 }, { "epoch": 0.305296475313827, "grad_norm": 5.2406816482543945, "learning_rate": 8.32169687416815e-06, "loss": 0.6724458694458008, "memory(GiB)": 33.07, "step": 6565, "token_acc": 0.8261464750171116, "train_speed(iter/s)": 0.095984 }, { "epoch": 0.3055289935737766, "grad_norm": 6.757907390594482, "learning_rate": 8.318822093032011e-06, "loss": 0.6793931484222412, "memory(GiB)": 33.07, "step": 6570, "token_acc": 0.8337605272793849, "train_speed(iter/s)": 0.096026 }, { "epoch": 0.3057615118337262, "grad_norm": 5.912170886993408, "learning_rate": 8.3159453493739e-06, "loss": 0.7322561740875244, "memory(GiB)": 33.07, "step": 6575, "token_acc": 0.8194933145672062, "train_speed(iter/s)": 0.096067 }, { "epoch": 0.3059940300936758, "grad_norm": 5.746609687805176, "learning_rate": 8.313066644894927e-06, "loss": 0.9141531944274902, "memory(GiB)": 33.07, "step": 6580, "token_acc": 0.778856526429342, "train_speed(iter/s)": 0.096107 }, { "epoch": 0.3062265483536254, "grad_norm": 6.435399055480957, "learning_rate": 8.310185981297359e-06, "loss": 0.7600241661071777, "memory(GiB)": 33.07, "step": 6585, "token_acc": 0.80836820083682, "train_speed(iter/s)": 0.096149 }, { "epoch": 0.30645906661357497, "grad_norm": 7.434659957885742, "learning_rate": 8.307303360284618e-06, "loss": 0.7691242218017578, "memory(GiB)": 33.07, "step": 6590, "token_acc": 0.8111338797814208, "train_speed(iter/s)": 0.09619 }, { "epoch": 0.3066915848735246, "grad_norm": 8.359783172607422, "learning_rate": 8.30441878356129e-06, "loss": 0.7094342708587646, "memory(GiB)": 33.07, "step": 6595, "token_acc": 0.8230152949745084, "train_speed(iter/s)": 0.096232 }, { "epoch": 0.3069241031334742, "grad_norm": 7.274518966674805, "learning_rate": 8.301532252833112e-06, "loss": 0.6358844757080078, "memory(GiB)": 33.07, "step": 6600, "token_acc": 0.8455056179775281, "train_speed(iter/s)": 0.096272 }, { "epoch": 0.3069241031334742, "eval_loss": 0.6463093161582947, "eval_runtime": 289.4779, "eval_samples_per_second": 12.004, "eval_steps_per_second": 12.004, "step": 6600 }, { "epoch": 0.3071566213934238, "grad_norm": 6.91964054107666, "learning_rate": 8.298643769806981e-06, "loss": 0.7548566818237304, "memory(GiB)": 33.07, "step": 6605, "token_acc": 0.8135487582702081, "train_speed(iter/s)": 0.095908 }, { "epoch": 0.3073891396533734, "grad_norm": 5.320629119873047, "learning_rate": 8.295753336190945e-06, "loss": 0.6493088722229003, "memory(GiB)": 33.07, "step": 6610, "token_acc": 0.8388537402307406, "train_speed(iter/s)": 0.095949 }, { "epoch": 0.307621657913323, "grad_norm": 6.0627217292785645, "learning_rate": 8.292860953694208e-06, "loss": 0.8688552856445313, "memory(GiB)": 33.07, "step": 6615, "token_acc": 0.7757100881488737, "train_speed(iter/s)": 0.09599 }, { "epoch": 0.3078541761732726, "grad_norm": 6.99749231338501, "learning_rate": 8.289966624027123e-06, "loss": 0.6862592220306396, "memory(GiB)": 33.07, "step": 6620, "token_acc": 0.8159357628165534, "train_speed(iter/s)": 0.096031 }, { "epoch": 0.30808669443322223, "grad_norm": 6.315097332000732, "learning_rate": 8.287070348901198e-06, "loss": 0.7650423526763916, "memory(GiB)": 33.07, "step": 6625, "token_acc": 0.8128571428571428, "train_speed(iter/s)": 0.096072 }, { "epoch": 0.3083192126931718, "grad_norm": 6.0371527671813965, "learning_rate": 8.28417213002909e-06, "loss": 0.6402733325958252, "memory(GiB)": 33.07, "step": 6630, "token_acc": 0.8501052631578947, "train_speed(iter/s)": 0.096113 }, { "epoch": 0.30855173095312144, "grad_norm": 5.829378604888916, "learning_rate": 8.281271969124602e-06, "loss": 0.8457640647888184, "memory(GiB)": 33.07, "step": 6635, "token_acc": 0.7996213316503629, "train_speed(iter/s)": 0.096153 }, { "epoch": 0.308784249213071, "grad_norm": 6.845560073852539, "learning_rate": 8.278369867902693e-06, "loss": 0.678613805770874, "memory(GiB)": 33.07, "step": 6640, "token_acc": 0.8426270136307311, "train_speed(iter/s)": 0.096193 }, { "epoch": 0.3090167674730206, "grad_norm": 5.743708610534668, "learning_rate": 8.275465828079463e-06, "loss": 0.8091531753540039, "memory(GiB)": 33.07, "step": 6645, "token_acc": 0.811193309745899, "train_speed(iter/s)": 0.096235 }, { "epoch": 0.3092492857329702, "grad_norm": 5.360169410705566, "learning_rate": 8.27255985137216e-06, "loss": 0.633982801437378, "memory(GiB)": 33.07, "step": 6650, "token_acc": 0.843441466854725, "train_speed(iter/s)": 0.096275 }, { "epoch": 0.3092492857329702, "eval_loss": 0.6490738987922668, "eval_runtime": 290.6445, "eval_samples_per_second": 11.956, "eval_steps_per_second": 11.956, "step": 6650 }, { "epoch": 0.3094818039929198, "grad_norm": 8.406295776367188, "learning_rate": 8.26965193949918e-06, "loss": 0.7386780738830566, "memory(GiB)": 33.07, "step": 6655, "token_acc": 0.8133779961053635, "train_speed(iter/s)": 0.095911 }, { "epoch": 0.30971432225286943, "grad_norm": 6.63882303237915, "learning_rate": 8.266742094180058e-06, "loss": 0.7321940422058105, "memory(GiB)": 33.07, "step": 6660, "token_acc": 0.8038100653966449, "train_speed(iter/s)": 0.095951 }, { "epoch": 0.309946840512819, "grad_norm": 5.776708602905273, "learning_rate": 8.263830317135479e-06, "loss": 0.6394780158996582, "memory(GiB)": 33.07, "step": 6665, "token_acc": 0.850541215653622, "train_speed(iter/s)": 0.095992 }, { "epoch": 0.31017935877276864, "grad_norm": 6.080323219299316, "learning_rate": 8.260916610087264e-06, "loss": 0.903103256225586, "memory(GiB)": 33.07, "step": 6670, "token_acc": 0.7794170564951421, "train_speed(iter/s)": 0.096032 }, { "epoch": 0.3104118770327182, "grad_norm": 5.895566463470459, "learning_rate": 8.258000974758378e-06, "loss": 0.6670703887939453, "memory(GiB)": 33.07, "step": 6675, "token_acc": 0.8356873822975518, "train_speed(iter/s)": 0.096072 }, { "epoch": 0.31064439529266785, "grad_norm": 5.250027179718018, "learning_rate": 8.255083412872927e-06, "loss": 0.6507026672363281, "memory(GiB)": 33.07, "step": 6680, "token_acc": 0.8399592252803262, "train_speed(iter/s)": 0.096113 }, { "epoch": 0.31087691355261743, "grad_norm": 6.803175449371338, "learning_rate": 8.252163926156154e-06, "loss": 0.7843762397766113, "memory(GiB)": 33.07, "step": 6685, "token_acc": 0.7998108448928121, "train_speed(iter/s)": 0.096154 }, { "epoch": 0.311109431812567, "grad_norm": 6.703561305999756, "learning_rate": 8.249242516334444e-06, "loss": 0.6578123569488525, "memory(GiB)": 33.07, "step": 6690, "token_acc": 0.8411325206449076, "train_speed(iter/s)": 0.096195 }, { "epoch": 0.31134195007251664, "grad_norm": 5.961348533630371, "learning_rate": 8.246319185135317e-06, "loss": 0.7796625137329102, "memory(GiB)": 33.07, "step": 6695, "token_acc": 0.80397127165188, "train_speed(iter/s)": 0.096235 }, { "epoch": 0.3115744683324662, "grad_norm": 7.202671527862549, "learning_rate": 8.243393934287424e-06, "loss": 0.6795454502105713, "memory(GiB)": 33.07, "step": 6700, "token_acc": 0.8431786216596343, "train_speed(iter/s)": 0.096276 }, { "epoch": 0.3115744683324662, "eval_loss": 0.6488170027732849, "eval_runtime": 293.4475, "eval_samples_per_second": 11.842, "eval_steps_per_second": 11.842, "step": 6700 }, { "epoch": 0.31180698659241585, "grad_norm": 6.131471157073975, "learning_rate": 8.240466765520563e-06, "loss": 0.7111649513244629, "memory(GiB)": 33.07, "step": 6705, "token_acc": 0.8138903053094525, "train_speed(iter/s)": 0.095912 }, { "epoch": 0.3120395048523654, "grad_norm": 7.5404815673828125, "learning_rate": 8.237537680565655e-06, "loss": 0.6519620895385743, "memory(GiB)": 33.07, "step": 6710, "token_acc": 0.8393972804116133, "train_speed(iter/s)": 0.095952 }, { "epoch": 0.31227202311231506, "grad_norm": 5.742206573486328, "learning_rate": 8.23460668115476e-06, "loss": 0.7203374862670898, "memory(GiB)": 33.07, "step": 6715, "token_acc": 0.8262962962962963, "train_speed(iter/s)": 0.095993 }, { "epoch": 0.31250454137226463, "grad_norm": 5.321285247802734, "learning_rate": 8.231673769021066e-06, "loss": 0.893209171295166, "memory(GiB)": 33.07, "step": 6720, "token_acc": 0.797979797979798, "train_speed(iter/s)": 0.096034 }, { "epoch": 0.31273705963221426, "grad_norm": 7.4848952293396, "learning_rate": 8.228738945898897e-06, "loss": 0.6799060344696045, "memory(GiB)": 33.07, "step": 6725, "token_acc": 0.827831025914093, "train_speed(iter/s)": 0.096074 }, { "epoch": 0.31296957789216384, "grad_norm": 5.708653450012207, "learning_rate": 8.225802213523705e-06, "loss": 0.6490331649780273, "memory(GiB)": 33.07, "step": 6730, "token_acc": 0.8281505728314239, "train_speed(iter/s)": 0.096114 }, { "epoch": 0.3132020961521134, "grad_norm": 4.684449195861816, "learning_rate": 8.222863573632068e-06, "loss": 0.7676737785339356, "memory(GiB)": 33.07, "step": 6735, "token_acc": 0.8076923076923077, "train_speed(iter/s)": 0.096153 }, { "epoch": 0.31343461441206305, "grad_norm": 5.874281406402588, "learning_rate": 8.219923027961696e-06, "loss": 0.6278162002563477, "memory(GiB)": 33.07, "step": 6740, "token_acc": 0.8484974958263773, "train_speed(iter/s)": 0.096194 }, { "epoch": 0.3136671326720126, "grad_norm": 6.561712265014648, "learning_rate": 8.216980578251426e-06, "loss": 0.6256554603576661, "memory(GiB)": 33.07, "step": 6745, "token_acc": 0.8286576168929111, "train_speed(iter/s)": 0.096233 }, { "epoch": 0.31389965093196226, "grad_norm": 7.314164161682129, "learning_rate": 8.214036226241216e-06, "loss": 0.6697664737701416, "memory(GiB)": 33.07, "step": 6750, "token_acc": 0.8280954184021204, "train_speed(iter/s)": 0.096272 }, { "epoch": 0.31389965093196226, "eval_loss": 0.6476150155067444, "eval_runtime": 294.4404, "eval_samples_per_second": 11.802, "eval_steps_per_second": 11.802, "step": 6750 }, { "epoch": 0.31413216919191184, "grad_norm": 6.524766445159912, "learning_rate": 8.211089973672155e-06, "loss": 0.8700243949890136, "memory(GiB)": 33.07, "step": 6755, "token_acc": 0.8140937559915639, "train_speed(iter/s)": 0.095909 }, { "epoch": 0.31436468745186147, "grad_norm": 5.901501655578613, "learning_rate": 8.208141822286452e-06, "loss": 0.7633928298950196, "memory(GiB)": 33.07, "step": 6760, "token_acc": 0.8067516362383741, "train_speed(iter/s)": 0.095949 }, { "epoch": 0.31459720571181105, "grad_norm": 8.813726425170898, "learning_rate": 8.20519177382744e-06, "loss": 0.8027013778686524, "memory(GiB)": 36.53, "step": 6765, "token_acc": 0.7644291091593476, "train_speed(iter/s)": 0.095986 }, { "epoch": 0.3148297239717607, "grad_norm": 7.789068222045898, "learning_rate": 8.202239830039572e-06, "loss": 0.6651137828826904, "memory(GiB)": 36.53, "step": 6770, "token_acc": 0.8395303326810176, "train_speed(iter/s)": 0.096025 }, { "epoch": 0.31506224223171025, "grad_norm": 5.441076755523682, "learning_rate": 8.199285992668426e-06, "loss": 0.7530568599700928, "memory(GiB)": 36.53, "step": 6775, "token_acc": 0.8276004973062577, "train_speed(iter/s)": 0.096065 }, { "epoch": 0.3152947604916599, "grad_norm": 5.6672234535217285, "learning_rate": 8.196330263460698e-06, "loss": 0.8605976104736328, "memory(GiB)": 36.53, "step": 6780, "token_acc": 0.7902680197762165, "train_speed(iter/s)": 0.096105 }, { "epoch": 0.31552727875160946, "grad_norm": 6.022310256958008, "learning_rate": 8.1933726441642e-06, "loss": 0.7086090087890625, "memory(GiB)": 36.53, "step": 6785, "token_acc": 0.8303532490187527, "train_speed(iter/s)": 0.096145 }, { "epoch": 0.31575979701155904, "grad_norm": 5.349603176116943, "learning_rate": 8.190413136527861e-06, "loss": 0.8489409446716308, "memory(GiB)": 36.53, "step": 6790, "token_acc": 0.7902423865755127, "train_speed(iter/s)": 0.096184 }, { "epoch": 0.3159923152715087, "grad_norm": 7.828374862670898, "learning_rate": 8.187451742301735e-06, "loss": 0.7629014492034912, "memory(GiB)": 36.53, "step": 6795, "token_acc": 0.8210290827740492, "train_speed(iter/s)": 0.096224 }, { "epoch": 0.31622483353145825, "grad_norm": 9.842150688171387, "learning_rate": 8.184488463236984e-06, "loss": 0.6981842041015625, "memory(GiB)": 36.53, "step": 6800, "token_acc": 0.8441385435168739, "train_speed(iter/s)": 0.096264 }, { "epoch": 0.31622483353145825, "eval_loss": 0.6452161073684692, "eval_runtime": 293.1209, "eval_samples_per_second": 11.855, "eval_steps_per_second": 11.855, "step": 6800 }, { "epoch": 0.3164573517914079, "grad_norm": 5.845304489135742, "learning_rate": 8.181523301085883e-06, "loss": 0.8383314132690429, "memory(GiB)": 36.53, "step": 6805, "token_acc": 0.813117224573554, "train_speed(iter/s)": 0.095906 }, { "epoch": 0.31668987005135746, "grad_norm": 7.104671478271484, "learning_rate": 8.178556257601828e-06, "loss": 0.7857285499572754, "memory(GiB)": 36.53, "step": 6810, "token_acc": 0.8020100502512563, "train_speed(iter/s)": 0.095947 }, { "epoch": 0.3169223883113071, "grad_norm": 5.765302658081055, "learning_rate": 8.175587334539321e-06, "loss": 0.7419705867767334, "memory(GiB)": 36.53, "step": 6815, "token_acc": 0.8118503118503119, "train_speed(iter/s)": 0.095986 }, { "epoch": 0.31715490657125667, "grad_norm": 7.719875335693359, "learning_rate": 8.172616533653978e-06, "loss": 0.7236505508422851, "memory(GiB)": 36.53, "step": 6820, "token_acc": 0.8233870967741935, "train_speed(iter/s)": 0.096025 }, { "epoch": 0.3173874248312063, "grad_norm": 5.352121829986572, "learning_rate": 8.169643856702528e-06, "loss": 0.8514439582824707, "memory(GiB)": 36.53, "step": 6825, "token_acc": 0.7984257357973991, "train_speed(iter/s)": 0.096064 }, { "epoch": 0.3176199430911559, "grad_norm": 5.204432964324951, "learning_rate": 8.166669305442803e-06, "loss": 0.7329915523529053, "memory(GiB)": 36.53, "step": 6830, "token_acc": 0.8289398280802293, "train_speed(iter/s)": 0.096103 }, { "epoch": 0.31785246135110545, "grad_norm": 8.581599235534668, "learning_rate": 8.16369288163375e-06, "loss": 0.7899059295654297, "memory(GiB)": 36.53, "step": 6835, "token_acc": 0.8126003210272873, "train_speed(iter/s)": 0.096141 }, { "epoch": 0.3180849796110551, "grad_norm": 8.947052955627441, "learning_rate": 8.160714587035418e-06, "loss": 0.6274521827697754, "memory(GiB)": 36.53, "step": 6840, "token_acc": 0.8581187598528639, "train_speed(iter/s)": 0.096181 }, { "epoch": 0.31831749787100466, "grad_norm": 5.990546226501465, "learning_rate": 8.157734423408964e-06, "loss": 0.6243311405181885, "memory(GiB)": 36.53, "step": 6845, "token_acc": 0.8362896190753126, "train_speed(iter/s)": 0.09622 }, { "epoch": 0.3185500161309543, "grad_norm": 6.6962690353393555, "learning_rate": 8.154752392516654e-06, "loss": 0.6459828853607178, "memory(GiB)": 36.53, "step": 6850, "token_acc": 0.8609100310237849, "train_speed(iter/s)": 0.096259 }, { "epoch": 0.3185500161309543, "eval_loss": 0.6414746046066284, "eval_runtime": 296.31, "eval_samples_per_second": 11.728, "eval_steps_per_second": 11.728, "step": 6850 }, { "epoch": 0.31878253439090387, "grad_norm": 5.734102249145508, "learning_rate": 8.151768496121852e-06, "loss": 0.7357370853424072, "memory(GiB)": 36.53, "step": 6855, "token_acc": 0.8145698542054879, "train_speed(iter/s)": 0.095898 }, { "epoch": 0.3190150526508535, "grad_norm": 7.676290035247803, "learning_rate": 8.148782735989032e-06, "loss": 0.7362208366394043, "memory(GiB)": 36.53, "step": 6860, "token_acc": 0.8165252906063463, "train_speed(iter/s)": 0.095937 }, { "epoch": 0.3192475709108031, "grad_norm": 4.947035789489746, "learning_rate": 8.145795113883762e-06, "loss": 0.8011846542358398, "memory(GiB)": 36.53, "step": 6865, "token_acc": 0.8070175438596491, "train_speed(iter/s)": 0.095977 }, { "epoch": 0.3194800891707527, "grad_norm": 6.373569488525391, "learning_rate": 8.142805631572714e-06, "loss": 0.6174682140350342, "memory(GiB)": 36.53, "step": 6870, "token_acc": 0.8545526212061489, "train_speed(iter/s)": 0.096015 }, { "epoch": 0.3197126074307023, "grad_norm": 8.267929077148438, "learning_rate": 8.139814290823666e-06, "loss": 0.6648574829101562, "memory(GiB)": 36.53, "step": 6875, "token_acc": 0.8253856942496494, "train_speed(iter/s)": 0.096054 }, { "epoch": 0.3199451256906519, "grad_norm": 5.163671016693115, "learning_rate": 8.13682109340549e-06, "loss": 0.906800651550293, "memory(GiB)": 36.53, "step": 6880, "token_acc": 0.7814548860443334, "train_speed(iter/s)": 0.096093 }, { "epoch": 0.3201776439506015, "grad_norm": 6.802454471588135, "learning_rate": 8.133826041088151e-06, "loss": 0.7294719696044922, "memory(GiB)": 36.53, "step": 6885, "token_acc": 0.8328280640970117, "train_speed(iter/s)": 0.096133 }, { "epoch": 0.3204101622105511, "grad_norm": 7.575679779052734, "learning_rate": 8.130829135642719e-06, "loss": 0.8211429595947266, "memory(GiB)": 36.53, "step": 6890, "token_acc": 0.7984913793103449, "train_speed(iter/s)": 0.096172 }, { "epoch": 0.3206426804705007, "grad_norm": 8.01412582397461, "learning_rate": 8.127830378841356e-06, "loss": 0.8153658866882324, "memory(GiB)": 36.53, "step": 6895, "token_acc": 0.8100911002102312, "train_speed(iter/s)": 0.096212 }, { "epoch": 0.3208751987304503, "grad_norm": 6.638772010803223, "learning_rate": 8.124829772457324e-06, "loss": 0.8706229209899903, "memory(GiB)": 36.53, "step": 6900, "token_acc": 0.8031319910514542, "train_speed(iter/s)": 0.096251 }, { "epoch": 0.3208751987304503, "eval_loss": 0.6478042006492615, "eval_runtime": 294.3248, "eval_samples_per_second": 11.807, "eval_steps_per_second": 11.807, "step": 6900 }, { "epoch": 0.3211077169903999, "grad_norm": 7.012734413146973, "learning_rate": 8.121827318264966e-06, "loss": 0.7513750076293946, "memory(GiB)": 36.53, "step": 6905, "token_acc": 0.8138943484015884, "train_speed(iter/s)": 0.095896 }, { "epoch": 0.3213402352503495, "grad_norm": 4.67357063293457, "learning_rate": 8.118823018039732e-06, "loss": 0.7380107402801513, "memory(GiB)": 36.53, "step": 6910, "token_acc": 0.814316974054392, "train_speed(iter/s)": 0.095935 }, { "epoch": 0.3215727535102991, "grad_norm": 5.693792343139648, "learning_rate": 8.115816873558155e-06, "loss": 0.7206833839416504, "memory(GiB)": 36.53, "step": 6915, "token_acc": 0.8147345612134345, "train_speed(iter/s)": 0.095973 }, { "epoch": 0.3218052717702487, "grad_norm": 7.146193027496338, "learning_rate": 8.112808886597863e-06, "loss": 0.8876776695251465, "memory(GiB)": 36.53, "step": 6920, "token_acc": 0.7880449684672334, "train_speed(iter/s)": 0.096012 }, { "epoch": 0.32203779003019833, "grad_norm": 6.618865013122559, "learning_rate": 8.109799058937568e-06, "loss": 0.7404951095581055, "memory(GiB)": 36.53, "step": 6925, "token_acc": 0.805001689760054, "train_speed(iter/s)": 0.09605 }, { "epoch": 0.3222703082901479, "grad_norm": 6.573246955871582, "learning_rate": 8.106787392357077e-06, "loss": 0.7026708602905274, "memory(GiB)": 36.53, "step": 6930, "token_acc": 0.8272071453108897, "train_speed(iter/s)": 0.096089 }, { "epoch": 0.3225028265500975, "grad_norm": 7.522039890289307, "learning_rate": 8.103773888637281e-06, "loss": 0.712070894241333, "memory(GiB)": 36.53, "step": 6935, "token_acc": 0.8365795724465558, "train_speed(iter/s)": 0.096129 }, { "epoch": 0.3227353448100471, "grad_norm": 5.250293731689453, "learning_rate": 8.100758549560157e-06, "loss": 0.6830814838409424, "memory(GiB)": 36.53, "step": 6940, "token_acc": 0.8091185410334346, "train_speed(iter/s)": 0.096168 }, { "epoch": 0.3229678630699967, "grad_norm": 9.969382286071777, "learning_rate": 8.09774137690877e-06, "loss": 0.7145745754241943, "memory(GiB)": 36.53, "step": 6945, "token_acc": 0.8318619582664526, "train_speed(iter/s)": 0.096207 }, { "epoch": 0.32320038132994633, "grad_norm": 6.600118637084961, "learning_rate": 8.094722372467264e-06, "loss": 0.6322240352630615, "memory(GiB)": 36.53, "step": 6950, "token_acc": 0.8467902051621443, "train_speed(iter/s)": 0.096246 }, { "epoch": 0.32320038132994633, "eval_loss": 0.6434064507484436, "eval_runtime": 292.2564, "eval_samples_per_second": 11.89, "eval_steps_per_second": 11.89, "step": 6950 }, { "epoch": 0.3234328995898959, "grad_norm": 7.271444320678711, "learning_rate": 8.091701538020871e-06, "loss": 0.5597502708435058, "memory(GiB)": 36.53, "step": 6955, "token_acc": 0.8157582938388626, "train_speed(iter/s)": 0.095896 }, { "epoch": 0.32366541784984554, "grad_norm": 6.547903537750244, "learning_rate": 8.088678875355907e-06, "loss": 0.7079691410064697, "memory(GiB)": 36.53, "step": 6960, "token_acc": 0.8249725375320396, "train_speed(iter/s)": 0.095936 }, { "epoch": 0.3238979361097951, "grad_norm": 6.421803951263428, "learning_rate": 8.08565438625976e-06, "loss": 0.7322468757629395, "memory(GiB)": 36.53, "step": 6965, "token_acc": 0.822429906542056, "train_speed(iter/s)": 0.095976 }, { "epoch": 0.32413045436974475, "grad_norm": 5.57395076751709, "learning_rate": 8.082628072520909e-06, "loss": 0.7521162509918213, "memory(GiB)": 36.53, "step": 6970, "token_acc": 0.817231548938866, "train_speed(iter/s)": 0.096015 }, { "epoch": 0.3243629726296943, "grad_norm": 6.311516761779785, "learning_rate": 8.079599935928903e-06, "loss": 0.6746647357940674, "memory(GiB)": 36.53, "step": 6975, "token_acc": 0.8303362001563722, "train_speed(iter/s)": 0.096054 }, { "epoch": 0.3245954908896439, "grad_norm": 6.750150203704834, "learning_rate": 8.076569978274373e-06, "loss": 0.6810788154602051, "memory(GiB)": 36.53, "step": 6980, "token_acc": 0.8330940416367552, "train_speed(iter/s)": 0.096092 }, { "epoch": 0.32482800914959353, "grad_norm": 6.657168388366699, "learning_rate": 8.073538201349027e-06, "loss": 0.7422618865966797, "memory(GiB)": 36.53, "step": 6985, "token_acc": 0.8146167557932263, "train_speed(iter/s)": 0.09613 }, { "epoch": 0.3250605274095431, "grad_norm": 7.414977550506592, "learning_rate": 8.070504606945652e-06, "loss": 0.744996976852417, "memory(GiB)": 36.53, "step": 6990, "token_acc": 0.8114323258869908, "train_speed(iter/s)": 0.096168 }, { "epoch": 0.32529304566949274, "grad_norm": 5.731770992279053, "learning_rate": 8.067469196858101e-06, "loss": 0.714255428314209, "memory(GiB)": 36.53, "step": 6995, "token_acc": 0.8291497975708502, "train_speed(iter/s)": 0.096207 }, { "epoch": 0.3255255639294423, "grad_norm": 6.356823921203613, "learning_rate": 8.064431972881308e-06, "loss": 0.8558525085449219, "memory(GiB)": 36.53, "step": 7000, "token_acc": 0.784629981024668, "train_speed(iter/s)": 0.096245 }, { "epoch": 0.3255255639294423, "eval_loss": 0.6428462266921997, "eval_runtime": 289.8023, "eval_samples_per_second": 11.991, "eval_steps_per_second": 11.991, "step": 7000 }, { "epoch": 0.32575808218939195, "grad_norm": 7.4635443687438965, "learning_rate": 8.061392936811276e-06, "loss": 0.6529666900634765, "memory(GiB)": 36.53, "step": 7005, "token_acc": 0.8152108566527458, "train_speed(iter/s)": 0.095901 }, { "epoch": 0.3259906004493415, "grad_norm": 5.513030052185059, "learning_rate": 8.058352090445085e-06, "loss": 0.7095055103302002, "memory(GiB)": 36.53, "step": 7010, "token_acc": 0.8111888111888111, "train_speed(iter/s)": 0.09594 }, { "epoch": 0.32622311870929116, "grad_norm": 6.568864345550537, "learning_rate": 8.055309435580874e-06, "loss": 0.796638298034668, "memory(GiB)": 36.53, "step": 7015, "token_acc": 0.8049580751002552, "train_speed(iter/s)": 0.095979 }, { "epoch": 0.32645563696924074, "grad_norm": 5.537734508514404, "learning_rate": 8.052264974017864e-06, "loss": 0.8192606925964355, "memory(GiB)": 36.53, "step": 7020, "token_acc": 0.8001395186606208, "train_speed(iter/s)": 0.096018 }, { "epoch": 0.32668815522919037, "grad_norm": 5.668457984924316, "learning_rate": 8.049218707556338e-06, "loss": 0.7112496852874756, "memory(GiB)": 36.53, "step": 7025, "token_acc": 0.8256777108433735, "train_speed(iter/s)": 0.096057 }, { "epoch": 0.32692067348913995, "grad_norm": 7.687334060668945, "learning_rate": 8.046170637997651e-06, "loss": 0.7744904041290284, "memory(GiB)": 36.53, "step": 7030, "token_acc": 0.8145842596709649, "train_speed(iter/s)": 0.096096 }, { "epoch": 0.3271531917490895, "grad_norm": 6.078497886657715, "learning_rate": 8.043120767144212e-06, "loss": 0.6508955955505371, "memory(GiB)": 36.53, "step": 7035, "token_acc": 0.834321590512731, "train_speed(iter/s)": 0.096135 }, { "epoch": 0.32738571000903915, "grad_norm": 5.643200397491455, "learning_rate": 8.040069096799511e-06, "loss": 0.8292275428771972, "memory(GiB)": 36.53, "step": 7040, "token_acc": 0.7918106886466365, "train_speed(iter/s)": 0.096172 }, { "epoch": 0.32761822826898873, "grad_norm": 6.846371173858643, "learning_rate": 8.037015628768092e-06, "loss": 0.7285429477691651, "memory(GiB)": 36.53, "step": 7045, "token_acc": 0.8193423597678917, "train_speed(iter/s)": 0.096211 }, { "epoch": 0.32785074652893836, "grad_norm": 7.871469974517822, "learning_rate": 8.033960364855566e-06, "loss": 0.7744301795959473, "memory(GiB)": 36.53, "step": 7050, "token_acc": 0.818105616093881, "train_speed(iter/s)": 0.09625 }, { "epoch": 0.32785074652893836, "eval_loss": 0.6424740552902222, "eval_runtime": 295.611, "eval_samples_per_second": 11.755, "eval_steps_per_second": 11.755, "step": 7050 }, { "epoch": 0.32808326478888794, "grad_norm": 6.070770740509033, "learning_rate": 8.030903306868605e-06, "loss": 0.7583102226257324, "memory(GiB)": 36.53, "step": 7055, "token_acc": 0.8147746967071057, "train_speed(iter/s)": 0.095901 }, { "epoch": 0.3283157830488376, "grad_norm": 7.5017523765563965, "learning_rate": 8.027844456614942e-06, "loss": 0.8310544967651368, "memory(GiB)": 36.53, "step": 7060, "token_acc": 0.7947409733124019, "train_speed(iter/s)": 0.095939 }, { "epoch": 0.32854830130878715, "grad_norm": 8.23862361907959, "learning_rate": 8.024783815903367e-06, "loss": 0.7447206974029541, "memory(GiB)": 36.53, "step": 7065, "token_acc": 0.8162358642972536, "train_speed(iter/s)": 0.095977 }, { "epoch": 0.3287808195687368, "grad_norm": 7.500193119049072, "learning_rate": 8.021721386543733e-06, "loss": 0.8137165069580078, "memory(GiB)": 36.53, "step": 7070, "token_acc": 0.7975917431192661, "train_speed(iter/s)": 0.096015 }, { "epoch": 0.32901333782868636, "grad_norm": 7.275796413421631, "learning_rate": 8.018657170346951e-06, "loss": 0.7593709468841553, "memory(GiB)": 36.53, "step": 7075, "token_acc": 0.8170637970791699, "train_speed(iter/s)": 0.096051 }, { "epoch": 0.32924585608863594, "grad_norm": 7.272811412811279, "learning_rate": 8.015591169124984e-06, "loss": 0.7790214538574218, "memory(GiB)": 36.53, "step": 7080, "token_acc": 0.8054070112893642, "train_speed(iter/s)": 0.096089 }, { "epoch": 0.32947837434858557, "grad_norm": 8.291638374328613, "learning_rate": 8.012523384690853e-06, "loss": 0.7552329063415527, "memory(GiB)": 36.53, "step": 7085, "token_acc": 0.8216233557908245, "train_speed(iter/s)": 0.096127 }, { "epoch": 0.32971089260853514, "grad_norm": 7.049375057220459, "learning_rate": 8.009453818858637e-06, "loss": 0.7487932682037354, "memory(GiB)": 36.53, "step": 7090, "token_acc": 0.8164435946462715, "train_speed(iter/s)": 0.096163 }, { "epoch": 0.3299434108684848, "grad_norm": 6.85177755355835, "learning_rate": 8.006382473443461e-06, "loss": 0.6691460132598877, "memory(GiB)": 36.53, "step": 7095, "token_acc": 0.8258766626360339, "train_speed(iter/s)": 0.096201 }, { "epoch": 0.33017592912843435, "grad_norm": 6.34906530380249, "learning_rate": 8.00330935026151e-06, "loss": 0.6504391193389892, "memory(GiB)": 36.53, "step": 7100, "token_acc": 0.8318752377329783, "train_speed(iter/s)": 0.096239 }, { "epoch": 0.33017592912843435, "eval_loss": 0.6397922039031982, "eval_runtime": 293.0806, "eval_samples_per_second": 11.857, "eval_steps_per_second": 11.857, "step": 7100 }, { "epoch": 0.330408447388384, "grad_norm": 6.709843158721924, "learning_rate": 8.000234451130013e-06, "loss": 0.797484302520752, "memory(GiB)": 36.53, "step": 7105, "token_acc": 0.8149428407203985, "train_speed(iter/s)": 0.095896 }, { "epoch": 0.33064096564833356, "grad_norm": 7.30489444732666, "learning_rate": 7.997157777867255e-06, "loss": 0.7768725395202637, "memory(GiB)": 36.53, "step": 7110, "token_acc": 0.8156225218080888, "train_speed(iter/s)": 0.095935 }, { "epoch": 0.3308734839082832, "grad_norm": 7.3210577964782715, "learning_rate": 7.994079332292566e-06, "loss": 0.7170722007751464, "memory(GiB)": 36.53, "step": 7115, "token_acc": 0.8237371953373366, "train_speed(iter/s)": 0.095972 }, { "epoch": 0.33110600216823277, "grad_norm": 5.7141337394714355, "learning_rate": 7.99099911622633e-06, "loss": 0.6883097171783448, "memory(GiB)": 36.53, "step": 7120, "token_acc": 0.8426040379068809, "train_speed(iter/s)": 0.09601 }, { "epoch": 0.33133852042818235, "grad_norm": 6.0469970703125, "learning_rate": 7.987917131489971e-06, "loss": 0.8080909729003907, "memory(GiB)": 36.53, "step": 7125, "token_acc": 0.8011449231696294, "train_speed(iter/s)": 0.096048 }, { "epoch": 0.331571038688132, "grad_norm": 5.462447643280029, "learning_rate": 7.984833379905961e-06, "loss": 0.7844725608825683, "memory(GiB)": 36.53, "step": 7130, "token_acc": 0.7944785276073619, "train_speed(iter/s)": 0.096086 }, { "epoch": 0.33180355694808156, "grad_norm": 7.006442070007324, "learning_rate": 7.981747863297817e-06, "loss": 0.6901938438415527, "memory(GiB)": 36.53, "step": 7135, "token_acc": 0.8397600685518424, "train_speed(iter/s)": 0.096124 }, { "epoch": 0.3320360752080312, "grad_norm": 5.262815475463867, "learning_rate": 7.978660583490104e-06, "loss": 0.6862985134124756, "memory(GiB)": 36.53, "step": 7140, "token_acc": 0.8212121212121212, "train_speed(iter/s)": 0.096162 }, { "epoch": 0.33226859346798077, "grad_norm": 6.187898635864258, "learning_rate": 7.975571542308422e-06, "loss": 0.691615104675293, "memory(GiB)": 36.53, "step": 7145, "token_acc": 0.835931700074239, "train_speed(iter/s)": 0.096199 }, { "epoch": 0.3325011117279304, "grad_norm": 7.044938564300537, "learning_rate": 7.97248074157942e-06, "loss": 0.6047670364379882, "memory(GiB)": 36.53, "step": 7150, "token_acc": 0.8514950166112957, "train_speed(iter/s)": 0.096237 }, { "epoch": 0.3325011117279304, "eval_loss": 0.640709638595581, "eval_runtime": 289.414, "eval_samples_per_second": 12.007, "eval_steps_per_second": 12.007, "step": 7150 }, { "epoch": 0.33273362998788, "grad_norm": 7.655607223510742, "learning_rate": 7.969388183130779e-06, "loss": 0.7802319526672363, "memory(GiB)": 36.53, "step": 7155, "token_acc": 0.8151366458130888, "train_speed(iter/s)": 0.095901 }, { "epoch": 0.3329661482478296, "grad_norm": 6.0845947265625, "learning_rate": 7.966293868791231e-06, "loss": 0.6793017387390137, "memory(GiB)": 36.53, "step": 7160, "token_acc": 0.832723279648609, "train_speed(iter/s)": 0.095939 }, { "epoch": 0.3331986665077792, "grad_norm": 5.9088850021362305, "learning_rate": 7.963197800390533e-06, "loss": 0.7765905857086182, "memory(GiB)": 36.53, "step": 7165, "token_acc": 0.8096875895672112, "train_speed(iter/s)": 0.095975 }, { "epoch": 0.3334311847677288, "grad_norm": 5.6625895500183105, "learning_rate": 7.96009997975949e-06, "loss": 0.6958408832550049, "memory(GiB)": 36.53, "step": 7170, "token_acc": 0.8290849673202615, "train_speed(iter/s)": 0.096012 }, { "epoch": 0.3336637030276784, "grad_norm": 6.877390384674072, "learning_rate": 7.957000408729937e-06, "loss": 0.7387121200561524, "memory(GiB)": 36.53, "step": 7175, "token_acc": 0.8228388473852721, "train_speed(iter/s)": 0.096049 }, { "epoch": 0.33389622128762797, "grad_norm": 8.14645004272461, "learning_rate": 7.953899089134747e-06, "loss": 0.7548263072967529, "memory(GiB)": 36.53, "step": 7180, "token_acc": 0.8222559817698443, "train_speed(iter/s)": 0.096086 }, { "epoch": 0.3341287395475776, "grad_norm": 6.597789764404297, "learning_rate": 7.950796022807825e-06, "loss": 0.8279875755310059, "memory(GiB)": 36.53, "step": 7185, "token_acc": 0.8059187887130076, "train_speed(iter/s)": 0.096124 }, { "epoch": 0.3343612578075272, "grad_norm": 7.290713787078857, "learning_rate": 7.947691211584111e-06, "loss": 0.7254250049591064, "memory(GiB)": 36.53, "step": 7190, "token_acc": 0.8295350957155879, "train_speed(iter/s)": 0.096162 }, { "epoch": 0.3345937760674768, "grad_norm": 6.3813629150390625, "learning_rate": 7.944584657299574e-06, "loss": 0.8218119621276856, "memory(GiB)": 36.53, "step": 7195, "token_acc": 0.7869318181818182, "train_speed(iter/s)": 0.096199 }, { "epoch": 0.3348262943274264, "grad_norm": 7.908492565155029, "learning_rate": 7.941476361791219e-06, "loss": 0.7454845428466796, "memory(GiB)": 36.53, "step": 7200, "token_acc": 0.8088725817211474, "train_speed(iter/s)": 0.096237 }, { "epoch": 0.3348262943274264, "eval_loss": 0.6419472694396973, "eval_runtime": 292.8292, "eval_samples_per_second": 11.867, "eval_steps_per_second": 11.867, "step": 7200 }, { "epoch": 0.335058812587376, "grad_norm": 8.641154289245605, "learning_rate": 7.938366326897074e-06, "loss": 0.7571213245391846, "memory(GiB)": 36.53, "step": 7205, "token_acc": 0.8146778119395337, "train_speed(iter/s)": 0.095899 }, { "epoch": 0.3352913308473256, "grad_norm": 5.167827129364014, "learning_rate": 7.9352545544562e-06, "loss": 0.9509881019592286, "memory(GiB)": 36.53, "step": 7210, "token_acc": 0.7677793904208998, "train_speed(iter/s)": 0.095935 }, { "epoch": 0.33552384910727523, "grad_norm": 7.85144567489624, "learning_rate": 7.932141046308684e-06, "loss": 0.7327893733978271, "memory(GiB)": 36.53, "step": 7215, "token_acc": 0.8123628383321141, "train_speed(iter/s)": 0.095971 }, { "epoch": 0.3357563673672248, "grad_norm": 7.0988287925720215, "learning_rate": 7.929025804295639e-06, "loss": 0.7216827392578125, "memory(GiB)": 36.53, "step": 7220, "token_acc": 0.8254152182309772, "train_speed(iter/s)": 0.096008 }, { "epoch": 0.3359888856271744, "grad_norm": 7.263143062591553, "learning_rate": 7.925908830259201e-06, "loss": 0.8273670196533203, "memory(GiB)": 36.53, "step": 7225, "token_acc": 0.7873343151693667, "train_speed(iter/s)": 0.096045 }, { "epoch": 0.336221403887124, "grad_norm": 7.280778408050537, "learning_rate": 7.922790126042539e-06, "loss": 0.6277379989624023, "memory(GiB)": 36.53, "step": 7230, "token_acc": 0.8471760797342193, "train_speed(iter/s)": 0.096083 }, { "epoch": 0.3364539221470736, "grad_norm": 7.691588878631592, "learning_rate": 7.919669693489835e-06, "loss": 0.7610618591308593, "memory(GiB)": 36.53, "step": 7235, "token_acc": 0.8135218736190897, "train_speed(iter/s)": 0.096121 }, { "epoch": 0.3366864404070232, "grad_norm": 6.554846286773682, "learning_rate": 7.9165475344463e-06, "loss": 0.7569685935974121, "memory(GiB)": 36.53, "step": 7240, "token_acc": 0.8254284575528099, "train_speed(iter/s)": 0.096159 }, { "epoch": 0.3369189586669728, "grad_norm": 6.927182674407959, "learning_rate": 7.913423650758158e-06, "loss": 0.7405023097991943, "memory(GiB)": 36.53, "step": 7245, "token_acc": 0.8135833038556773, "train_speed(iter/s)": 0.096195 }, { "epoch": 0.33715147692692243, "grad_norm": 6.637570381164551, "learning_rate": 7.910298044272661e-06, "loss": 0.663820457458496, "memory(GiB)": 36.53, "step": 7250, "token_acc": 0.8394245723172629, "train_speed(iter/s)": 0.096233 }, { "epoch": 0.33715147692692243, "eval_loss": 0.6380437612533569, "eval_runtime": 292.3654, "eval_samples_per_second": 11.886, "eval_steps_per_second": 11.886, "step": 7250 }, { "epoch": 0.337383995186872, "grad_norm": 7.666382312774658, "learning_rate": 7.90717071683808e-06, "loss": 0.7239179611206055, "memory(GiB)": 36.53, "step": 7255, "token_acc": 0.8155583638603171, "train_speed(iter/s)": 0.095897 }, { "epoch": 0.33761651344682164, "grad_norm": 5.590011119842529, "learning_rate": 7.904041670303695e-06, "loss": 0.7178312778472901, "memory(GiB)": 36.53, "step": 7260, "token_acc": 0.8258992805755395, "train_speed(iter/s)": 0.095936 }, { "epoch": 0.3378490317067712, "grad_norm": 6.321069717407227, "learning_rate": 7.90091090651981e-06, "loss": 0.7359897136688233, "memory(GiB)": 36.53, "step": 7265, "token_acc": 0.8224368499257058, "train_speed(iter/s)": 0.095972 }, { "epoch": 0.33808154996672085, "grad_norm": 7.234593868255615, "learning_rate": 7.897778427337741e-06, "loss": 0.6815497398376464, "memory(GiB)": 36.53, "step": 7270, "token_acc": 0.8278571428571428, "train_speed(iter/s)": 0.09601 }, { "epoch": 0.33831406822667043, "grad_norm": 7.684406280517578, "learning_rate": 7.894644234609823e-06, "loss": 0.8643548965454102, "memory(GiB)": 36.53, "step": 7275, "token_acc": 0.8054187192118226, "train_speed(iter/s)": 0.096048 }, { "epoch": 0.33854658648662, "grad_norm": 8.585091590881348, "learning_rate": 7.891508330189398e-06, "loss": 0.6693760395050049, "memory(GiB)": 36.53, "step": 7280, "token_acc": 0.8482220294882914, "train_speed(iter/s)": 0.096086 }, { "epoch": 0.33877910474656964, "grad_norm": 8.701008796691895, "learning_rate": 7.888370715930823e-06, "loss": 0.7502879619598388, "memory(GiB)": 36.53, "step": 7285, "token_acc": 0.806949806949807, "train_speed(iter/s)": 0.096123 }, { "epoch": 0.3390116230065192, "grad_norm": 7.461516857147217, "learning_rate": 7.885231393689467e-06, "loss": 0.7151779651641845, "memory(GiB)": 36.53, "step": 7290, "token_acc": 0.829172610556348, "train_speed(iter/s)": 0.096161 }, { "epoch": 0.33924414126646885, "grad_norm": 6.577462196350098, "learning_rate": 7.882090365321708e-06, "loss": 0.8359928131103516, "memory(GiB)": 36.53, "step": 7295, "token_acc": 0.8137787056367433, "train_speed(iter/s)": 0.096198 }, { "epoch": 0.3394766595264184, "grad_norm": 7.914944171905518, "learning_rate": 7.878947632684933e-06, "loss": 0.7542798519134521, "memory(GiB)": 36.53, "step": 7300, "token_acc": 0.8035133376707873, "train_speed(iter/s)": 0.096236 }, { "epoch": 0.3394766595264184, "eval_loss": 0.6390102505683899, "eval_runtime": 290.9563, "eval_samples_per_second": 11.943, "eval_steps_per_second": 11.943, "step": 7300 }, { "epoch": 0.33970917778636806, "grad_norm": 7.265176296234131, "learning_rate": 7.875803197637539e-06, "loss": 0.6686437606811524, "memory(GiB)": 36.53, "step": 7305, "token_acc": 0.8155821698475727, "train_speed(iter/s)": 0.095904 }, { "epoch": 0.33994169604631763, "grad_norm": 10.155379295349121, "learning_rate": 7.872657062038921e-06, "loss": 0.6359403133392334, "memory(GiB)": 36.53, "step": 7310, "token_acc": 0.8454882571075402, "train_speed(iter/s)": 0.09594 }, { "epoch": 0.34017421430626726, "grad_norm": 6.692675590515137, "learning_rate": 7.869509227749495e-06, "loss": 0.7660014629364014, "memory(GiB)": 36.53, "step": 7315, "token_acc": 0.8152403991533111, "train_speed(iter/s)": 0.095978 }, { "epoch": 0.34040673256621684, "grad_norm": 8.65769100189209, "learning_rate": 7.866359696630666e-06, "loss": 0.7088188648223877, "memory(GiB)": 36.53, "step": 7320, "token_acc": 0.8209606986899564, "train_speed(iter/s)": 0.096017 }, { "epoch": 0.3406392508261664, "grad_norm": 5.40556001663208, "learning_rate": 7.863208470544852e-06, "loss": 0.6675843715667724, "memory(GiB)": 36.53, "step": 7325, "token_acc": 0.8324375592791653, "train_speed(iter/s)": 0.096054 }, { "epoch": 0.34087176908611605, "grad_norm": 5.316320419311523, "learning_rate": 7.86005555135547e-06, "loss": 0.7577543735504151, "memory(GiB)": 36.53, "step": 7330, "token_acc": 0.8117934616559731, "train_speed(iter/s)": 0.096091 }, { "epoch": 0.3411042873460656, "grad_norm": 7.521231651306152, "learning_rate": 7.856900940926937e-06, "loss": 0.614345932006836, "memory(GiB)": 36.53, "step": 7335, "token_acc": 0.840962904498816, "train_speed(iter/s)": 0.096129 }, { "epoch": 0.34133680560601526, "grad_norm": 7.031819820404053, "learning_rate": 7.853744641124672e-06, "loss": 0.6725636482238769, "memory(GiB)": 36.53, "step": 7340, "token_acc": 0.8362771739130435, "train_speed(iter/s)": 0.096166 }, { "epoch": 0.34156932386596484, "grad_norm": 5.72618293762207, "learning_rate": 7.850586653815093e-06, "loss": 0.6473200798034668, "memory(GiB)": 36.53, "step": 7345, "token_acc": 0.836783988957902, "train_speed(iter/s)": 0.096204 }, { "epoch": 0.34180184212591447, "grad_norm": 7.18748664855957, "learning_rate": 7.847426980865618e-06, "loss": 0.8898324012756348, "memory(GiB)": 36.53, "step": 7350, "token_acc": 0.7603327965646807, "train_speed(iter/s)": 0.096241 }, { "epoch": 0.34180184212591447, "eval_loss": 0.6362443566322327, "eval_runtime": 291.2812, "eval_samples_per_second": 11.93, "eval_steps_per_second": 11.93, "step": 7350 }, { "epoch": 0.34203436038586404, "grad_norm": 8.468127250671387, "learning_rate": 7.844265624144653e-06, "loss": 0.736899995803833, "memory(GiB)": 36.53, "step": 7355, "token_acc": 0.8153188554564701, "train_speed(iter/s)": 0.095912 }, { "epoch": 0.3422668786458137, "grad_norm": 6.774470806121826, "learning_rate": 7.841102585521612e-06, "loss": 0.8117254257202149, "memory(GiB)": 36.53, "step": 7360, "token_acc": 0.8039726473461413, "train_speed(iter/s)": 0.095949 }, { "epoch": 0.34249939690576325, "grad_norm": 7.67664098739624, "learning_rate": 7.837937866866894e-06, "loss": 0.6635471820831299, "memory(GiB)": 36.53, "step": 7365, "token_acc": 0.8254976704786108, "train_speed(iter/s)": 0.095987 }, { "epoch": 0.34273191516571283, "grad_norm": 6.061575412750244, "learning_rate": 7.834771470051895e-06, "loss": 0.7754477977752685, "memory(GiB)": 36.53, "step": 7370, "token_acc": 0.7987326493663247, "train_speed(iter/s)": 0.096023 }, { "epoch": 0.34296443342566246, "grad_norm": 7.205853462219238, "learning_rate": 7.831603396949005e-06, "loss": 0.7205926895141601, "memory(GiB)": 36.53, "step": 7375, "token_acc": 0.8245721271393643, "train_speed(iter/s)": 0.09606 }, { "epoch": 0.34319695168561204, "grad_norm": 7.258941173553467, "learning_rate": 7.8284336494316e-06, "loss": 0.7685527324676513, "memory(GiB)": 36.53, "step": 7380, "token_acc": 0.8014018691588785, "train_speed(iter/s)": 0.096096 }, { "epoch": 0.34342946994556167, "grad_norm": 7.458856582641602, "learning_rate": 7.825262229374054e-06, "loss": 0.7406332492828369, "memory(GiB)": 36.53, "step": 7385, "token_acc": 0.8115746971736204, "train_speed(iter/s)": 0.096133 }, { "epoch": 0.34366198820551125, "grad_norm": 6.199806213378906, "learning_rate": 7.822089138651723e-06, "loss": 0.6603247165679932, "memory(GiB)": 36.53, "step": 7390, "token_acc": 0.8321905449296283, "train_speed(iter/s)": 0.096169 }, { "epoch": 0.3438945064654609, "grad_norm": 6.644196033477783, "learning_rate": 7.818914379140953e-06, "loss": 0.7804720401763916, "memory(GiB)": 36.53, "step": 7395, "token_acc": 0.7989261744966443, "train_speed(iter/s)": 0.096206 }, { "epoch": 0.34412702472541046, "grad_norm": 5.659149169921875, "learning_rate": 7.815737952719081e-06, "loss": 0.6324204444885254, "memory(GiB)": 36.53, "step": 7400, "token_acc": 0.8367801463569837, "train_speed(iter/s)": 0.096243 }, { "epoch": 0.34412702472541046, "eval_loss": 0.6397080421447754, "eval_runtime": 293.3095, "eval_samples_per_second": 11.848, "eval_steps_per_second": 11.848, "step": 7400 }, { "epoch": 0.3443595429853601, "grad_norm": 5.597126483917236, "learning_rate": 7.81255986126442e-06, "loss": 0.8179545402526855, "memory(GiB)": 36.53, "step": 7405, "token_acc": 0.8155359950203098, "train_speed(iter/s)": 0.095913 }, { "epoch": 0.34459206124530967, "grad_norm": 6.572484493255615, "learning_rate": 7.809380106656278e-06, "loss": 0.7602914333343506, "memory(GiB)": 36.53, "step": 7410, "token_acc": 0.8123157549950868, "train_speed(iter/s)": 0.095949 }, { "epoch": 0.3448245795052593, "grad_norm": 6.53602933883667, "learning_rate": 7.806198690774943e-06, "loss": 0.6647308826446533, "memory(GiB)": 36.53, "step": 7415, "token_acc": 0.8425531914893617, "train_speed(iter/s)": 0.095986 }, { "epoch": 0.3450570977652089, "grad_norm": 7.273094654083252, "learning_rate": 7.803015615501679e-06, "loss": 0.7383760452270508, "memory(GiB)": 36.53, "step": 7420, "token_acc": 0.8075300227198962, "train_speed(iter/s)": 0.096024 }, { "epoch": 0.34528961602515845, "grad_norm": 6.734947681427002, "learning_rate": 7.799830882718743e-06, "loss": 0.7719890594482421, "memory(GiB)": 36.53, "step": 7425, "token_acc": 0.8220779220779221, "train_speed(iter/s)": 0.096062 }, { "epoch": 0.3455221342851081, "grad_norm": 6.639370918273926, "learning_rate": 7.796644494309361e-06, "loss": 0.794908094406128, "memory(GiB)": 36.53, "step": 7430, "token_acc": 0.8007067137809187, "train_speed(iter/s)": 0.096099 }, { "epoch": 0.34575465254505766, "grad_norm": 4.576010704040527, "learning_rate": 7.793456452157746e-06, "loss": 0.7384835243225097, "memory(GiB)": 36.53, "step": 7435, "token_acc": 0.8103640830913558, "train_speed(iter/s)": 0.096134 }, { "epoch": 0.3459871708050073, "grad_norm": 7.08085298538208, "learning_rate": 7.790266758149083e-06, "loss": 0.7475857257843017, "memory(GiB)": 36.53, "step": 7440, "token_acc": 0.8180333224436459, "train_speed(iter/s)": 0.096171 }, { "epoch": 0.34621968906495687, "grad_norm": 6.292354583740234, "learning_rate": 7.78707541416954e-06, "loss": 0.702857780456543, "memory(GiB)": 36.53, "step": 7445, "token_acc": 0.821754165356806, "train_speed(iter/s)": 0.096206 }, { "epoch": 0.3464522073249065, "grad_norm": 6.977808952331543, "learning_rate": 7.783882422106254e-06, "loss": 0.6732274055480957, "memory(GiB)": 36.53, "step": 7450, "token_acc": 0.8363211223694466, "train_speed(iter/s)": 0.096243 }, { "epoch": 0.3464522073249065, "eval_loss": 0.6331688165664673, "eval_runtime": 290.2598, "eval_samples_per_second": 11.972, "eval_steps_per_second": 11.972, "step": 7450 }, { "epoch": 0.3466847255848561, "grad_norm": 7.257906913757324, "learning_rate": 7.780687783847341e-06, "loss": 0.7075543880462647, "memory(GiB)": 36.53, "step": 7455, "token_acc": 0.8163660155905044, "train_speed(iter/s)": 0.095919 }, { "epoch": 0.3469172438448057, "grad_norm": 6.582991123199463, "learning_rate": 7.777491501281891e-06, "loss": 0.7077393531799316, "memory(GiB)": 36.53, "step": 7460, "token_acc": 0.8191687344913151, "train_speed(iter/s)": 0.095955 }, { "epoch": 0.3471497621047553, "grad_norm": 7.522470951080322, "learning_rate": 7.77429357629996e-06, "loss": 0.692666482925415, "memory(GiB)": 36.53, "step": 7465, "token_acc": 0.8261022927689594, "train_speed(iter/s)": 0.095991 }, { "epoch": 0.34738228036470487, "grad_norm": 7.295367240905762, "learning_rate": 7.771094010792585e-06, "loss": 0.7090956687927246, "memory(GiB)": 36.53, "step": 7470, "token_acc": 0.8261661807580175, "train_speed(iter/s)": 0.096027 }, { "epoch": 0.3476147986246545, "grad_norm": 5.637494087219238, "learning_rate": 7.767892806651765e-06, "loss": 0.7600067138671875, "memory(GiB)": 36.53, "step": 7475, "token_acc": 0.8067147787888296, "train_speed(iter/s)": 0.096063 }, { "epoch": 0.3478473168846041, "grad_norm": 6.288785457611084, "learning_rate": 7.764689965770472e-06, "loss": 0.8045848846435547, "memory(GiB)": 36.53, "step": 7480, "token_acc": 0.7907473309608541, "train_speed(iter/s)": 0.096099 }, { "epoch": 0.3480798351445537, "grad_norm": 7.792519569396973, "learning_rate": 7.761485490042642e-06, "loss": 0.6550180912017822, "memory(GiB)": 36.53, "step": 7485, "token_acc": 0.8389721627408994, "train_speed(iter/s)": 0.096136 }, { "epoch": 0.3483123534045033, "grad_norm": 4.879187107086182, "learning_rate": 7.758279381363184e-06, "loss": 0.7051380157470704, "memory(GiB)": 36.53, "step": 7490, "token_acc": 0.8250084947332654, "train_speed(iter/s)": 0.096172 }, { "epoch": 0.3485448716644529, "grad_norm": 7.7562055587768555, "learning_rate": 7.755071641627968e-06, "loss": 0.6621471881866455, "memory(GiB)": 36.53, "step": 7495, "token_acc": 0.8392456219128873, "train_speed(iter/s)": 0.096209 }, { "epoch": 0.3487773899244025, "grad_norm": 6.802863121032715, "learning_rate": 7.751862272733825e-06, "loss": 0.6991421222686768, "memory(GiB)": 36.53, "step": 7500, "token_acc": 0.8280751506557958, "train_speed(iter/s)": 0.096245 }, { "epoch": 0.3487773899244025, "eval_loss": 0.6374966502189636, "eval_runtime": 292.9641, "eval_samples_per_second": 11.862, "eval_steps_per_second": 11.862, "step": 7500 }, { "epoch": 0.3490099081843521, "grad_norm": 7.602797031402588, "learning_rate": 7.748651276578563e-06, "loss": 0.6773967266082763, "memory(GiB)": 36.53, "step": 7505, "token_acc": 0.8170272850529896, "train_speed(iter/s)": 0.095921 }, { "epoch": 0.3492424264443017, "grad_norm": 5.853288173675537, "learning_rate": 7.745438655060935e-06, "loss": 0.8045696258544922, "memory(GiB)": 36.53, "step": 7510, "token_acc": 0.7974137931034483, "train_speed(iter/s)": 0.095957 }, { "epoch": 0.3494749447042513, "grad_norm": 7.790389060974121, "learning_rate": 7.742224410080668e-06, "loss": 0.5909008026123047, "memory(GiB)": 36.53, "step": 7515, "token_acc": 0.8477551020408163, "train_speed(iter/s)": 0.095993 }, { "epoch": 0.3497074629642009, "grad_norm": 11.229941368103027, "learning_rate": 7.739008543538442e-06, "loss": 0.6343198299407959, "memory(GiB)": 36.53, "step": 7520, "token_acc": 0.8413852073535699, "train_speed(iter/s)": 0.096029 }, { "epoch": 0.3499399812241505, "grad_norm": 7.57761812210083, "learning_rate": 7.735791057335899e-06, "loss": 0.626180362701416, "memory(GiB)": 36.53, "step": 7525, "token_acc": 0.8515372168284789, "train_speed(iter/s)": 0.096064 }, { "epoch": 0.3501724994841001, "grad_norm": 5.744942665100098, "learning_rate": 7.732571953375638e-06, "loss": 0.7779204845428467, "memory(GiB)": 36.53, "step": 7530, "token_acc": 0.810126582278481, "train_speed(iter/s)": 0.096099 }, { "epoch": 0.3504050177440497, "grad_norm": 6.306331634521484, "learning_rate": 7.729351233561216e-06, "loss": 0.6912620067596436, "memory(GiB)": 36.53, "step": 7535, "token_acc": 0.8287752675386445, "train_speed(iter/s)": 0.096135 }, { "epoch": 0.35063753600399933, "grad_norm": 7.440969467163086, "learning_rate": 7.72612889979714e-06, "loss": 0.7543702125549316, "memory(GiB)": 36.53, "step": 7540, "token_acc": 0.8168260038240918, "train_speed(iter/s)": 0.096171 }, { "epoch": 0.3508700542639489, "grad_norm": 6.490276336669922, "learning_rate": 7.72290495398888e-06, "loss": 0.7501804828643799, "memory(GiB)": 36.53, "step": 7545, "token_acc": 0.8132794068082237, "train_speed(iter/s)": 0.096206 }, { "epoch": 0.35110257252389854, "grad_norm": 6.402077674865723, "learning_rate": 7.719679398042851e-06, "loss": 0.756907320022583, "memory(GiB)": 36.53, "step": 7550, "token_acc": 0.8172623061362104, "train_speed(iter/s)": 0.096243 }, { "epoch": 0.35110257252389854, "eval_loss": 0.6371598839759827, "eval_runtime": 293.6264, "eval_samples_per_second": 11.835, "eval_steps_per_second": 11.835, "step": 7550 }, { "epoch": 0.3513350907838481, "grad_norm": 6.457944393157959, "learning_rate": 7.716452233866427e-06, "loss": 0.6614902496337891, "memory(GiB)": 36.53, "step": 7555, "token_acc": 0.8163759379208619, "train_speed(iter/s)": 0.095919 }, { "epoch": 0.35156760904379775, "grad_norm": 9.463583946228027, "learning_rate": 7.713223463367928e-06, "loss": 0.831269359588623, "memory(GiB)": 36.53, "step": 7560, "token_acc": 0.7870534135125575, "train_speed(iter/s)": 0.095956 }, { "epoch": 0.3518001273037473, "grad_norm": 7.320459365844727, "learning_rate": 7.709993088456625e-06, "loss": 0.7018909454345703, "memory(GiB)": 36.53, "step": 7565, "token_acc": 0.8254747871643746, "train_speed(iter/s)": 0.095992 }, { "epoch": 0.3520326455636969, "grad_norm": 6.774960041046143, "learning_rate": 7.706761111042738e-06, "loss": 0.7598164558410645, "memory(GiB)": 36.53, "step": 7570, "token_acc": 0.8191214470284238, "train_speed(iter/s)": 0.096028 }, { "epoch": 0.35226516382364653, "grad_norm": 5.613641738891602, "learning_rate": 7.703527533037438e-06, "loss": 0.7213669776916504, "memory(GiB)": 36.53, "step": 7575, "token_acc": 0.8275735294117647, "train_speed(iter/s)": 0.096064 }, { "epoch": 0.3524976820835961, "grad_norm": 8.267657279968262, "learning_rate": 7.700292356352839e-06, "loss": 0.6571903705596924, "memory(GiB)": 36.53, "step": 7580, "token_acc": 0.8469924812030075, "train_speed(iter/s)": 0.096099 }, { "epoch": 0.35273020034354574, "grad_norm": 6.900925159454346, "learning_rate": 7.697055582901997e-06, "loss": 0.7545526504516602, "memory(GiB)": 36.53, "step": 7585, "token_acc": 0.8162962962962963, "train_speed(iter/s)": 0.096135 }, { "epoch": 0.3529627186034953, "grad_norm": 6.630009174346924, "learning_rate": 7.693817214598922e-06, "loss": 0.7359566688537598, "memory(GiB)": 36.53, "step": 7590, "token_acc": 0.815540113708149, "train_speed(iter/s)": 0.096171 }, { "epoch": 0.35319523686344495, "grad_norm": 6.38250207901001, "learning_rate": 7.690577253358555e-06, "loss": 0.898930549621582, "memory(GiB)": 36.53, "step": 7595, "token_acc": 0.7799352750809061, "train_speed(iter/s)": 0.096206 }, { "epoch": 0.3534277551233945, "grad_norm": 7.791952610015869, "learning_rate": 7.68733570109679e-06, "loss": 0.6741089820861816, "memory(GiB)": 36.53, "step": 7600, "token_acc": 0.8259507829977628, "train_speed(iter/s)": 0.096241 }, { "epoch": 0.3534277551233945, "eval_loss": 0.6381203532218933, "eval_runtime": 291.6684, "eval_samples_per_second": 11.914, "eval_steps_per_second": 11.914, "step": 7600 }, { "epoch": 0.35366027338334416, "grad_norm": 6.15386962890625, "learning_rate": 7.684092559730454e-06, "loss": 0.7473597049713134, "memory(GiB)": 36.53, "step": 7605, "token_acc": 0.8166645391881542, "train_speed(iter/s)": 0.095922 }, { "epoch": 0.35389279164329374, "grad_norm": 6.232975959777832, "learning_rate": 7.680847831177318e-06, "loss": 0.6105194091796875, "memory(GiB)": 36.53, "step": 7610, "token_acc": 0.8553259141494436, "train_speed(iter/s)": 0.095957 }, { "epoch": 0.3541253099032433, "grad_norm": 7.638478755950928, "learning_rate": 7.67760151735609e-06, "loss": 0.8064892768859864, "memory(GiB)": 36.53, "step": 7615, "token_acc": 0.7994109947643979, "train_speed(iter/s)": 0.095993 }, { "epoch": 0.35435782816319294, "grad_norm": 6.6157941818237305, "learning_rate": 7.674353620186416e-06, "loss": 0.6375001907348633, "memory(GiB)": 36.53, "step": 7620, "token_acc": 0.8336236933797909, "train_speed(iter/s)": 0.096028 }, { "epoch": 0.3545903464231425, "grad_norm": 6.55482292175293, "learning_rate": 7.671104141588877e-06, "loss": 0.7433343887329101, "memory(GiB)": 36.53, "step": 7625, "token_acc": 0.8180338541666666, "train_speed(iter/s)": 0.096064 }, { "epoch": 0.35482286468309215, "grad_norm": 7.421290874481201, "learning_rate": 7.66785308348499e-06, "loss": 0.7123981952667237, "memory(GiB)": 36.53, "step": 7630, "token_acc": 0.8225122349102774, "train_speed(iter/s)": 0.0961 }, { "epoch": 0.35505538294304173, "grad_norm": 6.6890082359313965, "learning_rate": 7.664600447797206e-06, "loss": 0.6621024131774902, "memory(GiB)": 36.53, "step": 7635, "token_acc": 0.8417508417508418, "train_speed(iter/s)": 0.096136 }, { "epoch": 0.35528790120299136, "grad_norm": 8.536678314208984, "learning_rate": 7.661346236448908e-06, "loss": 0.8194070816040039, "memory(GiB)": 36.53, "step": 7640, "token_acc": 0.7854671280276817, "train_speed(iter/s)": 0.096171 }, { "epoch": 0.35552041946294094, "grad_norm": 7.1367597579956055, "learning_rate": 7.658090451364415e-06, "loss": 0.735087776184082, "memory(GiB)": 36.53, "step": 7645, "token_acc": 0.8191523778712391, "train_speed(iter/s)": 0.096206 }, { "epoch": 0.35575293772289057, "grad_norm": 6.231055736541748, "learning_rate": 7.65483309446897e-06, "loss": 0.7065731048583984, "memory(GiB)": 36.53, "step": 7650, "token_acc": 0.8235985887887103, "train_speed(iter/s)": 0.096241 }, { "epoch": 0.35575293772289057, "eval_loss": 0.6341753602027893, "eval_runtime": 292.1601, "eval_samples_per_second": 11.894, "eval_steps_per_second": 11.894, "step": 7650 }, { "epoch": 0.35598545598284015, "grad_norm": 8.451674461364746, "learning_rate": 7.65157416768875e-06, "loss": 0.7524304389953613, "memory(GiB)": 36.53, "step": 7655, "token_acc": 0.8164709654149802, "train_speed(iter/s)": 0.095923 }, { "epoch": 0.3562179742427898, "grad_norm": 7.157677173614502, "learning_rate": 7.64831367295086e-06, "loss": 0.6308634281158447, "memory(GiB)": 36.53, "step": 7660, "token_acc": 0.8431555971312753, "train_speed(iter/s)": 0.095959 }, { "epoch": 0.35645049250273936, "grad_norm": 6.980820655822754, "learning_rate": 7.645051612183329e-06, "loss": 0.7008957862854004, "memory(GiB)": 36.53, "step": 7665, "token_acc": 0.8320689655172414, "train_speed(iter/s)": 0.095995 }, { "epoch": 0.35668301076268893, "grad_norm": 6.382811546325684, "learning_rate": 7.641787987315115e-06, "loss": 0.6737799644470215, "memory(GiB)": 36.53, "step": 7670, "token_acc": 0.8232436472346786, "train_speed(iter/s)": 0.09603 }, { "epoch": 0.35691552902263857, "grad_norm": 7.144225597381592, "learning_rate": 7.6385228002761e-06, "loss": 0.6494386196136475, "memory(GiB)": 36.53, "step": 7675, "token_acc": 0.8335089567966281, "train_speed(iter/s)": 0.096065 }, { "epoch": 0.35714804728258814, "grad_norm": 7.304563522338867, "learning_rate": 7.63525605299709e-06, "loss": 0.6193370819091797, "memory(GiB)": 36.53, "step": 7680, "token_acc": 0.8410814375206067, "train_speed(iter/s)": 0.096098 }, { "epoch": 0.3573805655425378, "grad_norm": 5.732212543487549, "learning_rate": 7.631987747409816e-06, "loss": 0.7993177890777587, "memory(GiB)": 36.53, "step": 7685, "token_acc": 0.8033457249070632, "train_speed(iter/s)": 0.096134 }, { "epoch": 0.35761308380248735, "grad_norm": 8.190177917480469, "learning_rate": 7.628717885446926e-06, "loss": 0.7521889686584473, "memory(GiB)": 36.53, "step": 7690, "token_acc": 0.8173354735152488, "train_speed(iter/s)": 0.096169 }, { "epoch": 0.357845602062437, "grad_norm": 5.477593421936035, "learning_rate": 7.625446469041988e-06, "loss": 0.7286582469940186, "memory(GiB)": 36.53, "step": 7695, "token_acc": 0.815913688469319, "train_speed(iter/s)": 0.096205 }, { "epoch": 0.35807812032238656, "grad_norm": 7.711561679840088, "learning_rate": 7.622173500129495e-06, "loss": 0.6683283805847168, "memory(GiB)": 36.53, "step": 7700, "token_acc": 0.8445332364693062, "train_speed(iter/s)": 0.096239 }, { "epoch": 0.35807812032238656, "eval_loss": 0.6329870223999023, "eval_runtime": 291.3377, "eval_samples_per_second": 11.928, "eval_steps_per_second": 11.928, "step": 7700 }, { "epoch": 0.3583106385823362, "grad_norm": 7.231578826904297, "learning_rate": 7.618898980644854e-06, "loss": 0.7106448173522949, "memory(GiB)": 36.53, "step": 7705, "token_acc": 0.8174452583963813, "train_speed(iter/s)": 0.095925 }, { "epoch": 0.35854315684228577, "grad_norm": 6.559253215789795, "learning_rate": 7.6156229125243884e-06, "loss": 0.6210072040557861, "memory(GiB)": 36.53, "step": 7710, "token_acc": 0.8506024096385543, "train_speed(iter/s)": 0.09596 }, { "epoch": 0.35877567510223535, "grad_norm": 7.540843963623047, "learning_rate": 7.612345297705337e-06, "loss": 0.6631568908691406, "memory(GiB)": 36.53, "step": 7715, "token_acc": 0.8360078277886497, "train_speed(iter/s)": 0.095995 }, { "epoch": 0.359008193362185, "grad_norm": 6.490909576416016, "learning_rate": 7.6090661381258576e-06, "loss": 0.7047979831695557, "memory(GiB)": 36.53, "step": 7720, "token_acc": 0.8158925573587017, "train_speed(iter/s)": 0.096031 }, { "epoch": 0.35924071162213456, "grad_norm": 4.649106502532959, "learning_rate": 7.6057854357250194e-06, "loss": 0.7783477783203125, "memory(GiB)": 36.53, "step": 7725, "token_acc": 0.8007202881152461, "train_speed(iter/s)": 0.096066 }, { "epoch": 0.3594732298820842, "grad_norm": 9.474201202392578, "learning_rate": 7.6025031924427985e-06, "loss": 0.6698055267333984, "memory(GiB)": 36.53, "step": 7730, "token_acc": 0.8268434134217068, "train_speed(iter/s)": 0.096101 }, { "epoch": 0.35970574814203377, "grad_norm": 6.788260459899902, "learning_rate": 7.599219410220089e-06, "loss": 0.649644422531128, "memory(GiB)": 36.53, "step": 7735, "token_acc": 0.8305889803673211, "train_speed(iter/s)": 0.096136 }, { "epoch": 0.3599382664019834, "grad_norm": 6.228346824645996, "learning_rate": 7.5959340909986935e-06, "loss": 0.675438404083252, "memory(GiB)": 36.53, "step": 7740, "token_acc": 0.8261859582542694, "train_speed(iter/s)": 0.09617 }, { "epoch": 0.360170784661933, "grad_norm": 4.204248905181885, "learning_rate": 7.592647236721324e-06, "loss": 0.7380726337432861, "memory(GiB)": 36.53, "step": 7745, "token_acc": 0.8304862023653088, "train_speed(iter/s)": 0.096205 }, { "epoch": 0.3604033029218826, "grad_norm": 6.923390865325928, "learning_rate": 7.589358849331594e-06, "loss": 0.7568727493286133, "memory(GiB)": 36.53, "step": 7750, "token_acc": 0.829938570966699, "train_speed(iter/s)": 0.096239 }, { "epoch": 0.3604033029218826, "eval_loss": 0.634183406829834, "eval_runtime": 296.2328, "eval_samples_per_second": 11.731, "eval_steps_per_second": 11.731, "step": 7750 }, { "epoch": 0.3606358211818322, "grad_norm": 6.226519584655762, "learning_rate": 7.586068930774032e-06, "loss": 0.6907257556915283, "memory(GiB)": 36.53, "step": 7755, "token_acc": 0.8169730623249694, "train_speed(iter/s)": 0.09592 }, { "epoch": 0.36086833944178176, "grad_norm": 6.021866798400879, "learning_rate": 7.5827774829940685e-06, "loss": 0.7334243774414062, "memory(GiB)": 36.53, "step": 7760, "token_acc": 0.8235482836060315, "train_speed(iter/s)": 0.095955 }, { "epoch": 0.3611008577017314, "grad_norm": 6.340511798858643, "learning_rate": 7.579484507938037e-06, "loss": 0.7543991088867188, "memory(GiB)": 36.53, "step": 7765, "token_acc": 0.8080444735120994, "train_speed(iter/s)": 0.09599 }, { "epoch": 0.36133337596168097, "grad_norm": 7.009573936462402, "learning_rate": 7.576190007553177e-06, "loss": 0.7209124565124512, "memory(GiB)": 36.53, "step": 7770, "token_acc": 0.8328840970350404, "train_speed(iter/s)": 0.096024 }, { "epoch": 0.3615658942216306, "grad_norm": 6.050121307373047, "learning_rate": 7.572893983787626e-06, "loss": 0.6631560802459717, "memory(GiB)": 36.53, "step": 7775, "token_acc": 0.8289615522817104, "train_speed(iter/s)": 0.096058 }, { "epoch": 0.3617984124815802, "grad_norm": 7.535763263702393, "learning_rate": 7.5695964385904255e-06, "loss": 0.6829229354858398, "memory(GiB)": 36.53, "step": 7780, "token_acc": 0.8269992082343627, "train_speed(iter/s)": 0.096093 }, { "epoch": 0.3620309307415298, "grad_norm": 7.163476943969727, "learning_rate": 7.566297373911517e-06, "loss": 0.8164946556091308, "memory(GiB)": 36.53, "step": 7785, "token_acc": 0.8013176144244105, "train_speed(iter/s)": 0.096128 }, { "epoch": 0.3622634490014794, "grad_norm": 7.023055553436279, "learning_rate": 7.562996791701739e-06, "loss": 0.6314420223236084, "memory(GiB)": 36.53, "step": 7790, "token_acc": 0.8464187327823691, "train_speed(iter/s)": 0.096163 }, { "epoch": 0.362495967261429, "grad_norm": 6.3728928565979, "learning_rate": 7.559694693912827e-06, "loss": 0.7461518764495849, "memory(GiB)": 36.53, "step": 7795, "token_acc": 0.8165374677002584, "train_speed(iter/s)": 0.096198 }, { "epoch": 0.3627284855213786, "grad_norm": 5.9811201095581055, "learning_rate": 7.5563910824974114e-06, "loss": 0.7891818046569824, "memory(GiB)": 36.53, "step": 7800, "token_acc": 0.8100607111882047, "train_speed(iter/s)": 0.096233 }, { "epoch": 0.3627284855213786, "eval_loss": 0.6325167417526245, "eval_runtime": 294.4563, "eval_samples_per_second": 11.801, "eval_steps_per_second": 11.801, "step": 7800 }, { "epoch": 0.36296100378132823, "grad_norm": 6.407812118530273, "learning_rate": 7.553085959409023e-06, "loss": 0.526038122177124, "memory(GiB)": 36.53, "step": 7805, "token_acc": 0.8190200902988461, "train_speed(iter/s)": 0.095917 }, { "epoch": 0.3631935220412778, "grad_norm": 7.23723840713501, "learning_rate": 7.549779326602083e-06, "loss": 0.7444475650787353, "memory(GiB)": 36.53, "step": 7810, "token_acc": 0.823447313328681, "train_speed(iter/s)": 0.095951 }, { "epoch": 0.3634260403012274, "grad_norm": 5.919946193695068, "learning_rate": 7.546471186031903e-06, "loss": 0.8032725334167481, "memory(GiB)": 36.53, "step": 7815, "token_acc": 0.7879730430274754, "train_speed(iter/s)": 0.095985 }, { "epoch": 0.363658558561177, "grad_norm": 7.535763263702393, "learning_rate": 7.543161539654688e-06, "loss": 0.6441785812377929, "memory(GiB)": 36.53, "step": 7820, "token_acc": 0.8508461235733963, "train_speed(iter/s)": 0.09602 }, { "epoch": 0.3638910768211266, "grad_norm": 8.509560585021973, "learning_rate": 7.539850389427539e-06, "loss": 0.6608382225036621, "memory(GiB)": 36.53, "step": 7825, "token_acc": 0.8400160384923817, "train_speed(iter/s)": 0.096055 }, { "epoch": 0.3641235950810762, "grad_norm": 6.910362243652344, "learning_rate": 7.536537737308437e-06, "loss": 0.669133996963501, "memory(GiB)": 36.53, "step": 7830, "token_acc": 0.8291802094618996, "train_speed(iter/s)": 0.09609 }, { "epoch": 0.3643561133410258, "grad_norm": 6.624199390411377, "learning_rate": 7.533223585256255e-06, "loss": 0.7464718341827392, "memory(GiB)": 36.53, "step": 7835, "token_acc": 0.8115384615384615, "train_speed(iter/s)": 0.096124 }, { "epoch": 0.36458863160097543, "grad_norm": 6.117915630340576, "learning_rate": 7.529907935230758e-06, "loss": 0.7008463859558105, "memory(GiB)": 36.53, "step": 7840, "token_acc": 0.8128292531763247, "train_speed(iter/s)": 0.096159 }, { "epoch": 0.364821149860925, "grad_norm": 6.617120742797852, "learning_rate": 7.5265907891925895e-06, "loss": 0.6590275287628173, "memory(GiB)": 36.53, "step": 7845, "token_acc": 0.8379475821336286, "train_speed(iter/s)": 0.096193 }, { "epoch": 0.36505366812087464, "grad_norm": 6.689937591552734, "learning_rate": 7.52327214910328e-06, "loss": 0.7318082809448242, "memory(GiB)": 36.53, "step": 7850, "token_acc": 0.8155797101449276, "train_speed(iter/s)": 0.096227 }, { "epoch": 0.36505366812087464, "eval_loss": 0.6311256289482117, "eval_runtime": 292.5386, "eval_samples_per_second": 11.879, "eval_steps_per_second": 11.879, "step": 7850 }, { "epoch": 0.3652861863808242, "grad_norm": 10.915748596191406, "learning_rate": 7.5199520169252425e-06, "loss": 0.702227783203125, "memory(GiB)": 36.53, "step": 7855, "token_acc": 0.8179930906788287, "train_speed(iter/s)": 0.095918 }, { "epoch": 0.3655187046407738, "grad_norm": 6.974138259887695, "learning_rate": 7.5166303946217765e-06, "loss": 0.7374119281768798, "memory(GiB)": 36.53, "step": 7860, "token_acc": 0.8254620123203286, "train_speed(iter/s)": 0.095953 }, { "epoch": 0.3657512229007234, "grad_norm": 6.728903293609619, "learning_rate": 7.513307284157059e-06, "loss": 0.8151761054992676, "memory(GiB)": 36.53, "step": 7865, "token_acc": 0.803594351732991, "train_speed(iter/s)": 0.095987 }, { "epoch": 0.365983741160673, "grad_norm": 6.212688446044922, "learning_rate": 7.509982687496147e-06, "loss": 0.6229990482330322, "memory(GiB)": 36.53, "step": 7870, "token_acc": 0.8375451263537906, "train_speed(iter/s)": 0.096022 }, { "epoch": 0.36621625942062264, "grad_norm": 6.141170501708984, "learning_rate": 7.506656606604977e-06, "loss": 0.6170244216918945, "memory(GiB)": 36.53, "step": 7875, "token_acc": 0.8408247422680413, "train_speed(iter/s)": 0.096056 }, { "epoch": 0.3664487776805722, "grad_norm": 11.874704360961914, "learning_rate": 7.503329043450365e-06, "loss": 0.7883286476135254, "memory(GiB)": 36.53, "step": 7880, "token_acc": 0.8000757288905718, "train_speed(iter/s)": 0.096091 }, { "epoch": 0.36668129594052185, "grad_norm": 6.312220096588135, "learning_rate": 7.500000000000001e-06, "loss": 0.7715856075286865, "memory(GiB)": 36.53, "step": 7885, "token_acc": 0.8031995170540296, "train_speed(iter/s)": 0.096125 }, { "epoch": 0.3669138142004714, "grad_norm": 6.496086597442627, "learning_rate": 7.496669478222451e-06, "loss": 0.7942769527435303, "memory(GiB)": 36.53, "step": 7890, "token_acc": 0.8106926698049765, "train_speed(iter/s)": 0.096159 }, { "epoch": 0.36714633246042105, "grad_norm": 7.234003067016602, "learning_rate": 7.493337480087154e-06, "loss": 0.6148253917694092, "memory(GiB)": 36.53, "step": 7895, "token_acc": 0.8405434393993565, "train_speed(iter/s)": 0.096194 }, { "epoch": 0.36737885072037063, "grad_norm": 4.3299784660339355, "learning_rate": 7.490004007564426e-06, "loss": 0.6379117965698242, "memory(GiB)": 36.53, "step": 7900, "token_acc": 0.8427672955974843, "train_speed(iter/s)": 0.096228 }, { "epoch": 0.36737885072037063, "eval_loss": 0.631164014339447, "eval_runtime": 292.1888, "eval_samples_per_second": 11.893, "eval_steps_per_second": 11.893, "step": 7900 }, { "epoch": 0.36761136898032026, "grad_norm": 7.93583345413208, "learning_rate": 7.4866690626254504e-06, "loss": 0.7415062427520752, "memory(GiB)": 36.53, "step": 7905, "token_acc": 0.8179509025615709, "train_speed(iter/s)": 0.09592 }, { "epoch": 0.36784388724026984, "grad_norm": 6.743467330932617, "learning_rate": 7.483332647242283e-06, "loss": 0.6835087299346924, "memory(GiB)": 36.53, "step": 7910, "token_acc": 0.8301818181818181, "train_speed(iter/s)": 0.095954 }, { "epoch": 0.3680764055002194, "grad_norm": 5.77875280380249, "learning_rate": 7.47999476338785e-06, "loss": 0.7879226207733154, "memory(GiB)": 36.53, "step": 7915, "token_acc": 0.8026151930261519, "train_speed(iter/s)": 0.095988 }, { "epoch": 0.36830892376016905, "grad_norm": 6.8071112632751465, "learning_rate": 7.4766554130359446e-06, "loss": 0.7319873809814453, "memory(GiB)": 36.53, "step": 7920, "token_acc": 0.815610454708199, "train_speed(iter/s)": 0.096022 }, { "epoch": 0.3685414420201186, "grad_norm": 7.000846862792969, "learning_rate": 7.47331459816123e-06, "loss": 0.6864665508270263, "memory(GiB)": 36.53, "step": 7925, "token_acc": 0.8173270651443922, "train_speed(iter/s)": 0.096055 }, { "epoch": 0.36877396028006826, "grad_norm": 3.548530101776123, "learning_rate": 7.46997232073923e-06, "loss": 0.8144044876098633, "memory(GiB)": 36.53, "step": 7930, "token_acc": 0.7679455445544554, "train_speed(iter/s)": 0.096088 }, { "epoch": 0.36900647854001783, "grad_norm": 6.005768775939941, "learning_rate": 7.466628582746339e-06, "loss": 0.699169111251831, "memory(GiB)": 36.53, "step": 7935, "token_acc": 0.823170731707317, "train_speed(iter/s)": 0.096121 }, { "epoch": 0.36923899679996747, "grad_norm": 7.976009368896484, "learning_rate": 7.4632833861598096e-06, "loss": 0.6825541496276856, "memory(GiB)": 36.53, "step": 7940, "token_acc": 0.81710615280595, "train_speed(iter/s)": 0.096155 }, { "epoch": 0.36947151505991704, "grad_norm": 9.505805969238281, "learning_rate": 7.459936732957762e-06, "loss": 0.7014679908752441, "memory(GiB)": 36.53, "step": 7945, "token_acc": 0.8235664776307499, "train_speed(iter/s)": 0.096189 }, { "epoch": 0.3697040333198667, "grad_norm": 6.265750885009766, "learning_rate": 7.456588625119176e-06, "loss": 0.5671255111694335, "memory(GiB)": 36.53, "step": 7950, "token_acc": 0.8687304075235109, "train_speed(iter/s)": 0.096223 }, { "epoch": 0.3697040333198667, "eval_loss": 0.6289050579071045, "eval_runtime": 293.2764, "eval_samples_per_second": 11.849, "eval_steps_per_second": 11.849, "step": 7950 }, { "epoch": 0.36993655157981625, "grad_norm": 7.854147911071777, "learning_rate": 7.453239064623891e-06, "loss": 0.602921724319458, "memory(GiB)": 36.53, "step": 7955, "token_acc": 0.8187371239388522, "train_speed(iter/s)": 0.095916 }, { "epoch": 0.37016906983976583, "grad_norm": 6.244097709655762, "learning_rate": 7.449888053452602e-06, "loss": 0.718879508972168, "memory(GiB)": 36.53, "step": 7960, "token_acc": 0.8064516129032258, "train_speed(iter/s)": 0.09595 }, { "epoch": 0.37040158809971546, "grad_norm": 8.784631729125977, "learning_rate": 7.44653559358687e-06, "loss": 0.8118132591247559, "memory(GiB)": 36.53, "step": 7965, "token_acc": 0.8013925729442971, "train_speed(iter/s)": 0.095984 }, { "epoch": 0.37063410635966504, "grad_norm": 5.966042518615723, "learning_rate": 7.443181687009107e-06, "loss": 0.5720973014831543, "memory(GiB)": 36.53, "step": 7970, "token_acc": 0.8402120408936009, "train_speed(iter/s)": 0.096017 }, { "epoch": 0.37086662461961467, "grad_norm": 4.4571003913879395, "learning_rate": 7.439826335702579e-06, "loss": 0.7941804885864258, "memory(GiB)": 36.53, "step": 7975, "token_acc": 0.8104440789473685, "train_speed(iter/s)": 0.096051 }, { "epoch": 0.37109914287956425, "grad_norm": 6.233658313751221, "learning_rate": 7.43646954165141e-06, "loss": 0.61534104347229, "memory(GiB)": 36.53, "step": 7980, "token_acc": 0.854672704816797, "train_speed(iter/s)": 0.096085 }, { "epoch": 0.3713316611395139, "grad_norm": 5.3049492835998535, "learning_rate": 7.433111306840578e-06, "loss": 0.7509316444396973, "memory(GiB)": 36.53, "step": 7985, "token_acc": 0.8122314885013899, "train_speed(iter/s)": 0.096118 }, { "epoch": 0.37156417939946346, "grad_norm": 6.574211120605469, "learning_rate": 7.429751633255908e-06, "loss": 0.6532687187194824, "memory(GiB)": 36.53, "step": 7990, "token_acc": 0.8419689119170984, "train_speed(iter/s)": 0.096152 }, { "epoch": 0.3717966976594131, "grad_norm": 7.246621131896973, "learning_rate": 7.426390522884081e-06, "loss": 0.6897338390350342, "memory(GiB)": 36.53, "step": 7995, "token_acc": 0.8131868131868132, "train_speed(iter/s)": 0.096187 }, { "epoch": 0.37202921591936267, "grad_norm": 7.290582656860352, "learning_rate": 7.423027977712625e-06, "loss": 0.6523595809936523, "memory(GiB)": 36.53, "step": 8000, "token_acc": 0.8341408870667164, "train_speed(iter/s)": 0.096221 }, { "epoch": 0.37202921591936267, "eval_loss": 0.6307212710380554, "eval_runtime": 292.7948, "eval_samples_per_second": 11.868, "eval_steps_per_second": 11.868, "step": 8000 }, { "epoch": 0.37226173417931224, "grad_norm": 6.008643627166748, "learning_rate": 7.419663999729914e-06, "loss": 0.6222623348236084, "memory(GiB)": 36.53, "step": 8005, "token_acc": 0.8188679547309952, "train_speed(iter/s)": 0.095916 }, { "epoch": 0.3724942524392619, "grad_norm": 6.651423454284668, "learning_rate": 7.4162985909251775e-06, "loss": 0.5432338237762451, "memory(GiB)": 36.53, "step": 8010, "token_acc": 0.8626045400238949, "train_speed(iter/s)": 0.095951 }, { "epoch": 0.37272677069921145, "grad_norm": 5.912210941314697, "learning_rate": 7.412931753288479e-06, "loss": 0.6824829578399658, "memory(GiB)": 36.53, "step": 8015, "token_acc": 0.8359683794466403, "train_speed(iter/s)": 0.095984 }, { "epoch": 0.3729592889591611, "grad_norm": 7.724246025085449, "learning_rate": 7.409563488810739e-06, "loss": 0.6164308071136475, "memory(GiB)": 36.53, "step": 8020, "token_acc": 0.8528896672504378, "train_speed(iter/s)": 0.096017 }, { "epoch": 0.37319180721911066, "grad_norm": 7.594185829162598, "learning_rate": 7.406193799483714e-06, "loss": 0.6749426841735839, "memory(GiB)": 36.53, "step": 8025, "token_acc": 0.8529159519725558, "train_speed(iter/s)": 0.096051 }, { "epoch": 0.3734243254790603, "grad_norm": 6.529979705810547, "learning_rate": 7.402822687300005e-06, "loss": 0.6923796653747558, "memory(GiB)": 36.53, "step": 8030, "token_acc": 0.826313957535677, "train_speed(iter/s)": 0.096085 }, { "epoch": 0.37365684373900987, "grad_norm": 6.124407768249512, "learning_rate": 7.399450154253055e-06, "loss": 0.845398998260498, "memory(GiB)": 36.53, "step": 8035, "token_acc": 0.7821681864235056, "train_speed(iter/s)": 0.096119 }, { "epoch": 0.3738893619989595, "grad_norm": 7.790156364440918, "learning_rate": 7.396076202337148e-06, "loss": 0.7758492469787598, "memory(GiB)": 36.53, "step": 8040, "token_acc": 0.8095768374164811, "train_speed(iter/s)": 0.096153 }, { "epoch": 0.3741218802589091, "grad_norm": 6.442034721374512, "learning_rate": 7.392700833547404e-06, "loss": 0.6660655498504638, "memory(GiB)": 36.53, "step": 8045, "token_acc": 0.8396190476190476, "train_speed(iter/s)": 0.096186 }, { "epoch": 0.3743543985188587, "grad_norm": 5.838046073913574, "learning_rate": 7.389324049879784e-06, "loss": 0.6552052974700928, "memory(GiB)": 36.53, "step": 8050, "token_acc": 0.8436286621955524, "train_speed(iter/s)": 0.096218 }, { "epoch": 0.3743543985188587, "eval_loss": 0.6274383068084717, "eval_runtime": 295.4935, "eval_samples_per_second": 11.76, "eval_steps_per_second": 11.76, "step": 8050 }, { "epoch": 0.3745869167788083, "grad_norm": 7.194561958312988, "learning_rate": 7.385945853331087e-06, "loss": 0.9055326461791993, "memory(GiB)": 36.53, "step": 8055, "token_acc": 0.8169802095273211, "train_speed(iter/s)": 0.095913 }, { "epoch": 0.37481943503875786, "grad_norm": 5.424001693725586, "learning_rate": 7.382566245898939e-06, "loss": 0.6388668060302735, "memory(GiB)": 36.53, "step": 8060, "token_acc": 0.8429508196721311, "train_speed(iter/s)": 0.095947 }, { "epoch": 0.3750519532987075, "grad_norm": 6.293416976928711, "learning_rate": 7.379185229581811e-06, "loss": 0.6606010437011719, "memory(GiB)": 36.53, "step": 8065, "token_acc": 0.8308625336927223, "train_speed(iter/s)": 0.09598 }, { "epoch": 0.3752844715586571, "grad_norm": 6.584990501403809, "learning_rate": 7.375802806379001e-06, "loss": 0.7164745330810547, "memory(GiB)": 36.53, "step": 8070, "token_acc": 0.8291956305858987, "train_speed(iter/s)": 0.096013 }, { "epoch": 0.3755169898186067, "grad_norm": 5.697750091552734, "learning_rate": 7.37241897829064e-06, "loss": 0.6898795604705811, "memory(GiB)": 36.53, "step": 8075, "token_acc": 0.8306896551724138, "train_speed(iter/s)": 0.096046 }, { "epoch": 0.3757495080785563, "grad_norm": 7.506107807159424, "learning_rate": 7.369033747317689e-06, "loss": 0.7770956039428711, "memory(GiB)": 36.53, "step": 8080, "token_acc": 0.8004734528238079, "train_speed(iter/s)": 0.096079 }, { "epoch": 0.3759820263385059, "grad_norm": 8.35728931427002, "learning_rate": 7.3656471154619414e-06, "loss": 0.7625340938568115, "memory(GiB)": 36.53, "step": 8085, "token_acc": 0.801779359430605, "train_speed(iter/s)": 0.096112 }, { "epoch": 0.3762145445984555, "grad_norm": 6.635538101196289, "learning_rate": 7.362259084726016e-06, "loss": 0.6521346092224121, "memory(GiB)": 36.53, "step": 8090, "token_acc": 0.8464827050136028, "train_speed(iter/s)": 0.096144 }, { "epoch": 0.3764470628584051, "grad_norm": 4.508878231048584, "learning_rate": 7.358869657113361e-06, "loss": 0.7203670978546143, "memory(GiB)": 36.53, "step": 8095, "token_acc": 0.8256578947368421, "train_speed(iter/s)": 0.096176 }, { "epoch": 0.3766795811183547, "grad_norm": 5.5791521072387695, "learning_rate": 7.355478834628248e-06, "loss": 0.7203432559967041, "memory(GiB)": 36.53, "step": 8100, "token_acc": 0.8210862619808307, "train_speed(iter/s)": 0.096208 }, { "epoch": 0.3766795811183547, "eval_loss": 0.6275553703308105, "eval_runtime": 296.3077, "eval_samples_per_second": 11.728, "eval_steps_per_second": 11.728, "step": 8100 }, { "epoch": 0.3769120993783043, "grad_norm": 5.211892604827881, "learning_rate": 7.352086619275778e-06, "loss": 0.6877071380615234, "memory(GiB)": 36.53, "step": 8105, "token_acc": 0.8190062240829664, "train_speed(iter/s)": 0.095903 }, { "epoch": 0.3771446176382539, "grad_norm": 8.251355171203613, "learning_rate": 7.348693013061869e-06, "loss": 0.780007791519165, "memory(GiB)": 36.53, "step": 8110, "token_acc": 0.8195804195804196, "train_speed(iter/s)": 0.095936 }, { "epoch": 0.3773771358982035, "grad_norm": 6.018730163574219, "learning_rate": 7.345298017993268e-06, "loss": 0.744899320602417, "memory(GiB)": 36.53, "step": 8115, "token_acc": 0.8276515151515151, "train_speed(iter/s)": 0.095969 }, { "epoch": 0.3776096541581531, "grad_norm": 7.462071895599365, "learning_rate": 7.341901636077538e-06, "loss": 0.6954497337341309, "memory(GiB)": 36.53, "step": 8120, "token_acc": 0.8262008733624454, "train_speed(iter/s)": 0.096003 }, { "epoch": 0.3778421724181027, "grad_norm": 7.142657279968262, "learning_rate": 7.338503869323066e-06, "loss": 0.6354021072387696, "memory(GiB)": 36.53, "step": 8125, "token_acc": 0.8485804416403786, "train_speed(iter/s)": 0.096035 }, { "epoch": 0.3780746906780523, "grad_norm": 6.712642669677734, "learning_rate": 7.335104719739057e-06, "loss": 0.7543253898620605, "memory(GiB)": 36.53, "step": 8130, "token_acc": 0.8300211416490486, "train_speed(iter/s)": 0.096068 }, { "epoch": 0.3783072089380019, "grad_norm": 6.749781131744385, "learning_rate": 7.331704189335532e-06, "loss": 0.6483595848083497, "memory(GiB)": 36.53, "step": 8135, "token_acc": 0.8318812520167796, "train_speed(iter/s)": 0.096102 }, { "epoch": 0.37853972719795154, "grad_norm": 7.32687520980835, "learning_rate": 7.328302280123329e-06, "loss": 0.7506073951721192, "memory(GiB)": 36.53, "step": 8140, "token_acc": 0.8157894736842105, "train_speed(iter/s)": 0.096135 }, { "epoch": 0.3787722454579011, "grad_norm": 8.008889198303223, "learning_rate": 7.324898994114105e-06, "loss": 0.7339870452880859, "memory(GiB)": 36.53, "step": 8145, "token_acc": 0.8172205438066465, "train_speed(iter/s)": 0.096168 }, { "epoch": 0.3790047637178507, "grad_norm": 5.6713972091674805, "learning_rate": 7.321494333320324e-06, "loss": 0.7103267669677734, "memory(GiB)": 36.53, "step": 8150, "token_acc": 0.8242909987669543, "train_speed(iter/s)": 0.096201 }, { "epoch": 0.3790047637178507, "eval_loss": 0.6278688907623291, "eval_runtime": 289.0966, "eval_samples_per_second": 12.02, "eval_steps_per_second": 12.02, "step": 8150 }, { "epoch": 0.3792372819778003, "grad_norm": 6.218601703643799, "learning_rate": 7.318088299755269e-06, "loss": 0.713615608215332, "memory(GiB)": 36.53, "step": 8155, "token_acc": 0.8185400083117548, "train_speed(iter/s)": 0.095907 }, { "epoch": 0.3794698002377499, "grad_norm": 7.3420538902282715, "learning_rate": 7.314680895433033e-06, "loss": 0.6775365829467773, "memory(GiB)": 36.53, "step": 8160, "token_acc": 0.8393574297188755, "train_speed(iter/s)": 0.095942 }, { "epoch": 0.37970231849769953, "grad_norm": 4.985617637634277, "learning_rate": 7.311272122368518e-06, "loss": 0.6389101505279541, "memory(GiB)": 36.53, "step": 8165, "token_acc": 0.851161369193154, "train_speed(iter/s)": 0.095975 }, { "epoch": 0.3799348367576491, "grad_norm": 7.136925220489502, "learning_rate": 7.30786198257744e-06, "loss": 0.7170490264892578, "memory(GiB)": 36.53, "step": 8170, "token_acc": 0.8263403263403264, "train_speed(iter/s)": 0.096009 }, { "epoch": 0.38016735501759874, "grad_norm": 7.366158962249756, "learning_rate": 7.304450478076316e-06, "loss": 0.6841135025024414, "memory(GiB)": 36.53, "step": 8175, "token_acc": 0.8237410071942446, "train_speed(iter/s)": 0.096042 }, { "epoch": 0.3803998732775483, "grad_norm": 8.012834548950195, "learning_rate": 7.301037610882475e-06, "loss": 0.7514279842376709, "memory(GiB)": 36.53, "step": 8180, "token_acc": 0.8040621266427718, "train_speed(iter/s)": 0.096076 }, { "epoch": 0.38063239153749795, "grad_norm": 7.065184116363525, "learning_rate": 7.297623383014054e-06, "loss": 0.7410264015197754, "memory(GiB)": 36.53, "step": 8185, "token_acc": 0.829802513464991, "train_speed(iter/s)": 0.096109 }, { "epoch": 0.3808649097974475, "grad_norm": 6.61446475982666, "learning_rate": 7.2942077964899885e-06, "loss": 0.8626726150512696, "memory(GiB)": 36.53, "step": 8190, "token_acc": 0.7982254354255669, "train_speed(iter/s)": 0.096141 }, { "epoch": 0.38109742805739716, "grad_norm": 9.516047477722168, "learning_rate": 7.29079085333002e-06, "loss": 0.7314748764038086, "memory(GiB)": 36.53, "step": 8195, "token_acc": 0.845380564863571, "train_speed(iter/s)": 0.096174 }, { "epoch": 0.38132994631734674, "grad_norm": 6.8897318840026855, "learning_rate": 7.287372555554692e-06, "loss": 0.6762599468231201, "memory(GiB)": 36.53, "step": 8200, "token_acc": 0.8329145728643216, "train_speed(iter/s)": 0.096207 }, { "epoch": 0.38132994631734674, "eval_loss": 0.6261598467826843, "eval_runtime": 290.8816, "eval_samples_per_second": 11.946, "eval_steps_per_second": 11.946, "step": 8200 }, { "epoch": 0.3815624645772963, "grad_norm": 6.636117458343506, "learning_rate": 7.283952905185352e-06, "loss": 0.7347008228302002, "memory(GiB)": 36.53, "step": 8205, "token_acc": 0.8183973182217256, "train_speed(iter/s)": 0.095911 }, { "epoch": 0.38179498283724594, "grad_norm": 7.701080322265625, "learning_rate": 7.280531904244143e-06, "loss": 0.7064132213592529, "memory(GiB)": 36.53, "step": 8210, "token_acc": 0.8228217280349982, "train_speed(iter/s)": 0.095944 }, { "epoch": 0.3820275010971955, "grad_norm": 6.672091960906982, "learning_rate": 7.277109554754009e-06, "loss": 0.691359281539917, "memory(GiB)": 36.53, "step": 8215, "token_acc": 0.8353398058252427, "train_speed(iter/s)": 0.095976 }, { "epoch": 0.38226001935714515, "grad_norm": 7.158807277679443, "learning_rate": 7.27368585873869e-06, "loss": 0.656017541885376, "memory(GiB)": 36.53, "step": 8220, "token_acc": 0.8263909117390038, "train_speed(iter/s)": 0.096009 }, { "epoch": 0.38249253761709473, "grad_norm": 6.932867527008057, "learning_rate": 7.270260818222725e-06, "loss": 0.6323503971099853, "memory(GiB)": 36.53, "step": 8225, "token_acc": 0.8400509121764955, "train_speed(iter/s)": 0.096042 }, { "epoch": 0.38272505587704436, "grad_norm": 6.20552396774292, "learning_rate": 7.266834435231446e-06, "loss": 0.724024772644043, "memory(GiB)": 36.53, "step": 8230, "token_acc": 0.8196887686062246, "train_speed(iter/s)": 0.096074 }, { "epoch": 0.38295757413699394, "grad_norm": 6.925886154174805, "learning_rate": 7.263406711790978e-06, "loss": 0.6515414237976074, "memory(GiB)": 36.53, "step": 8235, "token_acc": 0.8424753867791842, "train_speed(iter/s)": 0.096108 }, { "epoch": 0.38319009239694357, "grad_norm": 8.428985595703125, "learning_rate": 7.2599776499282385e-06, "loss": 0.8670886039733887, "memory(GiB)": 36.53, "step": 8240, "token_acc": 0.7900763358778626, "train_speed(iter/s)": 0.09614 }, { "epoch": 0.38342261065689315, "grad_norm": 7.204659461975098, "learning_rate": 7.25654725167094e-06, "loss": 0.7875346183776856, "memory(GiB)": 36.53, "step": 8245, "token_acc": 0.8029728020240354, "train_speed(iter/s)": 0.096172 }, { "epoch": 0.3836551289168427, "grad_norm": 8.074034690856934, "learning_rate": 7.253115519047582e-06, "loss": 0.6254090785980224, "memory(GiB)": 36.53, "step": 8250, "token_acc": 0.8534579439252337, "train_speed(iter/s)": 0.096205 }, { "epoch": 0.3836551289168427, "eval_loss": 0.6289153695106506, "eval_runtime": 292.7668, "eval_samples_per_second": 11.87, "eval_steps_per_second": 11.87, "step": 8250 }, { "epoch": 0.38388764717679236, "grad_norm": 6.093391418457031, "learning_rate": 7.249682454087455e-06, "loss": 0.6864508628845215, "memory(GiB)": 36.53, "step": 8255, "token_acc": 0.8191242209155144, "train_speed(iter/s)": 0.09591 }, { "epoch": 0.38412016543674193, "grad_norm": 7.866343975067139, "learning_rate": 7.246248058820633e-06, "loss": 0.7101897716522216, "memory(GiB)": 36.53, "step": 8260, "token_acc": 0.8233898305084746, "train_speed(iter/s)": 0.095943 }, { "epoch": 0.38435268369669157, "grad_norm": 6.162656307220459, "learning_rate": 7.242812335277983e-06, "loss": 0.6610394477844238, "memory(GiB)": 36.53, "step": 8265, "token_acc": 0.835983785469286, "train_speed(iter/s)": 0.095976 }, { "epoch": 0.38458520195664114, "grad_norm": 6.9368367195129395, "learning_rate": 7.239375285491155e-06, "loss": 0.7751856327056885, "memory(GiB)": 36.53, "step": 8270, "token_acc": 0.8140916808149405, "train_speed(iter/s)": 0.096009 }, { "epoch": 0.3848177202165908, "grad_norm": 7.226904392242432, "learning_rate": 7.23593691149258e-06, "loss": 0.6185824394226074, "memory(GiB)": 36.53, "step": 8275, "token_acc": 0.8502024291497976, "train_speed(iter/s)": 0.096041 }, { "epoch": 0.38505023847654035, "grad_norm": 7.314187049865723, "learning_rate": 7.232497215315475e-06, "loss": 0.7929863929748535, "memory(GiB)": 36.53, "step": 8280, "token_acc": 0.8059055118110237, "train_speed(iter/s)": 0.096074 }, { "epoch": 0.38528275673649, "grad_norm": 6.342879772186279, "learning_rate": 7.229056198993841e-06, "loss": 0.641834306716919, "memory(GiB)": 36.53, "step": 8285, "token_acc": 0.8361940298507463, "train_speed(iter/s)": 0.096107 }, { "epoch": 0.38551527499643956, "grad_norm": 8.272603034973145, "learning_rate": 7.225613864562456e-06, "loss": 0.715467882156372, "memory(GiB)": 36.53, "step": 8290, "token_acc": 0.8329686360320934, "train_speed(iter/s)": 0.096139 }, { "epoch": 0.3857477932563892, "grad_norm": 8.183306694030762, "learning_rate": 7.222170214056878e-06, "loss": 0.7441752433776856, "memory(GiB)": 36.53, "step": 8295, "token_acc": 0.8247078464106845, "train_speed(iter/s)": 0.096171 }, { "epoch": 0.38598031151633877, "grad_norm": 6.751532077789307, "learning_rate": 7.218725249513444e-06, "loss": 0.7253667831420898, "memory(GiB)": 36.53, "step": 8300, "token_acc": 0.8165555945282357, "train_speed(iter/s)": 0.096204 }, { "epoch": 0.38598031151633877, "eval_loss": 0.6258677840232849, "eval_runtime": 290.9616, "eval_samples_per_second": 11.943, "eval_steps_per_second": 11.943, "step": 8300 }, { "epoch": 0.38621282977628835, "grad_norm": 6.383945465087891, "learning_rate": 7.215278972969267e-06, "loss": 0.6705090999603271, "memory(GiB)": 36.53, "step": 8305, "token_acc": 0.8193674339710201, "train_speed(iter/s)": 0.095913 }, { "epoch": 0.386445348036238, "grad_norm": 6.654679298400879, "learning_rate": 7.211831386462239e-06, "loss": 0.7232837677001953, "memory(GiB)": 36.53, "step": 8310, "token_acc": 0.8157894736842105, "train_speed(iter/s)": 0.095946 }, { "epoch": 0.38667786629618756, "grad_norm": 7.329780578613281, "learning_rate": 7.208382492031024e-06, "loss": 0.681901216506958, "memory(GiB)": 36.53, "step": 8315, "token_acc": 0.8424460431654677, "train_speed(iter/s)": 0.095979 }, { "epoch": 0.3869103845561372, "grad_norm": 5.073947429656982, "learning_rate": 7.204932291715059e-06, "loss": 0.7335203647613525, "memory(GiB)": 36.53, "step": 8320, "token_acc": 0.8240954580446497, "train_speed(iter/s)": 0.096011 }, { "epoch": 0.38714290281608676, "grad_norm": 8.371402740478516, "learning_rate": 7.201480787554551e-06, "loss": 0.7167182922363281, "memory(GiB)": 36.53, "step": 8325, "token_acc": 0.8266949152542373, "train_speed(iter/s)": 0.096043 }, { "epoch": 0.3873754210760364, "grad_norm": 6.6527180671691895, "learning_rate": 7.198027981590487e-06, "loss": 0.6592539310455322, "memory(GiB)": 36.53, "step": 8330, "token_acc": 0.8371653543307087, "train_speed(iter/s)": 0.096076 }, { "epoch": 0.387607939335986, "grad_norm": 5.508927822113037, "learning_rate": 7.194573875864615e-06, "loss": 0.7026764869689941, "memory(GiB)": 36.53, "step": 8335, "token_acc": 0.8083706238483811, "train_speed(iter/s)": 0.096108 }, { "epoch": 0.3878404575959356, "grad_norm": 7.605576038360596, "learning_rate": 7.1911184724194504e-06, "loss": 0.603968620300293, "memory(GiB)": 36.53, "step": 8340, "token_acc": 0.851528384279476, "train_speed(iter/s)": 0.096141 }, { "epoch": 0.3880729758558852, "grad_norm": 6.188586711883545, "learning_rate": 7.187661773298287e-06, "loss": 0.6946659088134766, "memory(GiB)": 36.53, "step": 8345, "token_acc": 0.834151979196764, "train_speed(iter/s)": 0.096172 }, { "epoch": 0.38830549411583476, "grad_norm": 7.38683557510376, "learning_rate": 7.184203780545173e-06, "loss": 0.6234054565429688, "memory(GiB)": 36.53, "step": 8350, "token_acc": 0.8553169734151329, "train_speed(iter/s)": 0.096205 }, { "epoch": 0.38830549411583476, "eval_loss": 0.6242329478263855, "eval_runtime": 291.5772, "eval_samples_per_second": 11.918, "eval_steps_per_second": 11.918, "step": 8350 }, { "epoch": 0.3885380123757844, "grad_norm": 7.963403224945068, "learning_rate": 7.180744496204928e-06, "loss": 0.8015275955200195, "memory(GiB)": 36.53, "step": 8355, "token_acc": 0.8184321983010544, "train_speed(iter/s)": 0.095913 }, { "epoch": 0.38877053063573397, "grad_norm": 9.448433876037598, "learning_rate": 7.177283922323132e-06, "loss": 0.8290170669555664, "memory(GiB)": 36.53, "step": 8360, "token_acc": 0.7876712328767124, "train_speed(iter/s)": 0.095944 }, { "epoch": 0.3890030488956836, "grad_norm": 6.719177722930908, "learning_rate": 7.173822060946131e-06, "loss": 0.8085485458374023, "memory(GiB)": 36.53, "step": 8365, "token_acc": 0.7998301245753114, "train_speed(iter/s)": 0.095977 }, { "epoch": 0.3892355671556332, "grad_norm": 8.454413414001465, "learning_rate": 7.170358914121031e-06, "loss": 0.6984954833984375, "memory(GiB)": 36.53, "step": 8370, "token_acc": 0.8300073909830007, "train_speed(iter/s)": 0.09601 }, { "epoch": 0.3894680854155828, "grad_norm": 6.720226764678955, "learning_rate": 7.166894483895695e-06, "loss": 0.6794505596160889, "memory(GiB)": 36.53, "step": 8375, "token_acc": 0.8185955786736021, "train_speed(iter/s)": 0.096041 }, { "epoch": 0.3897006036755324, "grad_norm": 5.379835605621338, "learning_rate": 7.163428772318749e-06, "loss": 0.7329598903656006, "memory(GiB)": 36.53, "step": 8380, "token_acc": 0.8256791720569211, "train_speed(iter/s)": 0.096072 }, { "epoch": 0.389933121935482, "grad_norm": 5.505593776702881, "learning_rate": 7.1599617814395764e-06, "loss": 0.6776937007904053, "memory(GiB)": 36.53, "step": 8385, "token_acc": 0.8378680581438688, "train_speed(iter/s)": 0.096103 }, { "epoch": 0.3901656401954316, "grad_norm": 6.174607276916504, "learning_rate": 7.1564935133083146e-06, "loss": 0.6524590492248535, "memory(GiB)": 36.53, "step": 8390, "token_acc": 0.8298722044728435, "train_speed(iter/s)": 0.096135 }, { "epoch": 0.3903981584553812, "grad_norm": 8.53377914428711, "learning_rate": 7.153023969975858e-06, "loss": 0.7605315685272217, "memory(GiB)": 36.53, "step": 8395, "token_acc": 0.8234060402684564, "train_speed(iter/s)": 0.096167 }, { "epoch": 0.3906306767153308, "grad_norm": 5.26388692855835, "learning_rate": 7.149553153493853e-06, "loss": 0.6725038051605224, "memory(GiB)": 36.53, "step": 8400, "token_acc": 0.8365276211950394, "train_speed(iter/s)": 0.096199 }, { "epoch": 0.3906306767153308, "eval_loss": 0.6255432963371277, "eval_runtime": 290.2824, "eval_samples_per_second": 11.971, "eval_steps_per_second": 11.971, "step": 8400 }, { "epoch": 0.3908631949752804, "grad_norm": 7.3965044021606445, "learning_rate": 7.1460810659147036e-06, "loss": 0.6825075626373291, "memory(GiB)": 36.53, "step": 8405, "token_acc": 0.8196063733073893, "train_speed(iter/s)": 0.095913 }, { "epoch": 0.39109571323523, "grad_norm": 5.887383937835693, "learning_rate": 7.142607709291561e-06, "loss": 0.5826900959014892, "memory(GiB)": 36.53, "step": 8410, "token_acc": 0.8551888195063931, "train_speed(iter/s)": 0.095944 }, { "epoch": 0.3913282314951796, "grad_norm": 6.264004707336426, "learning_rate": 7.139133085678329e-06, "loss": 0.6516207218170166, "memory(GiB)": 36.53, "step": 8415, "token_acc": 0.8345011678345011, "train_speed(iter/s)": 0.095975 }, { "epoch": 0.3915607497551292, "grad_norm": 7.808225154876709, "learning_rate": 7.135657197129658e-06, "loss": 0.7851691246032715, "memory(GiB)": 36.53, "step": 8420, "token_acc": 0.7870065789473685, "train_speed(iter/s)": 0.096008 }, { "epoch": 0.3917932680150788, "grad_norm": 7.2982563972473145, "learning_rate": 7.132180045700948e-06, "loss": 0.6950534343719482, "memory(GiB)": 36.53, "step": 8425, "token_acc": 0.8203883495145631, "train_speed(iter/s)": 0.09604 }, { "epoch": 0.39202578627502843, "grad_norm": 6.782866477966309, "learning_rate": 7.128701633448349e-06, "loss": 0.7969116687774658, "memory(GiB)": 36.53, "step": 8430, "token_acc": 0.7997275204359673, "train_speed(iter/s)": 0.096072 }, { "epoch": 0.392258304534978, "grad_norm": 6.884278297424316, "learning_rate": 7.125221962428751e-06, "loss": 0.7231873035430908, "memory(GiB)": 36.53, "step": 8435, "token_acc": 0.8266978922716628, "train_speed(iter/s)": 0.096104 }, { "epoch": 0.39249082279492764, "grad_norm": 6.458530902862549, "learning_rate": 7.121741034699791e-06, "loss": 0.685096549987793, "memory(GiB)": 36.53, "step": 8440, "token_acc": 0.8211323476379373, "train_speed(iter/s)": 0.096136 }, { "epoch": 0.3927233410548772, "grad_norm": 6.654186725616455, "learning_rate": 7.118258852319849e-06, "loss": 0.7162202358245849, "memory(GiB)": 36.53, "step": 8445, "token_acc": 0.8270702853166318, "train_speed(iter/s)": 0.096167 }, { "epoch": 0.3929558593148268, "grad_norm": 6.901993274688721, "learning_rate": 7.11477541734805e-06, "loss": 0.7216857433319092, "memory(GiB)": 36.53, "step": 8450, "token_acc": 0.8303501945525292, "train_speed(iter/s)": 0.0962 }, { "epoch": 0.3929558593148268, "eval_loss": 0.6227236986160278, "eval_runtime": 293.0838, "eval_samples_per_second": 11.857, "eval_steps_per_second": 11.857, "step": 8450 }, { "epoch": 0.3931883775747764, "grad_norm": 5.063840866088867, "learning_rate": 7.1112907318442525e-06, "loss": 0.8187254905700684, "memory(GiB)": 36.53, "step": 8455, "token_acc": 0.8182777610258526, "train_speed(iter/s)": 0.095909 }, { "epoch": 0.393420895834726, "grad_norm": 6.54954195022583, "learning_rate": 7.107804797869061e-06, "loss": 0.6639408111572266, "memory(GiB)": 36.53, "step": 8460, "token_acc": 0.8401349072512647, "train_speed(iter/s)": 0.09594 }, { "epoch": 0.39365341409467564, "grad_norm": 7.679318904876709, "learning_rate": 7.104317617483815e-06, "loss": 0.6447121143341065, "memory(GiB)": 36.53, "step": 8465, "token_acc": 0.8449675324675324, "train_speed(iter/s)": 0.095971 }, { "epoch": 0.3938859323546252, "grad_norm": 6.481260776519775, "learning_rate": 7.100829192750592e-06, "loss": 0.6538030624389648, "memory(GiB)": 36.53, "step": 8470, "token_acc": 0.8317723342939481, "train_speed(iter/s)": 0.096003 }, { "epoch": 0.39411845061457484, "grad_norm": 4.740011692047119, "learning_rate": 7.097339525732207e-06, "loss": 0.6647510528564453, "memory(GiB)": 36.53, "step": 8475, "token_acc": 0.8279646017699115, "train_speed(iter/s)": 0.096034 }, { "epoch": 0.3943509688745244, "grad_norm": 8.062383651733398, "learning_rate": 7.0938486184922055e-06, "loss": 0.7419010162353515, "memory(GiB)": 36.53, "step": 8480, "token_acc": 0.8148614609571788, "train_speed(iter/s)": 0.096066 }, { "epoch": 0.39458348713447405, "grad_norm": 6.529599189758301, "learning_rate": 7.090356473094868e-06, "loss": 0.6296727180480957, "memory(GiB)": 36.53, "step": 8485, "token_acc": 0.8369491525423729, "train_speed(iter/s)": 0.096098 }, { "epoch": 0.39481600539442363, "grad_norm": 6.894472122192383, "learning_rate": 7.086863091605212e-06, "loss": 0.7523578643798828, "memory(GiB)": 36.53, "step": 8490, "token_acc": 0.8076321551454488, "train_speed(iter/s)": 0.09613 }, { "epoch": 0.3950485236543732, "grad_norm": 7.225290298461914, "learning_rate": 7.083368476088978e-06, "loss": 0.658946943283081, "memory(GiB)": 36.53, "step": 8495, "token_acc": 0.8323272971160295, "train_speed(iter/s)": 0.096161 }, { "epoch": 0.39528104191432284, "grad_norm": 5.5117926597595215, "learning_rate": 7.07987262861264e-06, "loss": 0.6642012119293212, "memory(GiB)": 36.53, "step": 8500, "token_acc": 0.8328358208955224, "train_speed(iter/s)": 0.096193 }, { "epoch": 0.39528104191432284, "eval_loss": 0.6217544674873352, "eval_runtime": 291.8932, "eval_samples_per_second": 11.905, "eval_steps_per_second": 11.905, "step": 8500 }, { "epoch": 0.3955135601742724, "grad_norm": 8.180193901062012, "learning_rate": 7.076375551243404e-06, "loss": 0.8467119216918946, "memory(GiB)": 36.53, "step": 8505, "token_acc": 0.8187991678821644, "train_speed(iter/s)": 0.095906 }, { "epoch": 0.39574607843422205, "grad_norm": 9.231237411499023, "learning_rate": 7.072877246049197e-06, "loss": 0.7153133869171142, "memory(GiB)": 36.53, "step": 8510, "token_acc": 0.8269018743109151, "train_speed(iter/s)": 0.095938 }, { "epoch": 0.3959785966941716, "grad_norm": 8.131089210510254, "learning_rate": 7.069377715098675e-06, "loss": 0.5359804630279541, "memory(GiB)": 36.53, "step": 8515, "token_acc": 0.8640939597315436, "train_speed(iter/s)": 0.09597 }, { "epoch": 0.39621111495412126, "grad_norm": 7.11887264251709, "learning_rate": 7.065876960461219e-06, "loss": 0.7117724418640137, "memory(GiB)": 36.53, "step": 8520, "token_acc": 0.8228682170542636, "train_speed(iter/s)": 0.096001 }, { "epoch": 0.39644363321407083, "grad_norm": 5.5428667068481445, "learning_rate": 7.062374984206935e-06, "loss": 0.806981086730957, "memory(GiB)": 36.53, "step": 8525, "token_acc": 0.807551766138855, "train_speed(iter/s)": 0.096033 }, { "epoch": 0.39667615147402047, "grad_norm": 5.641619682312012, "learning_rate": 7.058871788406647e-06, "loss": 0.7675019264221191, "memory(GiB)": 36.53, "step": 8530, "token_acc": 0.8024366150806718, "train_speed(iter/s)": 0.096065 }, { "epoch": 0.39690866973397004, "grad_norm": 6.307408332824707, "learning_rate": 7.055367375131904e-06, "loss": 0.6659773349761963, "memory(GiB)": 36.53, "step": 8535, "token_acc": 0.8338645418326693, "train_speed(iter/s)": 0.096097 }, { "epoch": 0.3971411879939196, "grad_norm": 6.317528247833252, "learning_rate": 7.051861746454973e-06, "loss": 0.6953274250030518, "memory(GiB)": 36.53, "step": 8540, "token_acc": 0.8302925989672978, "train_speed(iter/s)": 0.096129 }, { "epoch": 0.39737370625386925, "grad_norm": 7.031256675720215, "learning_rate": 7.048354904448843e-06, "loss": 0.7554344177246094, "memory(GiB)": 36.53, "step": 8545, "token_acc": 0.7995824634655533, "train_speed(iter/s)": 0.09616 }, { "epoch": 0.39760622451381883, "grad_norm": 9.937764167785645, "learning_rate": 7.044846851187216e-06, "loss": 0.7292638778686523, "memory(GiB)": 36.53, "step": 8550, "token_acc": 0.8106201262532492, "train_speed(iter/s)": 0.096192 }, { "epoch": 0.39760622451381883, "eval_loss": 0.6203534007072449, "eval_runtime": 294.7863, "eval_samples_per_second": 11.788, "eval_steps_per_second": 11.788, "step": 8550 }, { "epoch": 0.39783874277376846, "grad_norm": 7.463900566101074, "learning_rate": 7.0413375887445125e-06, "loss": 0.7390836238861084, "memory(GiB)": 36.53, "step": 8555, "token_acc": 0.8197846190800272, "train_speed(iter/s)": 0.095905 }, { "epoch": 0.39807126103371804, "grad_norm": 5.609832286834717, "learning_rate": 7.037827119195867e-06, "loss": 0.6625056743621827, "memory(GiB)": 36.53, "step": 8560, "token_acc": 0.8261359369132557, "train_speed(iter/s)": 0.095936 }, { "epoch": 0.39830377929366767, "grad_norm": 5.434301376342773, "learning_rate": 7.034315444617129e-06, "loss": 0.5975072860717774, "memory(GiB)": 36.53, "step": 8565, "token_acc": 0.848990953375087, "train_speed(iter/s)": 0.095967 }, { "epoch": 0.39853629755361725, "grad_norm": 6.72133731842041, "learning_rate": 7.03080256708486e-06, "loss": 0.65772385597229, "memory(GiB)": 36.53, "step": 8570, "token_acc": 0.8415803605677024, "train_speed(iter/s)": 0.095998 }, { "epoch": 0.3987688158135669, "grad_norm": 6.663748741149902, "learning_rate": 7.027288488676335e-06, "loss": 0.7755102634429931, "memory(GiB)": 36.53, "step": 8575, "token_acc": 0.7967137944210928, "train_speed(iter/s)": 0.096029 }, { "epoch": 0.39900133407351646, "grad_norm": 8.37984561920166, "learning_rate": 7.023773211469535e-06, "loss": 0.7156404495239258, "memory(GiB)": 36.53, "step": 8580, "token_acc": 0.8284989122552574, "train_speed(iter/s)": 0.096061 }, { "epoch": 0.3992338523334661, "grad_norm": 6.5237908363342285, "learning_rate": 7.020256737543149e-06, "loss": 0.6762457847595215, "memory(GiB)": 36.53, "step": 8585, "token_acc": 0.8411397345823576, "train_speed(iter/s)": 0.096092 }, { "epoch": 0.39946637059341567, "grad_norm": 5.718166828155518, "learning_rate": 7.016739068976583e-06, "loss": 0.7685590744018554, "memory(GiB)": 36.53, "step": 8590, "token_acc": 0.8261183261183261, "train_speed(iter/s)": 0.096122 }, { "epoch": 0.39969888885336524, "grad_norm": 5.6232008934021, "learning_rate": 7.01322020784994e-06, "loss": 0.6538674354553222, "memory(GiB)": 36.53, "step": 8595, "token_acc": 0.8406889128094726, "train_speed(iter/s)": 0.096154 }, { "epoch": 0.3999314071133149, "grad_norm": 8.361418724060059, "learning_rate": 7.00970015624403e-06, "loss": 0.7033608913421631, "memory(GiB)": 36.53, "step": 8600, "token_acc": 0.8283450704225352, "train_speed(iter/s)": 0.096185 }, { "epoch": 0.3999314071133149, "eval_loss": 0.6218823194503784, "eval_runtime": 292.7751, "eval_samples_per_second": 11.869, "eval_steps_per_second": 11.869, "step": 8600 }, { "epoch": 0.40016392537326445, "grad_norm": 7.39115047454834, "learning_rate": 7.0061789162403694e-06, "loss": 0.740311861038208, "memory(GiB)": 36.53, "step": 8605, "token_acc": 0.8190235284670361, "train_speed(iter/s)": 0.095902 }, { "epoch": 0.4003964436332141, "grad_norm": 7.320059299468994, "learning_rate": 7.002656489921177e-06, "loss": 0.6501819133758545, "memory(GiB)": 36.53, "step": 8610, "token_acc": 0.8309228650137741, "train_speed(iter/s)": 0.095933 }, { "epoch": 0.40062896189316366, "grad_norm": 5.359230995178223, "learning_rate": 6.99913287936937e-06, "loss": 0.6049031257629395, "memory(GiB)": 36.53, "step": 8615, "token_acc": 0.8495334370139969, "train_speed(iter/s)": 0.095965 }, { "epoch": 0.4008614801531133, "grad_norm": 7.4673237800598145, "learning_rate": 6.99560808666857e-06, "loss": 0.7724496841430664, "memory(GiB)": 36.53, "step": 8620, "token_acc": 0.8127490039840638, "train_speed(iter/s)": 0.095996 }, { "epoch": 0.40109399841306287, "grad_norm": 7.398331165313721, "learning_rate": 6.992082113903096e-06, "loss": 0.6779186248779296, "memory(GiB)": 36.53, "step": 8625, "token_acc": 0.8232695139911634, "train_speed(iter/s)": 0.096027 }, { "epoch": 0.4013265166730125, "grad_norm": 6.002816677093506, "learning_rate": 6.988554963157962e-06, "loss": 0.8541918754577636, "memory(GiB)": 36.53, "step": 8630, "token_acc": 0.7821246819338422, "train_speed(iter/s)": 0.096057 }, { "epoch": 0.4015590349329621, "grad_norm": 5.535634994506836, "learning_rate": 6.985026636518884e-06, "loss": 0.7069508552551269, "memory(GiB)": 36.53, "step": 8635, "token_acc": 0.8242740134028295, "train_speed(iter/s)": 0.096089 }, { "epoch": 0.40179155319291165, "grad_norm": 9.848203659057617, "learning_rate": 6.9814971360722695e-06, "loss": 0.7899524211883545, "memory(GiB)": 36.53, "step": 8640, "token_acc": 0.8144616607071911, "train_speed(iter/s)": 0.09612 }, { "epoch": 0.4020240714528613, "grad_norm": 4.755890369415283, "learning_rate": 6.977966463905219e-06, "loss": 0.691878080368042, "memory(GiB)": 36.53, "step": 8645, "token_acc": 0.835621387283237, "train_speed(iter/s)": 0.096151 }, { "epoch": 0.40225658971281086, "grad_norm": 7.151211261749268, "learning_rate": 6.974434622105531e-06, "loss": 0.7025826930999756, "memory(GiB)": 36.53, "step": 8650, "token_acc": 0.8290136789056876, "train_speed(iter/s)": 0.096183 }, { "epoch": 0.40225658971281086, "eval_loss": 0.6199798583984375, "eval_runtime": 293.9287, "eval_samples_per_second": 11.823, "eval_steps_per_second": 11.823, "step": 8650 }, { "epoch": 0.4024891079727605, "grad_norm": 5.387206077575684, "learning_rate": 6.970901612761693e-06, "loss": 0.714640760421753, "memory(GiB)": 36.53, "step": 8655, "token_acc": 0.8199699723681899, "train_speed(iter/s)": 0.0959 }, { "epoch": 0.4027216262327101, "grad_norm": 10.001330375671387, "learning_rate": 6.967367437962879e-06, "loss": 0.7553164005279541, "memory(GiB)": 36.53, "step": 8660, "token_acc": 0.8316412859560067, "train_speed(iter/s)": 0.095931 }, { "epoch": 0.4029541444926597, "grad_norm": 4.902223587036133, "learning_rate": 6.963832099798957e-06, "loss": 0.7956992626190186, "memory(GiB)": 36.53, "step": 8665, "token_acc": 0.8115117014547755, "train_speed(iter/s)": 0.095962 }, { "epoch": 0.4031866627526093, "grad_norm": 8.351801872253418, "learning_rate": 6.960295600360484e-06, "loss": 0.7541163921356201, "memory(GiB)": 36.53, "step": 8670, "token_acc": 0.8208117443868739, "train_speed(iter/s)": 0.095994 }, { "epoch": 0.4034191810125589, "grad_norm": 5.438088893890381, "learning_rate": 6.956757941738699e-06, "loss": 0.6900172710418702, "memory(GiB)": 36.53, "step": 8675, "token_acc": 0.8305369127516778, "train_speed(iter/s)": 0.096026 }, { "epoch": 0.4036516992725085, "grad_norm": 6.124330043792725, "learning_rate": 6.953219126025529e-06, "loss": 0.7927371978759765, "memory(GiB)": 36.53, "step": 8680, "token_acc": 0.8033503277494537, "train_speed(iter/s)": 0.096057 }, { "epoch": 0.4038842175324581, "grad_norm": 8.318258285522461, "learning_rate": 6.949679155313585e-06, "loss": 0.7160331726074218, "memory(GiB)": 36.53, "step": 8685, "token_acc": 0.8290909090909091, "train_speed(iter/s)": 0.096089 }, { "epoch": 0.4041167357924077, "grad_norm": 5.619810581207275, "learning_rate": 6.946138031696161e-06, "loss": 0.6082026958465576, "memory(GiB)": 36.53, "step": 8690, "token_acc": 0.8539733763614361, "train_speed(iter/s)": 0.09612 }, { "epoch": 0.4043492540523573, "grad_norm": 6.684939861297607, "learning_rate": 6.942595757267234e-06, "loss": 0.6005841255187988, "memory(GiB)": 36.53, "step": 8695, "token_acc": 0.8429752066115702, "train_speed(iter/s)": 0.09615 }, { "epoch": 0.4045817723123069, "grad_norm": 6.462775707244873, "learning_rate": 6.939052334121458e-06, "loss": 0.6406507968902588, "memory(GiB)": 36.53, "step": 8700, "token_acc": 0.8348567946374162, "train_speed(iter/s)": 0.09618 }, { "epoch": 0.4045817723123069, "eval_loss": 0.6194509267807007, "eval_runtime": 291.2015, "eval_samples_per_second": 11.933, "eval_steps_per_second": 11.933, "step": 8700 }, { "epoch": 0.4048142905722565, "grad_norm": 7.093710422515869, "learning_rate": 6.9355077643541704e-06, "loss": 0.7701839923858642, "memory(GiB)": 36.53, "step": 8705, "token_acc": 0.8190435191607076, "train_speed(iter/s)": 0.0959 }, { "epoch": 0.4050468088322061, "grad_norm": 9.275561332702637, "learning_rate": 6.931962050061384e-06, "loss": 0.7341086864471436, "memory(GiB)": 36.53, "step": 8710, "token_acc": 0.8329238329238329, "train_speed(iter/s)": 0.095932 }, { "epoch": 0.4052793270921557, "grad_norm": 6.438332557678223, "learning_rate": 6.928415193339789e-06, "loss": 0.6380467891693116, "memory(GiB)": 36.53, "step": 8715, "token_acc": 0.8350694444444444, "train_speed(iter/s)": 0.095964 }, { "epoch": 0.4055118453521053, "grad_norm": 6.661995887756348, "learning_rate": 6.924867196286753e-06, "loss": 0.6209036350250244, "memory(GiB)": 36.53, "step": 8720, "token_acc": 0.8397932816537468, "train_speed(iter/s)": 0.095995 }, { "epoch": 0.4057443636120549, "grad_norm": 7.798610687255859, "learning_rate": 6.921318061000313e-06, "loss": 0.6994572639465332, "memory(GiB)": 36.53, "step": 8725, "token_acc": 0.834107498341075, "train_speed(iter/s)": 0.096025 }, { "epoch": 0.40597688187200454, "grad_norm": 8.071057319641113, "learning_rate": 6.917767789579184e-06, "loss": 0.6899877548217773, "memory(GiB)": 36.53, "step": 8730, "token_acc": 0.819412347309343, "train_speed(iter/s)": 0.096055 }, { "epoch": 0.4062094001319541, "grad_norm": 9.451443672180176, "learning_rate": 6.914216384122752e-06, "loss": 0.6809048652648926, "memory(GiB)": 36.53, "step": 8735, "token_acc": 0.839418924224578, "train_speed(iter/s)": 0.096087 }, { "epoch": 0.4064419183919037, "grad_norm": 8.285967826843262, "learning_rate": 6.910663846731072e-06, "loss": 0.6227863311767579, "memory(GiB)": 36.53, "step": 8740, "token_acc": 0.8486526393503138, "train_speed(iter/s)": 0.096117 }, { "epoch": 0.4066744366518533, "grad_norm": 6.145111083984375, "learning_rate": 6.9071101795048665e-06, "loss": 0.8297652244567871, "memory(GiB)": 36.53, "step": 8745, "token_acc": 0.8068181818181818, "train_speed(iter/s)": 0.096148 }, { "epoch": 0.4069069549118029, "grad_norm": 7.013164043426514, "learning_rate": 6.903555384545533e-06, "loss": 0.665109920501709, "memory(GiB)": 36.53, "step": 8750, "token_acc": 0.8276580958999306, "train_speed(iter/s)": 0.096179 }, { "epoch": 0.4069069549118029, "eval_loss": 0.6221299767494202, "eval_runtime": 292.4864, "eval_samples_per_second": 11.881, "eval_steps_per_second": 11.881, "step": 8750 }, { "epoch": 0.40713947317175253, "grad_norm": 7.368844032287598, "learning_rate": 6.899999463955129e-06, "loss": 0.771725845336914, "memory(GiB)": 36.53, "step": 8755, "token_acc": 0.8200054410446805, "train_speed(iter/s)": 0.095902 }, { "epoch": 0.4073719914317021, "grad_norm": 8.730157852172852, "learning_rate": 6.896442419836381e-06, "loss": 0.7538277626037597, "memory(GiB)": 36.53, "step": 8760, "token_acc": 0.8295061340044039, "train_speed(iter/s)": 0.095932 }, { "epoch": 0.40760450969165174, "grad_norm": 5.8263630867004395, "learning_rate": 6.892884254292677e-06, "loss": 0.7268019676208496, "memory(GiB)": 36.53, "step": 8765, "token_acc": 0.8250991705733862, "train_speed(iter/s)": 0.095962 }, { "epoch": 0.4078370279516013, "grad_norm": 7.256455421447754, "learning_rate": 6.889324969428074e-06, "loss": 0.7247865676879883, "memory(GiB)": 36.53, "step": 8770, "token_acc": 0.8247863247863247, "train_speed(iter/s)": 0.095992 }, { "epoch": 0.40806954621155095, "grad_norm": 5.477489471435547, "learning_rate": 6.885764567347284e-06, "loss": 0.6486649990081788, "memory(GiB)": 36.53, "step": 8775, "token_acc": 0.8216751726208346, "train_speed(iter/s)": 0.096022 }, { "epoch": 0.4083020644715005, "grad_norm": 8.233301162719727, "learning_rate": 6.882203050155684e-06, "loss": 0.6443092346191406, "memory(GiB)": 36.53, "step": 8780, "token_acc": 0.8434589800443459, "train_speed(iter/s)": 0.096053 }, { "epoch": 0.4085345827314501, "grad_norm": 9.864056587219238, "learning_rate": 6.878640419959306e-06, "loss": 0.8140290260314942, "memory(GiB)": 36.53, "step": 8785, "token_acc": 0.8026589115081014, "train_speed(iter/s)": 0.096083 }, { "epoch": 0.40876710099139973, "grad_norm": 7.310309886932373, "learning_rate": 6.875076678864847e-06, "loss": 0.6930231094360352, "memory(GiB)": 36.53, "step": 8790, "token_acc": 0.832723644828733, "train_speed(iter/s)": 0.096114 }, { "epoch": 0.4089996192513493, "grad_norm": 7.788050174713135, "learning_rate": 6.8715118289796575e-06, "loss": 0.6872176170349121, "memory(GiB)": 36.53, "step": 8795, "token_acc": 0.8275215598050244, "train_speed(iter/s)": 0.096144 }, { "epoch": 0.40923213751129894, "grad_norm": 6.276396751403809, "learning_rate": 6.867945872411741e-06, "loss": 0.6261724948883056, "memory(GiB)": 36.53, "step": 8800, "token_acc": 0.8405044510385756, "train_speed(iter/s)": 0.096175 }, { "epoch": 0.40923213751129894, "eval_loss": 0.6192914247512817, "eval_runtime": 291.4338, "eval_samples_per_second": 11.924, "eval_steps_per_second": 11.924, "step": 8800 }, { "epoch": 0.4094646557712485, "grad_norm": 6.184392929077148, "learning_rate": 6.8643788112697565e-06, "loss": 0.5885149002075195, "memory(GiB)": 36.53, "step": 8805, "token_acc": 0.8207722865661143, "train_speed(iter/s)": 0.0959 }, { "epoch": 0.40969717403119815, "grad_norm": 7.027329444885254, "learning_rate": 6.860810647663021e-06, "loss": 0.6702795028686523, "memory(GiB)": 36.53, "step": 8810, "token_acc": 0.8245682888540031, "train_speed(iter/s)": 0.095931 }, { "epoch": 0.40992969229114773, "grad_norm": 7.6430983543396, "learning_rate": 6.857241383701498e-06, "loss": 0.7755990028381348, "memory(GiB)": 36.53, "step": 8815, "token_acc": 0.8186856690419636, "train_speed(iter/s)": 0.095961 }, { "epoch": 0.41016221055109736, "grad_norm": 7.117034435272217, "learning_rate": 6.853671021495804e-06, "loss": 0.7607792854309082, "memory(GiB)": 36.53, "step": 8820, "token_acc": 0.8068219088225648, "train_speed(iter/s)": 0.095992 }, { "epoch": 0.41039472881104694, "grad_norm": 5.775667190551758, "learning_rate": 6.850099563157202e-06, "loss": 0.6519227981567383, "memory(GiB)": 36.53, "step": 8825, "token_acc": 0.8289615522817104, "train_speed(iter/s)": 0.096022 }, { "epoch": 0.41062724707099657, "grad_norm": 7.273473739624023, "learning_rate": 6.84652701079761e-06, "loss": 0.7388500690460205, "memory(GiB)": 36.53, "step": 8830, "token_acc": 0.8183768323203432, "train_speed(iter/s)": 0.096052 }, { "epoch": 0.41085976533094615, "grad_norm": 5.887622833251953, "learning_rate": 6.842953366529584e-06, "loss": 0.5621285438537598, "memory(GiB)": 36.53, "step": 8835, "token_acc": 0.8547868061142397, "train_speed(iter/s)": 0.096083 }, { "epoch": 0.4110922835908957, "grad_norm": 7.271026611328125, "learning_rate": 6.839378632466334e-06, "loss": 0.692125940322876, "memory(GiB)": 36.53, "step": 8840, "token_acc": 0.8310536044362292, "train_speed(iter/s)": 0.096114 }, { "epoch": 0.41132480185084536, "grad_norm": 8.459503173828125, "learning_rate": 6.8358028107217065e-06, "loss": 0.7182388305664062, "memory(GiB)": 36.53, "step": 8845, "token_acc": 0.8347509113001215, "train_speed(iter/s)": 0.096145 }, { "epoch": 0.41155732011079493, "grad_norm": 7.945958137512207, "learning_rate": 6.8322259034102e-06, "loss": 0.7506031036376953, "memory(GiB)": 36.53, "step": 8850, "token_acc": 0.819006309148265, "train_speed(iter/s)": 0.096175 }, { "epoch": 0.41155732011079493, "eval_loss": 0.6183744668960571, "eval_runtime": 293.1435, "eval_samples_per_second": 11.854, "eval_steps_per_second": 11.854, "step": 8850 }, { "epoch": 0.41178983837074457, "grad_norm": 7.761203289031982, "learning_rate": 6.828647912646947e-06, "loss": 0.6559175014495849, "memory(GiB)": 36.53, "step": 8855, "token_acc": 0.8202479338842975, "train_speed(iter/s)": 0.0959 }, { "epoch": 0.41202235663069414, "grad_norm": 6.704031944274902, "learning_rate": 6.825068840547726e-06, "loss": 0.7581852912902832, "memory(GiB)": 36.53, "step": 8860, "token_acc": 0.8106565176022835, "train_speed(iter/s)": 0.095931 }, { "epoch": 0.4122548748906438, "grad_norm": 6.423466205596924, "learning_rate": 6.82148868922895e-06, "loss": 0.7496987342834472, "memory(GiB)": 36.53, "step": 8865, "token_acc": 0.816932208684786, "train_speed(iter/s)": 0.095961 }, { "epoch": 0.41248739315059335, "grad_norm": 5.8876953125, "learning_rate": 6.8179074608076755e-06, "loss": 0.6948210716247558, "memory(GiB)": 36.53, "step": 8870, "token_acc": 0.8260869565217391, "train_speed(iter/s)": 0.095991 }, { "epoch": 0.412719911410543, "grad_norm": 7.565408706665039, "learning_rate": 6.8143251574015925e-06, "loss": 0.6806984901428222, "memory(GiB)": 36.53, "step": 8875, "token_acc": 0.8140625, "train_speed(iter/s)": 0.096021 }, { "epoch": 0.41295242967049256, "grad_norm": 9.712871551513672, "learning_rate": 6.810741781129027e-06, "loss": 0.7311966419219971, "memory(GiB)": 36.53, "step": 8880, "token_acc": 0.8348817567567568, "train_speed(iter/s)": 0.096051 }, { "epoch": 0.41318494793044214, "grad_norm": 6.433829307556152, "learning_rate": 6.807157334108941e-06, "loss": 0.7440563678741455, "memory(GiB)": 36.53, "step": 8885, "token_acc": 0.8127053669222344, "train_speed(iter/s)": 0.09608 }, { "epoch": 0.41341746619039177, "grad_norm": 4.980173587799072, "learning_rate": 6.803571818460929e-06, "loss": 0.772585391998291, "memory(GiB)": 36.53, "step": 8890, "token_acc": 0.8182640144665461, "train_speed(iter/s)": 0.09611 }, { "epoch": 0.41364998445034135, "grad_norm": 7.035939693450928, "learning_rate": 6.799985236305217e-06, "loss": 0.6235956192016602, "memory(GiB)": 36.53, "step": 8895, "token_acc": 0.8394070413835701, "train_speed(iter/s)": 0.096141 }, { "epoch": 0.413882502710291, "grad_norm": 8.031867027282715, "learning_rate": 6.796397589762661e-06, "loss": 0.6124022006988525, "memory(GiB)": 36.53, "step": 8900, "token_acc": 0.846942650968477, "train_speed(iter/s)": 0.096172 }, { "epoch": 0.413882502710291, "eval_loss": 0.6191074848175049, "eval_runtime": 295.2805, "eval_samples_per_second": 11.768, "eval_steps_per_second": 11.768, "step": 8900 }, { "epoch": 0.41411502097024055, "grad_norm": 7.602524280548096, "learning_rate": 6.792808880954746e-06, "loss": 0.6715277671813965, "memory(GiB)": 36.53, "step": 8905, "token_acc": 0.8206718594736674, "train_speed(iter/s)": 0.095895 }, { "epoch": 0.4143475392301902, "grad_norm": 6.761972427368164, "learning_rate": 6.789219112003589e-06, "loss": 0.77059907913208, "memory(GiB)": 36.53, "step": 8910, "token_acc": 0.8076407506702413, "train_speed(iter/s)": 0.095925 }, { "epoch": 0.41458005749013976, "grad_norm": 6.945693016052246, "learning_rate": 6.78562828503193e-06, "loss": 0.6703316688537597, "memory(GiB)": 36.53, "step": 8915, "token_acc": 0.8321584424303458, "train_speed(iter/s)": 0.095955 }, { "epoch": 0.4148125757500894, "grad_norm": 6.604681968688965, "learning_rate": 6.782036402163136e-06, "loss": 0.6581210613250732, "memory(GiB)": 36.53, "step": 8920, "token_acc": 0.8386841062227507, "train_speed(iter/s)": 0.095986 }, { "epoch": 0.415045094010039, "grad_norm": 6.909808158874512, "learning_rate": 6.778443465521196e-06, "loss": 0.7041978359222412, "memory(GiB)": 36.53, "step": 8925, "token_acc": 0.8179746011071313, "train_speed(iter/s)": 0.096015 }, { "epoch": 0.4152776122699886, "grad_norm": 8.918962478637695, "learning_rate": 6.77484947723073e-06, "loss": 0.6340020179748536, "memory(GiB)": 36.53, "step": 8930, "token_acc": 0.8308794269797055, "train_speed(iter/s)": 0.096045 }, { "epoch": 0.4155101305299382, "grad_norm": 8.383973121643066, "learning_rate": 6.7712544394169675e-06, "loss": 0.6914999961853028, "memory(GiB)": 36.53, "step": 8935, "token_acc": 0.8301647655259823, "train_speed(iter/s)": 0.096075 }, { "epoch": 0.41574264878988776, "grad_norm": 7.2458038330078125, "learning_rate": 6.7676583542057705e-06, "loss": 0.701669979095459, "memory(GiB)": 36.53, "step": 8940, "token_acc": 0.826133909287257, "train_speed(iter/s)": 0.096106 }, { "epoch": 0.4159751670498374, "grad_norm": 7.525779724121094, "learning_rate": 6.764061223723612e-06, "loss": 0.8554682731628418, "memory(GiB)": 36.53, "step": 8945, "token_acc": 0.7920922570016474, "train_speed(iter/s)": 0.096135 }, { "epoch": 0.41620768530978697, "grad_norm": 6.9160566329956055, "learning_rate": 6.760463050097588e-06, "loss": 0.7539933681488037, "memory(GiB)": 36.53, "step": 8950, "token_acc": 0.8097359210944957, "train_speed(iter/s)": 0.096165 }, { "epoch": 0.41620768530978697, "eval_loss": 0.6194281578063965, "eval_runtime": 293.896, "eval_samples_per_second": 11.824, "eval_steps_per_second": 11.824, "step": 8950 }, { "epoch": 0.4164402035697366, "grad_norm": 6.317086696624756, "learning_rate": 6.75686383545541e-06, "loss": 0.6532005786895752, "memory(GiB)": 36.53, "step": 8955, "token_acc": 0.8211216897351789, "train_speed(iter/s)": 0.095891 }, { "epoch": 0.4166727218296862, "grad_norm": 7.509308815002441, "learning_rate": 6.753263581925403e-06, "loss": 0.7630683898925781, "memory(GiB)": 36.53, "step": 8960, "token_acc": 0.8083182640144665, "train_speed(iter/s)": 0.09592 }, { "epoch": 0.4169052400896358, "grad_norm": 8.196243286132812, "learning_rate": 6.7496622916365085e-06, "loss": 0.6224451541900635, "memory(GiB)": 36.53, "step": 8965, "token_acc": 0.8581512605042017, "train_speed(iter/s)": 0.09595 }, { "epoch": 0.4171377583495854, "grad_norm": 7.792494297027588, "learning_rate": 6.746059966718282e-06, "loss": 0.7187991619110108, "memory(GiB)": 36.53, "step": 8970, "token_acc": 0.8277800080289041, "train_speed(iter/s)": 0.09598 }, { "epoch": 0.417370276609535, "grad_norm": 5.681851387023926, "learning_rate": 6.742456609300888e-06, "loss": 0.7289433479309082, "memory(GiB)": 36.53, "step": 8975, "token_acc": 0.8181182231549493, "train_speed(iter/s)": 0.096009 }, { "epoch": 0.4176027948694846, "grad_norm": 12.690560340881348, "learning_rate": 6.738852221515104e-06, "loss": 0.6835853099822998, "memory(GiB)": 36.53, "step": 8980, "token_acc": 0.8417593528816987, "train_speed(iter/s)": 0.09604 }, { "epoch": 0.41783531312943417, "grad_norm": 6.0064215660095215, "learning_rate": 6.735246805492316e-06, "loss": 0.7906692028045654, "memory(GiB)": 36.53, "step": 8985, "token_acc": 0.8182476466328747, "train_speed(iter/s)": 0.09607 }, { "epoch": 0.4180678313893838, "grad_norm": 6.358526706695557, "learning_rate": 6.731640363364516e-06, "loss": 0.7464702606201172, "memory(GiB)": 36.53, "step": 8990, "token_acc": 0.8180026281208935, "train_speed(iter/s)": 0.0961 }, { "epoch": 0.4183003496493334, "grad_norm": 9.47359561920166, "learning_rate": 6.728032897264307e-06, "loss": 0.7647457122802734, "memory(GiB)": 36.53, "step": 8995, "token_acc": 0.8158473954512105, "train_speed(iter/s)": 0.09613 }, { "epoch": 0.418532867909283, "grad_norm": 5.881313323974609, "learning_rate": 6.724424409324893e-06, "loss": 0.7446362495422363, "memory(GiB)": 36.53, "step": 9000, "token_acc": 0.80649436713055, "train_speed(iter/s)": 0.09616 }, { "epoch": 0.418532867909283, "eval_loss": 0.6171696782112122, "eval_runtime": 297.804, "eval_samples_per_second": 11.669, "eval_steps_per_second": 11.669, "step": 9000 }, { "epoch": 0.4187653861692326, "grad_norm": 10.644405364990234, "learning_rate": 6.720814901680086e-06, "loss": 0.627127742767334, "memory(GiB)": 36.53, "step": 9005, "token_acc": 0.82036708016111, "train_speed(iter/s)": 0.095884 }, { "epoch": 0.4189979044291822, "grad_norm": 5.751759052276611, "learning_rate": 6.717204376464297e-06, "loss": 0.6266797542572021, "memory(GiB)": 36.53, "step": 9010, "token_acc": 0.8363697705802969, "train_speed(iter/s)": 0.095913 }, { "epoch": 0.4192304226891318, "grad_norm": 7.086824893951416, "learning_rate": 6.713592835812543e-06, "loss": 0.6686243057250977, "memory(GiB)": 36.53, "step": 9015, "token_acc": 0.8332337118947998, "train_speed(iter/s)": 0.095942 }, { "epoch": 0.41946294094908143, "grad_norm": 6.7311530113220215, "learning_rate": 6.70998028186044e-06, "loss": 0.6335064888000488, "memory(GiB)": 36.53, "step": 9020, "token_acc": 0.8478342749529191, "train_speed(iter/s)": 0.095972 }, { "epoch": 0.419695459209031, "grad_norm": 6.627511024475098, "learning_rate": 6.706366716744201e-06, "loss": 0.7964089870452881, "memory(GiB)": 36.53, "step": 9025, "token_acc": 0.8168709444844989, "train_speed(iter/s)": 0.096002 }, { "epoch": 0.4199279774689806, "grad_norm": 6.411988258361816, "learning_rate": 6.702752142600639e-06, "loss": 0.691087007522583, "memory(GiB)": 36.53, "step": 9030, "token_acc": 0.8316246056782335, "train_speed(iter/s)": 0.096031 }, { "epoch": 0.4201604957289302, "grad_norm": 5.192274570465088, "learning_rate": 6.699136561567165e-06, "loss": 0.6185484886169433, "memory(GiB)": 36.53, "step": 9035, "token_acc": 0.8502076014053018, "train_speed(iter/s)": 0.09606 }, { "epoch": 0.4203930139888798, "grad_norm": 9.895514488220215, "learning_rate": 6.695519975781782e-06, "loss": 0.8439302444458008, "memory(GiB)": 36.53, "step": 9040, "token_acc": 0.8051863857374392, "train_speed(iter/s)": 0.09609 }, { "epoch": 0.4206255322488294, "grad_norm": 7.7914228439331055, "learning_rate": 6.6919023873830864e-06, "loss": 0.7071369647979736, "memory(GiB)": 36.53, "step": 9045, "token_acc": 0.8248201438848921, "train_speed(iter/s)": 0.096119 }, { "epoch": 0.420858050508779, "grad_norm": 7.353464603424072, "learning_rate": 6.688283798510275e-06, "loss": 0.7173079967498779, "memory(GiB)": 36.53, "step": 9050, "token_acc": 0.8235796668090559, "train_speed(iter/s)": 0.096149 }, { "epoch": 0.420858050508779, "eval_loss": 0.6170545220375061, "eval_runtime": 295.4839, "eval_samples_per_second": 11.76, "eval_steps_per_second": 11.76, "step": 9050 }, { "epoch": 0.42109056876872863, "grad_norm": 6.842942237854004, "learning_rate": 6.684664211303129e-06, "loss": 0.6946396827697754, "memory(GiB)": 36.53, "step": 9055, "token_acc": 0.8207173862646191, "train_speed(iter/s)": 0.095877 }, { "epoch": 0.4213230870286782, "grad_norm": 8.142498016357422, "learning_rate": 6.6810436279020215e-06, "loss": 0.7505970954895019, "memory(GiB)": 36.53, "step": 9060, "token_acc": 0.8142857142857143, "train_speed(iter/s)": 0.095907 }, { "epoch": 0.42155560528862784, "grad_norm": 8.180495262145996, "learning_rate": 6.677422050447915e-06, "loss": 0.699577522277832, "memory(GiB)": 36.53, "step": 9065, "token_acc": 0.8255624388653408, "train_speed(iter/s)": 0.095937 }, { "epoch": 0.4217881235485774, "grad_norm": 9.911946296691895, "learning_rate": 6.673799481082362e-06, "loss": 0.6910494327545166, "memory(GiB)": 36.53, "step": 9070, "token_acc": 0.8324185876326381, "train_speed(iter/s)": 0.095967 }, { "epoch": 0.42202064180852705, "grad_norm": 7.65514612197876, "learning_rate": 6.670175921947497e-06, "loss": 0.6632385730743409, "memory(GiB)": 36.53, "step": 9075, "token_acc": 0.8326345213137666, "train_speed(iter/s)": 0.095996 }, { "epoch": 0.42225316006847663, "grad_norm": 9.681537628173828, "learning_rate": 6.666551375186043e-06, "loss": 0.7314382553100586, "memory(GiB)": 36.53, "step": 9080, "token_acc": 0.8132972555083108, "train_speed(iter/s)": 0.096026 }, { "epoch": 0.4224856783284262, "grad_norm": 6.742770195007324, "learning_rate": 6.662925842941308e-06, "loss": 0.6615162849426269, "memory(GiB)": 36.53, "step": 9085, "token_acc": 0.8341759352881699, "train_speed(iter/s)": 0.096056 }, { "epoch": 0.42271819658837584, "grad_norm": 7.636063575744629, "learning_rate": 6.659299327357181e-06, "loss": 0.6485334873199463, "memory(GiB)": 36.53, "step": 9090, "token_acc": 0.8355714712026261, "train_speed(iter/s)": 0.096085 }, { "epoch": 0.4229507148483254, "grad_norm": 7.497421741485596, "learning_rate": 6.655671830578131e-06, "loss": 0.6972911357879639, "memory(GiB)": 36.53, "step": 9095, "token_acc": 0.8263677811550152, "train_speed(iter/s)": 0.096114 }, { "epoch": 0.42318323310827505, "grad_norm": 8.554731369018555, "learning_rate": 6.6520433547492095e-06, "loss": 0.7575594902038574, "memory(GiB)": 36.53, "step": 9100, "token_acc": 0.8071625344352618, "train_speed(iter/s)": 0.096143 }, { "epoch": 0.42318323310827505, "eval_loss": 0.6152464747428894, "eval_runtime": 294.8661, "eval_samples_per_second": 11.785, "eval_steps_per_second": 11.785, "step": 9100 }, { "epoch": 0.4234157513682246, "grad_norm": 5.9736738204956055, "learning_rate": 6.648413902016047e-06, "loss": 0.7907323360443115, "memory(GiB)": 36.53, "step": 9105, "token_acc": 0.8208651317899057, "train_speed(iter/s)": 0.095874 }, { "epoch": 0.42364826962817426, "grad_norm": 6.825490951538086, "learning_rate": 6.644783474524848e-06, "loss": 0.8026031494140625, "memory(GiB)": 36.53, "step": 9110, "token_acc": 0.8043887147335423, "train_speed(iter/s)": 0.095903 }, { "epoch": 0.42388078788812383, "grad_norm": 7.025318622589111, "learning_rate": 6.641152074422401e-06, "loss": 0.6262815475463868, "memory(GiB)": 36.53, "step": 9115, "token_acc": 0.8432312799704906, "train_speed(iter/s)": 0.095931 }, { "epoch": 0.42411330614807347, "grad_norm": 7.372581481933594, "learning_rate": 6.6375197038560636e-06, "loss": 0.6927440643310547, "memory(GiB)": 36.53, "step": 9120, "token_acc": 0.8218195545488863, "train_speed(iter/s)": 0.095961 }, { "epoch": 0.42434582440802304, "grad_norm": 7.317347526550293, "learning_rate": 6.633886364973767e-06, "loss": 0.7257655143737793, "memory(GiB)": 36.53, "step": 9125, "token_acc": 0.8196777511141584, "train_speed(iter/s)": 0.09599 }, { "epoch": 0.4245783426679726, "grad_norm": 7.706878185272217, "learning_rate": 6.630252059924016e-06, "loss": 0.6121723651885986, "memory(GiB)": 36.53, "step": 9130, "token_acc": 0.8564425770308123, "train_speed(iter/s)": 0.096019 }, { "epoch": 0.42481086092792225, "grad_norm": 7.673420429229736, "learning_rate": 6.626616790855891e-06, "loss": 0.7046977519989014, "memory(GiB)": 36.53, "step": 9135, "token_acc": 0.8263234227701233, "train_speed(iter/s)": 0.096048 }, { "epoch": 0.42504337918787183, "grad_norm": 7.619455814361572, "learning_rate": 6.622980559919037e-06, "loss": 0.7457279682159423, "memory(GiB)": 36.53, "step": 9140, "token_acc": 0.8259125551544324, "train_speed(iter/s)": 0.096077 }, { "epoch": 0.42527589744782146, "grad_norm": 5.783131122589111, "learning_rate": 6.619343369263667e-06, "loss": 0.6203608989715577, "memory(GiB)": 36.53, "step": 9145, "token_acc": 0.8454415954415955, "train_speed(iter/s)": 0.096107 }, { "epoch": 0.42550841570777104, "grad_norm": 6.525480270385742, "learning_rate": 6.615705221040568e-06, "loss": 0.7471608638763427, "memory(GiB)": 36.53, "step": 9150, "token_acc": 0.8127881955118352, "train_speed(iter/s)": 0.096136 }, { "epoch": 0.42550841570777104, "eval_loss": 0.6166175007820129, "eval_runtime": 291.944, "eval_samples_per_second": 11.903, "eval_steps_per_second": 11.903, "step": 9150 }, { "epoch": 0.42574093396772067, "grad_norm": 6.973150253295898, "learning_rate": 6.612066117401088e-06, "loss": 0.6104370594024658, "memory(GiB)": 36.53, "step": 9155, "token_acc": 0.8215181052918232, "train_speed(iter/s)": 0.095871 }, { "epoch": 0.42597345222767025, "grad_norm": 8.265861511230469, "learning_rate": 6.608426060497141e-06, "loss": 0.6862789154052734, "memory(GiB)": 36.53, "step": 9160, "token_acc": 0.8292282430213465, "train_speed(iter/s)": 0.095899 }, { "epoch": 0.4262059704876199, "grad_norm": 8.376321792602539, "learning_rate": 6.604785052481205e-06, "loss": 0.7466615200042724, "memory(GiB)": 36.53, "step": 9165, "token_acc": 0.8263552225650066, "train_speed(iter/s)": 0.095929 }, { "epoch": 0.42643848874756946, "grad_norm": 8.683837890625, "learning_rate": 6.60114309550632e-06, "loss": 0.5966279983520508, "memory(GiB)": 36.53, "step": 9170, "token_acc": 0.8487690504103166, "train_speed(iter/s)": 0.095958 }, { "epoch": 0.42667100700751903, "grad_norm": 6.879874229431152, "learning_rate": 6.59750019172609e-06, "loss": 0.6662249088287353, "memory(GiB)": 36.53, "step": 9175, "token_acc": 0.8512064343163539, "train_speed(iter/s)": 0.095988 }, { "epoch": 0.42690352526746866, "grad_norm": 7.139492034912109, "learning_rate": 6.593856343294674e-06, "loss": 0.6416230201721191, "memory(GiB)": 36.53, "step": 9180, "token_acc": 0.8420463032113518, "train_speed(iter/s)": 0.096016 }, { "epoch": 0.42713604352741824, "grad_norm": 8.100975036621094, "learning_rate": 6.590211552366792e-06, "loss": 0.681126880645752, "memory(GiB)": 36.53, "step": 9185, "token_acc": 0.8341675008341675, "train_speed(iter/s)": 0.096045 }, { "epoch": 0.4273685617873679, "grad_norm": 5.829564094543457, "learning_rate": 6.586565821097722e-06, "loss": 0.67999267578125, "memory(GiB)": 36.53, "step": 9190, "token_acc": 0.8355945730247406, "train_speed(iter/s)": 0.096074 }, { "epoch": 0.42760108004731745, "grad_norm": 8.35464096069336, "learning_rate": 6.5829191516432985e-06, "loss": 0.7434378147125245, "memory(GiB)": 36.53, "step": 9195, "token_acc": 0.8255695341720504, "train_speed(iter/s)": 0.096103 }, { "epoch": 0.4278335983072671, "grad_norm": 6.569089889526367, "learning_rate": 6.57927154615991e-06, "loss": 0.7322329521179199, "memory(GiB)": 36.53, "step": 9200, "token_acc": 0.82040953340047, "train_speed(iter/s)": 0.096132 }, { "epoch": 0.4278335983072671, "eval_loss": 0.6154603958129883, "eval_runtime": 292.0196, "eval_samples_per_second": 11.9, "eval_steps_per_second": 11.9, "step": 9200 }, { "epoch": 0.42806611656721666, "grad_norm": 6.184996128082275, "learning_rate": 6.575623006804495e-06, "loss": 0.7064523696899414, "memory(GiB)": 36.53, "step": 9205, "token_acc": 0.8213706374572378, "train_speed(iter/s)": 0.095868 }, { "epoch": 0.4282986348271663, "grad_norm": 6.941103935241699, "learning_rate": 6.5719735357345505e-06, "loss": 0.6321978092193603, "memory(GiB)": 36.53, "step": 9210, "token_acc": 0.8569169960474309, "train_speed(iter/s)": 0.095897 }, { "epoch": 0.42853115308711587, "grad_norm": 7.983799934387207, "learning_rate": 6.568323135108121e-06, "loss": 0.7142380237579345, "memory(GiB)": 36.53, "step": 9215, "token_acc": 0.825369978858351, "train_speed(iter/s)": 0.095927 }, { "epoch": 0.4287636713470655, "grad_norm": 7.8501482009887695, "learning_rate": 6.564671807083801e-06, "loss": 0.7349348545074463, "memory(GiB)": 36.53, "step": 9220, "token_acc": 0.8120401337792642, "train_speed(iter/s)": 0.095955 }, { "epoch": 0.4289961896070151, "grad_norm": 8.24466323852539, "learning_rate": 6.561019553820732e-06, "loss": 0.6588833808898926, "memory(GiB)": 36.53, "step": 9225, "token_acc": 0.8419503993274485, "train_speed(iter/s)": 0.095985 }, { "epoch": 0.42922870786696465, "grad_norm": 8.526206970214844, "learning_rate": 6.5573663774786045e-06, "loss": 0.7232032775878906, "memory(GiB)": 36.53, "step": 9230, "token_acc": 0.8263358778625954, "train_speed(iter/s)": 0.096015 }, { "epoch": 0.4294612261269143, "grad_norm": 7.539074420928955, "learning_rate": 6.553712280217657e-06, "loss": 0.5962289333343506, "memory(GiB)": 36.53, "step": 9235, "token_acc": 0.8554294975688816, "train_speed(iter/s)": 0.096043 }, { "epoch": 0.42969374438686386, "grad_norm": 7.943596363067627, "learning_rate": 6.550057264198668e-06, "loss": 0.7809437274932861, "memory(GiB)": 36.53, "step": 9240, "token_acc": 0.8077663671373556, "train_speed(iter/s)": 0.096072 }, { "epoch": 0.4299262626468135, "grad_norm": 6.926512718200684, "learning_rate": 6.546401331582962e-06, "loss": 0.7709768295288086, "memory(GiB)": 36.53, "step": 9245, "token_acc": 0.8153540701522171, "train_speed(iter/s)": 0.096101 }, { "epoch": 0.43015878090676307, "grad_norm": 5.472071647644043, "learning_rate": 6.542744484532403e-06, "loss": 0.7952607631683349, "memory(GiB)": 36.53, "step": 9250, "token_acc": 0.8125806451612904, "train_speed(iter/s)": 0.09613 }, { "epoch": 0.43015878090676307, "eval_loss": 0.619757890701294, "eval_runtime": 296.7723, "eval_samples_per_second": 11.709, "eval_steps_per_second": 11.709, "step": 9250 }, { "epoch": 0.4303912991667127, "grad_norm": 7.531529903411865, "learning_rate": 6.539086725209401e-06, "loss": 0.6834822177886963, "memory(GiB)": 36.53, "step": 9255, "token_acc": 0.821519068544119, "train_speed(iter/s)": 0.095863 }, { "epoch": 0.4306238174266623, "grad_norm": 7.340893745422363, "learning_rate": 6.535428055776898e-06, "loss": 0.6262123107910156, "memory(GiB)": 36.53, "step": 9260, "token_acc": 0.8447067502766507, "train_speed(iter/s)": 0.095892 }, { "epoch": 0.4308563356866119, "grad_norm": 7.021424293518066, "learning_rate": 6.531768478398382e-06, "loss": 0.7499904632568359, "memory(GiB)": 36.53, "step": 9265, "token_acc": 0.8159286186384667, "train_speed(iter/s)": 0.095922 }, { "epoch": 0.4310888539465615, "grad_norm": 6.474427223205566, "learning_rate": 6.5281079952378686e-06, "loss": 0.6529985904693604, "memory(GiB)": 36.53, "step": 9270, "token_acc": 0.8382250174703005, "train_speed(iter/s)": 0.09595 }, { "epoch": 0.43132137220651107, "grad_norm": 8.948592185974121, "learning_rate": 6.524446608459922e-06, "loss": 0.5798418521881104, "memory(GiB)": 36.53, "step": 9275, "token_acc": 0.8486916951080774, "train_speed(iter/s)": 0.09598 }, { "epoch": 0.4315538904664607, "grad_norm": 6.600849628448486, "learning_rate": 6.520784320229628e-06, "loss": 0.6643566608428955, "memory(GiB)": 36.53, "step": 9280, "token_acc": 0.825115562403698, "train_speed(iter/s)": 0.096009 }, { "epoch": 0.4317864087264103, "grad_norm": 6.3031206130981445, "learning_rate": 6.517121132712613e-06, "loss": 0.5986540794372559, "memory(GiB)": 36.53, "step": 9285, "token_acc": 0.8465489566613162, "train_speed(iter/s)": 0.096037 }, { "epoch": 0.4320189269863599, "grad_norm": 6.503426551818848, "learning_rate": 6.513457048075031e-06, "loss": 0.6698907375335693, "memory(GiB)": 36.53, "step": 9290, "token_acc": 0.8272375854891466, "train_speed(iter/s)": 0.096067 }, { "epoch": 0.4322514452463095, "grad_norm": 6.880972385406494, "learning_rate": 6.509792068483569e-06, "loss": 0.5666281700134277, "memory(GiB)": 36.53, "step": 9295, "token_acc": 0.8612693246541904, "train_speed(iter/s)": 0.096096 }, { "epoch": 0.4324839635062591, "grad_norm": 8.267417907714844, "learning_rate": 6.506126196105444e-06, "loss": 0.7473461627960205, "memory(GiB)": 36.53, "step": 9300, "token_acc": 0.810893098782138, "train_speed(iter/s)": 0.096124 }, { "epoch": 0.4324839635062591, "eval_loss": 0.6146489977836609, "eval_runtime": 294.9014, "eval_samples_per_second": 11.784, "eval_steps_per_second": 11.784, "step": 9300 }, { "epoch": 0.4327164817662087, "grad_norm": 7.167984485626221, "learning_rate": 6.502459433108398e-06, "loss": 0.6427381992340088, "memory(GiB)": 36.53, "step": 9305, "token_acc": 0.8212586271702882, "train_speed(iter/s)": 0.09586 }, { "epoch": 0.4329490000261583, "grad_norm": 3.9880833625793457, "learning_rate": 6.4987917816607e-06, "loss": 0.8922223091125489, "memory(GiB)": 36.53, "step": 9310, "token_acc": 0.7608077360637088, "train_speed(iter/s)": 0.095888 }, { "epoch": 0.4331815182861079, "grad_norm": 6.413315296173096, "learning_rate": 6.49512324393115e-06, "loss": 0.583865737915039, "memory(GiB)": 36.53, "step": 9315, "token_acc": 0.8495801387367652, "train_speed(iter/s)": 0.095916 }, { "epoch": 0.43341403654605754, "grad_norm": 6.642376899719238, "learning_rate": 6.491453822089065e-06, "loss": 0.6838852405548096, "memory(GiB)": 36.53, "step": 9320, "token_acc": 0.8288527073078379, "train_speed(iter/s)": 0.095945 }, { "epoch": 0.4336465548060071, "grad_norm": 5.483030796051025, "learning_rate": 6.487783518304284e-06, "loss": 0.7265839099884033, "memory(GiB)": 36.53, "step": 9325, "token_acc": 0.8278023598820059, "train_speed(iter/s)": 0.095973 }, { "epoch": 0.4338790730659567, "grad_norm": 7.246890544891357, "learning_rate": 6.484112334747177e-06, "loss": 0.6972790241241456, "memory(GiB)": 36.53, "step": 9330, "token_acc": 0.8220140515222483, "train_speed(iter/s)": 0.096002 }, { "epoch": 0.4341115913259063, "grad_norm": 7.137838840484619, "learning_rate": 6.480440273588624e-06, "loss": 0.5969900131225586, "memory(GiB)": 36.53, "step": 9335, "token_acc": 0.8442120447169131, "train_speed(iter/s)": 0.096031 }, { "epoch": 0.4343441095858559, "grad_norm": 5.954293251037598, "learning_rate": 6.4767673370000305e-06, "loss": 0.7084389209747315, "memory(GiB)": 36.53, "step": 9340, "token_acc": 0.831023102310231, "train_speed(iter/s)": 0.09606 }, { "epoch": 0.43457662784580553, "grad_norm": 8.764803886413574, "learning_rate": 6.473093527153315e-06, "loss": 0.6675600528717041, "memory(GiB)": 36.53, "step": 9345, "token_acc": 0.8438982319965502, "train_speed(iter/s)": 0.096089 }, { "epoch": 0.4348091461057551, "grad_norm": 6.77636194229126, "learning_rate": 6.4694188462209174e-06, "loss": 0.7175776481628418, "memory(GiB)": 36.53, "step": 9350, "token_acc": 0.8179903730445247, "train_speed(iter/s)": 0.096117 }, { "epoch": 0.4348091461057551, "eval_loss": 0.6128469705581665, "eval_runtime": 296.4157, "eval_samples_per_second": 11.723, "eval_steps_per_second": 11.723, "step": 9350 }, { "epoch": 0.43504166436570474, "grad_norm": 8.777336120605469, "learning_rate": 6.465743296375788e-06, "loss": 0.8006362915039062, "memory(GiB)": 36.53, "step": 9355, "token_acc": 0.8213751808712197, "train_speed(iter/s)": 0.095853 }, { "epoch": 0.4352741826256543, "grad_norm": 6.73029899597168, "learning_rate": 6.462066879791393e-06, "loss": 0.5721518039703369, "memory(GiB)": 36.53, "step": 9360, "token_acc": 0.8447519406007425, "train_speed(iter/s)": 0.095882 }, { "epoch": 0.43550670088560395, "grad_norm": 10.870583534240723, "learning_rate": 6.458389598641711e-06, "loss": 0.611840009689331, "memory(GiB)": 36.53, "step": 9365, "token_acc": 0.8376413570274637, "train_speed(iter/s)": 0.095911 }, { "epoch": 0.4357392191455535, "grad_norm": 6.957264423370361, "learning_rate": 6.454711455101232e-06, "loss": 0.6748052597045898, "memory(GiB)": 36.53, "step": 9370, "token_acc": 0.8365800865800865, "train_speed(iter/s)": 0.09594 }, { "epoch": 0.4359717374055031, "grad_norm": 5.895677089691162, "learning_rate": 6.451032451344958e-06, "loss": 0.6640129566192627, "memory(GiB)": 36.53, "step": 9375, "token_acc": 0.8376369327073553, "train_speed(iter/s)": 0.095969 }, { "epoch": 0.43620425566545273, "grad_norm": 6.726668357849121, "learning_rate": 6.447352589548396e-06, "loss": 0.6839561462402344, "memory(GiB)": 36.53, "step": 9380, "token_acc": 0.8363384188626907, "train_speed(iter/s)": 0.095997 }, { "epoch": 0.4364367739254023, "grad_norm": 5.2225775718688965, "learning_rate": 6.443671871887561e-06, "loss": 0.7582714080810546, "memory(GiB)": 36.53, "step": 9385, "token_acc": 0.8162845385067607, "train_speed(iter/s)": 0.096025 }, { "epoch": 0.43666929218535194, "grad_norm": 6.472532749176025, "learning_rate": 6.439990300538975e-06, "loss": 0.6851258754730225, "memory(GiB)": 36.53, "step": 9390, "token_acc": 0.8372978116079924, "train_speed(iter/s)": 0.096053 }, { "epoch": 0.4369018104453015, "grad_norm": 8.415928840637207, "learning_rate": 6.436307877679666e-06, "loss": 0.6269676685333252, "memory(GiB)": 36.53, "step": 9395, "token_acc": 0.854813046937152, "train_speed(iter/s)": 0.096081 }, { "epoch": 0.43713432870525115, "grad_norm": 7.9225687980651855, "learning_rate": 6.4326246054871645e-06, "loss": 0.6694862842559814, "memory(GiB)": 36.53, "step": 9400, "token_acc": 0.8336909871244635, "train_speed(iter/s)": 0.096111 }, { "epoch": 0.43713432870525115, "eval_loss": 0.613532543182373, "eval_runtime": 294.7694, "eval_samples_per_second": 11.789, "eval_steps_per_second": 11.789, "step": 9400 }, { "epoch": 0.43736684696520073, "grad_norm": 6.317355632781982, "learning_rate": 6.428940486139502e-06, "loss": 0.8399629592895508, "memory(GiB)": 36.53, "step": 9405, "token_acc": 0.821104738713915, "train_speed(iter/s)": 0.095848 }, { "epoch": 0.43759936522515036, "grad_norm": 6.062745571136475, "learning_rate": 6.425255521815212e-06, "loss": 0.7100383758544921, "memory(GiB)": 36.53, "step": 9410, "token_acc": 0.8216726326065734, "train_speed(iter/s)": 0.095877 }, { "epoch": 0.43783188348509994, "grad_norm": 9.153746604919434, "learning_rate": 6.4215697146933275e-06, "loss": 0.8023602485656738, "memory(GiB)": 36.53, "step": 9415, "token_acc": 0.8047173083593478, "train_speed(iter/s)": 0.095905 }, { "epoch": 0.4380644017450495, "grad_norm": 6.360644340515137, "learning_rate": 6.417883066953381e-06, "loss": 0.650984811782837, "memory(GiB)": 36.53, "step": 9420, "token_acc": 0.8299910206524993, "train_speed(iter/s)": 0.095933 }, { "epoch": 0.43829692000499915, "grad_norm": 6.684108734130859, "learning_rate": 6.414195580775401e-06, "loss": 0.7176222801208496, "memory(GiB)": 36.53, "step": 9425, "token_acc": 0.8133476088508208, "train_speed(iter/s)": 0.095962 }, { "epoch": 0.4385294382649487, "grad_norm": 6.66787052154541, "learning_rate": 6.410507258339911e-06, "loss": 0.6233195304870606, "memory(GiB)": 36.53, "step": 9430, "token_acc": 0.8431514275388508, "train_speed(iter/s)": 0.095992 }, { "epoch": 0.43876195652489836, "grad_norm": 8.708562850952148, "learning_rate": 6.40681810182793e-06, "loss": 0.6892944812774658, "memory(GiB)": 36.53, "step": 9435, "token_acc": 0.8342415985467757, "train_speed(iter/s)": 0.09602 }, { "epoch": 0.43899447478484793, "grad_norm": 8.571969032287598, "learning_rate": 6.403128113420973e-06, "loss": 0.6714536190032959, "memory(GiB)": 36.53, "step": 9440, "token_acc": 0.8265379113018598, "train_speed(iter/s)": 0.096049 }, { "epoch": 0.43922699304479756, "grad_norm": 5.261947154998779, "learning_rate": 6.399437295301041e-06, "loss": 0.7917817115783692, "memory(GiB)": 36.53, "step": 9445, "token_acc": 0.7919876733436055, "train_speed(iter/s)": 0.096077 }, { "epoch": 0.43945951130474714, "grad_norm": 7.901933670043945, "learning_rate": 6.3957456496506275e-06, "loss": 0.8872608184814453, "memory(GiB)": 36.53, "step": 9450, "token_acc": 0.7660295930949446, "train_speed(iter/s)": 0.096105 }, { "epoch": 0.43945951130474714, "eval_loss": 0.6135982871055603, "eval_runtime": 298.3506, "eval_samples_per_second": 11.647, "eval_steps_per_second": 11.647, "step": 9450 }, { "epoch": 0.4396920295646968, "grad_norm": 6.660861968994141, "learning_rate": 6.39205317865272e-06, "loss": 0.6729079246520996, "memory(GiB)": 36.53, "step": 9455, "token_acc": 0.8219437903348952, "train_speed(iter/s)": 0.095841 }, { "epoch": 0.43992454782464635, "grad_norm": 9.15910530090332, "learning_rate": 6.388359884490789e-06, "loss": 0.7558378219604492, "memory(GiB)": 36.53, "step": 9460, "token_acc": 0.8095046314941603, "train_speed(iter/s)": 0.09587 }, { "epoch": 0.440157066084596, "grad_norm": 8.168127059936523, "learning_rate": 6.3846657693487945e-06, "loss": 0.6436011314392089, "memory(GiB)": 36.53, "step": 9465, "token_acc": 0.8427491903562433, "train_speed(iter/s)": 0.095899 }, { "epoch": 0.44038958434454556, "grad_norm": 7.540072917938232, "learning_rate": 6.3809708354111775e-06, "loss": 0.6013598442077637, "memory(GiB)": 36.53, "step": 9470, "token_acc": 0.8489921421250427, "train_speed(iter/s)": 0.095926 }, { "epoch": 0.44062210260449514, "grad_norm": 9.310900688171387, "learning_rate": 6.3772750848628705e-06, "loss": 0.7716301441192627, "memory(GiB)": 36.53, "step": 9475, "token_acc": 0.8114617940199336, "train_speed(iter/s)": 0.095954 }, { "epoch": 0.44085462086444477, "grad_norm": 7.388217449188232, "learning_rate": 6.373578519889283e-06, "loss": 0.6690125942230225, "memory(GiB)": 36.53, "step": 9480, "token_acc": 0.8285063455906281, "train_speed(iter/s)": 0.095983 }, { "epoch": 0.44108713912439435, "grad_norm": 6.658295631408691, "learning_rate": 6.3698811426763086e-06, "loss": 0.7387238502502441, "memory(GiB)": 36.53, "step": 9485, "token_acc": 0.8200382897255903, "train_speed(iter/s)": 0.096011 }, { "epoch": 0.441319657384344, "grad_norm": 8.474617004394531, "learning_rate": 6.366182955410319e-06, "loss": 0.8051560401916504, "memory(GiB)": 36.53, "step": 9490, "token_acc": 0.7984709480122324, "train_speed(iter/s)": 0.096039 }, { "epoch": 0.44155217564429355, "grad_norm": 6.576582431793213, "learning_rate": 6.362483960278167e-06, "loss": 0.6635289669036866, "memory(GiB)": 36.53, "step": 9495, "token_acc": 0.8355202929508697, "train_speed(iter/s)": 0.096068 }, { "epoch": 0.4417846939042432, "grad_norm": 7.216211318969727, "learning_rate": 6.358784159467186e-06, "loss": 0.5488409996032715, "memory(GiB)": 36.53, "step": 9500, "token_acc": 0.8590710599444223, "train_speed(iter/s)": 0.096096 }, { "epoch": 0.4417846939042432, "eval_loss": 0.6116949915885925, "eval_runtime": 297.3772, "eval_samples_per_second": 11.685, "eval_steps_per_second": 11.685, "step": 9500 }, { "epoch": 0.44201721216419276, "grad_norm": 6.454580307006836, "learning_rate": 6.355083555165179e-06, "loss": 0.6398816585540772, "memory(GiB)": 36.53, "step": 9505, "token_acc": 0.8225794806382518, "train_speed(iter/s)": 0.095836 }, { "epoch": 0.4422497304241424, "grad_norm": 7.4101057052612305, "learning_rate": 6.3513821495604286e-06, "loss": 0.629861307144165, "memory(GiB)": 36.53, "step": 9510, "token_acc": 0.8217360114777619, "train_speed(iter/s)": 0.095864 }, { "epoch": 0.44248224868409197, "grad_norm": 7.9802632331848145, "learning_rate": 6.347679944841689e-06, "loss": 0.6410726070404053, "memory(GiB)": 36.53, "step": 9515, "token_acc": 0.844579226686884, "train_speed(iter/s)": 0.095891 }, { "epoch": 0.44271476694404155, "grad_norm": 7.573753356933594, "learning_rate": 6.34397694319819e-06, "loss": 0.7023254871368408, "memory(GiB)": 36.53, "step": 9520, "token_acc": 0.8184658104824714, "train_speed(iter/s)": 0.095919 }, { "epoch": 0.4429472852039912, "grad_norm": 8.315315246582031, "learning_rate": 6.340273146819631e-06, "loss": 0.6772781848907471, "memory(GiB)": 36.53, "step": 9525, "token_acc": 0.8359739049394221, "train_speed(iter/s)": 0.095948 }, { "epoch": 0.44317980346394076, "grad_norm": 6.7924628257751465, "learning_rate": 6.336568557896178e-06, "loss": 0.8196972846984864, "memory(GiB)": 36.53, "step": 9530, "token_acc": 0.7956465237166992, "train_speed(iter/s)": 0.095976 }, { "epoch": 0.4434123217238904, "grad_norm": 8.114082336425781, "learning_rate": 6.332863178618471e-06, "loss": 0.7693780422210693, "memory(GiB)": 36.53, "step": 9535, "token_acc": 0.8112975849365535, "train_speed(iter/s)": 0.096004 }, { "epoch": 0.44364483998383997, "grad_norm": 9.550496101379395, "learning_rate": 6.329157011177617e-06, "loss": 0.7559969425201416, "memory(GiB)": 36.53, "step": 9540, "token_acc": 0.8270291568163909, "train_speed(iter/s)": 0.096033 }, { "epoch": 0.4438773582437896, "grad_norm": 6.725862979888916, "learning_rate": 6.325450057765184e-06, "loss": 0.7191961288452149, "memory(GiB)": 36.53, "step": 9545, "token_acc": 0.8149694828140058, "train_speed(iter/s)": 0.096061 }, { "epoch": 0.4441098765037392, "grad_norm": 7.213817596435547, "learning_rate": 6.321742320573209e-06, "loss": 0.6995858669281005, "memory(GiB)": 36.53, "step": 9550, "token_acc": 0.83427071616048, "train_speed(iter/s)": 0.09609 }, { "epoch": 0.4441098765037392, "eval_loss": 0.6096996068954468, "eval_runtime": 297.0553, "eval_samples_per_second": 11.698, "eval_steps_per_second": 11.698, "step": 9550 }, { "epoch": 0.4443423947636888, "grad_norm": 7.757943630218506, "learning_rate": 6.318033801794193e-06, "loss": 0.7151656150817871, "memory(GiB)": 36.53, "step": 9555, "token_acc": 0.8218591052905603, "train_speed(iter/s)": 0.095831 }, { "epoch": 0.4445749130236384, "grad_norm": 5.580169200897217, "learning_rate": 6.3143245036210965e-06, "loss": 0.5690258979797364, "memory(GiB)": 36.53, "step": 9560, "token_acc": 0.8539325842696629, "train_speed(iter/s)": 0.09586 }, { "epoch": 0.44480743128358796, "grad_norm": 8.343439102172852, "learning_rate": 6.3106144282473425e-06, "loss": 0.8035446166992187, "memory(GiB)": 36.53, "step": 9565, "token_acc": 0.7925283522348232, "train_speed(iter/s)": 0.095888 }, { "epoch": 0.4450399495435376, "grad_norm": 7.245533466339111, "learning_rate": 6.306903577866811e-06, "loss": 0.7285247325897217, "memory(GiB)": 36.53, "step": 9570, "token_acc": 0.8194254445964432, "train_speed(iter/s)": 0.095915 }, { "epoch": 0.44527246780348717, "grad_norm": 7.128092288970947, "learning_rate": 6.303191954673844e-06, "loss": 0.7278432846069336, "memory(GiB)": 36.53, "step": 9575, "token_acc": 0.8139937651541392, "train_speed(iter/s)": 0.095943 }, { "epoch": 0.4455049860634368, "grad_norm": 7.35676383972168, "learning_rate": 6.29947956086324e-06, "loss": 0.8199946403503418, "memory(GiB)": 36.53, "step": 9580, "token_acc": 0.8070937386898299, "train_speed(iter/s)": 0.095972 }, { "epoch": 0.4457375043233864, "grad_norm": 5.832348346710205, "learning_rate": 6.295766398630251e-06, "loss": 0.6157866477966308, "memory(GiB)": 36.53, "step": 9585, "token_acc": 0.83373063170441, "train_speed(iter/s)": 0.096 }, { "epoch": 0.445970022583336, "grad_norm": 8.000378608703613, "learning_rate": 6.292052470170583e-06, "loss": 0.6848884105682373, "memory(GiB)": 36.53, "step": 9590, "token_acc": 0.8265479219677693, "train_speed(iter/s)": 0.096029 }, { "epoch": 0.4462025408432856, "grad_norm": 9.083320617675781, "learning_rate": 6.2883377776804e-06, "loss": 0.8012693405151368, "memory(GiB)": 36.53, "step": 9595, "token_acc": 0.7966036279428792, "train_speed(iter/s)": 0.096056 }, { "epoch": 0.4464350591032352, "grad_norm": 8.783888816833496, "learning_rate": 6.284622323356312e-06, "loss": 0.7423254489898682, "memory(GiB)": 36.53, "step": 9600, "token_acc": 0.8229587712206953, "train_speed(iter/s)": 0.096083 }, { "epoch": 0.4464350591032352, "eval_loss": 0.6070340871810913, "eval_runtime": 293.5483, "eval_samples_per_second": 11.838, "eval_steps_per_second": 11.838, "step": 9600 }, { "epoch": 0.4466675773631848, "grad_norm": 8.892648696899414, "learning_rate": 6.280906109395382e-06, "loss": 0.6096343040466309, "memory(GiB)": 36.53, "step": 9605, "token_acc": 0.8230499744049142, "train_speed(iter/s)": 0.095829 }, { "epoch": 0.44690009562313443, "grad_norm": 6.9500732421875, "learning_rate": 6.277189137995121e-06, "loss": 0.6724872589111328, "memory(GiB)": 36.53, "step": 9610, "token_acc": 0.833088018840153, "train_speed(iter/s)": 0.095858 }, { "epoch": 0.447132613883084, "grad_norm": 6.852596759796143, "learning_rate": 6.273471411353491e-06, "loss": 0.6281951427459717, "memory(GiB)": 36.53, "step": 9615, "token_acc": 0.8316430020283976, "train_speed(iter/s)": 0.095886 }, { "epoch": 0.4473651321430336, "grad_norm": 7.1367950439453125, "learning_rate": 6.269752931668899e-06, "loss": 0.7241427898406982, "memory(GiB)": 36.53, "step": 9620, "token_acc": 0.8250350631136045, "train_speed(iter/s)": 0.095913 }, { "epoch": 0.4475976504029832, "grad_norm": 6.101066589355469, "learning_rate": 6.266033701140193e-06, "loss": 0.6471002101898193, "memory(GiB)": 36.53, "step": 9625, "token_acc": 0.8366453351296733, "train_speed(iter/s)": 0.095941 }, { "epoch": 0.4478301686629328, "grad_norm": 7.338571548461914, "learning_rate": 6.262313721966673e-06, "loss": 0.7227649211883544, "memory(GiB)": 36.53, "step": 9630, "token_acc": 0.8344453711426189, "train_speed(iter/s)": 0.095969 }, { "epoch": 0.4480626869228824, "grad_norm": 7.873856067657471, "learning_rate": 6.2585929963480764e-06, "loss": 0.7752127647399902, "memory(GiB)": 36.53, "step": 9635, "token_acc": 0.8141263940520446, "train_speed(iter/s)": 0.095998 }, { "epoch": 0.448295205182832, "grad_norm": 7.643828392028809, "learning_rate": 6.254871526484583e-06, "loss": 0.809475040435791, "memory(GiB)": 36.53, "step": 9640, "token_acc": 0.8117283950617284, "train_speed(iter/s)": 0.096026 }, { "epoch": 0.44852772344278163, "grad_norm": 6.188126087188721, "learning_rate": 6.251149314576812e-06, "loss": 0.7115334510803223, "memory(GiB)": 36.53, "step": 9645, "token_acc": 0.8103073579633654, "train_speed(iter/s)": 0.096054 }, { "epoch": 0.4487602417027312, "grad_norm": 6.334682464599609, "learning_rate": 6.247426362825823e-06, "loss": 0.5974188804626465, "memory(GiB)": 36.53, "step": 9650, "token_acc": 0.8622777147181233, "train_speed(iter/s)": 0.096083 }, { "epoch": 0.4487602417027312, "eval_loss": 0.6079947352409363, "eval_runtime": 293.8957, "eval_samples_per_second": 11.824, "eval_steps_per_second": 11.824, "step": 9650 }, { "epoch": 0.44899275996268084, "grad_norm": 6.791869163513184, "learning_rate": 6.243702673433111e-06, "loss": 0.681541919708252, "memory(GiB)": 36.53, "step": 9655, "token_acc": 0.8224100972613259, "train_speed(iter/s)": 0.09583 }, { "epoch": 0.4492252782226304, "grad_norm": 6.402570724487305, "learning_rate": 6.23997824860061e-06, "loss": 0.527205514907837, "memory(GiB)": 36.53, "step": 9660, "token_acc": 0.876510067114094, "train_speed(iter/s)": 0.095857 }, { "epoch": 0.44945779648258, "grad_norm": 7.076948165893555, "learning_rate": 6.236253090530689e-06, "loss": 0.7284140110015869, "memory(GiB)": 36.53, "step": 9665, "token_acc": 0.8071654373024236, "train_speed(iter/s)": 0.095885 }, { "epoch": 0.44969031474252963, "grad_norm": 8.33540153503418, "learning_rate": 6.232527201426145e-06, "loss": 0.6018318176269531, "memory(GiB)": 36.53, "step": 9670, "token_acc": 0.8495754144763445, "train_speed(iter/s)": 0.095913 }, { "epoch": 0.4499228330024792, "grad_norm": 5.779272079467773, "learning_rate": 6.228800583490213e-06, "loss": 0.6930430412292481, "memory(GiB)": 36.53, "step": 9675, "token_acc": 0.8281002220107834, "train_speed(iter/s)": 0.09594 }, { "epoch": 0.45015535126242884, "grad_norm": 6.199838161468506, "learning_rate": 6.225073238926558e-06, "loss": 0.6994569301605225, "memory(GiB)": 36.53, "step": 9680, "token_acc": 0.8282828282828283, "train_speed(iter/s)": 0.095968 }, { "epoch": 0.4503878695223784, "grad_norm": 7.950428009033203, "learning_rate": 6.221345169939274e-06, "loss": 0.705945348739624, "memory(GiB)": 36.53, "step": 9685, "token_acc": 0.8452830188679246, "train_speed(iter/s)": 0.095997 }, { "epoch": 0.45062038778232805, "grad_norm": 6.8860249519348145, "learning_rate": 6.217616378732883e-06, "loss": 0.6749707221984863, "memory(GiB)": 36.53, "step": 9690, "token_acc": 0.8228431904503527, "train_speed(iter/s)": 0.096023 }, { "epoch": 0.4508529060422776, "grad_norm": 7.688700199127197, "learning_rate": 6.213886867512332e-06, "loss": 0.6652534484863282, "memory(GiB)": 36.53, "step": 9695, "token_acc": 0.8338132455779514, "train_speed(iter/s)": 0.096051 }, { "epoch": 0.45108542430222726, "grad_norm": 8.237911224365234, "learning_rate": 6.210156638483e-06, "loss": 0.5874074459075928, "memory(GiB)": 36.53, "step": 9700, "token_acc": 0.8490930142802007, "train_speed(iter/s)": 0.096079 }, { "epoch": 0.45108542430222726, "eval_loss": 0.608859121799469, "eval_runtime": 293.3921, "eval_samples_per_second": 11.844, "eval_steps_per_second": 11.844, "step": 9700 }, { "epoch": 0.45131794256217683, "grad_norm": 7.017812252044678, "learning_rate": 6.206425693850684e-06, "loss": 0.5841714859008789, "memory(GiB)": 36.53, "step": 9705, "token_acc": 0.8230711454870877, "train_speed(iter/s)": 0.095827 }, { "epoch": 0.45155046082212646, "grad_norm": 7.0842719078063965, "learning_rate": 6.202694035821607e-06, "loss": 0.7316049575805664, "memory(GiB)": 36.53, "step": 9710, "token_acc": 0.8187274909963985, "train_speed(iter/s)": 0.095854 }, { "epoch": 0.45178297908207604, "grad_norm": 7.328638076782227, "learning_rate": 6.198961666602416e-06, "loss": 0.7782143592834473, "memory(GiB)": 36.53, "step": 9715, "token_acc": 0.8079444658696491, "train_speed(iter/s)": 0.095882 }, { "epoch": 0.4520154973420256, "grad_norm": 9.605910301208496, "learning_rate": 6.195228588400173e-06, "loss": 0.6689083099365234, "memory(GiB)": 36.53, "step": 9720, "token_acc": 0.8285105086810843, "train_speed(iter/s)": 0.09591 }, { "epoch": 0.45224801560197525, "grad_norm": 9.341135025024414, "learning_rate": 6.191494803422364e-06, "loss": 0.6024797916412353, "memory(GiB)": 36.53, "step": 9725, "token_acc": 0.8464150943396226, "train_speed(iter/s)": 0.095938 }, { "epoch": 0.4524805338619248, "grad_norm": 9.248586654663086, "learning_rate": 6.187760313876891e-06, "loss": 0.7038826942443848, "memory(GiB)": 36.53, "step": 9730, "token_acc": 0.8426349496797805, "train_speed(iter/s)": 0.095965 }, { "epoch": 0.45271305212187446, "grad_norm": 7.3656487464904785, "learning_rate": 6.184025121972072e-06, "loss": 0.7399398803710937, "memory(GiB)": 36.53, "step": 9735, "token_acc": 0.816057293629853, "train_speed(iter/s)": 0.095993 }, { "epoch": 0.45294557038182404, "grad_norm": 7.461027145385742, "learning_rate": 6.180289229916645e-06, "loss": 0.6084781169891358, "memory(GiB)": 36.53, "step": 9740, "token_acc": 0.8564412542500944, "train_speed(iter/s)": 0.096021 }, { "epoch": 0.45317808864177367, "grad_norm": 6.418759822845459, "learning_rate": 6.176552639919754e-06, "loss": 0.7723904609680176, "memory(GiB)": 36.53, "step": 9745, "token_acc": 0.8285714285714286, "train_speed(iter/s)": 0.096048 }, { "epoch": 0.45341060690172325, "grad_norm": 7.4962921142578125, "learning_rate": 6.172815354190961e-06, "loss": 0.5903301239013672, "memory(GiB)": 36.53, "step": 9750, "token_acc": 0.8578680203045685, "train_speed(iter/s)": 0.096076 }, { "epoch": 0.45341060690172325, "eval_loss": 0.6094077825546265, "eval_runtime": 294.2374, "eval_samples_per_second": 11.81, "eval_steps_per_second": 11.81, "step": 9750 }, { "epoch": 0.4536431251616729, "grad_norm": 8.316295623779297, "learning_rate": 6.169077374940239e-06, "loss": 0.704967737197876, "memory(GiB)": 36.53, "step": 9755, "token_acc": 0.8228508585327479, "train_speed(iter/s)": 0.095825 }, { "epoch": 0.45387564342162245, "grad_norm": 7.59016227722168, "learning_rate": 6.165338704377971e-06, "loss": 0.709078311920166, "memory(GiB)": 36.53, "step": 9760, "token_acc": 0.8235915492957746, "train_speed(iter/s)": 0.095852 }, { "epoch": 0.45410816168157203, "grad_norm": 5.767019748687744, "learning_rate": 6.161599344714948e-06, "loss": 0.6274002075195313, "memory(GiB)": 36.53, "step": 9765, "token_acc": 0.8305860805860806, "train_speed(iter/s)": 0.095879 }, { "epoch": 0.45434067994152166, "grad_norm": 7.16868782043457, "learning_rate": 6.15785929816237e-06, "loss": 0.7391060829162598, "memory(GiB)": 36.53, "step": 9770, "token_acc": 0.8283212790255043, "train_speed(iter/s)": 0.095906 }, { "epoch": 0.45457319820147124, "grad_norm": 6.7341694831848145, "learning_rate": 6.154118566931838e-06, "loss": 0.6822587490081787, "memory(GiB)": 36.53, "step": 9775, "token_acc": 0.8355828220858895, "train_speed(iter/s)": 0.095934 }, { "epoch": 0.4548057164614209, "grad_norm": 9.300615310668945, "learning_rate": 6.150377153235368e-06, "loss": 0.626803970336914, "memory(GiB)": 36.53, "step": 9780, "token_acc": 0.8419951168468782, "train_speed(iter/s)": 0.095961 }, { "epoch": 0.45503823472137045, "grad_norm": 5.771785736083984, "learning_rate": 6.146635059285367e-06, "loss": 0.6426148414611816, "memory(GiB)": 36.53, "step": 9785, "token_acc": 0.8419601837672281, "train_speed(iter/s)": 0.095989 }, { "epoch": 0.4552707529813201, "grad_norm": 8.090389251708984, "learning_rate": 6.142892287294656e-06, "loss": 0.7309681892395019, "memory(GiB)": 36.53, "step": 9790, "token_acc": 0.8210048848569435, "train_speed(iter/s)": 0.096017 }, { "epoch": 0.45550327124126966, "grad_norm": 7.393229007720947, "learning_rate": 6.139148839476448e-06, "loss": 0.7353736400604248, "memory(GiB)": 36.53, "step": 9795, "token_acc": 0.8155509065550907, "train_speed(iter/s)": 0.096045 }, { "epoch": 0.4557357895012193, "grad_norm": 6.2574920654296875, "learning_rate": 6.135404718044361e-06, "loss": 0.6523962497711182, "memory(GiB)": 36.53, "step": 9800, "token_acc": 0.8372361954322058, "train_speed(iter/s)": 0.096073 }, { "epoch": 0.4557357895012193, "eval_loss": 0.6095502376556396, "eval_runtime": 292.7828, "eval_samples_per_second": 11.869, "eval_steps_per_second": 11.869, "step": 9800 }, { "epoch": 0.45596830776116887, "grad_norm": 6.883238315582275, "learning_rate": 6.13165992521241e-06, "loss": 0.6746381759643555, "memory(GiB)": 36.53, "step": 9805, "token_acc": 0.8226793375837043, "train_speed(iter/s)": 0.095826 }, { "epoch": 0.45620082602111844, "grad_norm": 7.616454124450684, "learning_rate": 6.127914463195006e-06, "loss": 0.7246517658233642, "memory(GiB)": 36.53, "step": 9810, "token_acc": 0.8232161874334398, "train_speed(iter/s)": 0.095854 }, { "epoch": 0.4564333442810681, "grad_norm": 7.244081974029541, "learning_rate": 6.124168334206955e-06, "loss": 0.7134342193603516, "memory(GiB)": 36.53, "step": 9815, "token_acc": 0.830820770519263, "train_speed(iter/s)": 0.095881 }, { "epoch": 0.45666586254101765, "grad_norm": 7.020895957946777, "learning_rate": 6.1204215404634605e-06, "loss": 0.6642593383789063, "memory(GiB)": 36.53, "step": 9820, "token_acc": 0.8396159317211949, "train_speed(iter/s)": 0.095908 }, { "epoch": 0.4568983808009673, "grad_norm": 8.283989906311035, "learning_rate": 6.116674084180116e-06, "loss": 0.7088629245758057, "memory(GiB)": 36.53, "step": 9825, "token_acc": 0.83529890199268, "train_speed(iter/s)": 0.095936 }, { "epoch": 0.45713089906091686, "grad_norm": 7.5810546875, "learning_rate": 6.112925967572911e-06, "loss": 0.6322180271148682, "memory(GiB)": 36.53, "step": 9830, "token_acc": 0.8513738551207327, "train_speed(iter/s)": 0.095964 }, { "epoch": 0.4573634173208665, "grad_norm": 7.434423446655273, "learning_rate": 6.109177192858218e-06, "loss": 0.7602972984313965, "memory(GiB)": 36.53, "step": 9835, "token_acc": 0.8070408502158751, "train_speed(iter/s)": 0.095991 }, { "epoch": 0.45759593558081607, "grad_norm": 7.4128217697143555, "learning_rate": 6.105427762252807e-06, "loss": 0.6599167823791504, "memory(GiB)": 36.53, "step": 9840, "token_acc": 0.8335649756775538, "train_speed(iter/s)": 0.096019 }, { "epoch": 0.4578284538407657, "grad_norm": 5.756119728088379, "learning_rate": 6.101677677973831e-06, "loss": 0.6275468349456788, "memory(GiB)": 36.53, "step": 9845, "token_acc": 0.8496710526315789, "train_speed(iter/s)": 0.096047 }, { "epoch": 0.4580609721007153, "grad_norm": 7.097313404083252, "learning_rate": 6.09792694223883e-06, "loss": 0.6755487442016601, "memory(GiB)": 36.53, "step": 9850, "token_acc": 0.8331877729257642, "train_speed(iter/s)": 0.096075 }, { "epoch": 0.4580609721007153, "eval_loss": 0.6061334013938904, "eval_runtime": 293.067, "eval_samples_per_second": 11.857, "eval_steps_per_second": 11.857, "step": 9850 }, { "epoch": 0.4582934903606649, "grad_norm": 6.993188381195068, "learning_rate": 6.094175557265729e-06, "loss": 0.6865869998931885, "memory(GiB)": 36.53, "step": 9855, "token_acc": 0.8228351826445551, "train_speed(iter/s)": 0.095827 }, { "epoch": 0.4585260086206145, "grad_norm": 5.587054252624512, "learning_rate": 6.09042352527284e-06, "loss": 0.605366563796997, "memory(GiB)": 36.53, "step": 9860, "token_acc": 0.8427457098283931, "train_speed(iter/s)": 0.095854 }, { "epoch": 0.45875852688056407, "grad_norm": 7.5531721115112305, "learning_rate": 6.086670848478855e-06, "loss": 0.7458683967590332, "memory(GiB)": 36.53, "step": 9865, "token_acc": 0.8089076136021667, "train_speed(iter/s)": 0.095881 }, { "epoch": 0.4589910451405137, "grad_norm": 5.862915515899658, "learning_rate": 6.082917529102846e-06, "loss": 0.7659440040588379, "memory(GiB)": 36.53, "step": 9870, "token_acc": 0.8184658104824714, "train_speed(iter/s)": 0.095908 }, { "epoch": 0.4592235634004633, "grad_norm": 8.969634056091309, "learning_rate": 6.079163569364268e-06, "loss": 0.6593055725097656, "memory(GiB)": 36.53, "step": 9875, "token_acc": 0.831306990881459, "train_speed(iter/s)": 0.095935 }, { "epoch": 0.4594560816604129, "grad_norm": 7.298051357269287, "learning_rate": 6.0754089714829535e-06, "loss": 0.6426519393920899, "memory(GiB)": 36.53, "step": 9880, "token_acc": 0.8413323782234957, "train_speed(iter/s)": 0.095962 }, { "epoch": 0.4596885999203625, "grad_norm": 5.749274253845215, "learning_rate": 6.07165373767911e-06, "loss": 0.7200953960418701, "memory(GiB)": 36.53, "step": 9885, "token_acc": 0.8264546684709067, "train_speed(iter/s)": 0.095989 }, { "epoch": 0.4599211181803121, "grad_norm": 8.445324897766113, "learning_rate": 6.067897870173325e-06, "loss": 0.686259412765503, "memory(GiB)": 36.53, "step": 9890, "token_acc": 0.8217406501223349, "train_speed(iter/s)": 0.096016 }, { "epoch": 0.4601536364402617, "grad_norm": 6.035857677459717, "learning_rate": 6.0641413711865585e-06, "loss": 0.5487040042877197, "memory(GiB)": 36.53, "step": 9895, "token_acc": 0.8626220362622036, "train_speed(iter/s)": 0.096043 }, { "epoch": 0.4603861547002113, "grad_norm": 6.481517791748047, "learning_rate": 6.060384242940146e-06, "loss": 0.7927393436431884, "memory(GiB)": 36.53, "step": 9900, "token_acc": 0.7991266375545851, "train_speed(iter/s)": 0.09607 }, { "epoch": 0.4603861547002113, "eval_loss": 0.6078546643257141, "eval_runtime": 294.4453, "eval_samples_per_second": 11.802, "eval_steps_per_second": 11.802, "step": 9900 }, { "epoch": 0.4606186729601609, "grad_norm": 6.822314739227295, "learning_rate": 6.056626487655791e-06, "loss": 0.6223538398742676, "memory(GiB)": 36.53, "step": 9905, "token_acc": 0.8235760952342779, "train_speed(iter/s)": 0.095823 }, { "epoch": 0.4608511912201105, "grad_norm": 8.354050636291504, "learning_rate": 6.052868107555572e-06, "loss": 0.7600676059722901, "memory(GiB)": 36.53, "step": 9910, "token_acc": 0.8175775480059084, "train_speed(iter/s)": 0.095849 }, { "epoch": 0.4610837094800601, "grad_norm": 8.266427040100098, "learning_rate": 6.0491091048619325e-06, "loss": 0.7591500282287598, "memory(GiB)": 36.53, "step": 9915, "token_acc": 0.824430823117338, "train_speed(iter/s)": 0.095877 }, { "epoch": 0.4613162277400097, "grad_norm": 6.544285774230957, "learning_rate": 6.04534948179769e-06, "loss": 0.7405786991119385, "memory(GiB)": 36.53, "step": 9920, "token_acc": 0.8334106728538283, "train_speed(iter/s)": 0.095904 }, { "epoch": 0.4615487459999593, "grad_norm": 6.957915306091309, "learning_rate": 6.041589240586025e-06, "loss": 0.6144753932952881, "memory(GiB)": 36.53, "step": 9925, "token_acc": 0.8433460076045627, "train_speed(iter/s)": 0.095931 }, { "epoch": 0.4617812642599089, "grad_norm": 7.508825778961182, "learning_rate": 6.037828383450481e-06, "loss": 0.597527551651001, "memory(GiB)": 36.53, "step": 9930, "token_acc": 0.8426294820717132, "train_speed(iter/s)": 0.095959 }, { "epoch": 0.46201378251985853, "grad_norm": 7.72009801864624, "learning_rate": 6.034066912614973e-06, "loss": 0.6360546112060547, "memory(GiB)": 36.53, "step": 9935, "token_acc": 0.8471028037383178, "train_speed(iter/s)": 0.095986 }, { "epoch": 0.4622463007798081, "grad_norm": 8.508076667785645, "learning_rate": 6.030304830303774e-06, "loss": 0.6973794460296631, "memory(GiB)": 36.53, "step": 9940, "token_acc": 0.8105059619722849, "train_speed(iter/s)": 0.096013 }, { "epoch": 0.46247881903975774, "grad_norm": 8.080486297607422, "learning_rate": 6.026542138741518e-06, "loss": 0.7362306594848633, "memory(GiB)": 36.53, "step": 9945, "token_acc": 0.8207620528771384, "train_speed(iter/s)": 0.09604 }, { "epoch": 0.4627113372997073, "grad_norm": 6.59077262878418, "learning_rate": 6.0227788401532025e-06, "loss": 0.6460929870605469, "memory(GiB)": 36.53, "step": 9950, "token_acc": 0.8428842504743833, "train_speed(iter/s)": 0.096068 }, { "epoch": 0.4627113372997073, "eval_loss": 0.6045697331428528, "eval_runtime": 292.8994, "eval_samples_per_second": 11.864, "eval_steps_per_second": 11.864, "step": 9950 }, { "epoch": 0.4629438555596569, "grad_norm": 9.889881134033203, "learning_rate": 6.019014936764179e-06, "loss": 0.6179000854492187, "memory(GiB)": 36.53, "step": 9955, "token_acc": 0.8231968779610388, "train_speed(iter/s)": 0.095824 }, { "epoch": 0.4631763738196065, "grad_norm": 7.871378421783447, "learning_rate": 6.015250430800164e-06, "loss": 0.7012851715087891, "memory(GiB)": 36.53, "step": 9960, "token_acc": 0.8371753720455208, "train_speed(iter/s)": 0.095851 }, { "epoch": 0.4634088920795561, "grad_norm": 6.383567810058594, "learning_rate": 6.011485324487224e-06, "loss": 0.5643723487854004, "memory(GiB)": 36.53, "step": 9965, "token_acc": 0.8545157335512873, "train_speed(iter/s)": 0.095879 }, { "epoch": 0.46364141033950573, "grad_norm": 8.291006088256836, "learning_rate": 6.007719620051781e-06, "loss": 0.6677682399749756, "memory(GiB)": 36.53, "step": 9970, "token_acc": 0.8361702127659575, "train_speed(iter/s)": 0.095906 }, { "epoch": 0.4638739285994553, "grad_norm": 8.129223823547363, "learning_rate": 6.003953319720614e-06, "loss": 0.6844709396362305, "memory(GiB)": 36.53, "step": 9975, "token_acc": 0.8373146622734761, "train_speed(iter/s)": 0.095934 }, { "epoch": 0.46410644685940494, "grad_norm": 5.145586967468262, "learning_rate": 6.000186425720854e-06, "loss": 0.7600801944732666, "memory(GiB)": 36.53, "step": 9980, "token_acc": 0.8042265923099501, "train_speed(iter/s)": 0.095961 }, { "epoch": 0.4643389651193545, "grad_norm": 9.662071228027344, "learning_rate": 5.99641894027998e-06, "loss": 0.835693359375, "memory(GiB)": 36.53, "step": 9985, "token_acc": 0.7827635327635327, "train_speed(iter/s)": 0.095988 }, { "epoch": 0.46457148337930415, "grad_norm": 10.052090644836426, "learning_rate": 5.992650865625823e-06, "loss": 0.6995959281921387, "memory(GiB)": 36.53, "step": 9990, "token_acc": 0.8277608915906788, "train_speed(iter/s)": 0.096015 }, { "epoch": 0.4648040016392537, "grad_norm": 6.37227725982666, "learning_rate": 5.98888220398656e-06, "loss": 0.7162473678588868, "memory(GiB)": 36.53, "step": 9995, "token_acc": 0.8284432171211599, "train_speed(iter/s)": 0.096043 }, { "epoch": 0.46503651989920336, "grad_norm": 6.917495250701904, "learning_rate": 5.985112957590721e-06, "loss": 0.6338780879974365, "memory(GiB)": 36.53, "step": 10000, "token_acc": 0.8304964539007093, "train_speed(iter/s)": 0.09607 }, { "epoch": 0.46503651989920336, "eval_loss": 0.6057087182998657, "eval_runtime": 291.3624, "eval_samples_per_second": 11.927, "eval_steps_per_second": 11.927, "step": 10000 }, { "epoch": 0.46526903815915294, "grad_norm": 7.6043171882629395, "learning_rate": 5.981343128667174e-06, "loss": 0.7458448886871338, "memory(GiB)": 36.53, "step": 10005, "token_acc": 0.8233181843685126, "train_speed(iter/s)": 0.095829 }, { "epoch": 0.4655015564191025, "grad_norm": 7.113598823547363, "learning_rate": 5.977572719445137e-06, "loss": 0.7033159255981445, "memory(GiB)": 36.53, "step": 10010, "token_acc": 0.8245614035087719, "train_speed(iter/s)": 0.095855 }, { "epoch": 0.46573407467905215, "grad_norm": 9.22055721282959, "learning_rate": 5.973801732154168e-06, "loss": 0.5509349822998046, "memory(GiB)": 36.53, "step": 10015, "token_acc": 0.865392965696917, "train_speed(iter/s)": 0.095882 }, { "epoch": 0.4659665929390017, "grad_norm": 5.772355079650879, "learning_rate": 5.97003016902417e-06, "loss": 0.7141910552978515, "memory(GiB)": 36.53, "step": 10020, "token_acc": 0.828882833787466, "train_speed(iter/s)": 0.095908 }, { "epoch": 0.46619911119895135, "grad_norm": 7.265198230743408, "learning_rate": 5.9662580322853825e-06, "loss": 0.6663059711456298, "memory(GiB)": 36.53, "step": 10025, "token_acc": 0.8441601049868767, "train_speed(iter/s)": 0.095935 }, { "epoch": 0.46643162945890093, "grad_norm": 7.731073379516602, "learning_rate": 5.96248532416839e-06, "loss": 0.69374098777771, "memory(GiB)": 36.53, "step": 10030, "token_acc": 0.8324854651162791, "train_speed(iter/s)": 0.095961 }, { "epoch": 0.46666414771885056, "grad_norm": 6.640981197357178, "learning_rate": 5.958712046904107e-06, "loss": 0.7169713020324707, "memory(GiB)": 36.53, "step": 10035, "token_acc": 0.8095913734392736, "train_speed(iter/s)": 0.095987 }, { "epoch": 0.46689666597880014, "grad_norm": 6.7847819328308105, "learning_rate": 5.954938202723794e-06, "loss": 0.7144430160522461, "memory(GiB)": 36.53, "step": 10040, "token_acc": 0.8301005747126436, "train_speed(iter/s)": 0.096013 }, { "epoch": 0.4671291842387498, "grad_norm": 7.647963047027588, "learning_rate": 5.951163793859036e-06, "loss": 0.6398671627044678, "memory(GiB)": 36.53, "step": 10045, "token_acc": 0.8311781609195402, "train_speed(iter/s)": 0.09604 }, { "epoch": 0.46736170249869935, "grad_norm": 4.547146797180176, "learning_rate": 5.947388822541762e-06, "loss": 0.7039501190185546, "memory(GiB)": 36.53, "step": 10050, "token_acc": 0.8191751850546352, "train_speed(iter/s)": 0.096067 }, { "epoch": 0.46736170249869935, "eval_loss": 0.6045349836349487, "eval_runtime": 291.4717, "eval_samples_per_second": 11.922, "eval_steps_per_second": 11.922, "step": 10050 }, { "epoch": 0.4675942207586489, "grad_norm": 9.998385429382324, "learning_rate": 5.943613291004224e-06, "loss": 0.692030668258667, "memory(GiB)": 36.53, "step": 10055, "token_acc": 0.8235185852038571, "train_speed(iter/s)": 0.095827 }, { "epoch": 0.46782673901859856, "grad_norm": 5.997773170471191, "learning_rate": 5.9398372014790175e-06, "loss": 0.8196736335754394, "memory(GiB)": 36.53, "step": 10060, "token_acc": 0.7867219917012448, "train_speed(iter/s)": 0.095852 }, { "epoch": 0.46805925727854814, "grad_norm": 7.602710723876953, "learning_rate": 5.936060556199055e-06, "loss": 0.6392131805419922, "memory(GiB)": 36.53, "step": 10065, "token_acc": 0.8540405838376647, "train_speed(iter/s)": 0.095879 }, { "epoch": 0.46829177553849777, "grad_norm": 5.817196369171143, "learning_rate": 5.932283357397586e-06, "loss": 0.6863467693328857, "memory(GiB)": 36.53, "step": 10070, "token_acc": 0.8298807281858129, "train_speed(iter/s)": 0.095906 }, { "epoch": 0.46852429379844734, "grad_norm": 6.113554000854492, "learning_rate": 5.928505607308182e-06, "loss": 0.6544069766998291, "memory(GiB)": 36.53, "step": 10075, "token_acc": 0.8291605301914581, "train_speed(iter/s)": 0.095932 }, { "epoch": 0.468756812058397, "grad_norm": 7.657655239105225, "learning_rate": 5.924727308164747e-06, "loss": 0.7510591506958008, "memory(GiB)": 36.53, "step": 10080, "token_acc": 0.8130738156445098, "train_speed(iter/s)": 0.095959 }, { "epoch": 0.46898933031834655, "grad_norm": 5.734485626220703, "learning_rate": 5.920948462201503e-06, "loss": 0.7152635097503662, "memory(GiB)": 36.53, "step": 10085, "token_acc": 0.8170918367346939, "train_speed(iter/s)": 0.095983 }, { "epoch": 0.4692218485782962, "grad_norm": 7.42374324798584, "learning_rate": 5.917169071653001e-06, "loss": 0.7044946193695069, "memory(GiB)": 36.53, "step": 10090, "token_acc": 0.8162097017173847, "train_speed(iter/s)": 0.09601 }, { "epoch": 0.46945436683824576, "grad_norm": 6.3649373054504395, "learning_rate": 5.913389138754109e-06, "loss": 0.65935378074646, "memory(GiB)": 36.53, "step": 10095, "token_acc": 0.8460063897763578, "train_speed(iter/s)": 0.096037 }, { "epoch": 0.4696868850981954, "grad_norm": 6.768080234527588, "learning_rate": 5.90960866574002e-06, "loss": 0.6156496047973633, "memory(GiB)": 36.53, "step": 10100, "token_acc": 0.8520084566596194, "train_speed(iter/s)": 0.096063 }, { "epoch": 0.4696868850981954, "eval_loss": 0.6128177046775818, "eval_runtime": 291.3128, "eval_samples_per_second": 11.929, "eval_steps_per_second": 11.929, "step": 10100 }, { "epoch": 0.46991940335814497, "grad_norm": 5.990058422088623, "learning_rate": 5.9058276548462435e-06, "loss": 0.7458582878112793, "memory(GiB)": 36.53, "step": 10105, "token_acc": 0.8227818872371427, "train_speed(iter/s)": 0.095824 }, { "epoch": 0.47015192161809455, "grad_norm": 9.34148120880127, "learning_rate": 5.902046108308607e-06, "loss": 0.7696472644805908, "memory(GiB)": 36.53, "step": 10110, "token_acc": 0.8111810440577564, "train_speed(iter/s)": 0.095851 }, { "epoch": 0.4703844398780442, "grad_norm": 6.020503520965576, "learning_rate": 5.8982640283632555e-06, "loss": 0.7021479606628418, "memory(GiB)": 36.53, "step": 10115, "token_acc": 0.8248538011695906, "train_speed(iter/s)": 0.095877 }, { "epoch": 0.47061695813799376, "grad_norm": 6.6600775718688965, "learning_rate": 5.894481417246652e-06, "loss": 0.6542953491210938, "memory(GiB)": 36.53, "step": 10120, "token_acc": 0.8318471337579618, "train_speed(iter/s)": 0.095902 }, { "epoch": 0.4708494763979434, "grad_norm": 7.627831935882568, "learning_rate": 5.890698277195569e-06, "loss": 0.7104983329772949, "memory(GiB)": 36.53, "step": 10125, "token_acc": 0.8335798816568047, "train_speed(iter/s)": 0.095929 }, { "epoch": 0.47108199465789297, "grad_norm": 6.605009078979492, "learning_rate": 5.886914610447097e-06, "loss": 0.6252718925476074, "memory(GiB)": 36.53, "step": 10130, "token_acc": 0.842741935483871, "train_speed(iter/s)": 0.095956 }, { "epoch": 0.4713145129178426, "grad_norm": 7.564285755157471, "learning_rate": 5.8831304192386295e-06, "loss": 0.6758628368377686, "memory(GiB)": 36.53, "step": 10135, "token_acc": 0.8410499453153482, "train_speed(iter/s)": 0.095983 }, { "epoch": 0.4715470311777922, "grad_norm": 6.068528175354004, "learning_rate": 5.87934570580788e-06, "loss": 0.6484007835388184, "memory(GiB)": 36.53, "step": 10140, "token_acc": 0.8354826103946854, "train_speed(iter/s)": 0.096009 }, { "epoch": 0.4717795494377418, "grad_norm": 5.984204292297363, "learning_rate": 5.875560472392867e-06, "loss": 0.6442727088928223, "memory(GiB)": 36.53, "step": 10145, "token_acc": 0.8396540252827678, "train_speed(iter/s)": 0.096035 }, { "epoch": 0.4720120676976914, "grad_norm": 8.497846603393555, "learning_rate": 5.871774721231913e-06, "loss": 0.6294188499450684, "memory(GiB)": 36.53, "step": 10150, "token_acc": 0.8486238532110092, "train_speed(iter/s)": 0.096062 }, { "epoch": 0.4720120676976914, "eval_loss": 0.6052373051643372, "eval_runtime": 292.5147, "eval_samples_per_second": 11.88, "eval_steps_per_second": 11.88, "step": 10150 }, { "epoch": 0.47224458595764096, "grad_norm": 8.094533920288086, "learning_rate": 5.8679884545636515e-06, "loss": 0.6738255023956299, "memory(GiB)": 36.53, "step": 10155, "token_acc": 0.8238933230759368, "train_speed(iter/s)": 0.095824 }, { "epoch": 0.4724771042175906, "grad_norm": 8.374753952026367, "learning_rate": 5.864201674627017e-06, "loss": 0.7145820617675781, "memory(GiB)": 36.53, "step": 10160, "token_acc": 0.8257487359004279, "train_speed(iter/s)": 0.09585 }, { "epoch": 0.47270962247754017, "grad_norm": 6.898515701293945, "learning_rate": 5.8604143836612515e-06, "loss": 0.6502760887145996, "memory(GiB)": 36.53, "step": 10165, "token_acc": 0.8404459823144944, "train_speed(iter/s)": 0.095876 }, { "epoch": 0.4729421407374898, "grad_norm": 7.775455951690674, "learning_rate": 5.856626583905895e-06, "loss": 0.6345690250396728, "memory(GiB)": 36.53, "step": 10170, "token_acc": 0.8561292865589278, "train_speed(iter/s)": 0.095903 }, { "epoch": 0.4731746589974394, "grad_norm": 7.383172035217285, "learning_rate": 5.8528382776007945e-06, "loss": 0.6680724143981933, "memory(GiB)": 36.53, "step": 10175, "token_acc": 0.830135039090263, "train_speed(iter/s)": 0.095929 }, { "epoch": 0.473407177257389, "grad_norm": 7.9133620262146, "learning_rate": 5.849049466986087e-06, "loss": 0.7379461288452148, "memory(GiB)": 36.53, "step": 10180, "token_acc": 0.8111346018322763, "train_speed(iter/s)": 0.095955 }, { "epoch": 0.4736396955173386, "grad_norm": 10.533130645751953, "learning_rate": 5.845260154302216e-06, "loss": 0.7678250789642334, "memory(GiB)": 36.53, "step": 10185, "token_acc": 0.8148893360160966, "train_speed(iter/s)": 0.09598 }, { "epoch": 0.4738722137772882, "grad_norm": 8.054941177368164, "learning_rate": 5.84147034178992e-06, "loss": 0.6807512760162353, "memory(GiB)": 36.53, "step": 10190, "token_acc": 0.8333333333333334, "train_speed(iter/s)": 0.096006 }, { "epoch": 0.4741047320372378, "grad_norm": 6.960943698883057, "learning_rate": 5.83768003169023e-06, "loss": 0.6838326454162598, "memory(GiB)": 36.53, "step": 10195, "token_acc": 0.8228460793804453, "train_speed(iter/s)": 0.096032 }, { "epoch": 0.4743372502971874, "grad_norm": 8.947465896606445, "learning_rate": 5.833889226244474e-06, "loss": 0.7164243698120117, "memory(GiB)": 36.53, "step": 10200, "token_acc": 0.8165033911077618, "train_speed(iter/s)": 0.096059 }, { "epoch": 0.4743372502971874, "eval_loss": 0.6035026907920837, "eval_runtime": 291.2419, "eval_samples_per_second": 11.932, "eval_steps_per_second": 11.932, "step": 10200 }, { "epoch": 0.474569768557137, "grad_norm": 6.6514081954956055, "learning_rate": 5.830097927694274e-06, "loss": 0.682344388961792, "memory(GiB)": 36.53, "step": 10205, "token_acc": 0.8235237622652987, "train_speed(iter/s)": 0.095822 }, { "epoch": 0.4748022868170866, "grad_norm": 8.357755661010742, "learning_rate": 5.82630613828154e-06, "loss": 0.6868386745452881, "memory(GiB)": 36.53, "step": 10210, "token_acc": 0.8239069394304052, "train_speed(iter/s)": 0.095849 }, { "epoch": 0.4750348050770362, "grad_norm": 5.058123588562012, "learning_rate": 5.822513860248473e-06, "loss": 0.6764467716217041, "memory(GiB)": 36.53, "step": 10215, "token_acc": 0.8304372197309418, "train_speed(iter/s)": 0.095875 }, { "epoch": 0.4752673233369858, "grad_norm": 6.218808174133301, "learning_rate": 5.818721095837568e-06, "loss": 0.6577483654022217, "memory(GiB)": 36.53, "step": 10220, "token_acc": 0.8478093774019985, "train_speed(iter/s)": 0.095901 }, { "epoch": 0.4754998415969354, "grad_norm": 5.844109535217285, "learning_rate": 5.814927847291601e-06, "loss": 0.8146178245544433, "memory(GiB)": 36.53, "step": 10225, "token_acc": 0.8012820512820513, "train_speed(iter/s)": 0.095928 }, { "epoch": 0.475732359856885, "grad_norm": 6.904383182525635, "learning_rate": 5.811134116853639e-06, "loss": 0.6491562366485596, "memory(GiB)": 36.53, "step": 10230, "token_acc": 0.8438868976503385, "train_speed(iter/s)": 0.095954 }, { "epoch": 0.47596487811683463, "grad_norm": 8.637255668640137, "learning_rate": 5.8073399067670264e-06, "loss": 0.7037137031555176, "memory(GiB)": 36.53, "step": 10235, "token_acc": 0.8274404304381245, "train_speed(iter/s)": 0.095981 }, { "epoch": 0.4761973963767842, "grad_norm": 6.772544860839844, "learning_rate": 5.803545219275404e-06, "loss": 0.6280532360076905, "memory(GiB)": 36.53, "step": 10240, "token_acc": 0.8492176386913229, "train_speed(iter/s)": 0.096007 }, { "epoch": 0.47642991463673384, "grad_norm": 9.061598777770996, "learning_rate": 5.799750056622684e-06, "loss": 0.667292594909668, "memory(GiB)": 36.53, "step": 10245, "token_acc": 0.8214404248257551, "train_speed(iter/s)": 0.096033 }, { "epoch": 0.4766624328966834, "grad_norm": 7.211753845214844, "learning_rate": 5.795954421053064e-06, "loss": 0.7735927104949951, "memory(GiB)": 36.53, "step": 10250, "token_acc": 0.8092586146884789, "train_speed(iter/s)": 0.096059 }, { "epoch": 0.4766624328966834, "eval_loss": 0.6029525995254517, "eval_runtime": 293.6988, "eval_samples_per_second": 11.832, "eval_steps_per_second": 11.832, "step": 10250 }, { "epoch": 0.476894951156633, "grad_norm": 8.002875328063965, "learning_rate": 5.792158314811018e-06, "loss": 0.6148755073547363, "memory(GiB)": 36.53, "step": 10255, "token_acc": 0.8244646359524477, "train_speed(iter/s)": 0.095821 }, { "epoch": 0.47712746941658263, "grad_norm": 8.440995216369629, "learning_rate": 5.788361740141305e-06, "loss": 0.6819697856903076, "memory(GiB)": 36.53, "step": 10260, "token_acc": 0.8247137781287012, "train_speed(iter/s)": 0.095847 }, { "epoch": 0.4773599876765322, "grad_norm": 7.9807844161987305, "learning_rate": 5.784564699288955e-06, "loss": 0.7811295032501221, "memory(GiB)": 36.53, "step": 10265, "token_acc": 0.7987358616101131, "train_speed(iter/s)": 0.095874 }, { "epoch": 0.47759250593648184, "grad_norm": 8.874316215515137, "learning_rate": 5.780767194499275e-06, "loss": 0.7386106014251709, "memory(GiB)": 36.53, "step": 10270, "token_acc": 0.817530695770805, "train_speed(iter/s)": 0.0959 }, { "epoch": 0.4778250241964314, "grad_norm": 7.540121078491211, "learning_rate": 5.776969228017846e-06, "loss": 0.6754724502563476, "memory(GiB)": 36.53, "step": 10275, "token_acc": 0.8390589992531741, "train_speed(iter/s)": 0.095927 }, { "epoch": 0.47805754245638105, "grad_norm": 9.226720809936523, "learning_rate": 5.773170802090526e-06, "loss": 0.6454158306121827, "memory(GiB)": 36.53, "step": 10280, "token_acc": 0.8572533849129593, "train_speed(iter/s)": 0.095952 }, { "epoch": 0.4782900607163306, "grad_norm": 8.085728645324707, "learning_rate": 5.7693719189634375e-06, "loss": 0.6315486431121826, "memory(GiB)": 36.53, "step": 10285, "token_acc": 0.843432289548597, "train_speed(iter/s)": 0.095979 }, { "epoch": 0.47852257897628026, "grad_norm": 9.334856986999512, "learning_rate": 5.76557258088298e-06, "loss": 0.7062033653259278, "memory(GiB)": 36.53, "step": 10290, "token_acc": 0.8273440726972325, "train_speed(iter/s)": 0.096004 }, { "epoch": 0.47875509723622983, "grad_norm": 9.930914878845215, "learning_rate": 5.76177279009582e-06, "loss": 0.693555498123169, "memory(GiB)": 36.53, "step": 10295, "token_acc": 0.8392219134577213, "train_speed(iter/s)": 0.09603 }, { "epoch": 0.4789876154961794, "grad_norm": 6.257607460021973, "learning_rate": 5.757972548848888e-06, "loss": 0.5264789581298828, "memory(GiB)": 36.53, "step": 10300, "token_acc": 0.8754208754208754, "train_speed(iter/s)": 0.096056 }, { "epoch": 0.4789876154961794, "eval_loss": 0.6043440699577332, "eval_runtime": 292.2265, "eval_samples_per_second": 11.891, "eval_steps_per_second": 11.891, "step": 10300 }, { "epoch": 0.47922013375612904, "grad_norm": 6.7262420654296875, "learning_rate": 5.7541718593893865e-06, "loss": 0.6431325912475586, "memory(GiB)": 36.53, "step": 10305, "token_acc": 0.8243854851348974, "train_speed(iter/s)": 0.09582 }, { "epoch": 0.4794526520160786, "grad_norm": 6.871412754058838, "learning_rate": 5.750370723964781e-06, "loss": 0.6176501274108886, "memory(GiB)": 36.53, "step": 10310, "token_acc": 0.8424317617866005, "train_speed(iter/s)": 0.095846 }, { "epoch": 0.47968517027602825, "grad_norm": 7.732744216918945, "learning_rate": 5.7465691448227985e-06, "loss": 0.691460132598877, "memory(GiB)": 36.53, "step": 10315, "token_acc": 0.8188585607940446, "train_speed(iter/s)": 0.095872 }, { "epoch": 0.4799176885359778, "grad_norm": 4.607907772064209, "learning_rate": 5.7427671242114305e-06, "loss": 0.6410884857177734, "memory(GiB)": 36.53, "step": 10320, "token_acc": 0.8241157556270097, "train_speed(iter/s)": 0.095898 }, { "epoch": 0.48015020679592746, "grad_norm": 5.815286636352539, "learning_rate": 5.73896466437893e-06, "loss": 0.627712631225586, "memory(GiB)": 36.53, "step": 10325, "token_acc": 0.848414539829853, "train_speed(iter/s)": 0.095924 }, { "epoch": 0.48038272505587704, "grad_norm": 7.110845565795898, "learning_rate": 5.735161767573809e-06, "loss": 0.8327849388122559, "memory(GiB)": 36.53, "step": 10330, "token_acc": 0.7952969550798915, "train_speed(iter/s)": 0.09595 }, { "epoch": 0.48061524331582667, "grad_norm": 7.159823894500732, "learning_rate": 5.731358436044836e-06, "loss": 0.7731481075286866, "memory(GiB)": 36.53, "step": 10335, "token_acc": 0.8158567774936062, "train_speed(iter/s)": 0.095976 }, { "epoch": 0.48084776157577624, "grad_norm": 5.621152400970459, "learning_rate": 5.7275546720410395e-06, "loss": 0.66044921875, "memory(GiB)": 36.53, "step": 10340, "token_acc": 0.8278571428571428, "train_speed(iter/s)": 0.096002 }, { "epoch": 0.4810802798357259, "grad_norm": 6.352847099304199, "learning_rate": 5.723750477811702e-06, "loss": 0.7206785678863525, "memory(GiB)": 36.53, "step": 10345, "token_acc": 0.8181818181818182, "train_speed(iter/s)": 0.096028 }, { "epoch": 0.48131279809567545, "grad_norm": 6.40288782119751, "learning_rate": 5.719945855606364e-06, "loss": 0.7612641811370849, "memory(GiB)": 36.53, "step": 10350, "token_acc": 0.8156575395295025, "train_speed(iter/s)": 0.096054 }, { "epoch": 0.48131279809567545, "eval_loss": 0.6041258573532104, "eval_runtime": 292.4376, "eval_samples_per_second": 11.883, "eval_steps_per_second": 11.883, "step": 10350 }, { "epoch": 0.48154531635562503, "grad_norm": 6.934345245361328, "learning_rate": 5.716140807674812e-06, "loss": 0.6299592971801757, "memory(GiB)": 36.53, "step": 10355, "token_acc": 0.8244488015685647, "train_speed(iter/s)": 0.095819 }, { "epoch": 0.48177783461557466, "grad_norm": 5.559031963348389, "learning_rate": 5.71233533626709e-06, "loss": 0.7900909900665283, "memory(GiB)": 36.53, "step": 10360, "token_acc": 0.7986577181208053, "train_speed(iter/s)": 0.095845 }, { "epoch": 0.48201035287552424, "grad_norm": 6.343786716461182, "learning_rate": 5.708529443633491e-06, "loss": 0.781171464920044, "memory(GiB)": 36.53, "step": 10365, "token_acc": 0.8162832929782082, "train_speed(iter/s)": 0.09587 }, { "epoch": 0.48224287113547387, "grad_norm": 8.414325714111328, "learning_rate": 5.704723132024557e-06, "loss": 0.6617238044738769, "memory(GiB)": 36.53, "step": 10370, "token_acc": 0.8287714831317632, "train_speed(iter/s)": 0.095896 }, { "epoch": 0.48247538939542345, "grad_norm": 6.823397636413574, "learning_rate": 5.700916403691077e-06, "loss": 0.6314909934997559, "memory(GiB)": 36.53, "step": 10375, "token_acc": 0.8461814270347795, "train_speed(iter/s)": 0.095922 }, { "epoch": 0.4827079076553731, "grad_norm": 6.531882286071777, "learning_rate": 5.697109260884085e-06, "loss": 0.5414093017578125, "memory(GiB)": 36.53, "step": 10380, "token_acc": 0.8636543797066983, "train_speed(iter/s)": 0.095948 }, { "epoch": 0.48294042591532266, "grad_norm": 6.150195121765137, "learning_rate": 5.693301705854867e-06, "loss": 0.6772344589233399, "memory(GiB)": 36.53, "step": 10385, "token_acc": 0.8308388444135051, "train_speed(iter/s)": 0.095974 }, { "epoch": 0.4831729441752723, "grad_norm": 5.847650527954102, "learning_rate": 5.6894937408549435e-06, "loss": 0.7219397068023682, "memory(GiB)": 36.53, "step": 10390, "token_acc": 0.8232174425456689, "train_speed(iter/s)": 0.096 }, { "epoch": 0.48340546243522187, "grad_norm": 5.661290168762207, "learning_rate": 5.6856853681360825e-06, "loss": 0.7071576118469238, "memory(GiB)": 36.53, "step": 10395, "token_acc": 0.8241574908241575, "train_speed(iter/s)": 0.096025 }, { "epoch": 0.48363798069517144, "grad_norm": 7.167285442352295, "learning_rate": 5.681876589950295e-06, "loss": 0.6745022296905517, "memory(GiB)": 36.53, "step": 10400, "token_acc": 0.8311345646437994, "train_speed(iter/s)": 0.096051 }, { "epoch": 0.48363798069517144, "eval_loss": 0.604568600654602, "eval_runtime": 291.1795, "eval_samples_per_second": 11.934, "eval_steps_per_second": 11.934, "step": 10400 }, { "epoch": 0.4838704989551211, "grad_norm": 6.558994293212891, "learning_rate": 5.678067408549828e-06, "loss": 0.7601808547973633, "memory(GiB)": 36.53, "step": 10405, "token_acc": 0.8237313086542231, "train_speed(iter/s)": 0.09582 }, { "epoch": 0.48410301721507065, "grad_norm": 9.058159828186035, "learning_rate": 5.6742578261871665e-06, "loss": 0.6190596580505371, "memory(GiB)": 36.53, "step": 10410, "token_acc": 0.8485485906604964, "train_speed(iter/s)": 0.095845 }, { "epoch": 0.4843355354750203, "grad_norm": 6.316933631896973, "learning_rate": 5.670447845115033e-06, "loss": 0.7364337921142579, "memory(GiB)": 36.53, "step": 10415, "token_acc": 0.8257394084732215, "train_speed(iter/s)": 0.095871 }, { "epoch": 0.48456805373496986, "grad_norm": 9.497570037841797, "learning_rate": 5.66663746758639e-06, "loss": 0.7649255752563476, "memory(GiB)": 36.53, "step": 10420, "token_acc": 0.8071589809738794, "train_speed(iter/s)": 0.095897 }, { "epoch": 0.4848005719949195, "grad_norm": 7.515045166015625, "learning_rate": 5.662826695854431e-06, "loss": 0.6880429744720459, "memory(GiB)": 36.53, "step": 10425, "token_acc": 0.8259047619047619, "train_speed(iter/s)": 0.095923 }, { "epoch": 0.48503309025486907, "grad_norm": 7.987893581390381, "learning_rate": 5.6590155321725825e-06, "loss": 0.8033793449401856, "memory(GiB)": 36.53, "step": 10430, "token_acc": 0.8061657032755298, "train_speed(iter/s)": 0.095948 }, { "epoch": 0.4852656085148187, "grad_norm": 8.104069709777832, "learning_rate": 5.655203978794504e-06, "loss": 0.6667456150054931, "memory(GiB)": 36.53, "step": 10435, "token_acc": 0.8448275862068966, "train_speed(iter/s)": 0.095974 }, { "epoch": 0.4854981267747683, "grad_norm": 7.878139495849609, "learning_rate": 5.6513920379740816e-06, "loss": 0.6613424777984619, "memory(GiB)": 36.53, "step": 10440, "token_acc": 0.8525703200775946, "train_speed(iter/s)": 0.096 }, { "epoch": 0.48573064503471786, "grad_norm": 9.738919258117676, "learning_rate": 5.647579711965438e-06, "loss": 0.6458121299743652, "memory(GiB)": 36.53, "step": 10445, "token_acc": 0.8279947345326898, "train_speed(iter/s)": 0.096026 }, { "epoch": 0.4859631632946675, "grad_norm": 6.289546966552734, "learning_rate": 5.6437670030229155e-06, "loss": 0.6806787014007568, "memory(GiB)": 36.53, "step": 10450, "token_acc": 0.8339513803049031, "train_speed(iter/s)": 0.096052 }, { "epoch": 0.4859631632946675, "eval_loss": 0.6016459465026855, "eval_runtime": 292.0162, "eval_samples_per_second": 11.9, "eval_steps_per_second": 11.9, "step": 10450 }, { "epoch": 0.48619568155461707, "grad_norm": 8.368342399597168, "learning_rate": 5.63995391340109e-06, "loss": 0.5780067443847656, "memory(GiB)": 36.53, "step": 10455, "token_acc": 0.8241071571627706, "train_speed(iter/s)": 0.095821 }, { "epoch": 0.4864281998145667, "grad_norm": 8.043057441711426, "learning_rate": 5.6361404453547545e-06, "loss": 0.7120685577392578, "memory(GiB)": 36.53, "step": 10460, "token_acc": 0.8316590563165905, "train_speed(iter/s)": 0.095847 }, { "epoch": 0.4866607180745163, "grad_norm": 8.706648826599121, "learning_rate": 5.632326601138935e-06, "loss": 0.8613996505737305, "memory(GiB)": 36.53, "step": 10465, "token_acc": 0.7830508474576271, "train_speed(iter/s)": 0.095873 }, { "epoch": 0.4868932363344659, "grad_norm": 7.210156440734863, "learning_rate": 5.628512383008874e-06, "loss": 0.668830156326294, "memory(GiB)": 36.53, "step": 10470, "token_acc": 0.8363457114689451, "train_speed(iter/s)": 0.095899 }, { "epoch": 0.4871257545944155, "grad_norm": 6.831915855407715, "learning_rate": 5.624697793220035e-06, "loss": 0.7384651660919189, "memory(GiB)": 36.53, "step": 10475, "token_acc": 0.8148029477731497, "train_speed(iter/s)": 0.095925 }, { "epoch": 0.4873582728543651, "grad_norm": 8.7930269241333, "learning_rate": 5.620882834028103e-06, "loss": 0.6526782989501954, "memory(GiB)": 36.53, "step": 10480, "token_acc": 0.8457166057653268, "train_speed(iter/s)": 0.095951 }, { "epoch": 0.4875907911143147, "grad_norm": 6.194753170013428, "learning_rate": 5.617067507688983e-06, "loss": 0.7427937984466553, "memory(GiB)": 36.53, "step": 10485, "token_acc": 0.8137285491419657, "train_speed(iter/s)": 0.095977 }, { "epoch": 0.4878233093742643, "grad_norm": 7.289088726043701, "learning_rate": 5.613251816458794e-06, "loss": 0.7535356521606446, "memory(GiB)": 36.53, "step": 10490, "token_acc": 0.8124249699879952, "train_speed(iter/s)": 0.096002 }, { "epoch": 0.4880558276342139, "grad_norm": 8.472503662109375, "learning_rate": 5.609435762593873e-06, "loss": 0.8218043327331543, "memory(GiB)": 36.53, "step": 10495, "token_acc": 0.7921367521367522, "train_speed(iter/s)": 0.096028 }, { "epoch": 0.4882883458941635, "grad_norm": 6.736743450164795, "learning_rate": 5.605619348350768e-06, "loss": 0.668080186843872, "memory(GiB)": 36.53, "step": 10500, "token_acc": 0.8355832467982001, "train_speed(iter/s)": 0.096054 }, { "epoch": 0.4882883458941635, "eval_loss": 0.6022413372993469, "eval_runtime": 293.5956, "eval_samples_per_second": 11.836, "eval_steps_per_second": 11.836, "step": 10500 }, { "epoch": 0.4885208641541131, "grad_norm": 6.929119110107422, "learning_rate": 5.6018025759862445e-06, "loss": 0.7089277267456054, "memory(GiB)": 36.53, "step": 10505, "token_acc": 0.8237999102736653, "train_speed(iter/s)": 0.095822 }, { "epoch": 0.4887533824140627, "grad_norm": 5.99254846572876, "learning_rate": 5.597985447757278e-06, "loss": 0.653064489364624, "memory(GiB)": 36.53, "step": 10510, "token_acc": 0.8375241779497099, "train_speed(iter/s)": 0.095847 }, { "epoch": 0.4889859006740123, "grad_norm": 6.13014030456543, "learning_rate": 5.594167965921055e-06, "loss": 0.7110246658325196, "memory(GiB)": 36.53, "step": 10515, "token_acc": 0.82035494386092, "train_speed(iter/s)": 0.095873 }, { "epoch": 0.4892184189339619, "grad_norm": 8.600988388061523, "learning_rate": 5.590350132734966e-06, "loss": 0.6460587024688721, "memory(GiB)": 36.53, "step": 10520, "token_acc": 0.8511146496815286, "train_speed(iter/s)": 0.095898 }, { "epoch": 0.48945093719391153, "grad_norm": 7.910162925720215, "learning_rate": 5.586531950456619e-06, "loss": 0.7574851512908936, "memory(GiB)": 36.53, "step": 10525, "token_acc": 0.8053097345132744, "train_speed(iter/s)": 0.095924 }, { "epoch": 0.4896834554538611, "grad_norm": 6.965766429901123, "learning_rate": 5.582713421343822e-06, "loss": 0.595002555847168, "memory(GiB)": 36.53, "step": 10530, "token_acc": 0.8482737734706238, "train_speed(iter/s)": 0.095949 }, { "epoch": 0.48991597371381074, "grad_norm": 7.77105188369751, "learning_rate": 5.578894547654586e-06, "loss": 0.7004836082458497, "memory(GiB)": 36.53, "step": 10535, "token_acc": 0.8149184149184149, "train_speed(iter/s)": 0.095974 }, { "epoch": 0.4901484919737603, "grad_norm": 8.512481689453125, "learning_rate": 5.57507533164713e-06, "loss": 0.7674338340759277, "memory(GiB)": 36.53, "step": 10540, "token_acc": 0.8025250890255746, "train_speed(iter/s)": 0.095999 }, { "epoch": 0.4903810102337099, "grad_norm": 4.976019382476807, "learning_rate": 5.571255775579878e-06, "loss": 0.6822651386260986, "memory(GiB)": 36.53, "step": 10545, "token_acc": 0.8155166249553093, "train_speed(iter/s)": 0.096025 }, { "epoch": 0.4906135284936595, "grad_norm": 7.175931930541992, "learning_rate": 5.567435881711446e-06, "loss": 0.7229970932006836, "memory(GiB)": 36.53, "step": 10550, "token_acc": 0.8246963562753037, "train_speed(iter/s)": 0.09605 }, { "epoch": 0.4906135284936595, "eval_loss": 0.599609375, "eval_runtime": 295.1161, "eval_samples_per_second": 11.775, "eval_steps_per_second": 11.775, "step": 10550 }, { "epoch": 0.4908460467536091, "grad_norm": 6.382102012634277, "learning_rate": 5.56361565230066e-06, "loss": 0.6602955341339112, "memory(GiB)": 36.53, "step": 10555, "token_acc": 0.8243287662446986, "train_speed(iter/s)": 0.095818 }, { "epoch": 0.49107856501355873, "grad_norm": 7.595702648162842, "learning_rate": 5.559795089606536e-06, "loss": 0.6507344722747803, "memory(GiB)": 36.53, "step": 10560, "token_acc": 0.8347368421052631, "train_speed(iter/s)": 0.095843 }, { "epoch": 0.4913110832735083, "grad_norm": 8.050884246826172, "learning_rate": 5.555974195888293e-06, "loss": 0.6467938899993897, "memory(GiB)": 36.53, "step": 10565, "token_acc": 0.8394875659382065, "train_speed(iter/s)": 0.095868 }, { "epoch": 0.49154360153345794, "grad_norm": 5.628944396972656, "learning_rate": 5.552152973405343e-06, "loss": 0.6498900890350342, "memory(GiB)": 36.53, "step": 10570, "token_acc": 0.831989247311828, "train_speed(iter/s)": 0.095893 }, { "epoch": 0.4917761197934075, "grad_norm": 7.6785454750061035, "learning_rate": 5.548331424417293e-06, "loss": 0.6663597106933594, "memory(GiB)": 36.53, "step": 10575, "token_acc": 0.8286666666666667, "train_speed(iter/s)": 0.095918 }, { "epoch": 0.49200863805335715, "grad_norm": 6.159049987792969, "learning_rate": 5.54450955118394e-06, "loss": 0.5869782447814942, "memory(GiB)": 36.53, "step": 10580, "token_acc": 0.858295334970186, "train_speed(iter/s)": 0.095944 }, { "epoch": 0.4922411563133067, "grad_norm": 5.802601337432861, "learning_rate": 5.54068735596528e-06, "loss": 0.6343412399291992, "memory(GiB)": 36.53, "step": 10585, "token_acc": 0.8386798272671191, "train_speed(iter/s)": 0.095969 }, { "epoch": 0.4924736745732563, "grad_norm": 7.778030872344971, "learning_rate": 5.536864841021492e-06, "loss": 0.7568382740020752, "memory(GiB)": 36.53, "step": 10590, "token_acc": 0.8063575386410032, "train_speed(iter/s)": 0.095994 }, { "epoch": 0.49270619283320594, "grad_norm": 6.506865978240967, "learning_rate": 5.533042008612949e-06, "loss": 0.6708489418029785, "memory(GiB)": 36.53, "step": 10595, "token_acc": 0.8332750786438309, "train_speed(iter/s)": 0.096019 }, { "epoch": 0.4929387110931555, "grad_norm": 7.4949493408203125, "learning_rate": 5.529218861000208e-06, "loss": 0.6150759220123291, "memory(GiB)": 36.53, "step": 10600, "token_acc": 0.8408723747980614, "train_speed(iter/s)": 0.096045 }, { "epoch": 0.4929387110931555, "eval_loss": 0.6005235910415649, "eval_runtime": 295.1347, "eval_samples_per_second": 11.774, "eval_steps_per_second": 11.774, "step": 10600 }, { "epoch": 0.49317122935310514, "grad_norm": 10.075146675109863, "learning_rate": 5.5253954004440146e-06, "loss": 0.6699877262115479, "memory(GiB)": 36.53, "step": 10605, "token_acc": 0.824314389822924, "train_speed(iter/s)": 0.095813 }, { "epoch": 0.4934037476130547, "grad_norm": 7.288110256195068, "learning_rate": 5.521571629205301e-06, "loss": 0.6553449153900146, "memory(GiB)": 36.53, "step": 10610, "token_acc": 0.8335005015045135, "train_speed(iter/s)": 0.095837 }, { "epoch": 0.49363626587300435, "grad_norm": 6.840569972991943, "learning_rate": 5.517747549545179e-06, "loss": 0.722406530380249, "memory(GiB)": 36.53, "step": 10615, "token_acc": 0.8230827638572513, "train_speed(iter/s)": 0.095863 }, { "epoch": 0.49386878413295393, "grad_norm": 6.8756022453308105, "learning_rate": 5.513923163724946e-06, "loss": 0.6679422855377197, "memory(GiB)": 36.53, "step": 10620, "token_acc": 0.8346281908990011, "train_speed(iter/s)": 0.095888 }, { "epoch": 0.49410130239290356, "grad_norm": 8.059365272521973, "learning_rate": 5.510098474006079e-06, "loss": 0.6776402473449707, "memory(GiB)": 36.53, "step": 10625, "token_acc": 0.8284745762711865, "train_speed(iter/s)": 0.095913 }, { "epoch": 0.49433382065285314, "grad_norm": 8.242023468017578, "learning_rate": 5.506273482650237e-06, "loss": 0.7858617782592774, "memory(GiB)": 36.53, "step": 10630, "token_acc": 0.8071585098612125, "train_speed(iter/s)": 0.095938 }, { "epoch": 0.49456633891280277, "grad_norm": 7.4410786628723145, "learning_rate": 5.502448191919253e-06, "loss": 0.5851879119873047, "memory(GiB)": 36.53, "step": 10635, "token_acc": 0.859278518037049, "train_speed(iter/s)": 0.095963 }, { "epoch": 0.49479885717275235, "grad_norm": 6.335245132446289, "learning_rate": 5.498622604075139e-06, "loss": 0.6306666374206543, "memory(GiB)": 36.53, "step": 10640, "token_acc": 0.8379658875552748, "train_speed(iter/s)": 0.095988 }, { "epoch": 0.4950313754327019, "grad_norm": 9.677199363708496, "learning_rate": 5.4947967213800855e-06, "loss": 0.5713845252990722, "memory(GiB)": 36.53, "step": 10645, "token_acc": 0.8544303797468354, "train_speed(iter/s)": 0.096014 }, { "epoch": 0.49526389369265156, "grad_norm": 7.010994911193848, "learning_rate": 5.490970546096454e-06, "loss": 0.6432509899139405, "memory(GiB)": 36.53, "step": 10650, "token_acc": 0.8430942687128092, "train_speed(iter/s)": 0.096038 }, { "epoch": 0.49526389369265156, "eval_loss": 0.601109504699707, "eval_runtime": 292.8707, "eval_samples_per_second": 11.865, "eval_steps_per_second": 11.865, "step": 10650 }, { "epoch": 0.49549641195260113, "grad_norm": 9.233794212341309, "learning_rate": 5.487144080486781e-06, "loss": 0.7560394287109375, "memory(GiB)": 36.53, "step": 10655, "token_acc": 0.8235558192108512, "train_speed(iter/s)": 0.09581 }, { "epoch": 0.49572893021255077, "grad_norm": 10.122901916503906, "learning_rate": 5.483317326813771e-06, "loss": 0.6702571868896484, "memory(GiB)": 36.53, "step": 10660, "token_acc": 0.8401162790697675, "train_speed(iter/s)": 0.095835 }, { "epoch": 0.49596144847250034, "grad_norm": 7.147398948669434, "learning_rate": 5.479490287340305e-06, "loss": 0.6192568302154541, "memory(GiB)": 36.53, "step": 10665, "token_acc": 0.8434712084347121, "train_speed(iter/s)": 0.09586 }, { "epoch": 0.49619396673245, "grad_norm": 7.782430648803711, "learning_rate": 5.475662964329428e-06, "loss": 0.7436542510986328, "memory(GiB)": 36.53, "step": 10670, "token_acc": 0.814753556070129, "train_speed(iter/s)": 0.095885 }, { "epoch": 0.49642648499239955, "grad_norm": 9.926347732543945, "learning_rate": 5.471835360044354e-06, "loss": 0.7577160358428955, "memory(GiB)": 36.53, "step": 10675, "token_acc": 0.8254643962848297, "train_speed(iter/s)": 0.095911 }, { "epoch": 0.4966590032523492, "grad_norm": 6.964309215545654, "learning_rate": 5.468007476748463e-06, "loss": 0.68472318649292, "memory(GiB)": 36.53, "step": 10680, "token_acc": 0.8404074702886248, "train_speed(iter/s)": 0.095935 }, { "epoch": 0.49689152151229876, "grad_norm": 8.409749031066895, "learning_rate": 5.464179316705302e-06, "loss": 0.8917485237121582, "memory(GiB)": 36.53, "step": 10685, "token_acc": 0.7695954487989887, "train_speed(iter/s)": 0.09596 }, { "epoch": 0.49712403977224834, "grad_norm": 8.865854263305664, "learning_rate": 5.460350882178581e-06, "loss": 0.704495906829834, "memory(GiB)": 36.53, "step": 10690, "token_acc": 0.8437796771130105, "train_speed(iter/s)": 0.095986 }, { "epoch": 0.49735655803219797, "grad_norm": 8.874429702758789, "learning_rate": 5.45652217543217e-06, "loss": 0.704408597946167, "memory(GiB)": 36.53, "step": 10695, "token_acc": 0.8195391106783512, "train_speed(iter/s)": 0.096011 }, { "epoch": 0.49758907629214755, "grad_norm": 7.480283260345459, "learning_rate": 5.452693198730101e-06, "loss": 0.7443026542663574, "memory(GiB)": 36.53, "step": 10700, "token_acc": 0.8162226878180185, "train_speed(iter/s)": 0.096036 }, { "epoch": 0.49758907629214755, "eval_loss": 0.6013534665107727, "eval_runtime": 297.2319, "eval_samples_per_second": 11.691, "eval_steps_per_second": 11.691, "step": 10700 }, { "epoch": 0.4978215945520972, "grad_norm": 4.385597229003906, "learning_rate": 5.448863954336568e-06, "loss": 0.7800351142883301, "memory(GiB)": 36.53, "step": 10705, "token_acc": 0.8231241158987237, "train_speed(iter/s)": 0.095803 }, { "epoch": 0.49805411281204676, "grad_norm": 7.6840291023254395, "learning_rate": 5.445034444515919e-06, "loss": 0.6394901752471924, "memory(GiB)": 36.53, "step": 10710, "token_acc": 0.8491237677984665, "train_speed(iter/s)": 0.095827 }, { "epoch": 0.4982866310719964, "grad_norm": 8.534746170043945, "learning_rate": 5.441204671532664e-06, "loss": 0.5974376201629639, "memory(GiB)": 36.53, "step": 10715, "token_acc": 0.8384531984098301, "train_speed(iter/s)": 0.095852 }, { "epoch": 0.49851914933194597, "grad_norm": 6.349865436553955, "learning_rate": 5.437374637651463e-06, "loss": 0.611474084854126, "memory(GiB)": 36.53, "step": 10720, "token_acc": 0.8560975609756097, "train_speed(iter/s)": 0.095877 }, { "epoch": 0.4987516675918956, "grad_norm": 8.979010581970215, "learning_rate": 5.433544345137137e-06, "loss": 0.7590946197509766, "memory(GiB)": 36.53, "step": 10725, "token_acc": 0.8189058171745153, "train_speed(iter/s)": 0.095902 }, { "epoch": 0.4989841858518452, "grad_norm": 6.776154518127441, "learning_rate": 5.429713796254654e-06, "loss": 0.6038641452789306, "memory(GiB)": 40.03, "step": 10730, "token_acc": 0.8517397881996974, "train_speed(iter/s)": 0.095925 }, { "epoch": 0.4992167041117948, "grad_norm": 5.198429584503174, "learning_rate": 5.425882993269136e-06, "loss": 0.6112579345703125, "memory(GiB)": 40.03, "step": 10735, "token_acc": 0.853542234332425, "train_speed(iter/s)": 0.09595 }, { "epoch": 0.4994492223717444, "grad_norm": 8.502971649169922, "learning_rate": 5.4220519384458545e-06, "loss": 0.6879833221435547, "memory(GiB)": 40.03, "step": 10740, "token_acc": 0.8312799452429842, "train_speed(iter/s)": 0.095975 }, { "epoch": 0.49968174063169396, "grad_norm": 8.349090576171875, "learning_rate": 5.418220634050232e-06, "loss": 0.7486342430114746, "memory(GiB)": 40.03, "step": 10745, "token_acc": 0.8212837837837837, "train_speed(iter/s)": 0.095999 }, { "epoch": 0.4999142588916436, "grad_norm": 5.16649866104126, "learning_rate": 5.414389082347836e-06, "loss": 0.6454334735870362, "memory(GiB)": 40.03, "step": 10750, "token_acc": 0.8207745421795257, "train_speed(iter/s)": 0.096024 }, { "epoch": 0.4999142588916436, "eval_loss": 0.6007801294326782, "eval_runtime": 295.4874, "eval_samples_per_second": 11.76, "eval_steps_per_second": 11.76, "step": 10750 }, { "epoch": 0.5001467771515932, "grad_norm": 6.761124134063721, "learning_rate": 5.410557285604382e-06, "loss": 0.6681477546691894, "memory(GiB)": 40.03, "step": 10755, "token_acc": 0.8252858146515936, "train_speed(iter/s)": 0.095796 }, { "epoch": 0.5003792954115428, "grad_norm": 6.287301063537598, "learning_rate": 5.406725246085728e-06, "loss": 0.8242059707641601, "memory(GiB)": 40.03, "step": 10760, "token_acc": 0.801002358490566, "train_speed(iter/s)": 0.095821 }, { "epoch": 0.5006118136714924, "grad_norm": 6.488293170928955, "learning_rate": 5.40289296605788e-06, "loss": 0.6869512557983398, "memory(GiB)": 40.03, "step": 10765, "token_acc": 0.8194488438390878, "train_speed(iter/s)": 0.095846 }, { "epoch": 0.500844331931442, "grad_norm": 5.360726833343506, "learning_rate": 5.399060447786982e-06, "loss": 0.7333622932434082, "memory(GiB)": 40.03, "step": 10770, "token_acc": 0.8263157894736842, "train_speed(iter/s)": 0.095871 }, { "epoch": 0.5010768501913916, "grad_norm": 9.921550750732422, "learning_rate": 5.39522769353932e-06, "loss": 0.6461817741394043, "memory(GiB)": 40.03, "step": 10775, "token_acc": 0.837942955920484, "train_speed(iter/s)": 0.095896 }, { "epoch": 0.5013093684513412, "grad_norm": 6.710071563720703, "learning_rate": 5.39139470558132e-06, "loss": 0.7817147254943848, "memory(GiB)": 40.03, "step": 10780, "token_acc": 0.8170890188434048, "train_speed(iter/s)": 0.095921 }, { "epoch": 0.5015418867112907, "grad_norm": 8.08139419555664, "learning_rate": 5.3875614861795466e-06, "loss": 0.6603563308715821, "memory(GiB)": 40.03, "step": 10785, "token_acc": 0.8272024729520866, "train_speed(iter/s)": 0.095945 }, { "epoch": 0.5017744049712404, "grad_norm": 6.168495178222656, "learning_rate": 5.383728037600702e-06, "loss": 0.670966386795044, "memory(GiB)": 40.03, "step": 10790, "token_acc": 0.8350973028337316, "train_speed(iter/s)": 0.095971 }, { "epoch": 0.50200692323119, "grad_norm": 5.787970542907715, "learning_rate": 5.379894362111621e-06, "loss": 0.5740962982177734, "memory(GiB)": 40.03, "step": 10795, "token_acc": 0.8688090737240075, "train_speed(iter/s)": 0.095997 }, { "epoch": 0.5022394414911396, "grad_norm": 7.659470558166504, "learning_rate": 5.376060461979272e-06, "loss": 0.7049031257629395, "memory(GiB)": 40.03, "step": 10800, "token_acc": 0.8216096324461344, "train_speed(iter/s)": 0.096022 }, { "epoch": 0.5022394414911396, "eval_loss": 0.5980068445205688, "eval_runtime": 292.8256, "eval_samples_per_second": 11.867, "eval_steps_per_second": 11.867, "step": 10800 }, { "epoch": 0.5024719597510892, "grad_norm": 9.11786937713623, "learning_rate": 5.372226339470764e-06, "loss": 0.6317077159881592, "memory(GiB)": 40.03, "step": 10805, "token_acc": 0.8252225018290575, "train_speed(iter/s)": 0.095797 }, { "epoch": 0.5027044780110388, "grad_norm": 9.503100395202637, "learning_rate": 5.368391996853328e-06, "loss": 0.6980850219726562, "memory(GiB)": 40.03, "step": 10810, "token_acc": 0.8356107660455486, "train_speed(iter/s)": 0.095822 }, { "epoch": 0.5029369962709884, "grad_norm": 10.004739761352539, "learning_rate": 5.364557436394331e-06, "loss": 0.7618881225585937, "memory(GiB)": 40.03, "step": 10815, "token_acc": 0.8205022643062989, "train_speed(iter/s)": 0.095847 }, { "epoch": 0.5031695145309381, "grad_norm": 6.893964767456055, "learning_rate": 5.360722660361266e-06, "loss": 0.8586786270141602, "memory(GiB)": 40.03, "step": 10820, "token_acc": 0.8097094259390503, "train_speed(iter/s)": 0.095872 }, { "epoch": 0.5034020327908876, "grad_norm": 6.224417209625244, "learning_rate": 5.3568876710217545e-06, "loss": 0.6337433815002441, "memory(GiB)": 40.03, "step": 10825, "token_acc": 0.845340383344349, "train_speed(iter/s)": 0.095897 }, { "epoch": 0.5036345510508372, "grad_norm": 6.450043678283691, "learning_rate": 5.353052470643545e-06, "loss": 0.6762244224548339, "memory(GiB)": 40.03, "step": 10830, "token_acc": 0.8395061728395061, "train_speed(iter/s)": 0.095922 }, { "epoch": 0.5038670693107868, "grad_norm": 7.789783000946045, "learning_rate": 5.349217061494509e-06, "loss": 0.7612596035003663, "memory(GiB)": 40.03, "step": 10835, "token_acc": 0.8089563019140484, "train_speed(iter/s)": 0.095947 }, { "epoch": 0.5040995875707364, "grad_norm": 8.258014678955078, "learning_rate": 5.345381445842644e-06, "loss": 0.5151895046234131, "memory(GiB)": 40.03, "step": 10840, "token_acc": 0.8606260296540362, "train_speed(iter/s)": 0.095971 }, { "epoch": 0.504332105830686, "grad_norm": 6.241860389709473, "learning_rate": 5.341545625956064e-06, "loss": 0.7108976364135742, "memory(GiB)": 40.03, "step": 10845, "token_acc": 0.8180952380952381, "train_speed(iter/s)": 0.095996 }, { "epoch": 0.5045646240906356, "grad_norm": 6.1465301513671875, "learning_rate": 5.337709604103013e-06, "loss": 0.6371690273284912, "memory(GiB)": 40.03, "step": 10850, "token_acc": 0.8452544704264099, "train_speed(iter/s)": 0.096021 }, { "epoch": 0.5045646240906356, "eval_loss": 0.5983362793922424, "eval_runtime": 290.2492, "eval_samples_per_second": 11.972, "eval_steps_per_second": 11.972, "step": 10850 }, { "epoch": 0.5047971423505853, "grad_norm": 7.173838138580322, "learning_rate": 5.3338733825518454e-06, "loss": 0.5999796867370606, "memory(GiB)": 40.03, "step": 10855, "token_acc": 0.8251333823200536, "train_speed(iter/s)": 0.0958 }, { "epoch": 0.5050296606105348, "grad_norm": 6.503340721130371, "learning_rate": 5.330036963571039e-06, "loss": 0.519847059249878, "memory(GiB)": 40.03, "step": 10860, "token_acc": 0.8625856164383562, "train_speed(iter/s)": 0.095825 }, { "epoch": 0.5052621788704844, "grad_norm": 7.504303455352783, "learning_rate": 5.326200349429185e-06, "loss": 0.5535295486450196, "memory(GiB)": 40.03, "step": 10865, "token_acc": 0.8465544871794872, "train_speed(iter/s)": 0.09585 }, { "epoch": 0.505494697130434, "grad_norm": 7.1386799812316895, "learning_rate": 5.322363542394994e-06, "loss": 0.7013700485229493, "memory(GiB)": 40.03, "step": 10870, "token_acc": 0.8328748280605227, "train_speed(iter/s)": 0.095875 }, { "epoch": 0.5057272153903837, "grad_norm": 6.778848171234131, "learning_rate": 5.318526544737288e-06, "loss": 0.6343185424804687, "memory(GiB)": 40.03, "step": 10875, "token_acc": 0.8425501937301867, "train_speed(iter/s)": 0.095899 }, { "epoch": 0.5059597336503332, "grad_norm": 6.905848026275635, "learning_rate": 5.314689358725002e-06, "loss": 0.7110846996307373, "memory(GiB)": 40.03, "step": 10880, "token_acc": 0.8171044202434337, "train_speed(iter/s)": 0.095924 }, { "epoch": 0.5061922519102828, "grad_norm": 10.56027889251709, "learning_rate": 5.31085198662718e-06, "loss": 0.6131344795227051, "memory(GiB)": 40.03, "step": 10885, "token_acc": 0.8370015948963317, "train_speed(iter/s)": 0.095948 }, { "epoch": 0.5064247701702325, "grad_norm": 8.732833862304688, "learning_rate": 5.3070144307129834e-06, "loss": 0.6883892059326172, "memory(GiB)": 40.03, "step": 10890, "token_acc": 0.8293180890159249, "train_speed(iter/s)": 0.095973 }, { "epoch": 0.506657288430182, "grad_norm": 7.830416679382324, "learning_rate": 5.303176693251675e-06, "loss": 0.6924023151397705, "memory(GiB)": 40.03, "step": 10895, "token_acc": 0.8322422258592471, "train_speed(iter/s)": 0.095998 }, { "epoch": 0.5068898066901316, "grad_norm": 7.292912483215332, "learning_rate": 5.2993387765126255e-06, "loss": 0.6057341575622559, "memory(GiB)": 40.03, "step": 10900, "token_acc": 0.8391360412637009, "train_speed(iter/s)": 0.096023 }, { "epoch": 0.5068898066901316, "eval_loss": 0.6003403663635254, "eval_runtime": 290.5621, "eval_samples_per_second": 11.96, "eval_steps_per_second": 11.96, "step": 10900 }, { "epoch": 0.5071223249500812, "grad_norm": 9.880396842956543, "learning_rate": 5.295500682765318e-06, "loss": 0.7102957725524902, "memory(GiB)": 40.03, "step": 10905, "token_acc": 0.8242088112303457, "train_speed(iter/s)": 0.095802 }, { "epoch": 0.5073548432100309, "grad_norm": 7.285651206970215, "learning_rate": 5.291662414279332e-06, "loss": 0.7098144054412842, "memory(GiB)": 40.03, "step": 10910, "token_acc": 0.8158052884615384, "train_speed(iter/s)": 0.095826 }, { "epoch": 0.5075873614699804, "grad_norm": 7.897537708282471, "learning_rate": 5.287823973324355e-06, "loss": 0.6831938743591308, "memory(GiB)": 40.03, "step": 10915, "token_acc": 0.8302300109529025, "train_speed(iter/s)": 0.095851 }, { "epoch": 0.50781987972993, "grad_norm": 7.627477645874023, "learning_rate": 5.283985362170176e-06, "loss": 0.592428731918335, "memory(GiB)": 40.03, "step": 10920, "token_acc": 0.8552695483244294, "train_speed(iter/s)": 0.095876 }, { "epoch": 0.5080523979898797, "grad_norm": 8.800142288208008, "learning_rate": 5.280146583086686e-06, "loss": 0.6185301303863525, "memory(GiB)": 40.03, "step": 10925, "token_acc": 0.8554865424430642, "train_speed(iter/s)": 0.095901 }, { "epoch": 0.5082849162498293, "grad_norm": 7.865505695343018, "learning_rate": 5.276307638343871e-06, "loss": 0.8139777183532715, "memory(GiB)": 40.03, "step": 10930, "token_acc": 0.8035950303991541, "train_speed(iter/s)": 0.095925 }, { "epoch": 0.5085174345097788, "grad_norm": 6.902219295501709, "learning_rate": 5.272468530211821e-06, "loss": 0.7540923118591308, "memory(GiB)": 40.03, "step": 10935, "token_acc": 0.8090858416945373, "train_speed(iter/s)": 0.09595 }, { "epoch": 0.5087499527697285, "grad_norm": 7.471006870269775, "learning_rate": 5.268629260960714e-06, "loss": 0.6599215030670166, "memory(GiB)": 40.03, "step": 10940, "token_acc": 0.8369609856262834, "train_speed(iter/s)": 0.095975 }, { "epoch": 0.5089824710296781, "grad_norm": 7.935706615447998, "learning_rate": 5.2647898328608315e-06, "loss": 0.6189352035522461, "memory(GiB)": 40.03, "step": 10945, "token_acc": 0.8472222222222222, "train_speed(iter/s)": 0.096 }, { "epoch": 0.5092149892896276, "grad_norm": 7.151297092437744, "learning_rate": 5.260950248182546e-06, "loss": 0.6623498439788819, "memory(GiB)": 40.03, "step": 10950, "token_acc": 0.8259187620889749, "train_speed(iter/s)": 0.096024 }, { "epoch": 0.5092149892896276, "eval_loss": 0.5966300368309021, "eval_runtime": 293.6356, "eval_samples_per_second": 11.834, "eval_steps_per_second": 11.834, "step": 10950 }, { "epoch": 0.5094475075495772, "grad_norm": 7.964531421661377, "learning_rate": 5.257110509196322e-06, "loss": 0.622746467590332, "memory(GiB)": 40.03, "step": 10955, "token_acc": 0.825282466675729, "train_speed(iter/s)": 0.095802 }, { "epoch": 0.5096800258095269, "grad_norm": 8.218613624572754, "learning_rate": 5.253270618172717e-06, "loss": 0.5788079261779785, "memory(GiB)": 40.03, "step": 10960, "token_acc": 0.8508771929824561, "train_speed(iter/s)": 0.095826 }, { "epoch": 0.5099125440694765, "grad_norm": 7.500878810882568, "learning_rate": 5.249430577382373e-06, "loss": 0.6466068744659423, "memory(GiB)": 40.03, "step": 10965, "token_acc": 0.84496996996997, "train_speed(iter/s)": 0.095851 }, { "epoch": 0.510145062329426, "grad_norm": 9.07535171508789, "learning_rate": 5.245590389096031e-06, "loss": 0.6920000076293945, "memory(GiB)": 40.03, "step": 10970, "token_acc": 0.8427947598253275, "train_speed(iter/s)": 0.095875 }, { "epoch": 0.5103775805893757, "grad_norm": 7.486384391784668, "learning_rate": 5.241750055584507e-06, "loss": 0.7849728107452393, "memory(GiB)": 40.03, "step": 10975, "token_acc": 0.8100734522560336, "train_speed(iter/s)": 0.0959 }, { "epoch": 0.5106100988493253, "grad_norm": 8.976439476013184, "learning_rate": 5.237909579118713e-06, "loss": 0.6393332004547119, "memory(GiB)": 40.03, "step": 10980, "token_acc": 0.8386505317198386, "train_speed(iter/s)": 0.095924 }, { "epoch": 0.5108426171092748, "grad_norm": 5.604680061340332, "learning_rate": 5.2340689619696375e-06, "loss": 0.7520250797271728, "memory(GiB)": 40.03, "step": 10985, "token_acc": 0.8177858439201452, "train_speed(iter/s)": 0.095949 }, { "epoch": 0.5110751353692244, "grad_norm": 6.231554985046387, "learning_rate": 5.23022820640836e-06, "loss": 0.743222713470459, "memory(GiB)": 40.03, "step": 10990, "token_acc": 0.823906083244397, "train_speed(iter/s)": 0.095973 }, { "epoch": 0.5113076536291741, "grad_norm": 6.983364105224609, "learning_rate": 5.226387314706035e-06, "loss": 0.6753977298736572, "memory(GiB)": 40.03, "step": 10995, "token_acc": 0.8220467658669905, "train_speed(iter/s)": 0.095998 }, { "epoch": 0.5115401718891237, "grad_norm": 7.330320358276367, "learning_rate": 5.222546289133902e-06, "loss": 0.686239767074585, "memory(GiB)": 40.03, "step": 11000, "token_acc": 0.8274095421069484, "train_speed(iter/s)": 0.096023 }, { "epoch": 0.5115401718891237, "eval_loss": 0.599044919013977, "eval_runtime": 293.7218, "eval_samples_per_second": 11.831, "eval_steps_per_second": 11.831, "step": 11000 }, { "epoch": 0.5117726901490732, "grad_norm": 6.857234001159668, "learning_rate": 5.218705131963275e-06, "loss": 0.589632225036621, "memory(GiB)": 40.03, "step": 11005, "token_acc": 0.825489095574086, "train_speed(iter/s)": 0.095801 }, { "epoch": 0.5120052084090229, "grad_norm": 7.7748613357543945, "learning_rate": 5.214863845465553e-06, "loss": 0.5865228652954102, "memory(GiB)": 40.03, "step": 11010, "token_acc": 0.8419864559819413, "train_speed(iter/s)": 0.095825 }, { "epoch": 0.5122377266689725, "grad_norm": 8.886736869812012, "learning_rate": 5.211022431912205e-06, "loss": 0.6716622829437255, "memory(GiB)": 40.03, "step": 11015, "token_acc": 0.8304431599229287, "train_speed(iter/s)": 0.09585 }, { "epoch": 0.5124702449289221, "grad_norm": 6.145265102386475, "learning_rate": 5.207180893574778e-06, "loss": 0.6420755386352539, "memory(GiB)": 40.03, "step": 11020, "token_acc": 0.8376664552948636, "train_speed(iter/s)": 0.095874 }, { "epoch": 0.5127027631888716, "grad_norm": 8.65272331237793, "learning_rate": 5.203339232724892e-06, "loss": 0.6653483867645263, "memory(GiB)": 40.03, "step": 11025, "token_acc": 0.8366606170598911, "train_speed(iter/s)": 0.095898 }, { "epoch": 0.5129352814488213, "grad_norm": 8.551532745361328, "learning_rate": 5.19949745163424e-06, "loss": 0.6726420402526856, "memory(GiB)": 40.03, "step": 11030, "token_acc": 0.8368983957219251, "train_speed(iter/s)": 0.095923 }, { "epoch": 0.5131677997087709, "grad_norm": 8.132074356079102, "learning_rate": 5.195655552574585e-06, "loss": 0.5192743301391601, "memory(GiB)": 40.03, "step": 11035, "token_acc": 0.8695299837925445, "train_speed(iter/s)": 0.095947 }, { "epoch": 0.5134003179687204, "grad_norm": 5.116249084472656, "learning_rate": 5.1918135378177615e-06, "loss": 0.6704733848571778, "memory(GiB)": 40.03, "step": 11040, "token_acc": 0.8200234879624193, "train_speed(iter/s)": 0.095971 }, { "epoch": 0.5136328362286701, "grad_norm": 7.496029853820801, "learning_rate": 5.1879714096356695e-06, "loss": 0.6737764358520508, "memory(GiB)": 40.03, "step": 11045, "token_acc": 0.8183986371379898, "train_speed(iter/s)": 0.095995 }, { "epoch": 0.5138653544886197, "grad_norm": 8.291666030883789, "learning_rate": 5.184129170300281e-06, "loss": 0.6830921649932862, "memory(GiB)": 40.03, "step": 11050, "token_acc": 0.8283981448448091, "train_speed(iter/s)": 0.096019 }, { "epoch": 0.5138653544886197, "eval_loss": 0.5962172746658325, "eval_runtime": 296.7148, "eval_samples_per_second": 11.712, "eval_steps_per_second": 11.712, "step": 11050 }, { "epoch": 0.5140978727485693, "grad_norm": 8.21020221710205, "learning_rate": 5.180286822083629e-06, "loss": 0.5733434200286865, "memory(GiB)": 40.03, "step": 11055, "token_acc": 0.8254607972492104, "train_speed(iter/s)": 0.095795 }, { "epoch": 0.5143303910085189, "grad_norm": 7.53735876083374, "learning_rate": 5.176444367257812e-06, "loss": 0.7004610538482666, "memory(GiB)": 40.03, "step": 11060, "token_acc": 0.8246505717916137, "train_speed(iter/s)": 0.09582 }, { "epoch": 0.5145629092684685, "grad_norm": 8.39875316619873, "learning_rate": 5.172601808094994e-06, "loss": 0.6619822025299072, "memory(GiB)": 40.03, "step": 11065, "token_acc": 0.8304177079614423, "train_speed(iter/s)": 0.095844 }, { "epoch": 0.5147954275284181, "grad_norm": 4.925565242767334, "learning_rate": 5.168759146867397e-06, "loss": 0.6136856555938721, "memory(GiB)": 40.03, "step": 11070, "token_acc": 0.8474870017331022, "train_speed(iter/s)": 0.095868 }, { "epoch": 0.5150279457883677, "grad_norm": 6.475769519805908, "learning_rate": 5.164916385847307e-06, "loss": 0.6914380550384521, "memory(GiB)": 40.03, "step": 11075, "token_acc": 0.8183244430498902, "train_speed(iter/s)": 0.095893 }, { "epoch": 0.5152604640483173, "grad_norm": 6.658247947692871, "learning_rate": 5.161073527307065e-06, "loss": 0.6014257907867432, "memory(GiB)": 40.03, "step": 11080, "token_acc": 0.857245337159254, "train_speed(iter/s)": 0.095916 }, { "epoch": 0.5154929823082669, "grad_norm": 7.923861503601074, "learning_rate": 5.157230573519074e-06, "loss": 0.695302152633667, "memory(GiB)": 40.03, "step": 11085, "token_acc": 0.8259504708754796, "train_speed(iter/s)": 0.095941 }, { "epoch": 0.5157255005682165, "grad_norm": 5.608233451843262, "learning_rate": 5.153387526755791e-06, "loss": 0.47870712280273436, "memory(GiB)": 40.03, "step": 11090, "token_acc": 0.877984952567877, "train_speed(iter/s)": 0.095965 }, { "epoch": 0.515958018828166, "grad_norm": 6.8638129234313965, "learning_rate": 5.149544389289728e-06, "loss": 0.6515926837921142, "memory(GiB)": 40.03, "step": 11095, "token_acc": 0.8164341085271318, "train_speed(iter/s)": 0.095989 }, { "epoch": 0.5161905370881157, "grad_norm": 9.983770370483398, "learning_rate": 5.145701163393449e-06, "loss": 0.7209710121154785, "memory(GiB)": 40.03, "step": 11100, "token_acc": 0.8370177719982661, "train_speed(iter/s)": 0.096014 }, { "epoch": 0.5161905370881157, "eval_loss": 0.5951548218727112, "eval_runtime": 292.8437, "eval_samples_per_second": 11.866, "eval_steps_per_second": 11.866, "step": 11100 }, { "epoch": 0.5164230553480653, "grad_norm": 7.779696464538574, "learning_rate": 5.141857851339574e-06, "loss": 0.5973493576049804, "memory(GiB)": 40.03, "step": 11105, "token_acc": 0.8255119248373886, "train_speed(iter/s)": 0.095795 }, { "epoch": 0.516655573608015, "grad_norm": 8.005353927612305, "learning_rate": 5.138014455400773e-06, "loss": 0.6842738151550293, "memory(GiB)": 40.03, "step": 11110, "token_acc": 0.8142916493560449, "train_speed(iter/s)": 0.095819 }, { "epoch": 0.5168880918679645, "grad_norm": 7.510136127471924, "learning_rate": 5.134170977849763e-06, "loss": 0.6187559604644776, "memory(GiB)": 40.03, "step": 11115, "token_acc": 0.8454123527311674, "train_speed(iter/s)": 0.095842 }, { "epoch": 0.5171206101279141, "grad_norm": 7.755091667175293, "learning_rate": 5.130327420959311e-06, "loss": 0.6718905448913575, "memory(GiB)": 40.03, "step": 11120, "token_acc": 0.8301941466241669, "train_speed(iter/s)": 0.095866 }, { "epoch": 0.5173531283878637, "grad_norm": 6.52587890625, "learning_rate": 5.126483787002231e-06, "loss": 0.6481473922729493, "memory(GiB)": 40.03, "step": 11125, "token_acc": 0.8374822190611664, "train_speed(iter/s)": 0.095891 }, { "epoch": 0.5175856466478133, "grad_norm": 9.18529224395752, "learning_rate": 5.122640078251383e-06, "loss": 0.6470609188079834, "memory(GiB)": 40.03, "step": 11130, "token_acc": 0.8411287205257054, "train_speed(iter/s)": 0.095915 }, { "epoch": 0.5178181649077629, "grad_norm": 6.860177516937256, "learning_rate": 5.118796296979671e-06, "loss": 0.6926799297332764, "memory(GiB)": 40.03, "step": 11135, "token_acc": 0.8272789581905414, "train_speed(iter/s)": 0.09594 }, { "epoch": 0.5180506831677125, "grad_norm": 6.139833927154541, "learning_rate": 5.11495244546004e-06, "loss": 0.5168371200561523, "memory(GiB)": 40.03, "step": 11140, "token_acc": 0.874955595026643, "train_speed(iter/s)": 0.095964 }, { "epoch": 0.5182832014276622, "grad_norm": 7.04421854019165, "learning_rate": 5.111108525965478e-06, "loss": 0.703952693939209, "memory(GiB)": 40.03, "step": 11145, "token_acc": 0.8202391118701964, "train_speed(iter/s)": 0.095988 }, { "epoch": 0.5185157196876117, "grad_norm": 8.804143905639648, "learning_rate": 5.107264540769016e-06, "loss": 0.6630299091339111, "memory(GiB)": 40.03, "step": 11150, "token_acc": 0.8420208500400962, "train_speed(iter/s)": 0.096012 }, { "epoch": 0.5185157196876117, "eval_loss": 0.5949863791465759, "eval_runtime": 298.501, "eval_samples_per_second": 11.642, "eval_steps_per_second": 11.642, "step": 11150 }, { "epoch": 0.5187482379475613, "grad_norm": 7.5537028312683105, "learning_rate": 5.103420492143718e-06, "loss": 0.6842432975769043, "memory(GiB)": 40.03, "step": 11155, "token_acc": 0.8257057996934083, "train_speed(iter/s)": 0.095789 }, { "epoch": 0.5189807562075109, "grad_norm": 7.630836486816406, "learning_rate": 5.0995763823626905e-06, "loss": 0.6884300708770752, "memory(GiB)": 40.03, "step": 11160, "token_acc": 0.8314069350338781, "train_speed(iter/s)": 0.095813 }, { "epoch": 0.5192132744674606, "grad_norm": 6.869633674621582, "learning_rate": 5.0957322136990724e-06, "loss": 0.5924717903137207, "memory(GiB)": 40.03, "step": 11165, "token_acc": 0.8458379992534528, "train_speed(iter/s)": 0.095837 }, { "epoch": 0.5194457927274101, "grad_norm": 7.150241374969482, "learning_rate": 5.091887988426043e-06, "loss": 0.7052815914154053, "memory(GiB)": 40.03, "step": 11170, "token_acc": 0.8319569120287253, "train_speed(iter/s)": 0.095861 }, { "epoch": 0.5196783109873597, "grad_norm": 7.968418598175049, "learning_rate": 5.088043708816807e-06, "loss": 0.651512622833252, "memory(GiB)": 40.03, "step": 11175, "token_acc": 0.833808844507846, "train_speed(iter/s)": 0.095885 }, { "epoch": 0.5199108292473094, "grad_norm": 8.40583324432373, "learning_rate": 5.08419937714461e-06, "loss": 0.6126296520233154, "memory(GiB)": 40.03, "step": 11180, "token_acc": 0.8552202283849919, "train_speed(iter/s)": 0.095909 }, { "epoch": 0.5201433475072589, "grad_norm": 7.4501519203186035, "learning_rate": 5.0803549956827196e-06, "loss": 0.6180335998535156, "memory(GiB)": 40.03, "step": 11185, "token_acc": 0.8421973407977607, "train_speed(iter/s)": 0.095933 }, { "epoch": 0.5203758657672085, "grad_norm": 6.713845729827881, "learning_rate": 5.07651056670444e-06, "loss": 0.6531442642211914, "memory(GiB)": 40.03, "step": 11190, "token_acc": 0.8341672623302359, "train_speed(iter/s)": 0.095957 }, { "epoch": 0.5206083840271581, "grad_norm": 8.381232261657715, "learning_rate": 5.072666092483101e-06, "loss": 0.7437289237976075, "memory(GiB)": 40.03, "step": 11195, "token_acc": 0.8160783150400475, "train_speed(iter/s)": 0.095981 }, { "epoch": 0.5208409022871078, "grad_norm": 7.857691287994385, "learning_rate": 5.068821575292057e-06, "loss": 0.7598735809326171, "memory(GiB)": 40.03, "step": 11200, "token_acc": 0.8187071144817624, "train_speed(iter/s)": 0.096004 }, { "epoch": 0.5208409022871078, "eval_loss": 0.5962448120117188, "eval_runtime": 296.2073, "eval_samples_per_second": 11.732, "eval_steps_per_second": 11.732, "step": 11200 }, { "epoch": 0.5210734205470573, "grad_norm": 7.9069132804870605, "learning_rate": 5.06497701740469e-06, "loss": 0.6560911178588867, "memory(GiB)": 40.03, "step": 11205, "token_acc": 0.824959984710576, "train_speed(iter/s)": 0.095784 }, { "epoch": 0.5213059388070069, "grad_norm": 6.1809258460998535, "learning_rate": 5.061132421094408e-06, "loss": 0.7430883884429932, "memory(GiB)": 40.03, "step": 11210, "token_acc": 0.7977564102564103, "train_speed(iter/s)": 0.095808 }, { "epoch": 0.5215384570669566, "grad_norm": 9.742097854614258, "learning_rate": 5.057287788634636e-06, "loss": 0.5030066490173339, "memory(GiB)": 40.03, "step": 11215, "token_acc": 0.880013596193066, "train_speed(iter/s)": 0.095832 }, { "epoch": 0.5217709753269062, "grad_norm": 7.676933288574219, "learning_rate": 5.053443122298827e-06, "loss": 0.5586160659790039, "memory(GiB)": 40.03, "step": 11220, "token_acc": 0.8607882052736037, "train_speed(iter/s)": 0.095856 }, { "epoch": 0.5220034935868557, "grad_norm": 8.416631698608398, "learning_rate": 5.049598424360449e-06, "loss": 0.7418983936309814, "memory(GiB)": 40.03, "step": 11225, "token_acc": 0.8017057569296375, "train_speed(iter/s)": 0.095881 }, { "epoch": 0.5222360118468053, "grad_norm": 5.841848373413086, "learning_rate": 5.045753697092993e-06, "loss": 0.7004424095153808, "memory(GiB)": 40.03, "step": 11230, "token_acc": 0.8233865371269952, "train_speed(iter/s)": 0.095905 }, { "epoch": 0.522468530106755, "grad_norm": 9.230901718139648, "learning_rate": 5.041908942769963e-06, "loss": 0.6646398544311524, "memory(GiB)": 40.03, "step": 11235, "token_acc": 0.8307048599935629, "train_speed(iter/s)": 0.095929 }, { "epoch": 0.5227010483667045, "grad_norm": 8.23694896697998, "learning_rate": 5.038064163664881e-06, "loss": 0.5914628028869628, "memory(GiB)": 40.03, "step": 11240, "token_acc": 0.854236034036256, "train_speed(iter/s)": 0.095953 }, { "epoch": 0.5229335666266541, "grad_norm": 6.6772308349609375, "learning_rate": 5.0342193620512825e-06, "loss": 0.7465476989746094, "memory(GiB)": 40.03, "step": 11245, "token_acc": 0.8043956043956044, "train_speed(iter/s)": 0.095977 }, { "epoch": 0.5231660848866038, "grad_norm": 7.63891077041626, "learning_rate": 5.03037454020272e-06, "loss": 0.6081331253051758, "memory(GiB)": 40.03, "step": 11250, "token_acc": 0.8427753023551878, "train_speed(iter/s)": 0.096001 }, { "epoch": 0.5231660848866038, "eval_loss": 0.5921972393989563, "eval_runtime": 290.7907, "eval_samples_per_second": 11.95, "eval_steps_per_second": 11.95, "step": 11250 }, { "epoch": 0.5233986031465534, "grad_norm": 7.560066223144531, "learning_rate": 5.026529700392754e-06, "loss": 0.7295114040374756, "memory(GiB)": 40.03, "step": 11255, "token_acc": 0.8256236840426211, "train_speed(iter/s)": 0.095787 }, { "epoch": 0.5236311214065029, "grad_norm": 6.873581409454346, "learning_rate": 5.022684844894957e-06, "loss": 0.7038755893707276, "memory(GiB)": 40.03, "step": 11260, "token_acc": 0.8330605564648118, "train_speed(iter/s)": 0.095811 }, { "epoch": 0.5238636396664526, "grad_norm": 6.969046115875244, "learning_rate": 5.0188399759829106e-06, "loss": 0.6631447792053222, "memory(GiB)": 40.03, "step": 11265, "token_acc": 0.8317152103559871, "train_speed(iter/s)": 0.095836 }, { "epoch": 0.5240961579264022, "grad_norm": 6.5216779708862305, "learning_rate": 5.014995095930205e-06, "loss": 0.6236719608306884, "memory(GiB)": 40.03, "step": 11270, "token_acc": 0.8280766852195424, "train_speed(iter/s)": 0.095859 }, { "epoch": 0.5243286761863517, "grad_norm": 9.973017692565918, "learning_rate": 5.011150207010437e-06, "loss": 0.6285340785980225, "memory(GiB)": 40.03, "step": 11275, "token_acc": 0.8375690607734807, "train_speed(iter/s)": 0.095883 }, { "epoch": 0.5245611944463013, "grad_norm": 9.72734260559082, "learning_rate": 5.007305311497206e-06, "loss": 0.6574903964996338, "memory(GiB)": 40.03, "step": 11280, "token_acc": 0.8370991253644315, "train_speed(iter/s)": 0.095907 }, { "epoch": 0.524793712706251, "grad_norm": 8.460036277770996, "learning_rate": 5.003460411664118e-06, "loss": 0.6372312068939209, "memory(GiB)": 40.03, "step": 11285, "token_acc": 0.8438914027149321, "train_speed(iter/s)": 0.095931 }, { "epoch": 0.5250262309662006, "grad_norm": 7.125070095062256, "learning_rate": 4.9996155097847834e-06, "loss": 0.6930059432983399, "memory(GiB)": 40.03, "step": 11290, "token_acc": 0.8248175182481752, "train_speed(iter/s)": 0.095955 }, { "epoch": 0.5252587492261501, "grad_norm": 4.946784019470215, "learning_rate": 4.995770608132809e-06, "loss": 0.7608030796051025, "memory(GiB)": 40.03, "step": 11295, "token_acc": 0.8132656109949208, "train_speed(iter/s)": 0.095979 }, { "epoch": 0.5254912674860998, "grad_norm": 6.038403511047363, "learning_rate": 4.991925708981806e-06, "loss": 0.7435698509216309, "memory(GiB)": 40.03, "step": 11300, "token_acc": 0.8154296875, "train_speed(iter/s)": 0.096003 }, { "epoch": 0.5254912674860998, "eval_loss": 0.5952001810073853, "eval_runtime": 291.1034, "eval_samples_per_second": 11.937, "eval_steps_per_second": 11.937, "step": 11300 }, { "epoch": 0.5257237857460494, "grad_norm": 8.406840324401855, "learning_rate": 4.9880808146053785e-06, "loss": 0.5746410369873047, "memory(GiB)": 40.03, "step": 11305, "token_acc": 0.826413462924119, "train_speed(iter/s)": 0.09579 }, { "epoch": 0.525956304005999, "grad_norm": 7.57234525680542, "learning_rate": 4.984235927277135e-06, "loss": 0.6861493587493896, "memory(GiB)": 40.03, "step": 11310, "token_acc": 0.8226790876967556, "train_speed(iter/s)": 0.095813 }, { "epoch": 0.5261888222659485, "grad_norm": 7.232485294342041, "learning_rate": 4.980391049270673e-06, "loss": 0.6427253246307373, "memory(GiB)": 40.03, "step": 11315, "token_acc": 0.8424116424116425, "train_speed(iter/s)": 0.095837 }, { "epoch": 0.5264213405258982, "grad_norm": 6.939469337463379, "learning_rate": 4.976546182859591e-06, "loss": 0.7365629196166992, "memory(GiB)": 40.03, "step": 11320, "token_acc": 0.8238817891373802, "train_speed(iter/s)": 0.095861 }, { "epoch": 0.5266538587858478, "grad_norm": 9.328145027160645, "learning_rate": 4.972701330317472e-06, "loss": 0.7492372989654541, "memory(GiB)": 40.03, "step": 11325, "token_acc": 0.818961818961819, "train_speed(iter/s)": 0.095885 }, { "epoch": 0.5268863770457973, "grad_norm": 5.637638568878174, "learning_rate": 4.968856493917902e-06, "loss": 0.6783174991607666, "memory(GiB)": 40.03, "step": 11330, "token_acc": 0.8413669064748202, "train_speed(iter/s)": 0.095909 }, { "epoch": 0.527118895305747, "grad_norm": 6.447477340698242, "learning_rate": 4.965011675934447e-06, "loss": 0.6275362014770508, "memory(GiB)": 40.03, "step": 11335, "token_acc": 0.8448905109489051, "train_speed(iter/s)": 0.095933 }, { "epoch": 0.5273514135656966, "grad_norm": 6.677289009094238, "learning_rate": 4.961166878640671e-06, "loss": 0.6838769912719727, "memory(GiB)": 40.03, "step": 11340, "token_acc": 0.80778739184178, "train_speed(iter/s)": 0.095956 }, { "epoch": 0.5275839318256462, "grad_norm": 8.42552661895752, "learning_rate": 4.957322104310115e-06, "loss": 0.6776114940643311, "memory(GiB)": 40.03, "step": 11345, "token_acc": 0.8311827956989247, "train_speed(iter/s)": 0.09598 }, { "epoch": 0.5278164500855957, "grad_norm": 6.210771560668945, "learning_rate": 4.953477355216318e-06, "loss": 0.6211733818054199, "memory(GiB)": 40.03, "step": 11350, "token_acc": 0.8471243042671615, "train_speed(iter/s)": 0.096004 }, { "epoch": 0.5278164500855957, "eval_loss": 0.5934306383132935, "eval_runtime": 291.2941, "eval_samples_per_second": 11.93, "eval_steps_per_second": 11.93, "step": 11350 }, { "epoch": 0.5280489683455454, "grad_norm": 9.925871849060059, "learning_rate": 4.949632633632797e-06, "loss": 0.672046422958374, "memory(GiB)": 40.03, "step": 11355, "token_acc": 0.8262175103694569, "train_speed(iter/s)": 0.095792 }, { "epoch": 0.528281486605495, "grad_norm": 9.350289344787598, "learning_rate": 4.945787941833056e-06, "loss": 0.6249475479125977, "memory(GiB)": 40.03, "step": 11360, "token_acc": 0.8458480565371025, "train_speed(iter/s)": 0.095815 }, { "epoch": 0.5285140048654446, "grad_norm": 9.15757942199707, "learning_rate": 4.941943282090578e-06, "loss": 0.7165769577026367, "memory(GiB)": 40.03, "step": 11365, "token_acc": 0.8218045112781955, "train_speed(iter/s)": 0.095839 }, { "epoch": 0.5287465231253942, "grad_norm": 8.200997352600098, "learning_rate": 4.9380986566788296e-06, "loss": 0.7071670055389404, "memory(GiB)": 40.03, "step": 11370, "token_acc": 0.8172683289914403, "train_speed(iter/s)": 0.095863 }, { "epoch": 0.5289790413853438, "grad_norm": 6.972626209259033, "learning_rate": 4.934254067871255e-06, "loss": 0.5738145351409912, "memory(GiB)": 40.03, "step": 11375, "token_acc": 0.8671679197994987, "train_speed(iter/s)": 0.095887 }, { "epoch": 0.5292115596452934, "grad_norm": 6.229203224182129, "learning_rate": 4.930409517941284e-06, "loss": 0.6540433406829834, "memory(GiB)": 40.03, "step": 11380, "token_acc": 0.8375670840787119, "train_speed(iter/s)": 0.095909 }, { "epoch": 0.529444077905243, "grad_norm": 7.376658916473389, "learning_rate": 4.926565009162309e-06, "loss": 0.620716142654419, "memory(GiB)": 40.03, "step": 11385, "token_acc": 0.8458440131819847, "train_speed(iter/s)": 0.095933 }, { "epoch": 0.5296765961651926, "grad_norm": 9.04910659790039, "learning_rate": 4.9227205438077145e-06, "loss": 0.5833307266235351, "memory(GiB)": 40.03, "step": 11390, "token_acc": 0.8477661334804192, "train_speed(iter/s)": 0.095957 }, { "epoch": 0.5299091144251422, "grad_norm": 8.151079177856445, "learning_rate": 4.918876124150846e-06, "loss": 0.6190403938293457, "memory(GiB)": 40.03, "step": 11395, "token_acc": 0.8311345646437994, "train_speed(iter/s)": 0.09598 }, { "epoch": 0.5301416326850918, "grad_norm": 6.338625907897949, "learning_rate": 4.915031752465033e-06, "loss": 0.6119202613830567, "memory(GiB)": 40.03, "step": 11400, "token_acc": 0.8471164309031556, "train_speed(iter/s)": 0.096004 }, { "epoch": 0.5301416326850918, "eval_loss": 0.5921990871429443, "eval_runtime": 291.4397, "eval_samples_per_second": 11.924, "eval_steps_per_second": 11.924, "step": 11400 }, { "epoch": 0.5303741509450414, "grad_norm": 7.004538059234619, "learning_rate": 4.911187431023565e-06, "loss": 0.5625624179840087, "memory(GiB)": 40.03, "step": 11405, "token_acc": 0.8265071992197182, "train_speed(iter/s)": 0.095792 }, { "epoch": 0.530606669204991, "grad_norm": 7.3375701904296875, "learning_rate": 4.907343162099712e-06, "loss": 0.6645450115203857, "memory(GiB)": 40.03, "step": 11410, "token_acc": 0.8184036249564308, "train_speed(iter/s)": 0.095815 }, { "epoch": 0.5308391874649406, "grad_norm": 7.548114776611328, "learning_rate": 4.90349894796671e-06, "loss": 0.6787260055541993, "memory(GiB)": 40.03, "step": 11415, "token_acc": 0.8279603223806572, "train_speed(iter/s)": 0.095839 }, { "epoch": 0.5310717057248902, "grad_norm": 6.177823066711426, "learning_rate": 4.899654790897757e-06, "loss": 0.7073424339294434, "memory(GiB)": 40.03, "step": 11420, "token_acc": 0.8293135435992579, "train_speed(iter/s)": 0.095863 }, { "epoch": 0.5313042239848398, "grad_norm": 6.371845245361328, "learning_rate": 4.895810693166026e-06, "loss": 0.6852655410766602, "memory(GiB)": 40.03, "step": 11425, "token_acc": 0.83436087135224, "train_speed(iter/s)": 0.095887 }, { "epoch": 0.5315367422447894, "grad_norm": 4.985559463500977, "learning_rate": 4.891966657044647e-06, "loss": 0.656800365447998, "memory(GiB)": 40.03, "step": 11430, "token_acc": 0.8411088573360379, "train_speed(iter/s)": 0.09591 }, { "epoch": 0.531769260504739, "grad_norm": 8.7366943359375, "learning_rate": 4.888122684806721e-06, "loss": 0.5947208404541016, "memory(GiB)": 40.03, "step": 11435, "token_acc": 0.858000858000858, "train_speed(iter/s)": 0.095934 }, { "epoch": 0.5320017787646886, "grad_norm": 8.291216850280762, "learning_rate": 4.884278778725304e-06, "loss": 0.6520028114318848, "memory(GiB)": 40.03, "step": 11440, "token_acc": 0.8529519618628529, "train_speed(iter/s)": 0.095958 }, { "epoch": 0.5322342970246382, "grad_norm": 7.761775970458984, "learning_rate": 4.8804349410734195e-06, "loss": 0.5053174018859863, "memory(GiB)": 40.03, "step": 11445, "token_acc": 0.8743396226415094, "train_speed(iter/s)": 0.095982 }, { "epoch": 0.5324668152845878, "grad_norm": 7.208600044250488, "learning_rate": 4.876591174124045e-06, "loss": 0.5161089420318603, "memory(GiB)": 40.03, "step": 11450, "token_acc": 0.8536853685368537, "train_speed(iter/s)": 0.096005 }, { "epoch": 0.5324668152845878, "eval_loss": 0.5929927229881287, "eval_runtime": 290.0243, "eval_samples_per_second": 11.982, "eval_steps_per_second": 11.982, "step": 11450 }, { "epoch": 0.5326993335445375, "grad_norm": 8.39334774017334, "learning_rate": 4.872747480150121e-06, "loss": 0.6529646396636963, "memory(GiB)": 40.03, "step": 11455, "token_acc": 0.8263123836610131, "train_speed(iter/s)": 0.095796 }, { "epoch": 0.532931851804487, "grad_norm": 7.040056228637695, "learning_rate": 4.8689038614245384e-06, "loss": 0.7571589946746826, "memory(GiB)": 40.03, "step": 11460, "token_acc": 0.8244972577696527, "train_speed(iter/s)": 0.095819 }, { "epoch": 0.5331643700644366, "grad_norm": 6.09945821762085, "learning_rate": 4.865060320220151e-06, "loss": 0.5826333999633789, "memory(GiB)": 40.03, "step": 11465, "token_acc": 0.8391376451077943, "train_speed(iter/s)": 0.095843 }, { "epoch": 0.5333968883243863, "grad_norm": 8.370718002319336, "learning_rate": 4.861216858809762e-06, "loss": 0.6425962448120117, "memory(GiB)": 40.03, "step": 11470, "token_acc": 0.8435257943286641, "train_speed(iter/s)": 0.095866 }, { "epoch": 0.5336294065843358, "grad_norm": 8.308968544006348, "learning_rate": 4.857373479466132e-06, "loss": 0.6420434474945068, "memory(GiB)": 40.03, "step": 11475, "token_acc": 0.8433778419343197, "train_speed(iter/s)": 0.095888 }, { "epoch": 0.5338619248442854, "grad_norm": 6.884545803070068, "learning_rate": 4.853530184461964e-06, "loss": 0.6627838134765625, "memory(GiB)": 40.03, "step": 11480, "token_acc": 0.8381901840490797, "train_speed(iter/s)": 0.095912 }, { "epoch": 0.534094443104235, "grad_norm": 8.793212890625, "learning_rate": 4.8496869760699206e-06, "loss": 0.6520689487457275, "memory(GiB)": 40.03, "step": 11485, "token_acc": 0.8455631399317406, "train_speed(iter/s)": 0.095936 }, { "epoch": 0.5343269613641847, "grad_norm": 9.045661926269531, "learning_rate": 4.845843856562609e-06, "loss": 0.5819193363189697, "memory(GiB)": 40.03, "step": 11490, "token_acc": 0.851056338028169, "train_speed(iter/s)": 0.095959 }, { "epoch": 0.5345594796241342, "grad_norm": 7.690134525299072, "learning_rate": 4.842000828212586e-06, "loss": 0.7085441589355469, "memory(GiB)": 40.03, "step": 11495, "token_acc": 0.8147433423388653, "train_speed(iter/s)": 0.095983 }, { "epoch": 0.5347919978840838, "grad_norm": 7.208715438842773, "learning_rate": 4.83815789329235e-06, "loss": 0.6616110801696777, "memory(GiB)": 40.03, "step": 11500, "token_acc": 0.8222143364088006, "train_speed(iter/s)": 0.096006 }, { "epoch": 0.5347919978840838, "eval_loss": 0.5930191874504089, "eval_runtime": 290.603, "eval_samples_per_second": 11.958, "eval_steps_per_second": 11.958, "step": 11500 }, { "epoch": 0.5350245161440335, "grad_norm": 6.396140098571777, "learning_rate": 4.8343150540743485e-06, "loss": 0.6394748687744141, "memory(GiB)": 40.03, "step": 11505, "token_acc": 0.8267575437585998, "train_speed(iter/s)": 0.095797 }, { "epoch": 0.5352570344039831, "grad_norm": 7.510823726654053, "learning_rate": 4.830472312830971e-06, "loss": 0.6311664581298828, "memory(GiB)": 40.03, "step": 11510, "token_acc": 0.8289065194894791, "train_speed(iter/s)": 0.095821 }, { "epoch": 0.5354895526639326, "grad_norm": 7.6930084228515625, "learning_rate": 4.8266296718345505e-06, "loss": 0.8164526939392089, "memory(GiB)": 40.03, "step": 11515, "token_acc": 0.8156723063223509, "train_speed(iter/s)": 0.095844 }, { "epoch": 0.5357220709238822, "grad_norm": 9.583854675292969, "learning_rate": 4.822787133357356e-06, "loss": 0.7618258953094482, "memory(GiB)": 40.03, "step": 11520, "token_acc": 0.8230184581976113, "train_speed(iter/s)": 0.095868 }, { "epoch": 0.5359545891838319, "grad_norm": 7.062460899353027, "learning_rate": 4.818944699671602e-06, "loss": 0.6841589927673339, "memory(GiB)": 40.03, "step": 11525, "token_acc": 0.8321906627489389, "train_speed(iter/s)": 0.095891 }, { "epoch": 0.5361871074437814, "grad_norm": 9.660597801208496, "learning_rate": 4.815102373049435e-06, "loss": 0.7356135368347168, "memory(GiB)": 40.03, "step": 11530, "token_acc": 0.8156158357771262, "train_speed(iter/s)": 0.095914 }, { "epoch": 0.536419625703731, "grad_norm": 5.609492778778076, "learning_rate": 4.811260155762947e-06, "loss": 0.6264122009277344, "memory(GiB)": 40.03, "step": 11535, "token_acc": 0.8382756727073036, "train_speed(iter/s)": 0.095938 }, { "epoch": 0.5366521439636807, "grad_norm": 8.03640079498291, "learning_rate": 4.8074180500841535e-06, "loss": 0.5942182064056396, "memory(GiB)": 40.03, "step": 11540, "token_acc": 0.8525579917381634, "train_speed(iter/s)": 0.095961 }, { "epoch": 0.5368846622236303, "grad_norm": 6.38782262802124, "learning_rate": 4.8035760582850124e-06, "loss": 0.7426403522491455, "memory(GiB)": 40.03, "step": 11545, "token_acc": 0.8122102009273571, "train_speed(iter/s)": 0.095984 }, { "epoch": 0.5371171804835798, "grad_norm": 7.080733299255371, "learning_rate": 4.799734182637413e-06, "loss": 0.6498960494995117, "memory(GiB)": 40.03, "step": 11550, "token_acc": 0.824015748031496, "train_speed(iter/s)": 0.096007 }, { "epoch": 0.5371171804835798, "eval_loss": 0.5918628573417664, "eval_runtime": 288.3253, "eval_samples_per_second": 12.052, "eval_steps_per_second": 12.052, "step": 11550 }, { "epoch": 0.5373496987435294, "grad_norm": 9.150640487670898, "learning_rate": 4.795892425413175e-06, "loss": 0.5990890979766845, "memory(GiB)": 40.03, "step": 11555, "token_acc": 0.8264292393641254, "train_speed(iter/s)": 0.095801 }, { "epoch": 0.5375822170034791, "grad_norm": 8.70228099822998, "learning_rate": 4.792050788884049e-06, "loss": 0.6391530513763428, "memory(GiB)": 40.03, "step": 11560, "token_acc": 0.8391319324836376, "train_speed(iter/s)": 0.095825 }, { "epoch": 0.5378147352634287, "grad_norm": 8.723603248596191, "learning_rate": 4.78820927532171e-06, "loss": 0.7898457527160645, "memory(GiB)": 40.03, "step": 11565, "token_acc": 0.8129608071400853, "train_speed(iter/s)": 0.095849 }, { "epoch": 0.5380472535233782, "grad_norm": 7.317963600158691, "learning_rate": 4.784367886997766e-06, "loss": 0.6194943428039551, "memory(GiB)": 40.03, "step": 11570, "token_acc": 0.8442244224422443, "train_speed(iter/s)": 0.095872 }, { "epoch": 0.5382797717833279, "grad_norm": 10.04311466217041, "learning_rate": 4.780526626183746e-06, "loss": 0.6792384147644043, "memory(GiB)": 40.03, "step": 11575, "token_acc": 0.8399412628487518, "train_speed(iter/s)": 0.095895 }, { "epoch": 0.5385122900432775, "grad_norm": 6.099349498748779, "learning_rate": 4.7766854951511115e-06, "loss": 0.5884709835052491, "memory(GiB)": 40.03, "step": 11580, "token_acc": 0.8562348668280871, "train_speed(iter/s)": 0.095918 }, { "epoch": 0.538744808303227, "grad_norm": 9.135714530944824, "learning_rate": 4.772844496171236e-06, "loss": 0.5978000640869141, "memory(GiB)": 40.03, "step": 11585, "token_acc": 0.8556073092081691, "train_speed(iter/s)": 0.095942 }, { "epoch": 0.5389773265631767, "grad_norm": 6.890235900878906, "learning_rate": 4.769003631515424e-06, "loss": 0.6094280242919922, "memory(GiB)": 40.03, "step": 11590, "token_acc": 0.8448572411420708, "train_speed(iter/s)": 0.095966 }, { "epoch": 0.5392098448231263, "grad_norm": 7.566539287567139, "learning_rate": 4.765162903454896e-06, "loss": 0.5553547859191894, "memory(GiB)": 40.03, "step": 11595, "token_acc": 0.8636524196397033, "train_speed(iter/s)": 0.095988 }, { "epoch": 0.5394423630830759, "grad_norm": 7.855575084686279, "learning_rate": 4.761322314260795e-06, "loss": 0.6286415100097656, "memory(GiB)": 40.03, "step": 11600, "token_acc": 0.8369634849455477, "train_speed(iter/s)": 0.096012 }, { "epoch": 0.5394423630830759, "eval_loss": 0.5954886078834534, "eval_runtime": 290.9344, "eval_samples_per_second": 11.944, "eval_steps_per_second": 11.944, "step": 11600 }, { "epoch": 0.5396748813430254, "grad_norm": 7.707352161407471, "learning_rate": 4.757481866204178e-06, "loss": 0.6133537292480469, "memory(GiB)": 40.03, "step": 11605, "token_acc": 0.8269178291089458, "train_speed(iter/s)": 0.095804 }, { "epoch": 0.5399073996029751, "grad_norm": 8.7269926071167, "learning_rate": 4.75364156155602e-06, "loss": 0.6341817378997803, "memory(GiB)": 40.03, "step": 11610, "token_acc": 0.8378782218901756, "train_speed(iter/s)": 0.095827 }, { "epoch": 0.5401399178629247, "grad_norm": 6.603315830230713, "learning_rate": 4.749801402587214e-06, "loss": 0.6399006366729736, "memory(GiB)": 40.03, "step": 11615, "token_acc": 0.8493666552550496, "train_speed(iter/s)": 0.09585 }, { "epoch": 0.5403724361228742, "grad_norm": 6.983320713043213, "learning_rate": 4.745961391568564e-06, "loss": 0.7938889980316162, "memory(GiB)": 40.03, "step": 11620, "token_acc": 0.8095981271946937, "train_speed(iter/s)": 0.095874 }, { "epoch": 0.5406049543828239, "grad_norm": 7.961282730102539, "learning_rate": 4.7421215307707846e-06, "loss": 0.546476936340332, "memory(GiB)": 40.03, "step": 11625, "token_acc": 0.86741494212557, "train_speed(iter/s)": 0.095897 }, { "epoch": 0.5408374726427735, "grad_norm": 8.474226951599121, "learning_rate": 4.738281822464508e-06, "loss": 0.7628999710083008, "memory(GiB)": 40.03, "step": 11630, "token_acc": 0.8200692041522492, "train_speed(iter/s)": 0.09592 }, { "epoch": 0.5410699909027231, "grad_norm": 6.8097639083862305, "learning_rate": 4.734442268920268e-06, "loss": 0.5580487251281738, "memory(GiB)": 40.03, "step": 11635, "token_acc": 0.8571891191709845, "train_speed(iter/s)": 0.095943 }, { "epoch": 0.5413025091626726, "grad_norm": 6.000946521759033, "learning_rate": 4.730602872408516e-06, "loss": 0.720289659500122, "memory(GiB)": 40.03, "step": 11640, "token_acc": 0.8221534227726178, "train_speed(iter/s)": 0.095966 }, { "epoch": 0.5415350274226223, "grad_norm": 6.937013149261475, "learning_rate": 4.7267636351996e-06, "loss": 0.656560754776001, "memory(GiB)": 40.03, "step": 11645, "token_acc": 0.8447912273302404, "train_speed(iter/s)": 0.095989 }, { "epoch": 0.5417675456825719, "grad_norm": 8.265849113464355, "learning_rate": 4.722924559563784e-06, "loss": 0.7568728923797607, "memory(GiB)": 40.03, "step": 11650, "token_acc": 0.805571510626077, "train_speed(iter/s)": 0.096013 }, { "epoch": 0.5417675456825719, "eval_loss": 0.5918522477149963, "eval_runtime": 296.0765, "eval_samples_per_second": 11.737, "eval_steps_per_second": 11.737, "step": 11650 }, { "epoch": 0.5420000639425215, "grad_norm": 11.34377384185791, "learning_rate": 4.71908564777123e-06, "loss": 0.6885035514831543, "memory(GiB)": 40.03, "step": 11655, "token_acc": 0.8265431850211395, "train_speed(iter/s)": 0.095801 }, { "epoch": 0.5422325822024711, "grad_norm": 8.528449058532715, "learning_rate": 4.7152469020920075e-06, "loss": 0.7828842163085937, "memory(GiB)": 40.03, "step": 11660, "token_acc": 0.8166794773251345, "train_speed(iter/s)": 0.095824 }, { "epoch": 0.5424651004624207, "grad_norm": 8.462496757507324, "learning_rate": 4.711408324796081e-06, "loss": 0.7147085666656494, "memory(GiB)": 40.03, "step": 11665, "token_acc": 0.8189625558227414, "train_speed(iter/s)": 0.095847 }, { "epoch": 0.5426976187223703, "grad_norm": 7.3300371170043945, "learning_rate": 4.707569918153323e-06, "loss": 0.6366332054138184, "memory(GiB)": 40.03, "step": 11670, "token_acc": 0.8369641602248771, "train_speed(iter/s)": 0.09587 }, { "epoch": 0.5429301369823198, "grad_norm": 7.166123390197754, "learning_rate": 4.7037316844335e-06, "loss": 0.7396623611450195, "memory(GiB)": 40.03, "step": 11675, "token_acc": 0.8181201221581269, "train_speed(iter/s)": 0.095892 }, { "epoch": 0.5431626552422695, "grad_norm": 8.054522514343262, "learning_rate": 4.699893625906279e-06, "loss": 0.653428602218628, "memory(GiB)": 40.03, "step": 11680, "token_acc": 0.8363309352517986, "train_speed(iter/s)": 0.095915 }, { "epoch": 0.5433951735022191, "grad_norm": 6.426419734954834, "learning_rate": 4.696055744841225e-06, "loss": 0.7306111335754395, "memory(GiB)": 40.03, "step": 11685, "token_acc": 0.8120168657765284, "train_speed(iter/s)": 0.095938 }, { "epoch": 0.5436276917621687, "grad_norm": 8.696972846984863, "learning_rate": 4.692218043507791e-06, "loss": 0.5761496067047119, "memory(GiB)": 40.03, "step": 11690, "token_acc": 0.8655980271270037, "train_speed(iter/s)": 0.095962 }, { "epoch": 0.5438602100221183, "grad_norm": 6.541189193725586, "learning_rate": 4.688380524175332e-06, "loss": 0.6662922859191894, "memory(GiB)": 40.03, "step": 11695, "token_acc": 0.8318356867779204, "train_speed(iter/s)": 0.095985 }, { "epoch": 0.5440927282820679, "grad_norm": 8.999557495117188, "learning_rate": 4.684543189113089e-06, "loss": 0.5919151782989502, "memory(GiB)": 40.03, "step": 11700, "token_acc": 0.8485790408525755, "train_speed(iter/s)": 0.096007 }, { "epoch": 0.5440927282820679, "eval_loss": 0.5910959839820862, "eval_runtime": 295.316, "eval_samples_per_second": 11.767, "eval_steps_per_second": 11.767, "step": 11700 }, { "epoch": 0.5443252465420175, "grad_norm": 11.527779579162598, "learning_rate": 4.6807060405902e-06, "loss": 0.6689294815063477, "memory(GiB)": 40.03, "step": 11705, "token_acc": 0.825695442970865, "train_speed(iter/s)": 0.095797 }, { "epoch": 0.5445577648019672, "grad_norm": 7.013660430908203, "learning_rate": 4.6768690808756835e-06, "loss": 0.7742821216583252, "memory(GiB)": 40.03, "step": 11710, "token_acc": 0.8161696895173686, "train_speed(iter/s)": 0.095821 }, { "epoch": 0.5447902830619167, "grad_norm": 6.9611735343933105, "learning_rate": 4.673032312238459e-06, "loss": 0.7051783084869385, "memory(GiB)": 40.03, "step": 11715, "token_acc": 0.8249914879128363, "train_speed(iter/s)": 0.095843 }, { "epoch": 0.5450228013218663, "grad_norm": 8.261186599731445, "learning_rate": 4.669195736947321e-06, "loss": 0.6473368167877197, "memory(GiB)": 40.03, "step": 11720, "token_acc": 0.8500555349870418, "train_speed(iter/s)": 0.095867 }, { "epoch": 0.545255319581816, "grad_norm": 7.994866371154785, "learning_rate": 4.665359357270959e-06, "loss": 0.7445518493652343, "memory(GiB)": 40.03, "step": 11725, "token_acc": 0.8127433628318584, "train_speed(iter/s)": 0.095889 }, { "epoch": 0.5454878378417655, "grad_norm": 6.611346244812012, "learning_rate": 4.661523175477939e-06, "loss": 0.8385189056396485, "memory(GiB)": 40.03, "step": 11730, "token_acc": 0.8054982817869416, "train_speed(iter/s)": 0.095912 }, { "epoch": 0.5457203561017151, "grad_norm": 6.896411418914795, "learning_rate": 4.657687193836718e-06, "loss": 0.5892057418823242, "memory(GiB)": 40.03, "step": 11735, "token_acc": 0.8499684144030322, "train_speed(iter/s)": 0.095935 }, { "epoch": 0.5459528743616647, "grad_norm": 6.74559211730957, "learning_rate": 4.653851414615626e-06, "loss": 0.5817788600921631, "memory(GiB)": 40.03, "step": 11740, "token_acc": 0.8544546850998463, "train_speed(iter/s)": 0.095959 }, { "epoch": 0.5461853926216144, "grad_norm": 5.590111255645752, "learning_rate": 4.650015840082881e-06, "loss": 0.7610962390899658, "memory(GiB)": 40.03, "step": 11745, "token_acc": 0.8103011539544047, "train_speed(iter/s)": 0.095981 }, { "epoch": 0.5464179108815639, "grad_norm": 8.093123435974121, "learning_rate": 4.646180472506573e-06, "loss": 0.6921517848968506, "memory(GiB)": 40.03, "step": 11750, "token_acc": 0.8304626815265113, "train_speed(iter/s)": 0.096004 }, { "epoch": 0.5464179108815639, "eval_loss": 0.5902931094169617, "eval_runtime": 294.1316, "eval_samples_per_second": 11.814, "eval_steps_per_second": 11.814, "step": 11750 }, { "epoch": 0.5466504291415135, "grad_norm": 6.688976287841797, "learning_rate": 4.6423453141546795e-06, "loss": 0.65995774269104, "memory(GiB)": 40.03, "step": 11755, "token_acc": 0.8267956965306658, "train_speed(iter/s)": 0.095795 }, { "epoch": 0.5468829474014631, "grad_norm": 7.701316833496094, "learning_rate": 4.638510367295041e-06, "loss": 0.6725353240966797, "memory(GiB)": 40.03, "step": 11760, "token_acc": 0.8244811818501583, "train_speed(iter/s)": 0.095819 }, { "epoch": 0.5471154656614127, "grad_norm": 7.668078422546387, "learning_rate": 4.6346756341953844e-06, "loss": 0.6790274620056153, "memory(GiB)": 40.03, "step": 11765, "token_acc": 0.8225134008338296, "train_speed(iter/s)": 0.095842 }, { "epoch": 0.5473479839213623, "grad_norm": 8.048089981079102, "learning_rate": 4.630841117123303e-06, "loss": 0.6271292686462402, "memory(GiB)": 40.03, "step": 11770, "token_acc": 0.8438142211261817, "train_speed(iter/s)": 0.095865 }, { "epoch": 0.5475805021813119, "grad_norm": 9.47883415222168, "learning_rate": 4.6270068183462695e-06, "loss": 0.7323870182037353, "memory(GiB)": 40.03, "step": 11775, "token_acc": 0.8140096618357487, "train_speed(iter/s)": 0.095888 }, { "epoch": 0.5478130204412616, "grad_norm": 8.775604248046875, "learning_rate": 4.623172740131617e-06, "loss": 0.5636983394622803, "memory(GiB)": 40.03, "step": 11780, "token_acc": 0.8589160115846091, "train_speed(iter/s)": 0.09591 }, { "epoch": 0.5480455387012111, "grad_norm": 7.393858432769775, "learning_rate": 4.61933888474656e-06, "loss": 0.7762670993804932, "memory(GiB)": 40.03, "step": 11785, "token_acc": 0.8098495212038304, "train_speed(iter/s)": 0.095933 }, { "epoch": 0.5482780569611607, "grad_norm": 8.181929588317871, "learning_rate": 4.615505254458171e-06, "loss": 0.6364175796508789, "memory(GiB)": 40.03, "step": 11790, "token_acc": 0.834214002642008, "train_speed(iter/s)": 0.095956 }, { "epoch": 0.5485105752211104, "grad_norm": 6.371438026428223, "learning_rate": 4.6116718515333986e-06, "loss": 0.7543970108032226, "memory(GiB)": 40.03, "step": 11795, "token_acc": 0.8123417721518987, "train_speed(iter/s)": 0.095978 }, { "epoch": 0.54874309348106, "grad_norm": 7.374037742614746, "learning_rate": 4.607838678239048e-06, "loss": 0.6098850727081299, "memory(GiB)": 40.03, "step": 11800, "token_acc": 0.8399006034788783, "train_speed(iter/s)": 0.096001 }, { "epoch": 0.54874309348106, "eval_loss": 0.5927510857582092, "eval_runtime": 294.3173, "eval_samples_per_second": 11.807, "eval_steps_per_second": 11.807, "step": 11800 }, { "epoch": 0.5489756117410095, "grad_norm": 11.768540382385254, "learning_rate": 4.604005736841794e-06, "loss": 0.762086009979248, "memory(GiB)": 40.03, "step": 11805, "token_acc": 0.8265537176810663, "train_speed(iter/s)": 0.095794 }, { "epoch": 0.5492081300009591, "grad_norm": 6.83040189743042, "learning_rate": 4.6001730296081755e-06, "loss": 0.6055526256561279, "memory(GiB)": 40.03, "step": 11810, "token_acc": 0.8458769633507853, "train_speed(iter/s)": 0.095817 }, { "epoch": 0.5494406482609088, "grad_norm": 7.891283988952637, "learning_rate": 4.596340558804588e-06, "loss": 0.7261328220367431, "memory(GiB)": 40.03, "step": 11815, "token_acc": 0.8048154093097913, "train_speed(iter/s)": 0.09584 }, { "epoch": 0.5496731665208583, "grad_norm": 6.938855171203613, "learning_rate": 4.592508326697292e-06, "loss": 0.8132460594177247, "memory(GiB)": 40.03, "step": 11820, "token_acc": 0.8011363636363636, "train_speed(iter/s)": 0.095863 }, { "epoch": 0.5499056847808079, "grad_norm": 6.9001336097717285, "learning_rate": 4.588676335552403e-06, "loss": 0.5898480892181397, "memory(GiB)": 40.03, "step": 11825, "token_acc": 0.8442064264849075, "train_speed(iter/s)": 0.095886 }, { "epoch": 0.5501382030407576, "grad_norm": 6.822132587432861, "learning_rate": 4.584844587635896e-06, "loss": 0.6609106540679932, "memory(GiB)": 40.03, "step": 11830, "token_acc": 0.8293972506168488, "train_speed(iter/s)": 0.095909 }, { "epoch": 0.5503707213007072, "grad_norm": 7.090336799621582, "learning_rate": 4.581013085213601e-06, "loss": 0.6086909770965576, "memory(GiB)": 40.03, "step": 11835, "token_acc": 0.8393269548003959, "train_speed(iter/s)": 0.095932 }, { "epoch": 0.5506032395606567, "grad_norm": 7.868709564208984, "learning_rate": 4.577181830551208e-06, "loss": 0.7239909648895264, "memory(GiB)": 40.03, "step": 11840, "token_acc": 0.8222000664672648, "train_speed(iter/s)": 0.095955 }, { "epoch": 0.5508357578206063, "grad_norm": 7.852298259735107, "learning_rate": 4.573350825914249e-06, "loss": 0.723827314376831, "memory(GiB)": 40.03, "step": 11845, "token_acc": 0.8253404381290704, "train_speed(iter/s)": 0.095977 }, { "epoch": 0.551068276080556, "grad_norm": 8.472371101379395, "learning_rate": 4.569520073568121e-06, "loss": 0.62130126953125, "memory(GiB)": 40.03, "step": 11850, "token_acc": 0.8579749103942652, "train_speed(iter/s)": 0.096001 }, { "epoch": 0.551068276080556, "eval_loss": 0.5887280702590942, "eval_runtime": 293.7275, "eval_samples_per_second": 11.831, "eval_steps_per_second": 11.831, "step": 11850 }, { "epoch": 0.5513007943405056, "grad_norm": 8.165416717529297, "learning_rate": 4.565689575778064e-06, "loss": 0.5155246734619141, "memory(GiB)": 40.03, "step": 11855, "token_acc": 0.8275770606438131, "train_speed(iter/s)": 0.095795 }, { "epoch": 0.5515333126004551, "grad_norm": 7.239565372467041, "learning_rate": 4.561859334809172e-06, "loss": 0.5977548599243164, "memory(GiB)": 40.03, "step": 11860, "token_acc": 0.8501628664495114, "train_speed(iter/s)": 0.095817 }, { "epoch": 0.5517658308604048, "grad_norm": 8.126626968383789, "learning_rate": 4.558029352926379e-06, "loss": 0.8376049041748047, "memory(GiB)": 40.03, "step": 11865, "token_acc": 0.8080301129234629, "train_speed(iter/s)": 0.095839 }, { "epoch": 0.5519983491203544, "grad_norm": 6.358272552490234, "learning_rate": 4.5541996323944775e-06, "loss": 0.6366981506347656, "memory(GiB)": 40.03, "step": 11870, "token_acc": 0.834573043736983, "train_speed(iter/s)": 0.095862 }, { "epoch": 0.5522308673803039, "grad_norm": 7.45095682144165, "learning_rate": 4.550370175478096e-06, "loss": 0.7741560459136962, "memory(GiB)": 40.03, "step": 11875, "token_acc": 0.8243014394580863, "train_speed(iter/s)": 0.095884 }, { "epoch": 0.5524633856402535, "grad_norm": 7.986889362335205, "learning_rate": 4.546540984441713e-06, "loss": 0.7031302452087402, "memory(GiB)": 40.03, "step": 11880, "token_acc": 0.8319423368740516, "train_speed(iter/s)": 0.095907 }, { "epoch": 0.5526959039002032, "grad_norm": 7.913536548614502, "learning_rate": 4.542712061549646e-06, "loss": 0.679871940612793, "memory(GiB)": 40.03, "step": 11885, "token_acc": 0.8283246977547496, "train_speed(iter/s)": 0.095929 }, { "epoch": 0.5529284221601528, "grad_norm": 7.337793350219727, "learning_rate": 4.538883409066055e-06, "loss": 0.6509832859039306, "memory(GiB)": 40.03, "step": 11890, "token_acc": 0.831765935214211, "train_speed(iter/s)": 0.095952 }, { "epoch": 0.5531609404201023, "grad_norm": 8.420065879821777, "learning_rate": 4.53505502925494e-06, "loss": 0.6353929996490478, "memory(GiB)": 40.03, "step": 11895, "token_acc": 0.8337801608579088, "train_speed(iter/s)": 0.095975 }, { "epoch": 0.553393458680052, "grad_norm": 6.823483943939209, "learning_rate": 4.531226924380144e-06, "loss": 0.7875633239746094, "memory(GiB)": 40.03, "step": 11900, "token_acc": 0.8031420327027894, "train_speed(iter/s)": 0.095998 }, { "epoch": 0.553393458680052, "eval_loss": 0.5897703170776367, "eval_runtime": 295.526, "eval_samples_per_second": 11.759, "eval_steps_per_second": 11.759, "step": 11900 }, { "epoch": 0.5536259769400016, "grad_norm": 8.597670555114746, "learning_rate": 4.527399096705338e-06, "loss": 0.6930715560913085, "memory(GiB)": 40.03, "step": 11905, "token_acc": 0.8261207670600308, "train_speed(iter/s)": 0.095792 }, { "epoch": 0.5538584951999511, "grad_norm": 8.751448631286621, "learning_rate": 4.523571548494039e-06, "loss": 0.6656043052673339, "memory(GiB)": 40.03, "step": 11910, "token_acc": 0.8314430973797419, "train_speed(iter/s)": 0.095815 }, { "epoch": 0.5540910134599007, "grad_norm": 6.747947692871094, "learning_rate": 4.5197442820095906e-06, "loss": 0.911475658416748, "memory(GiB)": 40.03, "step": 11915, "token_acc": 0.7812105926860026, "train_speed(iter/s)": 0.095838 }, { "epoch": 0.5543235317198504, "grad_norm": 7.468943119049072, "learning_rate": 4.5159172995151786e-06, "loss": 0.767233419418335, "memory(GiB)": 40.03, "step": 11920, "token_acc": 0.814017094017094, "train_speed(iter/s)": 0.09586 }, { "epoch": 0.5545560499798, "grad_norm": 7.417344093322754, "learning_rate": 4.512090603273811e-06, "loss": 0.7732417106628418, "memory(GiB)": 40.03, "step": 11925, "token_acc": 0.8210053126277074, "train_speed(iter/s)": 0.095883 }, { "epoch": 0.5547885682397495, "grad_norm": 7.629657745361328, "learning_rate": 4.508264195548336e-06, "loss": 0.7324337959289551, "memory(GiB)": 40.03, "step": 11930, "token_acc": 0.8334142787761049, "train_speed(iter/s)": 0.095905 }, { "epoch": 0.5550210864996992, "grad_norm": 7.580898284912109, "learning_rate": 4.504438078601421e-06, "loss": 0.7392070770263672, "memory(GiB)": 40.03, "step": 11935, "token_acc": 0.822000711997152, "train_speed(iter/s)": 0.095928 }, { "epoch": 0.5552536047596488, "grad_norm": 8.211389541625977, "learning_rate": 4.500612254695571e-06, "loss": 0.8082739830017089, "memory(GiB)": 40.03, "step": 11940, "token_acc": 0.794435857805255, "train_speed(iter/s)": 0.095951 }, { "epoch": 0.5554861230195984, "grad_norm": 6.073451519012451, "learning_rate": 4.496786726093116e-06, "loss": 0.686737060546875, "memory(GiB)": 40.03, "step": 11945, "token_acc": 0.8204729309271935, "train_speed(iter/s)": 0.095974 }, { "epoch": 0.555718641279548, "grad_norm": 6.639056205749512, "learning_rate": 4.492961495056204e-06, "loss": 0.7988576412200927, "memory(GiB)": 40.03, "step": 11950, "token_acc": 0.8012508686587908, "train_speed(iter/s)": 0.095996 }, { "epoch": 0.555718641279548, "eval_loss": 0.588585615158081, "eval_runtime": 295.4879, "eval_samples_per_second": 11.76, "eval_steps_per_second": 11.76, "step": 11950 }, { "epoch": 0.5559511595394976, "grad_norm": 6.849501609802246, "learning_rate": 4.489136563846814e-06, "loss": 0.7492703914642334, "memory(GiB)": 40.03, "step": 11955, "token_acc": 0.8257185645526841, "train_speed(iter/s)": 0.095791 }, { "epoch": 0.5561836777994472, "grad_norm": 8.924171447753906, "learning_rate": 4.485311934726747e-06, "loss": 0.7083686351776123, "memory(GiB)": 40.03, "step": 11960, "token_acc": 0.8093106535362579, "train_speed(iter/s)": 0.095814 }, { "epoch": 0.5564161960593967, "grad_norm": 5.738160133361816, "learning_rate": 4.4814876099576254e-06, "loss": 0.6795273780822754, "memory(GiB)": 40.03, "step": 11965, "token_acc": 0.822052067381317, "train_speed(iter/s)": 0.095836 }, { "epoch": 0.5566487143193464, "grad_norm": 9.783073425292969, "learning_rate": 4.477663591800887e-06, "loss": 0.7246876716613769, "memory(GiB)": 40.03, "step": 11970, "token_acc": 0.8208363374188897, "train_speed(iter/s)": 0.095858 }, { "epoch": 0.556881232579296, "grad_norm": 7.146920204162598, "learning_rate": 4.473839882517794e-06, "loss": 0.6436521530151367, "memory(GiB)": 40.03, "step": 11975, "token_acc": 0.8347107438016529, "train_speed(iter/s)": 0.095881 }, { "epoch": 0.5571137508392456, "grad_norm": 7.048880577087402, "learning_rate": 4.470016484369423e-06, "loss": 0.695180606842041, "memory(GiB)": 40.03, "step": 11980, "token_acc": 0.8230152949745084, "train_speed(iter/s)": 0.095902 }, { "epoch": 0.5573462690991952, "grad_norm": 5.961655139923096, "learning_rate": 4.466193399616669e-06, "loss": 0.6891386032104492, "memory(GiB)": 40.03, "step": 11985, "token_acc": 0.814453125, "train_speed(iter/s)": 0.095925 }, { "epoch": 0.5575787873591448, "grad_norm": 6.8944878578186035, "learning_rate": 4.462370630520237e-06, "loss": 0.6040480613708497, "memory(GiB)": 40.03, "step": 11990, "token_acc": 0.848985208118335, "train_speed(iter/s)": 0.095947 }, { "epoch": 0.5578113056190944, "grad_norm": 6.749599456787109, "learning_rate": 4.458548179340651e-06, "loss": 0.7882990360260009, "memory(GiB)": 40.03, "step": 11995, "token_acc": 0.8040925863804093, "train_speed(iter/s)": 0.09597 }, { "epoch": 0.558043823879044, "grad_norm": 8.951676368713379, "learning_rate": 4.4547260483382435e-06, "loss": 0.7259221076965332, "memory(GiB)": 40.03, "step": 12000, "token_acc": 0.825390625, "train_speed(iter/s)": 0.095992 }, { "epoch": 0.558043823879044, "eval_loss": 0.5902968049049377, "eval_runtime": 294.0747, "eval_samples_per_second": 11.817, "eval_steps_per_second": 11.817, "step": 12000 }, { "epoch": 0.5582763421389936, "grad_norm": 6.93931245803833, "learning_rate": 4.45090423977316e-06, "loss": 0.5834178447723388, "memory(GiB)": 40.03, "step": 12005, "token_acc": 0.827311676684998, "train_speed(iter/s)": 0.095789 }, { "epoch": 0.5585088603989432, "grad_norm": 9.755097389221191, "learning_rate": 4.447082755905351e-06, "loss": 0.7193635940551758, "memory(GiB)": 40.03, "step": 12010, "token_acc": 0.8245924875974486, "train_speed(iter/s)": 0.095811 }, { "epoch": 0.5587413786588928, "grad_norm": 6.74008321762085, "learning_rate": 4.4432615989945794e-06, "loss": 0.5961836814880371, "memory(GiB)": 40.03, "step": 12015, "token_acc": 0.846640872317974, "train_speed(iter/s)": 0.095834 }, { "epoch": 0.5589738969188424, "grad_norm": 10.712708473205566, "learning_rate": 4.439440771300412e-06, "loss": 0.5513602256774902, "memory(GiB)": 40.03, "step": 12020, "token_acc": 0.8571959836370397, "train_speed(iter/s)": 0.095856 }, { "epoch": 0.559206415178792, "grad_norm": 7.622487545013428, "learning_rate": 4.435620275082227e-06, "loss": 0.5959040641784668, "memory(GiB)": 40.03, "step": 12025, "token_acc": 0.8629697525206232, "train_speed(iter/s)": 0.095878 }, { "epoch": 0.5594389334387416, "grad_norm": 7.523995399475098, "learning_rate": 4.431800112599195e-06, "loss": 0.7422564506530762, "memory(GiB)": 40.03, "step": 12030, "token_acc": 0.8241792929292929, "train_speed(iter/s)": 0.0959 }, { "epoch": 0.5596714516986913, "grad_norm": 7.105021953582764, "learning_rate": 4.427980286110301e-06, "loss": 0.6166975498199463, "memory(GiB)": 40.03, "step": 12035, "token_acc": 0.8445110528287748, "train_speed(iter/s)": 0.095923 }, { "epoch": 0.5599039699586408, "grad_norm": 7.011575222015381, "learning_rate": 4.424160797874323e-06, "loss": 0.5584123134613037, "memory(GiB)": 40.03, "step": 12040, "token_acc": 0.85, "train_speed(iter/s)": 0.095944 }, { "epoch": 0.5601364882185904, "grad_norm": 8.523414611816406, "learning_rate": 4.420341650149847e-06, "loss": 0.7120149612426758, "memory(GiB)": 40.03, "step": 12045, "token_acc": 0.8180006642311525, "train_speed(iter/s)": 0.095966 }, { "epoch": 0.56036900647854, "grad_norm": 10.580933570861816, "learning_rate": 4.416522845195247e-06, "loss": 0.6449977874755859, "memory(GiB)": 40.03, "step": 12050, "token_acc": 0.8413223140495868, "train_speed(iter/s)": 0.095989 }, { "epoch": 0.56036900647854, "eval_loss": 0.5899468660354614, "eval_runtime": 293.9174, "eval_samples_per_second": 11.823, "eval_steps_per_second": 11.823, "step": 12050 }, { "epoch": 0.5606015247384896, "grad_norm": 6.270550727844238, "learning_rate": 4.4127043852687045e-06, "loss": 0.7345793724060059, "memory(GiB)": 40.03, "step": 12055, "token_acc": 0.8270697756325593, "train_speed(iter/s)": 0.095786 }, { "epoch": 0.5608340429984392, "grad_norm": 6.681027889251709, "learning_rate": 4.40888627262819e-06, "loss": 0.6348593711853028, "memory(GiB)": 40.03, "step": 12060, "token_acc": 0.8431644691186676, "train_speed(iter/s)": 0.095809 }, { "epoch": 0.5610665612583888, "grad_norm": 8.465786933898926, "learning_rate": 4.4050685095314755e-06, "loss": 0.650502061843872, "memory(GiB)": 40.03, "step": 12065, "token_acc": 0.8298568507157464, "train_speed(iter/s)": 0.095831 }, { "epoch": 0.5612990795183385, "grad_norm": 7.157223701477051, "learning_rate": 4.401251098236116e-06, "loss": 0.8636740684509278, "memory(GiB)": 40.03, "step": 12070, "token_acc": 0.8011363636363636, "train_speed(iter/s)": 0.095854 }, { "epoch": 0.561531597778288, "grad_norm": 6.018905162811279, "learning_rate": 4.397434040999469e-06, "loss": 0.7139524936676025, "memory(GiB)": 40.03, "step": 12075, "token_acc": 0.8330578512396695, "train_speed(iter/s)": 0.095876 }, { "epoch": 0.5617641160382376, "grad_norm": 6.583132743835449, "learning_rate": 4.39361734007868e-06, "loss": 0.8445042610168457, "memory(GiB)": 40.03, "step": 12080, "token_acc": 0.7940761636107193, "train_speed(iter/s)": 0.095899 }, { "epoch": 0.5619966342981872, "grad_norm": 8.308008193969727, "learning_rate": 4.389800997730677e-06, "loss": 0.6492765426635743, "memory(GiB)": 40.03, "step": 12085, "token_acc": 0.8364485981308412, "train_speed(iter/s)": 0.09592 }, { "epoch": 0.5622291525581369, "grad_norm": 7.715389728546143, "learning_rate": 4.385985016212184e-06, "loss": 0.5420804023742676, "memory(GiB)": 40.03, "step": 12090, "token_acc": 0.8639160332971408, "train_speed(iter/s)": 0.095943 }, { "epoch": 0.5624616708180864, "grad_norm": 7.167847633361816, "learning_rate": 4.382169397779708e-06, "loss": 0.7379115104675293, "memory(GiB)": 40.03, "step": 12095, "token_acc": 0.8219735503560529, "train_speed(iter/s)": 0.095965 }, { "epoch": 0.562694189078036, "grad_norm": 7.906615734100342, "learning_rate": 4.378354144689544e-06, "loss": 0.6950104236602783, "memory(GiB)": 40.03, "step": 12100, "token_acc": 0.8354203935599285, "train_speed(iter/s)": 0.095987 }, { "epoch": 0.562694189078036, "eval_loss": 0.588950514793396, "eval_runtime": 292.1716, "eval_samples_per_second": 11.894, "eval_steps_per_second": 11.894, "step": 12100 }, { "epoch": 0.5629267073379857, "grad_norm": 6.199409484863281, "learning_rate": 4.374539259197766e-06, "loss": 0.6853800296783448, "memory(GiB)": 40.03, "step": 12105, "token_acc": 0.8269081789938584, "train_speed(iter/s)": 0.095787 }, { "epoch": 0.5631592255979352, "grad_norm": 5.901808738708496, "learning_rate": 4.370724743560235e-06, "loss": 0.608131742477417, "memory(GiB)": 40.03, "step": 12110, "token_acc": 0.8353520060560181, "train_speed(iter/s)": 0.095809 }, { "epoch": 0.5633917438578848, "grad_norm": 7.537030220031738, "learning_rate": 4.36691060003259e-06, "loss": 0.699574613571167, "memory(GiB)": 40.03, "step": 12115, "token_acc": 0.8168229777256741, "train_speed(iter/s)": 0.095831 }, { "epoch": 0.5636242621178345, "grad_norm": 7.507444858551025, "learning_rate": 4.363096830870257e-06, "loss": 0.5918978691101074, "memory(GiB)": 40.03, "step": 12120, "token_acc": 0.8476223533495314, "train_speed(iter/s)": 0.095853 }, { "epoch": 0.5638567803777841, "grad_norm": 7.615386962890625, "learning_rate": 4.35928343832843e-06, "loss": 0.7293277740478515, "memory(GiB)": 40.03, "step": 12125, "token_acc": 0.8251136761105281, "train_speed(iter/s)": 0.095875 }, { "epoch": 0.5640892986377336, "grad_norm": 7.207439422607422, "learning_rate": 4.355470424662087e-06, "loss": 0.6662022590637207, "memory(GiB)": 40.03, "step": 12130, "token_acc": 0.8318496538081108, "train_speed(iter/s)": 0.095898 }, { "epoch": 0.5643218168976832, "grad_norm": 11.291145324707031, "learning_rate": 4.351657792125981e-06, "loss": 0.6410336971282959, "memory(GiB)": 40.03, "step": 12135, "token_acc": 0.8378037235721048, "train_speed(iter/s)": 0.09592 }, { "epoch": 0.5645543351576329, "grad_norm": 8.108573913574219, "learning_rate": 4.347845542974642e-06, "loss": 0.7120551109313965, "memory(GiB)": 40.03, "step": 12140, "token_acc": 0.8150989099717401, "train_speed(iter/s)": 0.095942 }, { "epoch": 0.5647868534175825, "grad_norm": 8.063129425048828, "learning_rate": 4.344033679462367e-06, "loss": 0.6626528263092041, "memory(GiB)": 40.03, "step": 12145, "token_acc": 0.8348591549295775, "train_speed(iter/s)": 0.095964 }, { "epoch": 0.565019371677532, "grad_norm": 7.968915939331055, "learning_rate": 4.3402222038432295e-06, "loss": 0.6640778541564941, "memory(GiB)": 40.03, "step": 12150, "token_acc": 0.8185752330226365, "train_speed(iter/s)": 0.095986 }, { "epoch": 0.565019371677532, "eval_loss": 0.5861648917198181, "eval_runtime": 290.5533, "eval_samples_per_second": 11.96, "eval_steps_per_second": 11.96, "step": 12150 }, { "epoch": 0.5652518899374817, "grad_norm": 8.727378845214844, "learning_rate": 4.336411118371073e-06, "loss": 0.6298631191253662, "memory(GiB)": 40.03, "step": 12155, "token_acc": 0.8282516248094359, "train_speed(iter/s)": 0.095788 }, { "epoch": 0.5654844081974313, "grad_norm": 8.191335678100586, "learning_rate": 4.332600425299512e-06, "loss": 0.6278055667877197, "memory(GiB)": 40.03, "step": 12160, "token_acc": 0.8424785367674505, "train_speed(iter/s)": 0.095811 }, { "epoch": 0.5657169264573808, "grad_norm": 9.833742141723633, "learning_rate": 4.328790126881923e-06, "loss": 0.6580445289611816, "memory(GiB)": 40.03, "step": 12165, "token_acc": 0.8379080118694362, "train_speed(iter/s)": 0.095833 }, { "epoch": 0.5659494447173304, "grad_norm": 6.148544788360596, "learning_rate": 4.324980225371456e-06, "loss": 0.5997506618499756, "memory(GiB)": 40.03, "step": 12170, "token_acc": 0.8537455410225921, "train_speed(iter/s)": 0.095855 }, { "epoch": 0.5661819629772801, "grad_norm": 6.668766975402832, "learning_rate": 4.321170723021022e-06, "loss": 0.5688246250152588, "memory(GiB)": 40.03, "step": 12175, "token_acc": 0.8473724884080371, "train_speed(iter/s)": 0.095877 }, { "epoch": 0.5664144812372297, "grad_norm": 8.070734977722168, "learning_rate": 4.3173616220833e-06, "loss": 0.7162200927734375, "memory(GiB)": 40.03, "step": 12180, "token_acc": 0.8325041459369817, "train_speed(iter/s)": 0.0959 }, { "epoch": 0.5666469994971792, "grad_norm": 8.31356143951416, "learning_rate": 4.3135529248107245e-06, "loss": 0.7518483161926269, "memory(GiB)": 40.03, "step": 12185, "token_acc": 0.8049171566007483, "train_speed(iter/s)": 0.095922 }, { "epoch": 0.5668795177571289, "grad_norm": 5.885660648345947, "learning_rate": 4.3097446334555e-06, "loss": 0.8494339942932129, "memory(GiB)": 40.03, "step": 12190, "token_acc": 0.7792833483890395, "train_speed(iter/s)": 0.095944 }, { "epoch": 0.5671120360170785, "grad_norm": 9.467564582824707, "learning_rate": 4.305936750269583e-06, "loss": 0.6303043842315674, "memory(GiB)": 40.03, "step": 12195, "token_acc": 0.8442650521358896, "train_speed(iter/s)": 0.095966 }, { "epoch": 0.567344554277028, "grad_norm": 5.601880073547363, "learning_rate": 4.302129277504696e-06, "loss": 0.6847558975219726, "memory(GiB)": 40.03, "step": 12200, "token_acc": 0.8275607958732498, "train_speed(iter/s)": 0.095988 }, { "epoch": 0.567344554277028, "eval_loss": 0.5879337787628174, "eval_runtime": 294.3451, "eval_samples_per_second": 11.806, "eval_steps_per_second": 11.806, "step": 12200 }, { "epoch": 0.5675770725369776, "grad_norm": 9.660368919372559, "learning_rate": 4.298322217412312e-06, "loss": 0.6761251926422119, "memory(GiB)": 40.03, "step": 12205, "token_acc": 0.8278712867313605, "train_speed(iter/s)": 0.095788 }, { "epoch": 0.5678095907969273, "grad_norm": 8.12336254119873, "learning_rate": 4.294515572243665e-06, "loss": 0.6749348163604736, "memory(GiB)": 40.03, "step": 12210, "token_acc": 0.8432692307692308, "train_speed(iter/s)": 0.09581 }, { "epoch": 0.5680421090568769, "grad_norm": 8.090500831604004, "learning_rate": 4.290709344249743e-06, "loss": 0.6176517009735107, "memory(GiB)": 40.03, "step": 12215, "token_acc": 0.837516960651289, "train_speed(iter/s)": 0.095833 }, { "epoch": 0.5682746273168264, "grad_norm": 7.9386982917785645, "learning_rate": 4.286903535681282e-06, "loss": 0.6293925285339356, "memory(GiB)": 40.03, "step": 12220, "token_acc": 0.8361252731245448, "train_speed(iter/s)": 0.095855 }, { "epoch": 0.5685071455767761, "grad_norm": 8.970488548278809, "learning_rate": 4.283098148788781e-06, "loss": 0.5741978168487549, "memory(GiB)": 40.03, "step": 12225, "token_acc": 0.8548273431994362, "train_speed(iter/s)": 0.095877 }, { "epoch": 0.5687396638367257, "grad_norm": 6.578322887420654, "learning_rate": 4.279293185822476e-06, "loss": 0.6272590160369873, "memory(GiB)": 40.03, "step": 12230, "token_acc": 0.837742980561555, "train_speed(iter/s)": 0.095899 }, { "epoch": 0.5689721820966753, "grad_norm": 9.832868576049805, "learning_rate": 4.275488649032362e-06, "loss": 0.6227746963500976, "memory(GiB)": 40.03, "step": 12235, "token_acc": 0.8389763779527559, "train_speed(iter/s)": 0.09592 }, { "epoch": 0.5692047003566248, "grad_norm": 7.5642900466918945, "learning_rate": 4.271684540668178e-06, "loss": 0.6691617488861084, "memory(GiB)": 40.03, "step": 12240, "token_acc": 0.8332721263312522, "train_speed(iter/s)": 0.095942 }, { "epoch": 0.5694372186165745, "grad_norm": 7.92784309387207, "learning_rate": 4.267880862979414e-06, "loss": 0.6101456165313721, "memory(GiB)": 40.03, "step": 12245, "token_acc": 0.8562476962771839, "train_speed(iter/s)": 0.095964 }, { "epoch": 0.5696697368765241, "grad_norm": 7.504398345947266, "learning_rate": 4.264077618215296e-06, "loss": 0.6823426246643066, "memory(GiB)": 40.03, "step": 12250, "token_acc": 0.8348559381588194, "train_speed(iter/s)": 0.095986 }, { "epoch": 0.5696697368765241, "eval_loss": 0.5861591100692749, "eval_runtime": 295.5018, "eval_samples_per_second": 11.76, "eval_steps_per_second": 11.76, "step": 12250 }, { "epoch": 0.5699022551364736, "grad_norm": 5.381181716918945, "learning_rate": 4.260274808624805e-06, "loss": 0.6528043746948242, "memory(GiB)": 40.03, "step": 12255, "token_acc": 0.8278091189490675, "train_speed(iter/s)": 0.095786 }, { "epoch": 0.5701347733964233, "grad_norm": 6.923305034637451, "learning_rate": 4.256472436456658e-06, "loss": 0.7982216835021972, "memory(GiB)": 40.03, "step": 12260, "token_acc": 0.7843494085532302, "train_speed(iter/s)": 0.095808 }, { "epoch": 0.5703672916563729, "grad_norm": 8.104715347290039, "learning_rate": 4.252670503959317e-06, "loss": 0.6475061893463134, "memory(GiB)": 40.03, "step": 12265, "token_acc": 0.8468261269549218, "train_speed(iter/s)": 0.095829 }, { "epoch": 0.5705998099163225, "grad_norm": 7.670345306396484, "learning_rate": 4.248869013380977e-06, "loss": 0.6680463790893555, "memory(GiB)": 40.03, "step": 12270, "token_acc": 0.8414048059149722, "train_speed(iter/s)": 0.095852 }, { "epoch": 0.570832328176272, "grad_norm": 10.764370918273926, "learning_rate": 4.245067966969583e-06, "loss": 0.7138540267944335, "memory(GiB)": 40.03, "step": 12275, "token_acc": 0.8249346771183277, "train_speed(iter/s)": 0.095874 }, { "epoch": 0.5710648464362217, "grad_norm": 9.680176734924316, "learning_rate": 4.241267366972806e-06, "loss": 0.5867207050323486, "memory(GiB)": 40.03, "step": 12280, "token_acc": 0.8530480419862737, "train_speed(iter/s)": 0.095896 }, { "epoch": 0.5712973646961713, "grad_norm": 6.827235698699951, "learning_rate": 4.237467215638064e-06, "loss": 0.6321615219116211, "memory(GiB)": 40.03, "step": 12285, "token_acc": 0.8348656294200849, "train_speed(iter/s)": 0.095918 }, { "epoch": 0.571529882956121, "grad_norm": 7.801530838012695, "learning_rate": 4.233667515212496e-06, "loss": 0.7946747779846192, "memory(GiB)": 40.03, "step": 12290, "token_acc": 0.8071505958829902, "train_speed(iter/s)": 0.09594 }, { "epoch": 0.5717624012160705, "grad_norm": 7.390880107879639, "learning_rate": 4.229868267942988e-06, "loss": 0.6462111473083496, "memory(GiB)": 40.03, "step": 12295, "token_acc": 0.8338423946243128, "train_speed(iter/s)": 0.095962 }, { "epoch": 0.5719949194760201, "grad_norm": 9.422018051147461, "learning_rate": 4.226069476076151e-06, "loss": 0.6602601051330567, "memory(GiB)": 40.03, "step": 12300, "token_acc": 0.8338995847489619, "train_speed(iter/s)": 0.095984 }, { "epoch": 0.5719949194760201, "eval_loss": 0.5856576561927795, "eval_runtime": 291.848, "eval_samples_per_second": 11.907, "eval_steps_per_second": 11.907, "step": 12300 }, { "epoch": 0.5722274377359697, "grad_norm": 7.992288112640381, "learning_rate": 4.222271141858328e-06, "loss": 0.5441146850585937, "memory(GiB)": 40.03, "step": 12305, "token_acc": 0.8283153004022423, "train_speed(iter/s)": 0.095787 }, { "epoch": 0.5724599559959193, "grad_norm": 7.502604961395264, "learning_rate": 4.218473267535589e-06, "loss": 0.6329378128051758, "memory(GiB)": 40.03, "step": 12310, "token_acc": 0.8429888084265964, "train_speed(iter/s)": 0.095809 }, { "epoch": 0.5726924742558689, "grad_norm": 6.395047187805176, "learning_rate": 4.214675855353737e-06, "loss": 0.781887435913086, "memory(GiB)": 40.03, "step": 12315, "token_acc": 0.8022071307300509, "train_speed(iter/s)": 0.09583 }, { "epoch": 0.5729249925158185, "grad_norm": 12.194600105285645, "learning_rate": 4.210878907558298e-06, "loss": 0.7144025325775146, "memory(GiB)": 40.03, "step": 12320, "token_acc": 0.8248885285772193, "train_speed(iter/s)": 0.095852 }, { "epoch": 0.5731575107757682, "grad_norm": 7.285346508026123, "learning_rate": 4.207082426394525e-06, "loss": 0.6607818126678466, "memory(GiB)": 40.03, "step": 12325, "token_acc": 0.8305728088336783, "train_speed(iter/s)": 0.095874 }, { "epoch": 0.5733900290357177, "grad_norm": 6.923439025878906, "learning_rate": 4.203286414107394e-06, "loss": 0.7327235698699951, "memory(GiB)": 40.03, "step": 12330, "token_acc": 0.8300500834724541, "train_speed(iter/s)": 0.095896 }, { "epoch": 0.5736225472956673, "grad_norm": 9.087661743164062, "learning_rate": 4.199490872941603e-06, "loss": 0.6531758308410645, "memory(GiB)": 40.03, "step": 12335, "token_acc": 0.8412962193601994, "train_speed(iter/s)": 0.095919 }, { "epoch": 0.5738550655556169, "grad_norm": 8.431983947753906, "learning_rate": 4.195695805141575e-06, "loss": 0.6540855407714844, "memory(GiB)": 40.03, "step": 12340, "token_acc": 0.8342618384401114, "train_speed(iter/s)": 0.095942 }, { "epoch": 0.5740875838155666, "grad_norm": 7.269530773162842, "learning_rate": 4.1919012129514494e-06, "loss": 0.6406298637390136, "memory(GiB)": 40.03, "step": 12345, "token_acc": 0.8300678221552373, "train_speed(iter/s)": 0.095964 }, { "epoch": 0.5743201020755161, "grad_norm": 8.626265525817871, "learning_rate": 4.188107098615088e-06, "loss": 0.7785932064056397, "memory(GiB)": 40.03, "step": 12350, "token_acc": 0.7945383615084526, "train_speed(iter/s)": 0.095985 }, { "epoch": 0.5743201020755161, "eval_loss": 0.5844415426254272, "eval_runtime": 291.785, "eval_samples_per_second": 11.909, "eval_steps_per_second": 11.909, "step": 12350 }, { "epoch": 0.5745526203354657, "grad_norm": 9.09610366821289, "learning_rate": 4.1843134643760645e-06, "loss": 0.6106031894683838, "memory(GiB)": 40.03, "step": 12355, "token_acc": 0.8282516834546166, "train_speed(iter/s)": 0.09579 }, { "epoch": 0.5747851385954154, "grad_norm": 6.92556095123291, "learning_rate": 4.180520312477674e-06, "loss": 0.7823381423950195, "memory(GiB)": 40.03, "step": 12360, "token_acc": 0.8039964736996768, "train_speed(iter/s)": 0.095812 }, { "epoch": 0.5750176568553649, "grad_norm": 7.292800426483154, "learning_rate": 4.176727645162922e-06, "loss": 0.6464691638946534, "memory(GiB)": 40.03, "step": 12365, "token_acc": 0.8313665778454511, "train_speed(iter/s)": 0.095834 }, { "epoch": 0.5752501751153145, "grad_norm": 8.345467567443848, "learning_rate": 4.172935464674535e-06, "loss": 0.6117934226989746, "memory(GiB)": 40.03, "step": 12370, "token_acc": 0.8359663865546219, "train_speed(iter/s)": 0.095856 }, { "epoch": 0.5754826933752641, "grad_norm": 5.228003978729248, "learning_rate": 4.16914377325494e-06, "loss": 0.5495120048522949, "memory(GiB)": 40.03, "step": 12375, "token_acc": 0.8590203106332138, "train_speed(iter/s)": 0.095878 }, { "epoch": 0.5757152116352138, "grad_norm": 10.157156944274902, "learning_rate": 4.165352573146285e-06, "loss": 0.7139785766601563, "memory(GiB)": 40.03, "step": 12380, "token_acc": 0.8175206611570248, "train_speed(iter/s)": 0.0959 }, { "epoch": 0.5759477298951633, "grad_norm": 8.81885051727295, "learning_rate": 4.161561866590421e-06, "loss": 0.5811410427093506, "memory(GiB)": 40.03, "step": 12385, "token_acc": 0.8459538511537211, "train_speed(iter/s)": 0.095922 }, { "epoch": 0.5761802481551129, "grad_norm": 7.776773929595947, "learning_rate": 4.157771655828915e-06, "loss": 0.6779936790466309, "memory(GiB)": 40.03, "step": 12390, "token_acc": 0.8202472435683261, "train_speed(iter/s)": 0.095944 }, { "epoch": 0.5764127664150626, "grad_norm": 6.842904567718506, "learning_rate": 4.1539819431030295e-06, "loss": 0.5791988849639893, "memory(GiB)": 40.03, "step": 12395, "token_acc": 0.854490337248958, "train_speed(iter/s)": 0.095965 }, { "epoch": 0.5766452846750121, "grad_norm": 6.902646064758301, "learning_rate": 4.150192730653742e-06, "loss": 0.7370592594146729, "memory(GiB)": 40.03, "step": 12400, "token_acc": 0.8119092627599244, "train_speed(iter/s)": 0.095988 }, { "epoch": 0.5766452846750121, "eval_loss": 0.5841361880302429, "eval_runtime": 291.209, "eval_samples_per_second": 11.933, "eval_steps_per_second": 11.933, "step": 12400 }, { "epoch": 0.5768778029349617, "grad_norm": 8.645942687988281, "learning_rate": 4.14640402072173e-06, "loss": 0.5885090827941895, "memory(GiB)": 40.03, "step": 12405, "token_acc": 0.8285636281878332, "train_speed(iter/s)": 0.095792 }, { "epoch": 0.5771103211949113, "grad_norm": 9.0007963180542, "learning_rate": 4.142615815547376e-06, "loss": 0.5649767398834229, "memory(GiB)": 40.03, "step": 12410, "token_acc": 0.8557068741893644, "train_speed(iter/s)": 0.095814 }, { "epoch": 0.577342839454861, "grad_norm": 8.417078971862793, "learning_rate": 4.138828117370759e-06, "loss": 0.5798979282379151, "memory(GiB)": 40.03, "step": 12415, "token_acc": 0.8643841707425522, "train_speed(iter/s)": 0.095835 }, { "epoch": 0.5775753577148105, "grad_norm": 7.812051296234131, "learning_rate": 4.135040928431667e-06, "loss": 0.7484992980957031, "memory(GiB)": 40.03, "step": 12420, "token_acc": 0.8171667829727843, "train_speed(iter/s)": 0.095857 }, { "epoch": 0.5778078759747601, "grad_norm": 6.615225791931152, "learning_rate": 4.131254250969578e-06, "loss": 0.6321574211120605, "memory(GiB)": 40.03, "step": 12425, "token_acc": 0.8439282803585982, "train_speed(iter/s)": 0.095879 }, { "epoch": 0.5780403942347098, "grad_norm": 9.891707420349121, "learning_rate": 4.1274680872236724e-06, "loss": 0.6489062309265137, "memory(GiB)": 40.03, "step": 12430, "token_acc": 0.8401946107784432, "train_speed(iter/s)": 0.095901 }, { "epoch": 0.5782729124946594, "grad_norm": 6.718799591064453, "learning_rate": 4.123682439432826e-06, "loss": 0.7149152755737305, "memory(GiB)": 40.03, "step": 12435, "token_acc": 0.819376026272578, "train_speed(iter/s)": 0.095923 }, { "epoch": 0.5785054307546089, "grad_norm": 7.437344074249268, "learning_rate": 4.1198973098356095e-06, "loss": 0.5938013553619385, "memory(GiB)": 40.03, "step": 12440, "token_acc": 0.8444444444444444, "train_speed(iter/s)": 0.095946 }, { "epoch": 0.5787379490145586, "grad_norm": 6.788064002990723, "learning_rate": 4.116112700670285e-06, "loss": 0.612579345703125, "memory(GiB)": 40.03, "step": 12445, "token_acc": 0.8387524883875249, "train_speed(iter/s)": 0.095967 }, { "epoch": 0.5789704672745082, "grad_norm": 9.03939437866211, "learning_rate": 4.112328614174811e-06, "loss": 0.6132975578308105, "memory(GiB)": 40.03, "step": 12450, "token_acc": 0.8432807085098191, "train_speed(iter/s)": 0.095989 }, { "epoch": 0.5789704672745082, "eval_loss": 0.5835011601448059, "eval_runtime": 291.6729, "eval_samples_per_second": 11.914, "eval_steps_per_second": 11.914, "step": 12450 }, { "epoch": 0.5792029855344577, "grad_norm": 6.867413520812988, "learning_rate": 4.108545052586833e-06, "loss": 0.7212032794952392, "memory(GiB)": 40.03, "step": 12455, "token_acc": 0.8281243760033865, "train_speed(iter/s)": 0.095795 }, { "epoch": 0.5794355037944073, "grad_norm": 8.361393928527832, "learning_rate": 4.10476201814369e-06, "loss": 0.7784335613250732, "memory(GiB)": 40.03, "step": 12460, "token_acc": 0.8015446608462055, "train_speed(iter/s)": 0.095817 }, { "epoch": 0.579668022054357, "grad_norm": 6.556368350982666, "learning_rate": 4.100979513082404e-06, "loss": 0.6438935279846192, "memory(GiB)": 40.03, "step": 12465, "token_acc": 0.8374913374913375, "train_speed(iter/s)": 0.095838 }, { "epoch": 0.5799005403143066, "grad_norm": 6.511188507080078, "learning_rate": 4.0971975396396894e-06, "loss": 0.6152307987213135, "memory(GiB)": 40.03, "step": 12470, "token_acc": 0.8479876160990713, "train_speed(iter/s)": 0.09586 }, { "epoch": 0.5801330585742561, "grad_norm": 9.094870567321777, "learning_rate": 4.093416100051943e-06, "loss": 0.717622709274292, "memory(GiB)": 40.03, "step": 12475, "token_acc": 0.8252267106347898, "train_speed(iter/s)": 0.095881 }, { "epoch": 0.5803655768342058, "grad_norm": 8.88550853729248, "learning_rate": 4.089635196555246e-06, "loss": 0.5561723232269287, "memory(GiB)": 40.03, "step": 12480, "token_acc": 0.8629441624365483, "train_speed(iter/s)": 0.095903 }, { "epoch": 0.5805980950941554, "grad_norm": 8.385820388793945, "learning_rate": 4.085854831385367e-06, "loss": 0.7090948581695556, "memory(GiB)": 40.03, "step": 12485, "token_acc": 0.8193054738081225, "train_speed(iter/s)": 0.095924 }, { "epoch": 0.580830613354105, "grad_norm": 8.583136558532715, "learning_rate": 4.082075006777747e-06, "loss": 0.7321601390838623, "memory(GiB)": 40.03, "step": 12490, "token_acc": 0.8369313801079414, "train_speed(iter/s)": 0.095945 }, { "epoch": 0.5810631316140545, "grad_norm": 7.687808036804199, "learning_rate": 4.078295724967517e-06, "loss": 0.642839765548706, "memory(GiB)": 40.03, "step": 12495, "token_acc": 0.8467590857999251, "train_speed(iter/s)": 0.095967 }, { "epoch": 0.5812956498740042, "grad_norm": 8.041131019592285, "learning_rate": 4.074516988189482e-06, "loss": 0.6586986064910889, "memory(GiB)": 40.03, "step": 12500, "token_acc": 0.8389610389610389, "train_speed(iter/s)": 0.095989 }, { "epoch": 0.5812956498740042, "eval_loss": 0.5843559503555298, "eval_runtime": 290.4963, "eval_samples_per_second": 11.962, "eval_steps_per_second": 11.962, "step": 12500 }, { "epoch": 0.5815281681339538, "grad_norm": 8.2643404006958, "learning_rate": 4.070738798678126e-06, "loss": 0.6503505229949951, "memory(GiB)": 40.03, "step": 12505, "token_acc": 0.8280452326951281, "train_speed(iter/s)": 0.095797 }, { "epoch": 0.5817606863939033, "grad_norm": 8.272042274475098, "learning_rate": 4.066961158667609e-06, "loss": 0.6874100685119628, "memory(GiB)": 40.03, "step": 12510, "token_acc": 0.8293502613890963, "train_speed(iter/s)": 0.095819 }, { "epoch": 0.581993204653853, "grad_norm": 7.307247638702393, "learning_rate": 4.063184070391765e-06, "loss": 0.6720140457153321, "memory(GiB)": 40.03, "step": 12515, "token_acc": 0.839740995548361, "train_speed(iter/s)": 0.09584 }, { "epoch": 0.5822257229138026, "grad_norm": 5.765705108642578, "learning_rate": 4.0594075360841035e-06, "loss": 0.6570749282836914, "memory(GiB)": 40.03, "step": 12520, "token_acc": 0.82145236508994, "train_speed(iter/s)": 0.095862 }, { "epoch": 0.5824582411737522, "grad_norm": 10.116802215576172, "learning_rate": 4.055631557977808e-06, "loss": 0.6857599258422852, "memory(GiB)": 40.03, "step": 12525, "token_acc": 0.8437649307214524, "train_speed(iter/s)": 0.095884 }, { "epoch": 0.5826907594337017, "grad_norm": 9.83465576171875, "learning_rate": 4.051856138305727e-06, "loss": 0.6695918560028076, "memory(GiB)": 40.03, "step": 12530, "token_acc": 0.8444125044915559, "train_speed(iter/s)": 0.095905 }, { "epoch": 0.5829232776936514, "grad_norm": 9.249621391296387, "learning_rate": 4.048081279300386e-06, "loss": 0.6529331207275391, "memory(GiB)": 40.03, "step": 12535, "token_acc": 0.8379413015737984, "train_speed(iter/s)": 0.095927 }, { "epoch": 0.583155795953601, "grad_norm": 8.802436828613281, "learning_rate": 4.044306983193973e-06, "loss": 0.5567544460296631, "memory(GiB)": 40.03, "step": 12540, "token_acc": 0.8690082644628099, "train_speed(iter/s)": 0.095948 }, { "epoch": 0.5833883142135505, "grad_norm": 8.022923469543457, "learning_rate": 4.04053325221835e-06, "loss": 0.615961742401123, "memory(GiB)": 40.03, "step": 12545, "token_acc": 0.8515337423312883, "train_speed(iter/s)": 0.09597 }, { "epoch": 0.5836208324735002, "grad_norm": 8.537787437438965, "learning_rate": 4.036760088605035e-06, "loss": 0.7440320968627929, "memory(GiB)": 40.03, "step": 12550, "token_acc": 0.8174474959612278, "train_speed(iter/s)": 0.095991 }, { "epoch": 0.5836208324735002, "eval_loss": 0.5858432054519653, "eval_runtime": 291.3931, "eval_samples_per_second": 11.925, "eval_steps_per_second": 11.925, "step": 12550 }, { "epoch": 0.5838533507334498, "grad_norm": 9.390138626098633, "learning_rate": 4.032987494585221e-06, "loss": 0.6783319473266601, "memory(GiB)": 40.03, "step": 12555, "token_acc": 0.828230028373896, "train_speed(iter/s)": 0.095798 }, { "epoch": 0.5840858689933994, "grad_norm": 9.66921615600586, "learning_rate": 4.029215472389756e-06, "loss": 0.7414599418640136, "memory(GiB)": 40.03, "step": 12560, "token_acc": 0.8162729658792651, "train_speed(iter/s)": 0.095819 }, { "epoch": 0.584318387253349, "grad_norm": 6.690301895141602, "learning_rate": 4.0254440242491565e-06, "loss": 0.6200287342071533, "memory(GiB)": 40.03, "step": 12565, "token_acc": 0.8414403032217309, "train_speed(iter/s)": 0.09584 }, { "epoch": 0.5845509055132986, "grad_norm": 7.844669342041016, "learning_rate": 4.0216731523935925e-06, "loss": 0.5903035163879394, "memory(GiB)": 40.03, "step": 12570, "token_acc": 0.8531626506024096, "train_speed(iter/s)": 0.095862 }, { "epoch": 0.5847834237732482, "grad_norm": 8.23584270477295, "learning_rate": 4.0179028590529e-06, "loss": 0.5819211959838867, "memory(GiB)": 40.03, "step": 12575, "token_acc": 0.8545808966861599, "train_speed(iter/s)": 0.095884 }, { "epoch": 0.5850159420331978, "grad_norm": 8.184552192687988, "learning_rate": 4.014133146456568e-06, "loss": 0.7459378719329834, "memory(GiB)": 40.03, "step": 12580, "token_acc": 0.8132008971483499, "train_speed(iter/s)": 0.095905 }, { "epoch": 0.5852484602931474, "grad_norm": 6.241691589355469, "learning_rate": 4.010364016833745e-06, "loss": 0.6224756240844727, "memory(GiB)": 40.03, "step": 12585, "token_acc": 0.8384336952945047, "train_speed(iter/s)": 0.095926 }, { "epoch": 0.585480978553097, "grad_norm": 8.508033752441406, "learning_rate": 4.006595472413233e-06, "loss": 0.5763284206390381, "memory(GiB)": 40.03, "step": 12590, "token_acc": 0.8516377649325626, "train_speed(iter/s)": 0.095948 }, { "epoch": 0.5857134968130466, "grad_norm": 8.421253204345703, "learning_rate": 4.0028275154234885e-06, "loss": 0.525826358795166, "memory(GiB)": 40.03, "step": 12595, "token_acc": 0.8619173262972736, "train_speed(iter/s)": 0.095969 }, { "epoch": 0.5859460150729962, "grad_norm": 7.334120273590088, "learning_rate": 3.999060148092621e-06, "loss": 0.6416743278503418, "memory(GiB)": 40.03, "step": 12600, "token_acc": 0.835724043715847, "train_speed(iter/s)": 0.09599 }, { "epoch": 0.5859460150729962, "eval_loss": 0.5827152132987976, "eval_runtime": 292.4291, "eval_samples_per_second": 11.883, "eval_steps_per_second": 11.883, "step": 12600 }, { "epoch": 0.5861785333329458, "grad_norm": 8.851205825805664, "learning_rate": 3.995293372648391e-06, "loss": 0.5905053615570068, "memory(GiB)": 40.03, "step": 12605, "token_acc": 0.8291728479210938, "train_speed(iter/s)": 0.095798 }, { "epoch": 0.5864110515928954, "grad_norm": 6.9212493896484375, "learning_rate": 3.9915271913182115e-06, "loss": 0.7106484413146973, "memory(GiB)": 40.03, "step": 12610, "token_acc": 0.8066332916145181, "train_speed(iter/s)": 0.095819 }, { "epoch": 0.586643569852845, "grad_norm": 7.967902660369873, "learning_rate": 3.987761606329138e-06, "loss": 0.631581974029541, "memory(GiB)": 40.03, "step": 12615, "token_acc": 0.8341113105924596, "train_speed(iter/s)": 0.095841 }, { "epoch": 0.5868760881127946, "grad_norm": 7.79425573348999, "learning_rate": 3.98399661990788e-06, "loss": 0.5869490623474121, "memory(GiB)": 40.03, "step": 12620, "token_acc": 0.8605851979345955, "train_speed(iter/s)": 0.095862 }, { "epoch": 0.5871086063727442, "grad_norm": 10.994945526123047, "learning_rate": 3.980232234280788e-06, "loss": 0.6822981834411621, "memory(GiB)": 40.03, "step": 12625, "token_acc": 0.8275714895433205, "train_speed(iter/s)": 0.095883 }, { "epoch": 0.5873411246326938, "grad_norm": 7.380897521972656, "learning_rate": 3.976468451673864e-06, "loss": 0.6111745834350586, "memory(GiB)": 40.03, "step": 12630, "token_acc": 0.8547326279668287, "train_speed(iter/s)": 0.095904 }, { "epoch": 0.5875736428926435, "grad_norm": 6.481570243835449, "learning_rate": 3.972705274312741e-06, "loss": 0.6143715381622314, "memory(GiB)": 40.03, "step": 12635, "token_acc": 0.8486401261332283, "train_speed(iter/s)": 0.095925 }, { "epoch": 0.587806161152593, "grad_norm": 8.564242362976074, "learning_rate": 3.968942704422709e-06, "loss": 0.6853072166442871, "memory(GiB)": 40.03, "step": 12640, "token_acc": 0.8207063084817386, "train_speed(iter/s)": 0.095947 }, { "epoch": 0.5880386794125426, "grad_norm": 6.765157222747803, "learning_rate": 3.965180744228688e-06, "loss": 0.5456812381744385, "memory(GiB)": 40.03, "step": 12645, "token_acc": 0.8647798742138365, "train_speed(iter/s)": 0.095968 }, { "epoch": 0.5882711976724923, "grad_norm": 8.51491641998291, "learning_rate": 3.961419395955244e-06, "loss": 0.6321295261383056, "memory(GiB)": 40.03, "step": 12650, "token_acc": 0.8409266409266409, "train_speed(iter/s)": 0.095989 }, { "epoch": 0.5882711976724923, "eval_loss": 0.5844881534576416, "eval_runtime": 291.3342, "eval_samples_per_second": 11.928, "eval_steps_per_second": 11.928, "step": 12650 }, { "epoch": 0.5885037159324418, "grad_norm": 8.120543479919434, "learning_rate": 3.957658661826575e-06, "loss": 0.5859105587005615, "memory(GiB)": 40.03, "step": 12655, "token_acc": 0.8287495797781228, "train_speed(iter/s)": 0.095798 }, { "epoch": 0.5887362341923914, "grad_norm": 9.27128791809082, "learning_rate": 3.953898544066522e-06, "loss": 0.5901806354522705, "memory(GiB)": 40.03, "step": 12660, "token_acc": 0.84593837535014, "train_speed(iter/s)": 0.095819 }, { "epoch": 0.588968752452341, "grad_norm": 6.452368259429932, "learning_rate": 3.9501390448985565e-06, "loss": 0.6747704982757569, "memory(GiB)": 40.03, "step": 12665, "token_acc": 0.8348032564450475, "train_speed(iter/s)": 0.09584 }, { "epoch": 0.5892012707122907, "grad_norm": 5.982104778289795, "learning_rate": 3.946380166545789e-06, "loss": 0.6910391330718995, "memory(GiB)": 40.03, "step": 12670, "token_acc": 0.8227188081936685, "train_speed(iter/s)": 0.095861 }, { "epoch": 0.5894337889722402, "grad_norm": 9.366355895996094, "learning_rate": 3.9426219112309585e-06, "loss": 0.5870296001434326, "memory(GiB)": 40.03, "step": 12675, "token_acc": 0.85423197492163, "train_speed(iter/s)": 0.095881 }, { "epoch": 0.5896663072321898, "grad_norm": 8.060824394226074, "learning_rate": 3.938864281176438e-06, "loss": 0.6449044704437256, "memory(GiB)": 40.03, "step": 12680, "token_acc": 0.8468025298664793, "train_speed(iter/s)": 0.095902 }, { "epoch": 0.5898988254921395, "grad_norm": 8.35085678100586, "learning_rate": 3.935107278604229e-06, "loss": 0.6621711730957032, "memory(GiB)": 40.03, "step": 12685, "token_acc": 0.8270584634220998, "train_speed(iter/s)": 0.095923 }, { "epoch": 0.590131343752089, "grad_norm": 7.202229976654053, "learning_rate": 3.931350905735965e-06, "loss": 0.6891547203063965, "memory(GiB)": 40.03, "step": 12690, "token_acc": 0.8342198581560284, "train_speed(iter/s)": 0.095945 }, { "epoch": 0.5903638620120386, "grad_norm": 7.003572463989258, "learning_rate": 3.9275951647929034e-06, "loss": 0.6476888656616211, "memory(GiB)": 40.03, "step": 12695, "token_acc": 0.8447937131630648, "train_speed(iter/s)": 0.095967 }, { "epoch": 0.5905963802719882, "grad_norm": 9.953988075256348, "learning_rate": 3.9238400579959316e-06, "loss": 0.7107308387756348, "memory(GiB)": 40.03, "step": 12700, "token_acc": 0.8269881556683587, "train_speed(iter/s)": 0.095987 }, { "epoch": 0.5905963802719882, "eval_loss": 0.5821236968040466, "eval_runtime": 291.7847, "eval_samples_per_second": 11.909, "eval_steps_per_second": 11.909, "step": 12700 }, { "epoch": 0.5908288985319379, "grad_norm": 5.7399187088012695, "learning_rate": 3.920085587565558e-06, "loss": 0.5839637756347656, "memory(GiB)": 40.03, "step": 12705, "token_acc": 0.8290176177249842, "train_speed(iter/s)": 0.095797 }, { "epoch": 0.5910614167918874, "grad_norm": 8.689284324645996, "learning_rate": 3.916331755721921e-06, "loss": 0.5548437595367431, "memory(GiB)": 40.03, "step": 12710, "token_acc": 0.8688783570300158, "train_speed(iter/s)": 0.095818 }, { "epoch": 0.591293935051837, "grad_norm": 8.828099250793457, "learning_rate": 3.912578564684772e-06, "loss": 0.660148286819458, "memory(GiB)": 40.03, "step": 12715, "token_acc": 0.8491717523975588, "train_speed(iter/s)": 0.095839 }, { "epoch": 0.5915264533117867, "grad_norm": 8.250561714172363, "learning_rate": 3.908826016673493e-06, "loss": 0.6170506954193116, "memory(GiB)": 40.03, "step": 12720, "token_acc": 0.8587921847246892, "train_speed(iter/s)": 0.09586 }, { "epoch": 0.5917589715717363, "grad_norm": 6.9805073738098145, "learning_rate": 3.905074113907077e-06, "loss": 0.7492640495300293, "memory(GiB)": 40.03, "step": 12725, "token_acc": 0.8194444444444444, "train_speed(iter/s)": 0.095882 }, { "epoch": 0.5919914898316858, "grad_norm": 8.539275169372559, "learning_rate": 3.901322858604144e-06, "loss": 0.6248363018035888, "memory(GiB)": 40.03, "step": 12730, "token_acc": 0.8336466165413534, "train_speed(iter/s)": 0.095902 }, { "epoch": 0.5922240080916354, "grad_norm": 7.449262619018555, "learning_rate": 3.897572252982927e-06, "loss": 0.6663394927978515, "memory(GiB)": 40.03, "step": 12735, "token_acc": 0.8276753960556095, "train_speed(iter/s)": 0.095923 }, { "epoch": 0.5924565263515851, "grad_norm": 8.026508331298828, "learning_rate": 3.893822299261271e-06, "loss": 0.63345365524292, "memory(GiB)": 40.03, "step": 12740, "token_acc": 0.8415265200517464, "train_speed(iter/s)": 0.095945 }, { "epoch": 0.5926890446115346, "grad_norm": 8.786938667297363, "learning_rate": 3.890072999656645e-06, "loss": 0.6941215515136718, "memory(GiB)": 40.03, "step": 12745, "token_acc": 0.8345984818218138, "train_speed(iter/s)": 0.095966 }, { "epoch": 0.5929215628714842, "grad_norm": 6.686178684234619, "learning_rate": 3.886324356386121e-06, "loss": 0.6077319145202636, "memory(GiB)": 40.03, "step": 12750, "token_acc": 0.8468640560331104, "train_speed(iter/s)": 0.095987 }, { "epoch": 0.5929215628714842, "eval_loss": 0.582139790058136, "eval_runtime": 291.4567, "eval_samples_per_second": 11.923, "eval_steps_per_second": 11.923, "step": 12750 }, { "epoch": 0.5931540811314339, "grad_norm": 7.007806301116943, "learning_rate": 3.8825763716663895e-06, "loss": 0.5417373180389404, "memory(GiB)": 40.03, "step": 12755, "token_acc": 0.8294199720948774, "train_speed(iter/s)": 0.095798 }, { "epoch": 0.5933865993913835, "grad_norm": 6.337910175323486, "learning_rate": 3.878829047713748e-06, "loss": 0.6792127609252929, "memory(GiB)": 40.03, "step": 12760, "token_acc": 0.8312189740761169, "train_speed(iter/s)": 0.095819 }, { "epoch": 0.593619117651333, "grad_norm": 8.660492897033691, "learning_rate": 3.875082386744109e-06, "loss": 0.7165204048156738, "memory(GiB)": 40.03, "step": 12765, "token_acc": 0.8216374269005848, "train_speed(iter/s)": 0.09584 }, { "epoch": 0.5938516359112826, "grad_norm": 7.1973419189453125, "learning_rate": 3.871336390972983e-06, "loss": 0.6989931106567383, "memory(GiB)": 40.03, "step": 12770, "token_acc": 0.8199863107460643, "train_speed(iter/s)": 0.095861 }, { "epoch": 0.5940841541712323, "grad_norm": 6.249396800994873, "learning_rate": 3.867591062615497e-06, "loss": 0.6296061992645263, "memory(GiB)": 40.03, "step": 12775, "token_acc": 0.8337129840546698, "train_speed(iter/s)": 0.095882 }, { "epoch": 0.5943166724311819, "grad_norm": 6.071090221405029, "learning_rate": 3.8638464038863785e-06, "loss": 0.6844027042388916, "memory(GiB)": 40.03, "step": 12780, "token_acc": 0.836848635235732, "train_speed(iter/s)": 0.095903 }, { "epoch": 0.5945491906911314, "grad_norm": 7.891200065612793, "learning_rate": 3.8601024169999605e-06, "loss": 0.8278802871704102, "memory(GiB)": 40.03, "step": 12785, "token_acc": 0.7972350230414746, "train_speed(iter/s)": 0.095924 }, { "epoch": 0.5947817089510811, "grad_norm": 9.561638832092285, "learning_rate": 3.856359104170174e-06, "loss": 0.6455109119415283, "memory(GiB)": 40.03, "step": 12790, "token_acc": 0.8415918845103394, "train_speed(iter/s)": 0.095945 }, { "epoch": 0.5950142272110307, "grad_norm": 7.377737045288086, "learning_rate": 3.852616467610561e-06, "loss": 0.7129979610443116, "memory(GiB)": 40.03, "step": 12795, "token_acc": 0.7946912242686891, "train_speed(iter/s)": 0.095965 }, { "epoch": 0.5952467454709802, "grad_norm": 7.447319030761719, "learning_rate": 3.848874509534254e-06, "loss": 0.7946955680847168, "memory(GiB)": 40.03, "step": 12800, "token_acc": 0.8070114543561263, "train_speed(iter/s)": 0.095987 }, { "epoch": 0.5952467454709802, "eval_loss": 0.5835894346237183, "eval_runtime": 290.9893, "eval_samples_per_second": 11.942, "eval_steps_per_second": 11.942, "step": 12800 }, { "epoch": 0.5954792637309299, "grad_norm": 8.0943603515625, "learning_rate": 3.8451332321539915e-06, "loss": 0.5825368404388428, "memory(GiB)": 40.03, "step": 12805, "token_acc": 0.8292804408775778, "train_speed(iter/s)": 0.095798 }, { "epoch": 0.5957117819908795, "grad_norm": 7.130446434020996, "learning_rate": 3.841392637682103e-06, "loss": 0.6323969841003418, "memory(GiB)": 40.03, "step": 12810, "token_acc": 0.8424920127795528, "train_speed(iter/s)": 0.095819 }, { "epoch": 0.5959443002508291, "grad_norm": 8.11547565460205, "learning_rate": 3.83765272833052e-06, "loss": 0.6336262702941895, "memory(GiB)": 40.03, "step": 12815, "token_acc": 0.8367215230719587, "train_speed(iter/s)": 0.09584 }, { "epoch": 0.5961768185107786, "grad_norm": 9.913498878479004, "learning_rate": 3.833913506310765e-06, "loss": 0.6936050415039062, "memory(GiB)": 40.03, "step": 12820, "token_acc": 0.8213120695904313, "train_speed(iter/s)": 0.095861 }, { "epoch": 0.5964093367707283, "grad_norm": 8.244888305664062, "learning_rate": 3.830174973833956e-06, "loss": 0.5694565773010254, "memory(GiB)": 40.03, "step": 12825, "token_acc": 0.8766328011611031, "train_speed(iter/s)": 0.095881 }, { "epoch": 0.5966418550306779, "grad_norm": 6.958888530731201, "learning_rate": 3.826437133110803e-06, "loss": 0.7474677562713623, "memory(GiB)": 40.03, "step": 12830, "token_acc": 0.8297191610380377, "train_speed(iter/s)": 0.095902 }, { "epoch": 0.5968743732906274, "grad_norm": 7.2198052406311035, "learning_rate": 3.822699986351607e-06, "loss": 0.6751950263977051, "memory(GiB)": 40.03, "step": 12835, "token_acc": 0.8365192582025678, "train_speed(iter/s)": 0.095923 }, { "epoch": 0.5971068915505771, "grad_norm": 7.975522518157959, "learning_rate": 3.818963535766255e-06, "loss": 0.6447204113006592, "memory(GiB)": 40.03, "step": 12840, "token_acc": 0.8336427775714816, "train_speed(iter/s)": 0.095943 }, { "epoch": 0.5973394098105267, "grad_norm": 8.00240421295166, "learning_rate": 3.8152277835642315e-06, "loss": 0.6401217460632325, "memory(GiB)": 40.03, "step": 12845, "token_acc": 0.8358505564387917, "train_speed(iter/s)": 0.095963 }, { "epoch": 0.5975719280704763, "grad_norm": 8.330437660217285, "learning_rate": 3.8114927319545962e-06, "loss": 0.5822761535644532, "memory(GiB)": 40.03, "step": 12850, "token_acc": 0.8531309297912714, "train_speed(iter/s)": 0.095984 }, { "epoch": 0.5975719280704763, "eval_loss": 0.581326961517334, "eval_runtime": 290.2078, "eval_samples_per_second": 11.974, "eval_steps_per_second": 11.974, "step": 12850 }, { "epoch": 0.5978044463304258, "grad_norm": 6.689164638519287, "learning_rate": 3.807758383146004e-06, "loss": 0.6984948635101318, "memory(GiB)": 40.03, "step": 12855, "token_acc": 0.8286753629691356, "train_speed(iter/s)": 0.095798 }, { "epoch": 0.5980369645903755, "grad_norm": 7.692912578582764, "learning_rate": 3.804024739346689e-06, "loss": 0.718368673324585, "memory(GiB)": 40.03, "step": 12860, "token_acc": 0.8182618907809748, "train_speed(iter/s)": 0.095819 }, { "epoch": 0.5982694828503251, "grad_norm": 7.318233966827393, "learning_rate": 3.8002918027644697e-06, "loss": 0.7070892333984375, "memory(GiB)": 40.03, "step": 12865, "token_acc": 0.8297933409873708, "train_speed(iter/s)": 0.09584 }, { "epoch": 0.5985020011102747, "grad_norm": 8.469462394714355, "learning_rate": 3.7965595756067507e-06, "loss": 0.7083279609680175, "memory(GiB)": 40.03, "step": 12870, "token_acc": 0.831043445971036, "train_speed(iter/s)": 0.095861 }, { "epoch": 0.5987345193702243, "grad_norm": 9.082457542419434, "learning_rate": 3.792828060080508e-06, "loss": 0.5696929931640625, "memory(GiB)": 40.03, "step": 12875, "token_acc": 0.8571428571428571, "train_speed(iter/s)": 0.095882 }, { "epoch": 0.5989670376301739, "grad_norm": 9.04974365234375, "learning_rate": 3.789097258392305e-06, "loss": 0.751149845123291, "memory(GiB)": 40.03, "step": 12880, "token_acc": 0.8173846740373618, "train_speed(iter/s)": 0.095903 }, { "epoch": 0.5991995558901235, "grad_norm": 9.381528854370117, "learning_rate": 3.7853671727482775e-06, "loss": 0.5614064693450928, "memory(GiB)": 40.03, "step": 12885, "token_acc": 0.8481468154012235, "train_speed(iter/s)": 0.095924 }, { "epoch": 0.599432074150073, "grad_norm": 8.049118041992188, "learning_rate": 3.7816378053541446e-06, "loss": 0.5766382217407227, "memory(GiB)": 40.03, "step": 12890, "token_acc": 0.8552249637155298, "train_speed(iter/s)": 0.095945 }, { "epoch": 0.5996645924100227, "grad_norm": 6.832623481750488, "learning_rate": 3.7779091584151912e-06, "loss": 0.6892680644989013, "memory(GiB)": 40.03, "step": 12895, "token_acc": 0.8299130434782609, "train_speed(iter/s)": 0.095966 }, { "epoch": 0.5998971106699723, "grad_norm": 8.119552612304688, "learning_rate": 3.7741812341362848e-06, "loss": 0.6701316833496094, "memory(GiB)": 40.03, "step": 12900, "token_acc": 0.8302945301542777, "train_speed(iter/s)": 0.095987 }, { "epoch": 0.5998971106699723, "eval_loss": 0.5804882645606995, "eval_runtime": 290.8058, "eval_samples_per_second": 11.95, "eval_steps_per_second": 11.95, "step": 12900 }, { "epoch": 0.6001296289299219, "grad_norm": 9.428837776184082, "learning_rate": 3.7704540347218598e-06, "loss": 0.650011682510376, "memory(GiB)": 40.03, "step": 12905, "token_acc": 0.8292998026030717, "train_speed(iter/s)": 0.095801 }, { "epoch": 0.6003621471898715, "grad_norm": 8.723655700683594, "learning_rate": 3.766727562375928e-06, "loss": 0.5859549045562744, "memory(GiB)": 40.03, "step": 12910, "token_acc": 0.8322303110522833, "train_speed(iter/s)": 0.095821 }, { "epoch": 0.6005946654498211, "grad_norm": 7.602426528930664, "learning_rate": 3.7630018193020635e-06, "loss": 0.7068216323852539, "memory(GiB)": 40.03, "step": 12915, "token_acc": 0.828998505231689, "train_speed(iter/s)": 0.095843 }, { "epoch": 0.6008271837097707, "grad_norm": 7.832668304443359, "learning_rate": 3.759276807703415e-06, "loss": 0.7375150680541992, "memory(GiB)": 40.03, "step": 12920, "token_acc": 0.8146705615060046, "train_speed(iter/s)": 0.095864 }, { "epoch": 0.6010597019697204, "grad_norm": 8.143230438232422, "learning_rate": 3.7555525297826963e-06, "loss": 0.8059114456176758, "memory(GiB)": 40.03, "step": 12925, "token_acc": 0.7878280290340591, "train_speed(iter/s)": 0.095884 }, { "epoch": 0.6012922202296699, "grad_norm": 6.759118556976318, "learning_rate": 3.7518289877421898e-06, "loss": 0.5784881114959717, "memory(GiB)": 40.03, "step": 12930, "token_acc": 0.8483215913800248, "train_speed(iter/s)": 0.095905 }, { "epoch": 0.6015247384896195, "grad_norm": 7.4658122062683105, "learning_rate": 3.748106183783738e-06, "loss": 0.6297882556915283, "memory(GiB)": 40.03, "step": 12935, "token_acc": 0.8391791044776119, "train_speed(iter/s)": 0.095925 }, { "epoch": 0.6017572567495691, "grad_norm": 7.309219837188721, "learning_rate": 3.7443841201087515e-06, "loss": 0.745161485671997, "memory(GiB)": 40.03, "step": 12940, "token_acc": 0.8164102564102564, "train_speed(iter/s)": 0.095947 }, { "epoch": 0.6019897750095187, "grad_norm": 8.289355278015137, "learning_rate": 3.740662798918201e-06, "loss": 0.7899196147918701, "memory(GiB)": 40.03, "step": 12945, "token_acc": 0.8073065902578797, "train_speed(iter/s)": 0.095968 }, { "epoch": 0.6022222932694683, "grad_norm": 8.108407974243164, "learning_rate": 3.7369422224126205e-06, "loss": 0.654511833190918, "memory(GiB)": 40.03, "step": 12950, "token_acc": 0.8181179775280899, "train_speed(iter/s)": 0.095988 }, { "epoch": 0.6022222932694683, "eval_loss": 0.5823829770088196, "eval_runtime": 292.6367, "eval_samples_per_second": 11.875, "eval_steps_per_second": 11.875, "step": 12950 }, { "epoch": 0.6024548115294179, "grad_norm": 8.292499542236328, "learning_rate": 3.733222392792098e-06, "loss": 0.6521989822387695, "memory(GiB)": 40.03, "step": 12955, "token_acc": 0.8286181547070283, "train_speed(iter/s)": 0.095801 }, { "epoch": 0.6026873297893676, "grad_norm": 10.976821899414062, "learning_rate": 3.729503312256287e-06, "loss": 0.6724837303161622, "memory(GiB)": 40.03, "step": 12960, "token_acc": 0.843737882900349, "train_speed(iter/s)": 0.095822 }, { "epoch": 0.6029198480493171, "grad_norm": 7.344293117523193, "learning_rate": 3.7257849830043913e-06, "loss": 0.7633102893829345, "memory(GiB)": 40.03, "step": 12965, "token_acc": 0.8059814023624026, "train_speed(iter/s)": 0.095842 }, { "epoch": 0.6031523663092667, "grad_norm": 8.665361404418945, "learning_rate": 3.722067407235179e-06, "loss": 0.7092705726623535, "memory(GiB)": 40.03, "step": 12970, "token_acc": 0.8277597986335851, "train_speed(iter/s)": 0.095862 }, { "epoch": 0.6033848845692164, "grad_norm": 8.475383758544922, "learning_rate": 3.7183505871469618e-06, "loss": 0.6085611820220947, "memory(GiB)": 40.03, "step": 12975, "token_acc": 0.8396268325188805, "train_speed(iter/s)": 0.095882 }, { "epoch": 0.603617402829166, "grad_norm": 5.808564186096191, "learning_rate": 3.7146345249376132e-06, "loss": 0.7104721546173096, "memory(GiB)": 40.03, "step": 12980, "token_acc": 0.8194636439037877, "train_speed(iter/s)": 0.095903 }, { "epoch": 0.6038499210891155, "grad_norm": 6.993358135223389, "learning_rate": 3.7109192228045542e-06, "loss": 0.6546235084533691, "memory(GiB)": 40.03, "step": 12985, "token_acc": 0.828125, "train_speed(iter/s)": 0.095924 }, { "epoch": 0.6040824393490651, "grad_norm": 8.322504043579102, "learning_rate": 3.7072046829447607e-06, "loss": 0.6930451393127441, "memory(GiB)": 40.03, "step": 12990, "token_acc": 0.8300180831826401, "train_speed(iter/s)": 0.095944 }, { "epoch": 0.6043149576090148, "grad_norm": 9.36544132232666, "learning_rate": 3.7034909075547498e-06, "loss": 0.592886209487915, "memory(GiB)": 40.03, "step": 12995, "token_acc": 0.859493670886076, "train_speed(iter/s)": 0.095964 }, { "epoch": 0.6045474758689643, "grad_norm": 7.081161975860596, "learning_rate": 3.6997778988305944e-06, "loss": 0.6275952816009521, "memory(GiB)": 40.03, "step": 13000, "token_acc": 0.8473684210526315, "train_speed(iter/s)": 0.095984 }, { "epoch": 0.6045474758689643, "eval_loss": 0.5824636220932007, "eval_runtime": 290.8141, "eval_samples_per_second": 11.949, "eval_steps_per_second": 11.949, "step": 13000 }, { "epoch": 0.6047799941289139, "grad_norm": 6.186984062194824, "learning_rate": 3.6960656589679124e-06, "loss": 0.6313210010528565, "memory(GiB)": 40.03, "step": 13005, "token_acc": 0.8288772616527539, "train_speed(iter/s)": 0.095799 }, { "epoch": 0.6050125123888636, "grad_norm": 9.29638671875, "learning_rate": 3.692354190161863e-06, "loss": 0.8241156578063965, "memory(GiB)": 40.03, "step": 13010, "token_acc": 0.8011299435028248, "train_speed(iter/s)": 0.09582 }, { "epoch": 0.6052450306488132, "grad_norm": 7.913212299346924, "learning_rate": 3.688643494607156e-06, "loss": 0.6520851612091064, "memory(GiB)": 40.03, "step": 13015, "token_acc": 0.8400690846286701, "train_speed(iter/s)": 0.09584 }, { "epoch": 0.6054775489087627, "grad_norm": 7.703744411468506, "learning_rate": 3.6849335744980364e-06, "loss": 0.6411514282226562, "memory(GiB)": 40.03, "step": 13020, "token_acc": 0.831096196868009, "train_speed(iter/s)": 0.095861 }, { "epoch": 0.6057100671687123, "grad_norm": 9.644156455993652, "learning_rate": 3.6812244320282965e-06, "loss": 0.6027958393096924, "memory(GiB)": 40.03, "step": 13025, "token_acc": 0.848517327617006, "train_speed(iter/s)": 0.095882 }, { "epoch": 0.605942585428662, "grad_norm": 10.0267333984375, "learning_rate": 3.677516069391266e-06, "loss": 0.6812788963317871, "memory(GiB)": 40.03, "step": 13030, "token_acc": 0.8278537125969708, "train_speed(iter/s)": 0.095902 }, { "epoch": 0.6061751036886115, "grad_norm": 8.00976276397705, "learning_rate": 3.673808488779816e-06, "loss": 0.6315018653869628, "memory(GiB)": 40.03, "step": 13035, "token_acc": 0.840867992766727, "train_speed(iter/s)": 0.095922 }, { "epoch": 0.6064076219485611, "grad_norm": 9.463610649108887, "learning_rate": 3.6701016923863495e-06, "loss": 0.8386247634887696, "memory(GiB)": 40.03, "step": 13040, "token_acc": 0.8117283950617284, "train_speed(iter/s)": 0.095943 }, { "epoch": 0.6066401402085108, "grad_norm": 7.517702102661133, "learning_rate": 3.6663956824028145e-06, "loss": 0.6400721549987793, "memory(GiB)": 40.03, "step": 13045, "token_acc": 0.8454281567489115, "train_speed(iter/s)": 0.095964 }, { "epoch": 0.6068726584684604, "grad_norm": 8.288093566894531, "learning_rate": 3.6626904610206847e-06, "loss": 0.6020816326141357, "memory(GiB)": 40.03, "step": 13050, "token_acc": 0.844327990135635, "train_speed(iter/s)": 0.095984 }, { "epoch": 0.6068726584684604, "eval_loss": 0.5813098549842834, "eval_runtime": 292.0744, "eval_samples_per_second": 11.898, "eval_steps_per_second": 11.898, "step": 13050 }, { "epoch": 0.6071051767284099, "grad_norm": 6.002388000488281, "learning_rate": 3.6589860304309767e-06, "loss": 0.6771800994873047, "memory(GiB)": 40.03, "step": 13055, "token_acc": 0.8286898971373894, "train_speed(iter/s)": 0.095799 }, { "epoch": 0.6073376949883595, "grad_norm": 8.348504066467285, "learning_rate": 3.655282392824229e-06, "loss": 0.7498507976531983, "memory(GiB)": 40.03, "step": 13060, "token_acc": 0.815230961298377, "train_speed(iter/s)": 0.095819 }, { "epoch": 0.6075702132483092, "grad_norm": 6.031515121459961, "learning_rate": 3.6515795503905216e-06, "loss": 0.6623213291168213, "memory(GiB)": 40.03, "step": 13065, "token_acc": 0.8269918699186992, "train_speed(iter/s)": 0.09584 }, { "epoch": 0.6078027315082588, "grad_norm": 8.771651268005371, "learning_rate": 3.647877505319456e-06, "loss": 0.5782103538513184, "memory(GiB)": 40.03, "step": 13070, "token_acc": 0.8602808786460209, "train_speed(iter/s)": 0.095861 }, { "epoch": 0.6080352497682083, "grad_norm": 9.957589149475098, "learning_rate": 3.6441762598001706e-06, "loss": 0.5717226505279541, "memory(GiB)": 40.03, "step": 13075, "token_acc": 0.8529185867895546, "train_speed(iter/s)": 0.095881 }, { "epoch": 0.608267768028158, "grad_norm": 9.755043983459473, "learning_rate": 3.640475816021319e-06, "loss": 0.6282239437103272, "memory(GiB)": 40.03, "step": 13080, "token_acc": 0.8373353989155693, "train_speed(iter/s)": 0.095902 }, { "epoch": 0.6085002862881076, "grad_norm": 9.720813751220703, "learning_rate": 3.636776176171095e-06, "loss": 0.724711799621582, "memory(GiB)": 40.03, "step": 13085, "token_acc": 0.8197911938266, "train_speed(iter/s)": 0.095923 }, { "epoch": 0.6087328045480571, "grad_norm": 7.896881580352783, "learning_rate": 3.6330773424372055e-06, "loss": 0.6664868354797363, "memory(GiB)": 40.03, "step": 13090, "token_acc": 0.8358640636297903, "train_speed(iter/s)": 0.095943 }, { "epoch": 0.6089653228080067, "grad_norm": 7.925256729125977, "learning_rate": 3.6293793170068883e-06, "loss": 0.8178078651428222, "memory(GiB)": 40.03, "step": 13095, "token_acc": 0.7784163473818646, "train_speed(iter/s)": 0.095963 }, { "epoch": 0.6091978410679564, "grad_norm": 8.73812484741211, "learning_rate": 3.6256821020668944e-06, "loss": 0.5127421855926514, "memory(GiB)": 40.03, "step": 13100, "token_acc": 0.8643181025462155, "train_speed(iter/s)": 0.095984 }, { "epoch": 0.6091978410679564, "eval_loss": 0.5807544589042664, "eval_runtime": 294.0126, "eval_samples_per_second": 11.819, "eval_steps_per_second": 11.819, "step": 13100 }, { "epoch": 0.609430359327906, "grad_norm": 7.597916126251221, "learning_rate": 3.621985699803508e-06, "loss": 0.6496970176696777, "memory(GiB)": 40.03, "step": 13105, "token_acc": 0.829230387169648, "train_speed(iter/s)": 0.095798 }, { "epoch": 0.6096628775878555, "grad_norm": 7.8660101890563965, "learning_rate": 3.6182901124025205e-06, "loss": 0.5560397148132324, "memory(GiB)": 40.03, "step": 13110, "token_acc": 0.8621080468454855, "train_speed(iter/s)": 0.095819 }, { "epoch": 0.6098953958478052, "grad_norm": 10.034387588500977, "learning_rate": 3.6145953420492506e-06, "loss": 0.5755732536315918, "memory(GiB)": 40.03, "step": 13115, "token_acc": 0.858712236801953, "train_speed(iter/s)": 0.095839 }, { "epoch": 0.6101279141077548, "grad_norm": 6.889771938323975, "learning_rate": 3.6109013909285275e-06, "loss": 0.5321535587310791, "memory(GiB)": 40.03, "step": 13120, "token_acc": 0.8643453028654334, "train_speed(iter/s)": 0.095859 }, { "epoch": 0.6103604323677044, "grad_norm": 9.340177536010742, "learning_rate": 3.6072082612247006e-06, "loss": 0.7592248916625977, "memory(GiB)": 40.03, "step": 13125, "token_acc": 0.804501175680215, "train_speed(iter/s)": 0.09588 }, { "epoch": 0.610592950627654, "grad_norm": 9.322178840637207, "learning_rate": 3.603515955121629e-06, "loss": 0.5940544128417968, "memory(GiB)": 40.03, "step": 13130, "token_acc": 0.8639663737103553, "train_speed(iter/s)": 0.095901 }, { "epoch": 0.6108254688876036, "grad_norm": 9.264908790588379, "learning_rate": 3.599824474802689e-06, "loss": 0.6264768123626709, "memory(GiB)": 40.03, "step": 13135, "token_acc": 0.8332737030411449, "train_speed(iter/s)": 0.095921 }, { "epoch": 0.6110579871475532, "grad_norm": 8.198921203613281, "learning_rate": 3.596133822450768e-06, "loss": 0.7603225231170654, "memory(GiB)": 40.03, "step": 13140, "token_acc": 0.8068370394955194, "train_speed(iter/s)": 0.095941 }, { "epoch": 0.6112905054075027, "grad_norm": 10.895846366882324, "learning_rate": 3.5924440002482595e-06, "loss": 0.6804223537445069, "memory(GiB)": 40.03, "step": 13145, "token_acc": 0.8257232485186476, "train_speed(iter/s)": 0.095962 }, { "epoch": 0.6115230236674524, "grad_norm": 11.369791030883789, "learning_rate": 3.588755010377074e-06, "loss": 0.7144616603851318, "memory(GiB)": 40.03, "step": 13150, "token_acc": 0.8176609369733738, "train_speed(iter/s)": 0.095982 }, { "epoch": 0.6115230236674524, "eval_loss": 0.5825229287147522, "eval_runtime": 293.787, "eval_samples_per_second": 11.828, "eval_steps_per_second": 11.828, "step": 13150 }, { "epoch": 0.611755541927402, "grad_norm": 8.719143867492676, "learning_rate": 3.58506685501862e-06, "loss": 0.7275121688842774, "memory(GiB)": 40.03, "step": 13155, "token_acc": 0.8287984493019456, "train_speed(iter/s)": 0.095797 }, { "epoch": 0.6119880601873516, "grad_norm": 7.496494293212891, "learning_rate": 3.58137953635382e-06, "loss": 0.6511481285095215, "memory(GiB)": 40.03, "step": 13160, "token_acc": 0.8354237932345115, "train_speed(iter/s)": 0.095818 }, { "epoch": 0.6122205784473012, "grad_norm": 7.9762773513793945, "learning_rate": 3.5776930565630985e-06, "loss": 0.571917200088501, "memory(GiB)": 40.03, "step": 13165, "token_acc": 0.8398208749569411, "train_speed(iter/s)": 0.095838 }, { "epoch": 0.6124530967072508, "grad_norm": 8.024611473083496, "learning_rate": 3.5740074178263883e-06, "loss": 0.6731865406036377, "memory(GiB)": 40.03, "step": 13170, "token_acc": 0.838391502276176, "train_speed(iter/s)": 0.095859 }, { "epoch": 0.6126856149672004, "grad_norm": 7.202920913696289, "learning_rate": 3.5703226223231164e-06, "loss": 0.622395133972168, "memory(GiB)": 40.03, "step": 13175, "token_acc": 0.8469915600326708, "train_speed(iter/s)": 0.095879 }, { "epoch": 0.6129181332271499, "grad_norm": 10.924602508544922, "learning_rate": 3.5666386722322187e-06, "loss": 0.5626607418060303, "memory(GiB)": 40.03, "step": 13180, "token_acc": 0.8612159329140461, "train_speed(iter/s)": 0.0959 }, { "epoch": 0.6131506514870996, "grad_norm": 7.208611965179443, "learning_rate": 3.5629555697321284e-06, "loss": 0.7295779228210449, "memory(GiB)": 40.03, "step": 13185, "token_acc": 0.8218286953379111, "train_speed(iter/s)": 0.09592 }, { "epoch": 0.6133831697470492, "grad_norm": 9.755807876586914, "learning_rate": 3.559273317000779e-06, "loss": 0.5655300617218018, "memory(GiB)": 40.03, "step": 13190, "token_acc": 0.8583294877711122, "train_speed(iter/s)": 0.095941 }, { "epoch": 0.6136156880069988, "grad_norm": 8.565410614013672, "learning_rate": 3.5555919162155968e-06, "loss": 0.5593877792358398, "memory(GiB)": 40.03, "step": 13195, "token_acc": 0.86209216279852, "train_speed(iter/s)": 0.095961 }, { "epoch": 0.6138482062669484, "grad_norm": 9.327910423278809, "learning_rate": 3.5519113695535105e-06, "loss": 0.7032355785369873, "memory(GiB)": 40.03, "step": 13200, "token_acc": 0.8238464129336477, "train_speed(iter/s)": 0.095982 }, { "epoch": 0.6138482062669484, "eval_loss": 0.581161379814148, "eval_runtime": 292.8198, "eval_samples_per_second": 11.867, "eval_steps_per_second": 11.867, "step": 13200 }, { "epoch": 0.614080724526898, "grad_norm": 5.984133243560791, "learning_rate": 3.54823167919094e-06, "loss": 0.6683717727661133, "memory(GiB)": 40.03, "step": 13205, "token_acc": 0.8294327679302749, "train_speed(iter/s)": 0.095797 }, { "epoch": 0.6143132427868476, "grad_norm": 7.557715892791748, "learning_rate": 3.5445528473038016e-06, "loss": 0.7758615970611572, "memory(GiB)": 40.03, "step": 13210, "token_acc": 0.8118092832333439, "train_speed(iter/s)": 0.095818 }, { "epoch": 0.6145457610467973, "grad_norm": 7.7075514793396, "learning_rate": 3.540874876067499e-06, "loss": 0.5775607585906982, "memory(GiB)": 40.03, "step": 13215, "token_acc": 0.8622920517560074, "train_speed(iter/s)": 0.095838 }, { "epoch": 0.6147782793067468, "grad_norm": 6.942788124084473, "learning_rate": 3.5371977676569323e-06, "loss": 0.5903857707977295, "memory(GiB)": 40.03, "step": 13220, "token_acc": 0.8548087634608244, "train_speed(iter/s)": 0.095858 }, { "epoch": 0.6150107975666964, "grad_norm": 7.968829154968262, "learning_rate": 3.533521524246488e-06, "loss": 0.7749314785003663, "memory(GiB)": 40.03, "step": 13225, "token_acc": 0.8158175988599928, "train_speed(iter/s)": 0.095879 }, { "epoch": 0.615243315826646, "grad_norm": 7.358787536621094, "learning_rate": 3.5298461480100456e-06, "loss": 0.7289624214172363, "memory(GiB)": 40.03, "step": 13230, "token_acc": 0.8090257023311417, "train_speed(iter/s)": 0.095899 }, { "epoch": 0.6154758340865956, "grad_norm": 6.563849449157715, "learning_rate": 3.5261716411209632e-06, "loss": 0.6173213481903076, "memory(GiB)": 40.03, "step": 13235, "token_acc": 0.8423772609819121, "train_speed(iter/s)": 0.095919 }, { "epoch": 0.6157083523465452, "grad_norm": 9.109136581420898, "learning_rate": 3.522498005752094e-06, "loss": 0.6493964195251465, "memory(GiB)": 40.03, "step": 13240, "token_acc": 0.8377281947261663, "train_speed(iter/s)": 0.09594 }, { "epoch": 0.6159408706064948, "grad_norm": 9.26567554473877, "learning_rate": 3.5188252440757707e-06, "loss": 0.6601822376251221, "memory(GiB)": 40.03, "step": 13245, "token_acc": 0.8382547377699427, "train_speed(iter/s)": 0.09596 }, { "epoch": 0.6161733888664445, "grad_norm": 8.409039497375488, "learning_rate": 3.515153358263813e-06, "loss": 0.6446426391601563, "memory(GiB)": 40.03, "step": 13250, "token_acc": 0.8314216197427876, "train_speed(iter/s)": 0.095981 }, { "epoch": 0.6161733888664445, "eval_loss": 0.5819464325904846, "eval_runtime": 294.1628, "eval_samples_per_second": 11.813, "eval_steps_per_second": 11.813, "step": 13250 }, { "epoch": 0.616405907126394, "grad_norm": 6.008672714233398, "learning_rate": 3.511482350487516e-06, "loss": 0.7331273078918457, "memory(GiB)": 40.03, "step": 13255, "token_acc": 0.8281281177418969, "train_speed(iter/s)": 0.095796 }, { "epoch": 0.6166384253863436, "grad_norm": 6.8742828369140625, "learning_rate": 3.507812222917662e-06, "loss": 0.5720431327819824, "memory(GiB)": 40.03, "step": 13260, "token_acc": 0.8560767590618337, "train_speed(iter/s)": 0.095815 }, { "epoch": 0.6168709436462932, "grad_norm": 9.703290939331055, "learning_rate": 3.504142977724512e-06, "loss": 0.6921139717102051, "memory(GiB)": 40.03, "step": 13265, "token_acc": 0.8224266006367174, "train_speed(iter/s)": 0.095836 }, { "epoch": 0.6171034619062429, "grad_norm": 7.2310709953308105, "learning_rate": 3.5004746170778024e-06, "loss": 0.6525343418121338, "memory(GiB)": 40.03, "step": 13270, "token_acc": 0.8408477842003853, "train_speed(iter/s)": 0.095857 }, { "epoch": 0.6173359801661924, "grad_norm": 7.915954113006592, "learning_rate": 3.496807143146751e-06, "loss": 0.7520557880401612, "memory(GiB)": 40.03, "step": 13275, "token_acc": 0.8154848046309696, "train_speed(iter/s)": 0.095877 }, { "epoch": 0.617568498426142, "grad_norm": 7.646523475646973, "learning_rate": 3.493140558100043e-06, "loss": 0.5795305252075196, "memory(GiB)": 40.03, "step": 13280, "token_acc": 0.8563643441027637, "train_speed(iter/s)": 0.095897 }, { "epoch": 0.6178010166860917, "grad_norm": 8.842355728149414, "learning_rate": 3.4894748641058483e-06, "loss": 0.7470812320709228, "memory(GiB)": 40.03, "step": 13285, "token_acc": 0.8251192368839427, "train_speed(iter/s)": 0.095917 }, { "epoch": 0.6180335349460412, "grad_norm": 6.617765426635742, "learning_rate": 3.4858100633318014e-06, "loss": 0.7513682842254639, "memory(GiB)": 40.03, "step": 13290, "token_acc": 0.8246217331499313, "train_speed(iter/s)": 0.095937 }, { "epoch": 0.6182660532059908, "grad_norm": 11.116613388061523, "learning_rate": 3.4821461579450166e-06, "loss": 0.6056669235229493, "memory(GiB)": 40.03, "step": 13295, "token_acc": 0.8522577816747041, "train_speed(iter/s)": 0.095956 }, { "epoch": 0.6184985714659404, "grad_norm": 8.063444137573242, "learning_rate": 3.4784831501120687e-06, "loss": 0.6425219058990479, "memory(GiB)": 40.03, "step": 13300, "token_acc": 0.8359303391384051, "train_speed(iter/s)": 0.095976 }, { "epoch": 0.6184985714659404, "eval_loss": 0.5811650156974792, "eval_runtime": 293.3389, "eval_samples_per_second": 11.846, "eval_steps_per_second": 11.846, "step": 13300 }, { "epoch": 0.6187310897258901, "grad_norm": 8.379485130310059, "learning_rate": 3.4748210419990116e-06, "loss": 0.7794717311859131, "memory(GiB)": 40.03, "step": 13305, "token_acc": 0.8279750441660964, "train_speed(iter/s)": 0.095793 }, { "epoch": 0.6189636079858396, "grad_norm": 8.657197952270508, "learning_rate": 3.4711598357713607e-06, "loss": 0.775357723236084, "memory(GiB)": 40.03, "step": 13310, "token_acc": 0.7870988242770893, "train_speed(iter/s)": 0.095813 }, { "epoch": 0.6191961262457892, "grad_norm": 7.7351765632629395, "learning_rate": 3.467499533594102e-06, "loss": 0.6825555324554443, "memory(GiB)": 40.03, "step": 13315, "token_acc": 0.8237533307955843, "train_speed(iter/s)": 0.095834 }, { "epoch": 0.6194286445057389, "grad_norm": 11.018149375915527, "learning_rate": 3.463840137631682e-06, "loss": 0.6458070278167725, "memory(GiB)": 40.03, "step": 13320, "token_acc": 0.8359375, "train_speed(iter/s)": 0.095854 }, { "epoch": 0.6196611627656884, "grad_norm": 5.231941223144531, "learning_rate": 3.4601816500480188e-06, "loss": 0.643475866317749, "memory(GiB)": 40.03, "step": 13325, "token_acc": 0.8257418909592823, "train_speed(iter/s)": 0.095875 }, { "epoch": 0.619893681025638, "grad_norm": 9.33646297454834, "learning_rate": 3.456524073006485e-06, "loss": 0.6933014869689942, "memory(GiB)": 40.03, "step": 13330, "token_acc": 0.829021372328459, "train_speed(iter/s)": 0.095896 }, { "epoch": 0.6201261992855877, "grad_norm": 7.163576126098633, "learning_rate": 3.4528674086699234e-06, "loss": 0.6457261085510254, "memory(GiB)": 40.03, "step": 13335, "token_acc": 0.8344663494221618, "train_speed(iter/s)": 0.095917 }, { "epoch": 0.6203587175455373, "grad_norm": 5.749244689941406, "learning_rate": 3.4492116592006274e-06, "loss": 0.6414624214172363, "memory(GiB)": 40.03, "step": 13340, "token_acc": 0.8356120826709063, "train_speed(iter/s)": 0.095937 }, { "epoch": 0.6205912358054868, "grad_norm": 8.368809700012207, "learning_rate": 3.4455568267603577e-06, "loss": 0.5333932399749756, "memory(GiB)": 40.03, "step": 13345, "token_acc": 0.8651933701657458, "train_speed(iter/s)": 0.095957 }, { "epoch": 0.6208237540654364, "grad_norm": 6.098763942718506, "learning_rate": 3.4419029135103288e-06, "loss": 0.7009202003479004, "memory(GiB)": 40.03, "step": 13350, "token_acc": 0.8158658497258948, "train_speed(iter/s)": 0.095977 }, { "epoch": 0.6208237540654364, "eval_loss": 0.5797573328018188, "eval_runtime": 293.8327, "eval_samples_per_second": 11.826, "eval_steps_per_second": 11.826, "step": 13350 }, { "epoch": 0.6210562723253861, "grad_norm": 5.722322463989258, "learning_rate": 3.438249921611214e-06, "loss": 0.6894258499145508, "memory(GiB)": 40.03, "step": 13355, "token_acc": 0.8296364362764682, "train_speed(iter/s)": 0.095795 }, { "epoch": 0.6212887905853357, "grad_norm": 8.34636402130127, "learning_rate": 3.4345978532231367e-06, "loss": 0.6312924861907959, "memory(GiB)": 40.03, "step": 13360, "token_acc": 0.8543487307532251, "train_speed(iter/s)": 0.095816 }, { "epoch": 0.6215213088452852, "grad_norm": 6.663353443145752, "learning_rate": 3.4309467105056802e-06, "loss": 0.616228437423706, "memory(GiB)": 40.03, "step": 13365, "token_acc": 0.8325808878856283, "train_speed(iter/s)": 0.095835 }, { "epoch": 0.6217538271052349, "grad_norm": 9.392459869384766, "learning_rate": 3.4272964956178774e-06, "loss": 0.7160651683807373, "memory(GiB)": 40.03, "step": 13370, "token_acc": 0.825006825006825, "train_speed(iter/s)": 0.095855 }, { "epoch": 0.6219863453651845, "grad_norm": 6.7092366218566895, "learning_rate": 3.423647210718214e-06, "loss": 0.6473873138427735, "memory(GiB)": 40.03, "step": 13375, "token_acc": 0.8419076229815997, "train_speed(iter/s)": 0.095875 }, { "epoch": 0.622218863625134, "grad_norm": 8.367863655090332, "learning_rate": 3.4199988579646226e-06, "loss": 0.7488775253295898, "memory(GiB)": 40.03, "step": 13380, "token_acc": 0.8174308137133416, "train_speed(iter/s)": 0.095896 }, { "epoch": 0.6224513818850836, "grad_norm": 7.5913777351379395, "learning_rate": 3.4163514395144892e-06, "loss": 0.5292999744415283, "memory(GiB)": 40.03, "step": 13385, "token_acc": 0.8726533166458073, "train_speed(iter/s)": 0.095916 }, { "epoch": 0.6226839001450333, "grad_norm": 7.731056213378906, "learning_rate": 3.4127049575246417e-06, "loss": 0.7689545154571533, "memory(GiB)": 40.03, "step": 13390, "token_acc": 0.821762349799733, "train_speed(iter/s)": 0.095937 }, { "epoch": 0.6229164184049829, "grad_norm": 8.29542064666748, "learning_rate": 3.409059414151361e-06, "loss": 0.6694583892822266, "memory(GiB)": 40.03, "step": 13395, "token_acc": 0.830316047867444, "train_speed(iter/s)": 0.095958 }, { "epoch": 0.6231489366649324, "grad_norm": 8.759634017944336, "learning_rate": 3.4054148115503695e-06, "loss": 0.7301998138427734, "memory(GiB)": 40.03, "step": 13400, "token_acc": 0.8189996401583304, "train_speed(iter/s)": 0.095979 }, { "epoch": 0.6231489366649324, "eval_loss": 0.5786752700805664, "eval_runtime": 293.4147, "eval_samples_per_second": 11.843, "eval_steps_per_second": 11.843, "step": 13400 }, { "epoch": 0.6233814549248821, "grad_norm": 7.837320327758789, "learning_rate": 3.4017711518768293e-06, "loss": 0.787266206741333, "memory(GiB)": 40.03, "step": 13405, "token_acc": 0.8289615867959114, "train_speed(iter/s)": 0.095798 }, { "epoch": 0.6236139731848317, "grad_norm": 9.125218391418457, "learning_rate": 3.398128437285353e-06, "loss": 0.6132421016693115, "memory(GiB)": 40.03, "step": 13410, "token_acc": 0.8308845577211394, "train_speed(iter/s)": 0.095818 }, { "epoch": 0.6238464914447813, "grad_norm": 6.4724273681640625, "learning_rate": 3.3944866699299872e-06, "loss": 0.49155464172363283, "memory(GiB)": 40.03, "step": 13415, "token_acc": 0.8774436090225564, "train_speed(iter/s)": 0.095838 }, { "epoch": 0.6240790097047308, "grad_norm": 7.698394298553467, "learning_rate": 3.3908458519642252e-06, "loss": 0.6231107711791992, "memory(GiB)": 40.03, "step": 13420, "token_acc": 0.8484301696138579, "train_speed(iter/s)": 0.095858 }, { "epoch": 0.6243115279646805, "grad_norm": 9.802258491516113, "learning_rate": 3.3872059855409893e-06, "loss": 0.6324323177337646, "memory(GiB)": 40.03, "step": 13425, "token_acc": 0.8554396423248882, "train_speed(iter/s)": 0.095879 }, { "epoch": 0.6245440462246301, "grad_norm": 8.112380981445312, "learning_rate": 3.383567072812651e-06, "loss": 0.6585474491119385, "memory(GiB)": 40.03, "step": 13430, "token_acc": 0.8506078055022392, "train_speed(iter/s)": 0.095899 }, { "epoch": 0.6247765644845796, "grad_norm": 7.874328136444092, "learning_rate": 3.3799291159310077e-06, "loss": 0.5667964935302734, "memory(GiB)": 40.03, "step": 13435, "token_acc": 0.8522376543209876, "train_speed(iter/s)": 0.095919 }, { "epoch": 0.6250090827445293, "grad_norm": 9.904825210571289, "learning_rate": 3.3762921170472973e-06, "loss": 0.6364833354949951, "memory(GiB)": 40.03, "step": 13440, "token_acc": 0.8434704830053668, "train_speed(iter/s)": 0.095939 }, { "epoch": 0.6252416010044789, "grad_norm": 11.402176856994629, "learning_rate": 3.372656078312189e-06, "loss": 0.7669492244720459, "memory(GiB)": 40.03, "step": 13445, "token_acc": 0.8166441136671178, "train_speed(iter/s)": 0.095959 }, { "epoch": 0.6254741192644285, "grad_norm": 8.771894454956055, "learning_rate": 3.3690210018757864e-06, "loss": 0.5411834239959716, "memory(GiB)": 40.03, "step": 13450, "token_acc": 0.8685134607881388, "train_speed(iter/s)": 0.095979 }, { "epoch": 0.6254741192644285, "eval_loss": 0.5777615904808044, "eval_runtime": 290.3126, "eval_samples_per_second": 11.97, "eval_steps_per_second": 11.97, "step": 13450 }, { "epoch": 0.625706637524378, "grad_norm": 9.126480102539062, "learning_rate": 3.3653868898876187e-06, "loss": 0.5853212833404541, "memory(GiB)": 40.03, "step": 13455, "token_acc": 0.8296562545029539, "train_speed(iter/s)": 0.095801 }, { "epoch": 0.6259391557843277, "grad_norm": 9.626083374023438, "learning_rate": 3.3617537444966515e-06, "loss": 0.576531457901001, "memory(GiB)": 40.03, "step": 13460, "token_acc": 0.8597442851607904, "train_speed(iter/s)": 0.095821 }, { "epoch": 0.6261716740442773, "grad_norm": 11.668168067932129, "learning_rate": 3.358121567851274e-06, "loss": 0.6511495590209961, "memory(GiB)": 40.03, "step": 13465, "token_acc": 0.8036732108929703, "train_speed(iter/s)": 0.095841 }, { "epoch": 0.6264041923042268, "grad_norm": 6.140625, "learning_rate": 3.354490362099308e-06, "loss": 0.7017635822296142, "memory(GiB)": 40.03, "step": 13470, "token_acc": 0.8272281511393135, "train_speed(iter/s)": 0.095861 }, { "epoch": 0.6266367105641765, "grad_norm": 8.743377685546875, "learning_rate": 3.350860129387993e-06, "loss": 0.7101643562316895, "memory(GiB)": 40.03, "step": 13475, "token_acc": 0.8201840894148587, "train_speed(iter/s)": 0.095881 }, { "epoch": 0.6268692288241261, "grad_norm": 6.7029337882995605, "learning_rate": 3.347230871864e-06, "loss": 0.6393361568450928, "memory(GiB)": 40.03, "step": 13480, "token_acc": 0.835424883470778, "train_speed(iter/s)": 0.0959 }, { "epoch": 0.6271017470840757, "grad_norm": 9.114521026611328, "learning_rate": 3.3436025916734207e-06, "loss": 0.6931623458862305, "memory(GiB)": 40.03, "step": 13485, "token_acc": 0.8219291014014839, "train_speed(iter/s)": 0.095921 }, { "epoch": 0.6273342653440253, "grad_norm": 7.915518283843994, "learning_rate": 3.339975290961771e-06, "loss": 0.6445385456085205, "memory(GiB)": 40.03, "step": 13490, "token_acc": 0.8270348837209303, "train_speed(iter/s)": 0.09594 }, { "epoch": 0.6275667836039749, "grad_norm": 10.833139419555664, "learning_rate": 3.3363489718739817e-06, "loss": 0.7753934383392334, "memory(GiB)": 43.68, "step": 13495, "token_acc": 0.7683600947617795, "train_speed(iter/s)": 0.095959 }, { "epoch": 0.6277993018639245, "grad_norm": 7.672884464263916, "learning_rate": 3.3327236365544115e-06, "loss": 0.7468546390533447, "memory(GiB)": 43.68, "step": 13500, "token_acc": 0.8008849557522124, "train_speed(iter/s)": 0.095979 }, { "epoch": 0.6277993018639245, "eval_loss": 0.5770713686943054, "eval_runtime": 293.1909, "eval_samples_per_second": 11.852, "eval_steps_per_second": 11.852, "step": 13500 }, { "epoch": 0.6280318201238742, "grad_norm": 8.18341064453125, "learning_rate": 3.3290992871468286e-06, "loss": 0.5357805252075195, "memory(GiB)": 43.68, "step": 13505, "token_acc": 0.8304309882384967, "train_speed(iter/s)": 0.095799 }, { "epoch": 0.6282643383838237, "grad_norm": 8.50880241394043, "learning_rate": 3.3254759257944284e-06, "loss": 0.5876242637634277, "memory(GiB)": 43.68, "step": 13510, "token_acc": 0.8617998163452709, "train_speed(iter/s)": 0.09582 }, { "epoch": 0.6284968566437733, "grad_norm": 8.151193618774414, "learning_rate": 3.321853554639811e-06, "loss": 0.6524288654327393, "memory(GiB)": 43.68, "step": 13515, "token_acc": 0.8302622253720765, "train_speed(iter/s)": 0.09584 }, { "epoch": 0.6287293749037229, "grad_norm": 11.721169471740723, "learning_rate": 3.3182321758249997e-06, "loss": 0.5465070724487304, "memory(GiB)": 43.68, "step": 13520, "token_acc": 0.8517136070580251, "train_speed(iter/s)": 0.09586 }, { "epoch": 0.6289618931636725, "grad_norm": 5.964034557342529, "learning_rate": 3.3146117914914257e-06, "loss": 0.628056812286377, "memory(GiB)": 43.68, "step": 13525, "token_acc": 0.84767393989296, "train_speed(iter/s)": 0.095881 }, { "epoch": 0.6291944114236221, "grad_norm": 6.987983703613281, "learning_rate": 3.310992403779934e-06, "loss": 0.6603175640106201, "memory(GiB)": 43.68, "step": 13530, "token_acc": 0.8353322528363047, "train_speed(iter/s)": 0.095901 }, { "epoch": 0.6294269296835717, "grad_norm": 8.833318710327148, "learning_rate": 3.3073740148307833e-06, "loss": 0.6114417552947998, "memory(GiB)": 43.68, "step": 13535, "token_acc": 0.8437158469945355, "train_speed(iter/s)": 0.095921 }, { "epoch": 0.6296594479435214, "grad_norm": 7.671047210693359, "learning_rate": 3.3037566267836347e-06, "loss": 0.6404377460479737, "memory(GiB)": 43.68, "step": 13540, "token_acc": 0.8379737045630317, "train_speed(iter/s)": 0.095941 }, { "epoch": 0.6298919662034709, "grad_norm": 8.291642189025879, "learning_rate": 3.300140241777564e-06, "loss": 0.5734952449798584, "memory(GiB)": 43.68, "step": 13545, "token_acc": 0.8604651162790697, "train_speed(iter/s)": 0.095962 }, { "epoch": 0.6301244844634205, "grad_norm": 8.439014434814453, "learning_rate": 3.2965248619510494e-06, "loss": 0.6413207530975342, "memory(GiB)": 43.68, "step": 13550, "token_acc": 0.8401782871696912, "train_speed(iter/s)": 0.095982 }, { "epoch": 0.6301244844634205, "eval_loss": 0.578080952167511, "eval_runtime": 289.0829, "eval_samples_per_second": 12.021, "eval_steps_per_second": 12.021, "step": 13550 }, { "epoch": 0.6303570027233701, "grad_norm": 6.432934761047363, "learning_rate": 3.2929104894419806e-06, "loss": 0.7349601745605469, "memory(GiB)": 43.68, "step": 13555, "token_acc": 0.8285882287503278, "train_speed(iter/s)": 0.095805 }, { "epoch": 0.6305895209833198, "grad_norm": 7.739002227783203, "learning_rate": 3.2892971263876416e-06, "loss": 0.5691585540771484, "memory(GiB)": 43.68, "step": 13560, "token_acc": 0.8558882235528942, "train_speed(iter/s)": 0.095825 }, { "epoch": 0.6308220392432693, "grad_norm": 8.967711448669434, "learning_rate": 3.28568477492473e-06, "loss": 0.7265130996704101, "memory(GiB)": 43.68, "step": 13565, "token_acc": 0.8324140857021638, "train_speed(iter/s)": 0.095845 }, { "epoch": 0.6310545575032189, "grad_norm": 7.715224266052246, "learning_rate": 3.2820734371893394e-06, "loss": 0.6721633434295654, "memory(GiB)": 43.68, "step": 13570, "token_acc": 0.8380987746008169, "train_speed(iter/s)": 0.095865 }, { "epoch": 0.6312870757631686, "grad_norm": 6.480725288391113, "learning_rate": 3.2784631153169667e-06, "loss": 0.6947125434875489, "memory(GiB)": 43.68, "step": 13575, "token_acc": 0.8141641504254735, "train_speed(iter/s)": 0.095885 }, { "epoch": 0.6315195940231181, "grad_norm": 8.174640655517578, "learning_rate": 3.274853811442503e-06, "loss": 0.6244683742523194, "memory(GiB)": 43.68, "step": 13580, "token_acc": 0.843908135461269, "train_speed(iter/s)": 0.095905 }, { "epoch": 0.6317521122830677, "grad_norm": 10.560827255249023, "learning_rate": 3.271245527700245e-06, "loss": 0.6520066738128663, "memory(GiB)": 43.68, "step": 13585, "token_acc": 0.8298475717830557, "train_speed(iter/s)": 0.095926 }, { "epoch": 0.6319846305430173, "grad_norm": 6.402472972869873, "learning_rate": 3.2676382662238792e-06, "loss": 0.6191014289855957, "memory(GiB)": 43.68, "step": 13590, "token_acc": 0.8441325768886234, "train_speed(iter/s)": 0.095945 }, { "epoch": 0.632217148802967, "grad_norm": 9.326212882995605, "learning_rate": 3.264032029146495e-06, "loss": 0.8372744560241699, "memory(GiB)": 43.68, "step": 13595, "token_acc": 0.80083857442348, "train_speed(iter/s)": 0.095966 }, { "epoch": 0.6324496670629165, "grad_norm": 5.53291130065918, "learning_rate": 3.260426818600566e-06, "loss": 0.6795487403869629, "memory(GiB)": 43.68, "step": 13600, "token_acc": 0.8197094844773569, "train_speed(iter/s)": 0.095986 }, { "epoch": 0.6324496670629165, "eval_loss": 0.5772190690040588, "eval_runtime": 295.8738, "eval_samples_per_second": 11.745, "eval_steps_per_second": 11.745, "step": 13600 }, { "epoch": 0.6326821853228661, "grad_norm": 5.555092811584473, "learning_rate": 3.2568226367179695e-06, "loss": 0.7151205539703369, "memory(GiB)": 43.68, "step": 13605, "token_acc": 0.8287276524178828, "train_speed(iter/s)": 0.095804 }, { "epoch": 0.6329147035828158, "grad_norm": 8.747469902038574, "learning_rate": 3.253219485629966e-06, "loss": 0.7140115737915039, "memory(GiB)": 43.68, "step": 13610, "token_acc": 0.8367752184273233, "train_speed(iter/s)": 0.095824 }, { "epoch": 0.6331472218427653, "grad_norm": 6.2085747718811035, "learning_rate": 3.249617367467214e-06, "loss": 0.6731038570404053, "memory(GiB)": 43.68, "step": 13615, "token_acc": 0.8313609467455622, "train_speed(iter/s)": 0.095844 }, { "epoch": 0.6333797401027149, "grad_norm": 9.040670394897461, "learning_rate": 3.246016284359752e-06, "loss": 0.7263114929199219, "memory(GiB)": 43.68, "step": 13620, "token_acc": 0.8259759211966435, "train_speed(iter/s)": 0.095864 }, { "epoch": 0.6336122583626645, "grad_norm": 9.780359268188477, "learning_rate": 3.242416238437015e-06, "loss": 0.631324291229248, "memory(GiB)": 43.68, "step": 13625, "token_acc": 0.8524399690162665, "train_speed(iter/s)": 0.095884 }, { "epoch": 0.6338447766226142, "grad_norm": 8.346222877502441, "learning_rate": 3.23881723182782e-06, "loss": 0.5538475036621093, "memory(GiB)": 43.68, "step": 13630, "token_acc": 0.8669991687448046, "train_speed(iter/s)": 0.095904 }, { "epoch": 0.6340772948825637, "grad_norm": 7.389737129211426, "learning_rate": 3.2352192666603733e-06, "loss": 0.6332985877990722, "memory(GiB)": 43.68, "step": 13635, "token_acc": 0.8622803432774826, "train_speed(iter/s)": 0.095924 }, { "epoch": 0.6343098131425133, "grad_norm": 7.637555122375488, "learning_rate": 3.231622345062259e-06, "loss": 0.5670449733734131, "memory(GiB)": 43.68, "step": 13640, "token_acc": 0.8635014836795252, "train_speed(iter/s)": 0.095944 }, { "epoch": 0.634542331402463, "grad_norm": 8.666919708251953, "learning_rate": 3.2280264691604505e-06, "loss": 0.6867550373077392, "memory(GiB)": 43.68, "step": 13645, "token_acc": 0.8319641523525019, "train_speed(iter/s)": 0.095964 }, { "epoch": 0.6347748496624126, "grad_norm": 9.447999000549316, "learning_rate": 3.224431641081298e-06, "loss": 0.6106939792633057, "memory(GiB)": 43.68, "step": 13650, "token_acc": 0.8417582417582418, "train_speed(iter/s)": 0.095983 }, { "epoch": 0.6347748496624126, "eval_loss": 0.5787781476974487, "eval_runtime": 291.6325, "eval_samples_per_second": 11.916, "eval_steps_per_second": 11.916, "step": 13650 }, { "epoch": 0.6350073679223621, "grad_norm": 8.59967041015625, "learning_rate": 3.2208378629505366e-06, "loss": 0.7382328510284424, "memory(GiB)": 43.68, "step": 13655, "token_acc": 0.8293154714326821, "train_speed(iter/s)": 0.095806 }, { "epoch": 0.6352398861823118, "grad_norm": 9.68194580078125, "learning_rate": 3.217245136893279e-06, "loss": 0.7147025108337403, "memory(GiB)": 43.68, "step": 13660, "token_acc": 0.8270995059985886, "train_speed(iter/s)": 0.095826 }, { "epoch": 0.6354724044422614, "grad_norm": 6.674454212188721, "learning_rate": 3.2136534650340117e-06, "loss": 0.700990867614746, "memory(GiB)": 43.68, "step": 13665, "token_acc": 0.8329571106094809, "train_speed(iter/s)": 0.095846 }, { "epoch": 0.6357049227022109, "grad_norm": 7.11359167098999, "learning_rate": 3.2100628494966033e-06, "loss": 0.7956765651702881, "memory(GiB)": 43.68, "step": 13670, "token_acc": 0.7988103568929321, "train_speed(iter/s)": 0.095866 }, { "epoch": 0.6359374409621605, "grad_norm": 8.432509422302246, "learning_rate": 3.206473292404295e-06, "loss": 0.6146938323974609, "memory(GiB)": 43.68, "step": 13675, "token_acc": 0.8451695457453615, "train_speed(iter/s)": 0.095885 }, { "epoch": 0.6361699592221102, "grad_norm": 7.3090901374816895, "learning_rate": 3.202884795879705e-06, "loss": 0.6174387454986572, "memory(GiB)": 43.68, "step": 13680, "token_acc": 0.8505385996409336, "train_speed(iter/s)": 0.095905 }, { "epoch": 0.6364024774820598, "grad_norm": 9.991720199584961, "learning_rate": 3.1992973620448178e-06, "loss": 0.6521985054016113, "memory(GiB)": 43.68, "step": 13685, "token_acc": 0.8370843130668458, "train_speed(iter/s)": 0.095925 }, { "epoch": 0.6366349957420093, "grad_norm": 6.456875324249268, "learning_rate": 3.1957109930209972e-06, "loss": 0.6120789051055908, "memory(GiB)": 43.68, "step": 13690, "token_acc": 0.845568783068783, "train_speed(iter/s)": 0.095945 }, { "epoch": 0.636867514001959, "grad_norm": 6.283238410949707, "learning_rate": 3.1921256909289717e-06, "loss": 0.5926040172576904, "memory(GiB)": 43.68, "step": 13695, "token_acc": 0.8515030785947121, "train_speed(iter/s)": 0.095965 }, { "epoch": 0.6371000322619086, "grad_norm": 6.858433246612549, "learning_rate": 3.188541457888844e-06, "loss": 0.6875529289245605, "memory(GiB)": 43.68, "step": 13700, "token_acc": 0.8313755210306935, "train_speed(iter/s)": 0.095984 }, { "epoch": 0.6371000322619086, "eval_loss": 0.5758278965950012, "eval_runtime": 293.8539, "eval_samples_per_second": 11.826, "eval_steps_per_second": 11.826, "step": 13700 }, { "epoch": 0.6373325505218582, "grad_norm": 8.783116340637207, "learning_rate": 3.184958296020078e-06, "loss": 0.5677198886871337, "memory(GiB)": 43.68, "step": 13705, "token_acc": 0.8306381276255251, "train_speed(iter/s)": 0.095807 }, { "epoch": 0.6375650687818077, "grad_norm": 8.005859375, "learning_rate": 3.181376207441511e-06, "loss": 0.647659158706665, "memory(GiB)": 43.68, "step": 13710, "token_acc": 0.840589417280643, "train_speed(iter/s)": 0.095827 }, { "epoch": 0.6377975870417574, "grad_norm": 6.116490840911865, "learning_rate": 3.1777951942713407e-06, "loss": 0.643339729309082, "memory(GiB)": 43.68, "step": 13715, "token_acc": 0.8404487379245871, "train_speed(iter/s)": 0.095846 }, { "epoch": 0.638030105301707, "grad_norm": 8.654963493347168, "learning_rate": 3.1742152586271336e-06, "loss": 0.6308553695678711, "memory(GiB)": 43.68, "step": 13720, "token_acc": 0.8474514118078474, "train_speed(iter/s)": 0.095866 }, { "epoch": 0.6382626235616565, "grad_norm": 8.51541519165039, "learning_rate": 3.170636402625812e-06, "loss": 0.6695310592651367, "memory(GiB)": 43.68, "step": 13725, "token_acc": 0.8303973781237198, "train_speed(iter/s)": 0.095886 }, { "epoch": 0.6384951418216062, "grad_norm": 8.290445327758789, "learning_rate": 3.167058628383667e-06, "loss": 0.6182727336883544, "memory(GiB)": 43.68, "step": 13730, "token_acc": 0.8479587048334115, "train_speed(iter/s)": 0.095906 }, { "epoch": 0.6387276600815558, "grad_norm": 6.582568645477295, "learning_rate": 3.163481938016345e-06, "loss": 0.5673922538757324, "memory(GiB)": 43.68, "step": 13735, "token_acc": 0.8598392170569731, "train_speed(iter/s)": 0.095925 }, { "epoch": 0.6389601783415054, "grad_norm": 7.43300724029541, "learning_rate": 3.159906333638856e-06, "loss": 0.6439460754394531, "memory(GiB)": 43.68, "step": 13740, "token_acc": 0.8429054054054054, "train_speed(iter/s)": 0.095945 }, { "epoch": 0.639192696601455, "grad_norm": 9.422440528869629, "learning_rate": 3.1563318173655623e-06, "loss": 0.5517416000366211, "memory(GiB)": 43.68, "step": 13745, "token_acc": 0.8609561752988047, "train_speed(iter/s)": 0.095964 }, { "epoch": 0.6394252148614046, "grad_norm": 9.674040794372559, "learning_rate": 3.1527583913101878e-06, "loss": 0.6156484127044678, "memory(GiB)": 43.68, "step": 13750, "token_acc": 0.8474264705882353, "train_speed(iter/s)": 0.095984 }, { "epoch": 0.6394252148614046, "eval_loss": 0.5760475397109985, "eval_runtime": 294.3994, "eval_samples_per_second": 11.804, "eval_steps_per_second": 11.804, "step": 13750 }, { "epoch": 0.6396577331213542, "grad_norm": 7.030127048492432, "learning_rate": 3.1491860575858084e-06, "loss": 0.6011571884155273, "memory(GiB)": 43.68, "step": 13755, "token_acc": 0.8305973680200807, "train_speed(iter/s)": 0.095807 }, { "epoch": 0.6398902513813038, "grad_norm": 9.282323837280273, "learning_rate": 3.1456148183048583e-06, "loss": 0.6616458892822266, "memory(GiB)": 43.68, "step": 13760, "token_acc": 0.8436350257542311, "train_speed(iter/s)": 0.095826 }, { "epoch": 0.6401227696412534, "grad_norm": 6.603138446807861, "learning_rate": 3.1420446755791157e-06, "loss": 0.7302883625030517, "memory(GiB)": 43.68, "step": 13765, "token_acc": 0.8173349534224382, "train_speed(iter/s)": 0.095846 }, { "epoch": 0.640355287901203, "grad_norm": 5.0972137451171875, "learning_rate": 3.138475631519723e-06, "loss": 0.859835147857666, "memory(GiB)": 43.68, "step": 13770, "token_acc": 0.7932816537467701, "train_speed(iter/s)": 0.095865 }, { "epoch": 0.6405878061611526, "grad_norm": 8.140726089477539, "learning_rate": 3.1349076882371597e-06, "loss": 0.6677399635314941, "memory(GiB)": 43.68, "step": 13775, "token_acc": 0.8311345646437994, "train_speed(iter/s)": 0.095885 }, { "epoch": 0.6408203244211021, "grad_norm": 9.439065933227539, "learning_rate": 3.1313408478412677e-06, "loss": 0.5588486671447754, "memory(GiB)": 43.68, "step": 13780, "token_acc": 0.8525798525798526, "train_speed(iter/s)": 0.095905 }, { "epoch": 0.6410528426810518, "grad_norm": 6.783348083496094, "learning_rate": 3.127775112441222e-06, "loss": 0.696048355102539, "memory(GiB)": 43.68, "step": 13785, "token_acc": 0.8311071534579995, "train_speed(iter/s)": 0.095924 }, { "epoch": 0.6412853609410014, "grad_norm": 6.2189531326293945, "learning_rate": 3.124210484145558e-06, "loss": 0.7082841396331787, "memory(GiB)": 43.68, "step": 13790, "token_acc": 0.8230918499353169, "train_speed(iter/s)": 0.095942 }, { "epoch": 0.641517879200951, "grad_norm": 8.979401588439941, "learning_rate": 3.1206469650621496e-06, "loss": 0.7509594440460206, "memory(GiB)": 43.68, "step": 13795, "token_acc": 0.8189786585365854, "train_speed(iter/s)": 0.095962 }, { "epoch": 0.6417503974609006, "grad_norm": 6.314192295074463, "learning_rate": 3.117084557298213e-06, "loss": 0.6499105453491211, "memory(GiB)": 43.68, "step": 13800, "token_acc": 0.8217032184562802, "train_speed(iter/s)": 0.095981 }, { "epoch": 0.6417503974609006, "eval_loss": 0.5744973421096802, "eval_runtime": 294.748, "eval_samples_per_second": 11.79, "eval_steps_per_second": 11.79, "step": 13800 }, { "epoch": 0.6419829157208502, "grad_norm": 6.625218391418457, "learning_rate": 3.113523262960313e-06, "loss": 0.7195854663848877, "memory(GiB)": 43.68, "step": 13805, "token_acc": 0.8297727636421828, "train_speed(iter/s)": 0.095804 }, { "epoch": 0.6422154339807998, "grad_norm": 7.311429500579834, "learning_rate": 3.1099630841543504e-06, "loss": 0.6338868618011475, "memory(GiB)": 43.68, "step": 13810, "token_acc": 0.8433029908972692, "train_speed(iter/s)": 0.095823 }, { "epoch": 0.6424479522407494, "grad_norm": 12.813942909240723, "learning_rate": 3.106404022985572e-06, "loss": 0.6633894443511963, "memory(GiB)": 43.68, "step": 13815, "token_acc": 0.8306645316253003, "train_speed(iter/s)": 0.095842 }, { "epoch": 0.642680470500699, "grad_norm": 6.483264446258545, "learning_rate": 3.102846081558556e-06, "loss": 0.7106656551361084, "memory(GiB)": 43.68, "step": 13820, "token_acc": 0.8238815374921235, "train_speed(iter/s)": 0.095862 }, { "epoch": 0.6429129887606486, "grad_norm": 8.264899253845215, "learning_rate": 3.099289261977227e-06, "loss": 0.8299205780029297, "memory(GiB)": 43.68, "step": 13825, "token_acc": 0.7785035629453682, "train_speed(iter/s)": 0.095881 }, { "epoch": 0.6431455070205983, "grad_norm": 8.50506591796875, "learning_rate": 3.0957335663448397e-06, "loss": 0.6548618793487548, "memory(GiB)": 43.68, "step": 13830, "token_acc": 0.8282962470939887, "train_speed(iter/s)": 0.0959 }, { "epoch": 0.6433780252805478, "grad_norm": 6.441330432891846, "learning_rate": 3.0921789967639893e-06, "loss": 0.6297261238098144, "memory(GiB)": 43.68, "step": 13835, "token_acc": 0.8375950241879752, "train_speed(iter/s)": 0.095919 }, { "epoch": 0.6436105435404974, "grad_norm": 9.392109870910645, "learning_rate": 3.088625555336599e-06, "loss": 0.768109655380249, "memory(GiB)": 43.68, "step": 13840, "token_acc": 0.8078817733990148, "train_speed(iter/s)": 0.095939 }, { "epoch": 0.643843061800447, "grad_norm": 8.62150764465332, "learning_rate": 3.085073244163932e-06, "loss": 0.7165932178497314, "memory(GiB)": 43.68, "step": 13845, "token_acc": 0.8293402137854773, "train_speed(iter/s)": 0.095958 }, { "epoch": 0.6440755800603967, "grad_norm": 8.874046325683594, "learning_rate": 3.081522065346576e-06, "loss": 0.6085056304931641, "memory(GiB)": 43.68, "step": 13850, "token_acc": 0.8465703971119134, "train_speed(iter/s)": 0.095978 }, { "epoch": 0.6440755800603967, "eval_loss": 0.5765819549560547, "eval_runtime": 291.2604, "eval_samples_per_second": 11.931, "eval_steps_per_second": 11.931, "step": 13850 }, { "epoch": 0.6443080983203462, "grad_norm": 8.001680374145508, "learning_rate": 3.077972020984458e-06, "loss": 0.6871384620666504, "memory(GiB)": 43.68, "step": 13855, "token_acc": 0.8298419082732809, "train_speed(iter/s)": 0.095803 }, { "epoch": 0.6445406165802958, "grad_norm": 5.726327419281006, "learning_rate": 3.074423113176822e-06, "loss": 0.6752862453460693, "memory(GiB)": 43.68, "step": 13860, "token_acc": 0.8349722103463019, "train_speed(iter/s)": 0.095823 }, { "epoch": 0.6447731348402455, "grad_norm": 6.493947505950928, "learning_rate": 3.070875344022252e-06, "loss": 0.6672042369842529, "memory(GiB)": 43.68, "step": 13865, "token_acc": 0.8307134220072552, "train_speed(iter/s)": 0.095842 }, { "epoch": 0.645005653100195, "grad_norm": 8.697665214538574, "learning_rate": 3.0673287156186503e-06, "loss": 0.6020650386810302, "memory(GiB)": 43.68, "step": 13870, "token_acc": 0.8491048593350383, "train_speed(iter/s)": 0.095862 }, { "epoch": 0.6452381713601446, "grad_norm": 7.544933319091797, "learning_rate": 3.063783230063252e-06, "loss": 0.6294498920440674, "memory(GiB)": 43.68, "step": 13875, "token_acc": 0.8423893486865779, "train_speed(iter/s)": 0.095881 }, { "epoch": 0.6454706896200942, "grad_norm": 9.882427215576172, "learning_rate": 3.060238889452607e-06, "loss": 0.6869438648223877, "memory(GiB)": 43.68, "step": 13880, "token_acc": 0.8407671721677074, "train_speed(iter/s)": 0.095901 }, { "epoch": 0.6457032078800439, "grad_norm": 7.94822359085083, "learning_rate": 3.0566956958825965e-06, "loss": 0.6243470191955567, "memory(GiB)": 43.68, "step": 13885, "token_acc": 0.8395860284605433, "train_speed(iter/s)": 0.09592 }, { "epoch": 0.6459357261399934, "grad_norm": 8.722342491149902, "learning_rate": 3.0531536514484183e-06, "loss": 0.759521484375, "memory(GiB)": 43.68, "step": 13890, "token_acc": 0.8233936129280492, "train_speed(iter/s)": 0.09594 }, { "epoch": 0.646168244399943, "grad_norm": 8.995368003845215, "learning_rate": 3.0496127582445955e-06, "loss": 0.6832744598388671, "memory(GiB)": 43.68, "step": 13895, "token_acc": 0.8318135764944276, "train_speed(iter/s)": 0.095959 }, { "epoch": 0.6464007626598927, "grad_norm": 8.118508338928223, "learning_rate": 3.0460730183649646e-06, "loss": 0.6283859252929688, "memory(GiB)": 43.68, "step": 13900, "token_acc": 0.8495370370370371, "train_speed(iter/s)": 0.095978 }, { "epoch": 0.6464007626598927, "eval_loss": 0.5750031471252441, "eval_runtime": 292.3705, "eval_samples_per_second": 11.886, "eval_steps_per_second": 11.886, "step": 13900 }, { "epoch": 0.6466332809198423, "grad_norm": 6.906221389770508, "learning_rate": 3.0425344339026842e-06, "loss": 0.6433838367462158, "memory(GiB)": 43.68, "step": 13905, "token_acc": 0.8298588838999359, "train_speed(iter/s)": 0.095804 }, { "epoch": 0.6468657991797918, "grad_norm": 7.201536178588867, "learning_rate": 3.0389970069502282e-06, "loss": 0.5808377742767334, "memory(GiB)": 43.68, "step": 13910, "token_acc": 0.8401525658807212, "train_speed(iter/s)": 0.095824 }, { "epoch": 0.6470983174397414, "grad_norm": 6.924560070037842, "learning_rate": 3.0354607395993897e-06, "loss": 0.5945795059204102, "memory(GiB)": 43.68, "step": 13915, "token_acc": 0.849733570159858, "train_speed(iter/s)": 0.095843 }, { "epoch": 0.6473308356996911, "grad_norm": 9.050676345825195, "learning_rate": 3.031925633941267e-06, "loss": 0.68319673538208, "memory(GiB)": 43.68, "step": 13920, "token_acc": 0.8365591397849462, "train_speed(iter/s)": 0.095862 }, { "epoch": 0.6475633539596406, "grad_norm": 8.45664119720459, "learning_rate": 3.02839169206628e-06, "loss": 0.5608391761779785, "memory(GiB)": 43.68, "step": 13925, "token_acc": 0.8556187766714083, "train_speed(iter/s)": 0.095882 }, { "epoch": 0.6477958722195902, "grad_norm": 6.573840618133545, "learning_rate": 3.024858916064158e-06, "loss": 0.664864444732666, "memory(GiB)": 43.68, "step": 13930, "token_acc": 0.8237134909596662, "train_speed(iter/s)": 0.095901 }, { "epoch": 0.6480283904795399, "grad_norm": 9.13306713104248, "learning_rate": 3.0213273080239407e-06, "loss": 0.6471784114837646, "memory(GiB)": 43.68, "step": 13935, "token_acc": 0.8430813124108416, "train_speed(iter/s)": 0.09592 }, { "epoch": 0.6482609087394895, "grad_norm": 8.26430892944336, "learning_rate": 3.0177968700339783e-06, "loss": 0.6823818683624268, "memory(GiB)": 43.68, "step": 13940, "token_acc": 0.8463696948439144, "train_speed(iter/s)": 0.09594 }, { "epoch": 0.648493426999439, "grad_norm": 7.3765435218811035, "learning_rate": 3.0142676041819235e-06, "loss": 0.5814319610595703, "memory(GiB)": 43.68, "step": 13945, "token_acc": 0.8620386643233744, "train_speed(iter/s)": 0.095959 }, { "epoch": 0.6487259452593886, "grad_norm": 6.571820259094238, "learning_rate": 3.010739512554744e-06, "loss": 0.6527836322784424, "memory(GiB)": 43.68, "step": 13950, "token_acc": 0.8295980078263963, "train_speed(iter/s)": 0.095978 }, { "epoch": 0.6487259452593886, "eval_loss": 0.574194610118866, "eval_runtime": 295.9557, "eval_samples_per_second": 11.742, "eval_steps_per_second": 11.742, "step": 13950 }, { "epoch": 0.6489584635193383, "grad_norm": 7.557634353637695, "learning_rate": 3.0072125972387066e-06, "loss": 0.562659215927124, "memory(GiB)": 43.68, "step": 13955, "token_acc": 0.8313536907910556, "train_speed(iter/s)": 0.095801 }, { "epoch": 0.6491909817792878, "grad_norm": 10.294422149658203, "learning_rate": 3.0036868603193894e-06, "loss": 0.664063835144043, "memory(GiB)": 43.68, "step": 13960, "token_acc": 0.8424485699949824, "train_speed(iter/s)": 0.09582 }, { "epoch": 0.6494235000392374, "grad_norm": 9.17708969116211, "learning_rate": 3.000162303881664e-06, "loss": 0.6378396511077881, "memory(GiB)": 43.68, "step": 13965, "token_acc": 0.8550347222222222, "train_speed(iter/s)": 0.09584 }, { "epoch": 0.6496560182991871, "grad_norm": 7.761695384979248, "learning_rate": 2.996638930009713e-06, "loss": 0.6696043491363526, "memory(GiB)": 43.68, "step": 13970, "token_acc": 0.8345890410958904, "train_speed(iter/s)": 0.095859 }, { "epoch": 0.6498885365591367, "grad_norm": 8.447530746459961, "learning_rate": 2.9931167407870142e-06, "loss": 0.7301533222198486, "memory(GiB)": 43.68, "step": 13975, "token_acc": 0.8180955393862702, "train_speed(iter/s)": 0.095878 }, { "epoch": 0.6501210548190862, "grad_norm": 5.710422992706299, "learning_rate": 2.9895957382963507e-06, "loss": 0.6815677642822265, "memory(GiB)": 43.68, "step": 13980, "token_acc": 0.8277608915906788, "train_speed(iter/s)": 0.095897 }, { "epoch": 0.6503535730790359, "grad_norm": 7.333799839019775, "learning_rate": 2.9860759246197948e-06, "loss": 0.6125518321990967, "memory(GiB)": 43.68, "step": 13985, "token_acc": 0.839831401475237, "train_speed(iter/s)": 0.095915 }, { "epoch": 0.6505860913389855, "grad_norm": 5.481876850128174, "learning_rate": 2.9825573018387245e-06, "loss": 0.781313419342041, "memory(GiB)": 43.68, "step": 13990, "token_acc": 0.7800857237059018, "train_speed(iter/s)": 0.095935 }, { "epoch": 0.6508186095989351, "grad_norm": 7.97532844543457, "learning_rate": 2.9790398720338076e-06, "loss": 0.5176995277404786, "memory(GiB)": 43.68, "step": 13995, "token_acc": 0.8639365918097754, "train_speed(iter/s)": 0.095953 }, { "epoch": 0.6510511278588846, "grad_norm": 9.163570404052734, "learning_rate": 2.975523637285013e-06, "loss": 0.6193868160247803, "memory(GiB)": 43.68, "step": 14000, "token_acc": 0.8512965964343598, "train_speed(iter/s)": 0.095973 }, { "epoch": 0.6510511278588846, "eval_loss": 0.5734658241271973, "eval_runtime": 295.8049, "eval_samples_per_second": 11.748, "eval_steps_per_second": 11.748, "step": 14000 }, { "epoch": 0.6512836461188343, "grad_norm": 6.884223937988281, "learning_rate": 2.9720085996715934e-06, "loss": 0.7771946430206299, "memory(GiB)": 43.68, "step": 14005, "token_acc": 0.8298910314151567, "train_speed(iter/s)": 0.095798 }, { "epoch": 0.6515161643787839, "grad_norm": 6.871092319488525, "learning_rate": 2.968494761272104e-06, "loss": 0.6113077640533447, "memory(GiB)": 43.68, "step": 14010, "token_acc": 0.8521489971346705, "train_speed(iter/s)": 0.095816 }, { "epoch": 0.6517486826387334, "grad_norm": 9.444489479064941, "learning_rate": 2.9649821241643815e-06, "loss": 0.7457359790802002, "memory(GiB)": 43.68, "step": 14015, "token_acc": 0.8287831513260531, "train_speed(iter/s)": 0.095836 }, { "epoch": 0.651981200898683, "grad_norm": 8.061306953430176, "learning_rate": 2.9614706904255618e-06, "loss": 0.7024789333343506, "memory(GiB)": 43.68, "step": 14020, "token_acc": 0.8234827449424832, "train_speed(iter/s)": 0.095855 }, { "epoch": 0.6522137191586327, "grad_norm": 6.835376739501953, "learning_rate": 2.957960462132059e-06, "loss": 0.737022066116333, "memory(GiB)": 43.68, "step": 14025, "token_acc": 0.8126159554730983, "train_speed(iter/s)": 0.095874 }, { "epoch": 0.6524462374185823, "grad_norm": 8.676056861877441, "learning_rate": 2.9544514413595826e-06, "loss": 0.6453481674194336, "memory(GiB)": 43.68, "step": 14030, "token_acc": 0.8382838283828383, "train_speed(iter/s)": 0.095894 }, { "epoch": 0.6526787556785318, "grad_norm": 5.367279529571533, "learning_rate": 2.950943630183123e-06, "loss": 0.6223780155181885, "memory(GiB)": 43.68, "step": 14035, "token_acc": 0.8357348703170029, "train_speed(iter/s)": 0.095913 }, { "epoch": 0.6529112739384815, "grad_norm": 6.280850410461426, "learning_rate": 2.947437030676961e-06, "loss": 0.6186736583709717, "memory(GiB)": 43.68, "step": 14040, "token_acc": 0.844964314036479, "train_speed(iter/s)": 0.095932 }, { "epoch": 0.6531437921984311, "grad_norm": 7.411701679229736, "learning_rate": 2.9439316449146515e-06, "loss": 0.6844239234924316, "memory(GiB)": 43.68, "step": 14045, "token_acc": 0.8381320224719101, "train_speed(iter/s)": 0.095952 }, { "epoch": 0.6533763104583807, "grad_norm": 6.25632381439209, "learning_rate": 2.940427474969042e-06, "loss": 0.45699324607849123, "memory(GiB)": 43.68, "step": 14050, "token_acc": 0.8847736625514403, "train_speed(iter/s)": 0.095971 }, { "epoch": 0.6533763104583807, "eval_loss": 0.5746976137161255, "eval_runtime": 295.928, "eval_samples_per_second": 11.743, "eval_steps_per_second": 11.743, "step": 14050 }, { "epoch": 0.6536088287183303, "grad_norm": 9.883111953735352, "learning_rate": 2.9369245229122532e-06, "loss": 0.5956651210784912, "memory(GiB)": 43.68, "step": 14055, "token_acc": 0.8314616620441049, "train_speed(iter/s)": 0.095796 }, { "epoch": 0.6538413469782799, "grad_norm": 6.37638521194458, "learning_rate": 2.93342279081569e-06, "loss": 0.7650864124298096, "memory(GiB)": 43.68, "step": 14060, "token_acc": 0.7998368234974164, "train_speed(iter/s)": 0.095815 }, { "epoch": 0.6540738652382295, "grad_norm": 7.586302757263184, "learning_rate": 2.929922280750037e-06, "loss": 0.559014081954956, "memory(GiB)": 43.68, "step": 14065, "token_acc": 0.8657630083078268, "train_speed(iter/s)": 0.095834 }, { "epoch": 0.654306383498179, "grad_norm": 8.94262409210205, "learning_rate": 2.9264229947852506e-06, "loss": 0.5777543067932129, "memory(GiB)": 43.68, "step": 14070, "token_acc": 0.8611764705882353, "train_speed(iter/s)": 0.095853 }, { "epoch": 0.6545389017581287, "grad_norm": 7.716413497924805, "learning_rate": 2.9229249349905686e-06, "loss": 0.6495201587677002, "memory(GiB)": 43.68, "step": 14075, "token_acc": 0.8377230246389125, "train_speed(iter/s)": 0.095872 }, { "epoch": 0.6547714200180783, "grad_norm": 6.944495677947998, "learning_rate": 2.9194281034344995e-06, "loss": 0.5988493919372558, "memory(GiB)": 43.68, "step": 14080, "token_acc": 0.8444821731748726, "train_speed(iter/s)": 0.095891 }, { "epoch": 0.6550039382780279, "grad_norm": 8.53555965423584, "learning_rate": 2.9159325021848305e-06, "loss": 0.6086976528167725, "memory(GiB)": 43.68, "step": 14085, "token_acc": 0.8399395998489996, "train_speed(iter/s)": 0.09591 }, { "epoch": 0.6552364565379775, "grad_norm": 7.036818981170654, "learning_rate": 2.9124381333086173e-06, "loss": 0.6461452007293701, "memory(GiB)": 43.68, "step": 14090, "token_acc": 0.8341232227488151, "train_speed(iter/s)": 0.095928 }, { "epoch": 0.6554689747979271, "grad_norm": 9.432230949401855, "learning_rate": 2.9089449988721883e-06, "loss": 0.6820971965789795, "memory(GiB)": 43.68, "step": 14095, "token_acc": 0.8365271802894017, "train_speed(iter/s)": 0.095948 }, { "epoch": 0.6557014930578767, "grad_norm": 7.816703796386719, "learning_rate": 2.9054531009411414e-06, "loss": 0.6649023532867432, "memory(GiB)": 43.68, "step": 14100, "token_acc": 0.835795836131632, "train_speed(iter/s)": 0.095967 }, { "epoch": 0.6557014930578767, "eval_loss": 0.572761595249176, "eval_runtime": 296.5335, "eval_samples_per_second": 11.719, "eval_steps_per_second": 11.719, "step": 14100 }, { "epoch": 0.6559340113178262, "grad_norm": 9.045654296875, "learning_rate": 2.901962441580345e-06, "loss": 0.6884272575378418, "memory(GiB)": 43.68, "step": 14105, "token_acc": 0.8311472914179493, "train_speed(iter/s)": 0.095792 }, { "epoch": 0.6561665295777759, "grad_norm": 8.36978816986084, "learning_rate": 2.8984730228539304e-06, "loss": 0.6390267372131347, "memory(GiB)": 43.68, "step": 14110, "token_acc": 0.844022770398482, "train_speed(iter/s)": 0.095811 }, { "epoch": 0.6563990478377255, "grad_norm": 9.524076461791992, "learning_rate": 2.894984846825303e-06, "loss": 0.6867616653442383, "memory(GiB)": 43.68, "step": 14115, "token_acc": 0.8194259012016022, "train_speed(iter/s)": 0.095831 }, { "epoch": 0.6566315660976751, "grad_norm": 4.460529804229736, "learning_rate": 2.8914979155571227e-06, "loss": 0.8245270729064942, "memory(GiB)": 43.68, "step": 14120, "token_acc": 0.7884366087632876, "train_speed(iter/s)": 0.095849 }, { "epoch": 0.6568640843576247, "grad_norm": 8.338356971740723, "learning_rate": 2.888012231111328e-06, "loss": 0.7064074516296387, "memory(GiB)": 43.68, "step": 14125, "token_acc": 0.8225, "train_speed(iter/s)": 0.095868 }, { "epoch": 0.6570966026175743, "grad_norm": 8.58897590637207, "learning_rate": 2.8845277955491046e-06, "loss": 0.6155064582824707, "memory(GiB)": 43.68, "step": 14130, "token_acc": 0.8537222464083587, "train_speed(iter/s)": 0.095887 }, { "epoch": 0.6573291208775239, "grad_norm": 6.647374153137207, "learning_rate": 2.8810446109309128e-06, "loss": 0.6504099845886231, "memory(GiB)": 43.68, "step": 14135, "token_acc": 0.8369781312127237, "train_speed(iter/s)": 0.095907 }, { "epoch": 0.6575616391374736, "grad_norm": 6.2928547859191895, "learning_rate": 2.8775626793164613e-06, "loss": 0.7517566204071044, "memory(GiB)": 43.68, "step": 14140, "token_acc": 0.8156510980513455, "train_speed(iter/s)": 0.095926 }, { "epoch": 0.6577941573974231, "grad_norm": 6.533952236175537, "learning_rate": 2.8740820027647287e-06, "loss": 0.6002644062042236, "memory(GiB)": 43.68, "step": 14145, "token_acc": 0.8577603143418467, "train_speed(iter/s)": 0.095945 }, { "epoch": 0.6580266756573727, "grad_norm": 8.172672271728516, "learning_rate": 2.8706025833339426e-06, "loss": 0.6353270530700683, "memory(GiB)": 43.68, "step": 14150, "token_acc": 0.8471074380165289, "train_speed(iter/s)": 0.095965 }, { "epoch": 0.6580266756573727, "eval_loss": 0.5736487507820129, "eval_runtime": 291.4912, "eval_samples_per_second": 11.921, "eval_steps_per_second": 11.921, "step": 14150 }, { "epoch": 0.6582591939173223, "grad_norm": 8.132963180541992, "learning_rate": 2.867124423081592e-06, "loss": 0.6144407749176025, "memory(GiB)": 43.68, "step": 14155, "token_acc": 0.8309699194599739, "train_speed(iter/s)": 0.095794 }, { "epoch": 0.6584917121772719, "grad_norm": 8.013964653015137, "learning_rate": 2.8636475240644224e-06, "loss": 0.7096580982208252, "memory(GiB)": 43.68, "step": 14160, "token_acc": 0.8162005085361423, "train_speed(iter/s)": 0.095814 }, { "epoch": 0.6587242304372215, "grad_norm": 6.895023345947266, "learning_rate": 2.8601718883384323e-06, "loss": 0.6625951766967774, "memory(GiB)": 43.68, "step": 14165, "token_acc": 0.8329945799457995, "train_speed(iter/s)": 0.095833 }, { "epoch": 0.6589567486971711, "grad_norm": 5.4100260734558105, "learning_rate": 2.856697517958868e-06, "loss": 0.7325220108032227, "memory(GiB)": 43.68, "step": 14170, "token_acc": 0.8107576254509675, "train_speed(iter/s)": 0.095853 }, { "epoch": 0.6591892669571208, "grad_norm": 7.335725784301758, "learning_rate": 2.853224414980237e-06, "loss": 0.6887342453002929, "memory(GiB)": 43.68, "step": 14175, "token_acc": 0.8252652519893899, "train_speed(iter/s)": 0.095872 }, { "epoch": 0.6594217852170703, "grad_norm": 8.461999893188477, "learning_rate": 2.849752581456288e-06, "loss": 0.7595938682556153, "memory(GiB)": 43.68, "step": 14180, "token_acc": 0.8095395826432593, "train_speed(iter/s)": 0.095891 }, { "epoch": 0.6596543034770199, "grad_norm": 8.143224716186523, "learning_rate": 2.846282019440024e-06, "loss": 0.7325577735900879, "memory(GiB)": 43.68, "step": 14185, "token_acc": 0.8076923076923077, "train_speed(iter/s)": 0.09591 }, { "epoch": 0.6598868217369696, "grad_norm": 8.338869094848633, "learning_rate": 2.8428127309837e-06, "loss": 0.6581947326660156, "memory(GiB)": 43.68, "step": 14190, "token_acc": 0.8365147783251231, "train_speed(iter/s)": 0.09593 }, { "epoch": 0.6601193399969192, "grad_norm": 7.938302516937256, "learning_rate": 2.839344718138808e-06, "loss": 0.7366507530212403, "memory(GiB)": 43.68, "step": 14195, "token_acc": 0.8035604665438919, "train_speed(iter/s)": 0.095948 }, { "epoch": 0.6603518582568687, "grad_norm": 8.492761611938477, "learning_rate": 2.8358779829560925e-06, "loss": 0.6317257404327392, "memory(GiB)": 43.68, "step": 14200, "token_acc": 0.835741980965809, "train_speed(iter/s)": 0.095967 }, { "epoch": 0.6603518582568687, "eval_loss": 0.5748091340065002, "eval_runtime": 292.2811, "eval_samples_per_second": 11.889, "eval_steps_per_second": 11.889, "step": 14200 }, { "epoch": 0.6605843765168183, "grad_norm": 9.483651161193848, "learning_rate": 2.8324125274855417e-06, "loss": 0.6270530700683594, "memory(GiB)": 43.68, "step": 14205, "token_acc": 0.8306502044415939, "train_speed(iter/s)": 0.095796 }, { "epoch": 0.660816894776768, "grad_norm": 11.275638580322266, "learning_rate": 2.8289483537763896e-06, "loss": 0.6266158103942872, "memory(GiB)": 43.68, "step": 14210, "token_acc": 0.8413255360623781, "train_speed(iter/s)": 0.095815 }, { "epoch": 0.6610494130367175, "grad_norm": 6.486075401306152, "learning_rate": 2.8254854638771024e-06, "loss": 0.5724782943725586, "memory(GiB)": 43.68, "step": 14215, "token_acc": 0.8566433566433567, "train_speed(iter/s)": 0.095834 }, { "epoch": 0.6612819312966671, "grad_norm": 8.653912544250488, "learning_rate": 2.8220238598354e-06, "loss": 0.7338716983795166, "memory(GiB)": 43.68, "step": 14220, "token_acc": 0.8108202443280977, "train_speed(iter/s)": 0.095852 }, { "epoch": 0.6615144495566168, "grad_norm": 8.97234058380127, "learning_rate": 2.8185635436982304e-06, "loss": 0.7180376052856445, "memory(GiB)": 43.68, "step": 14225, "token_acc": 0.8164863856769862, "train_speed(iter/s)": 0.095872 }, { "epoch": 0.6617469678165664, "grad_norm": 7.8891730308532715, "learning_rate": 2.815104517511791e-06, "loss": 0.7036499977111816, "memory(GiB)": 43.68, "step": 14230, "token_acc": 0.8244940683879972, "train_speed(iter/s)": 0.095891 }, { "epoch": 0.6619794860765159, "grad_norm": 10.813043594360352, "learning_rate": 2.8116467833215056e-06, "loss": 0.5858555316925049, "memory(GiB)": 43.68, "step": 14235, "token_acc": 0.8422420193021529, "train_speed(iter/s)": 0.09591 }, { "epoch": 0.6622120043364655, "grad_norm": 8.522062301635742, "learning_rate": 2.8081903431720403e-06, "loss": 0.6352302074432373, "memory(GiB)": 43.68, "step": 14240, "token_acc": 0.8485938521909745, "train_speed(iter/s)": 0.095929 }, { "epoch": 0.6624445225964152, "grad_norm": 8.544513702392578, "learning_rate": 2.804735199107297e-06, "loss": 0.6336312294006348, "memory(GiB)": 43.68, "step": 14245, "token_acc": 0.8369250562881956, "train_speed(iter/s)": 0.095948 }, { "epoch": 0.6626770408563647, "grad_norm": 9.956283569335938, "learning_rate": 2.8012813531704097e-06, "loss": 0.642596435546875, "memory(GiB)": 43.68, "step": 14250, "token_acc": 0.8246120534103212, "train_speed(iter/s)": 0.095966 }, { "epoch": 0.6626770408563647, "eval_loss": 0.5738605856895447, "eval_runtime": 291.7651, "eval_samples_per_second": 11.91, "eval_steps_per_second": 11.91, "step": 14250 }, { "epoch": 0.6629095591163143, "grad_norm": 9.033814430236816, "learning_rate": 2.7978288074037397e-06, "loss": 0.6293091773986816, "memory(GiB)": 43.68, "step": 14255, "token_acc": 0.8305877812884397, "train_speed(iter/s)": 0.095797 }, { "epoch": 0.663142077376264, "grad_norm": 7.8562703132629395, "learning_rate": 2.7943775638488896e-06, "loss": 0.5436077117919922, "memory(GiB)": 43.68, "step": 14260, "token_acc": 0.8550860719874804, "train_speed(iter/s)": 0.095816 }, { "epoch": 0.6633745956362136, "grad_norm": 8.680620193481445, "learning_rate": 2.790927624546681e-06, "loss": 0.6706278324127197, "memory(GiB)": 43.68, "step": 14265, "token_acc": 0.834733893557423, "train_speed(iter/s)": 0.095836 }, { "epoch": 0.6636071138961631, "grad_norm": 8.097391128540039, "learning_rate": 2.7874789915371736e-06, "loss": 0.6728082656860351, "memory(GiB)": 43.68, "step": 14270, "token_acc": 0.8357296908698778, "train_speed(iter/s)": 0.095855 }, { "epoch": 0.6638396321561127, "grad_norm": 6.790459632873535, "learning_rate": 2.7840316668596468e-06, "loss": 0.7238803386688233, "memory(GiB)": 43.68, "step": 14275, "token_acc": 0.8292777134028583, "train_speed(iter/s)": 0.095874 }, { "epoch": 0.6640721504160624, "grad_norm": 7.590424060821533, "learning_rate": 2.7805856525526125e-06, "loss": 0.6790872573852539, "memory(GiB)": 43.68, "step": 14280, "token_acc": 0.83116095786602, "train_speed(iter/s)": 0.095892 }, { "epoch": 0.664304668676012, "grad_norm": 8.506343841552734, "learning_rate": 2.777140950653805e-06, "loss": 0.6780567646026612, "memory(GiB)": 43.68, "step": 14285, "token_acc": 0.8256519102486355, "train_speed(iter/s)": 0.095911 }, { "epoch": 0.6645371869359615, "grad_norm": 8.424605369567871, "learning_rate": 2.7736975632001843e-06, "loss": 0.7088619709014893, "memory(GiB)": 43.68, "step": 14290, "token_acc": 0.8216023353876095, "train_speed(iter/s)": 0.09593 }, { "epoch": 0.6647697051959112, "grad_norm": 8.809297561645508, "learning_rate": 2.770255492227929e-06, "loss": 0.6911493301391601, "memory(GiB)": 43.68, "step": 14295, "token_acc": 0.8151062155782848, "train_speed(iter/s)": 0.095949 }, { "epoch": 0.6650022234558608, "grad_norm": 6.6492509841918945, "learning_rate": 2.766814739772444e-06, "loss": 0.5668047904968262, "memory(GiB)": 43.68, "step": 14300, "token_acc": 0.8501394978078916, "train_speed(iter/s)": 0.095968 }, { "epoch": 0.6650022234558608, "eval_loss": 0.5735751986503601, "eval_runtime": 292.2407, "eval_samples_per_second": 11.891, "eval_steps_per_second": 11.891, "step": 14300 }, { "epoch": 0.6652347417158103, "grad_norm": 9.933414459228516, "learning_rate": 2.763375307868351e-06, "loss": 0.6078849792480469, "memory(GiB)": 43.68, "step": 14305, "token_acc": 0.8308808820823032, "train_speed(iter/s)": 0.095799 }, { "epoch": 0.66546725997576, "grad_norm": 9.008543014526367, "learning_rate": 2.7599371985494936e-06, "loss": 0.7673866748809814, "memory(GiB)": 43.68, "step": 14310, "token_acc": 0.8110333470564018, "train_speed(iter/s)": 0.095818 }, { "epoch": 0.6656997782357096, "grad_norm": 8.77670955657959, "learning_rate": 2.75650041384893e-06, "loss": 0.5974670886993408, "memory(GiB)": 43.68, "step": 14315, "token_acc": 0.8492482730597318, "train_speed(iter/s)": 0.095837 }, { "epoch": 0.6659322964956592, "grad_norm": 9.20003604888916, "learning_rate": 2.7530649557989392e-06, "loss": 0.655994987487793, "memory(GiB)": 43.68, "step": 14320, "token_acc": 0.8544620517097581, "train_speed(iter/s)": 0.095856 }, { "epoch": 0.6661648147556087, "grad_norm": 8.369126319885254, "learning_rate": 2.7496308264310124e-06, "loss": 0.6682295322418212, "memory(GiB)": 43.68, "step": 14325, "token_acc": 0.8262195121951219, "train_speed(iter/s)": 0.095875 }, { "epoch": 0.6663973330155584, "grad_norm": 9.203055381774902, "learning_rate": 2.7461980277758566e-06, "loss": 0.7167951583862304, "memory(GiB)": 43.68, "step": 14330, "token_acc": 0.8232542599398597, "train_speed(iter/s)": 0.095894 }, { "epoch": 0.666629851275508, "grad_norm": 8.59557056427002, "learning_rate": 2.7427665618633938e-06, "loss": 0.6695326805114746, "memory(GiB)": 43.68, "step": 14335, "token_acc": 0.83780276816609, "train_speed(iter/s)": 0.095913 }, { "epoch": 0.6668623695354576, "grad_norm": 10.057333946228027, "learning_rate": 2.7393364307227516e-06, "loss": 0.5996942043304443, "memory(GiB)": 43.68, "step": 14340, "token_acc": 0.8568464730290456, "train_speed(iter/s)": 0.095932 }, { "epoch": 0.6670948877954072, "grad_norm": 7.976700782775879, "learning_rate": 2.7359076363822767e-06, "loss": 0.635762882232666, "memory(GiB)": 43.68, "step": 14345, "token_acc": 0.8433179723502304, "train_speed(iter/s)": 0.095951 }, { "epoch": 0.6673274060553568, "grad_norm": 8.137279510498047, "learning_rate": 2.7324801808695168e-06, "loss": 0.6215654850006104, "memory(GiB)": 43.68, "step": 14350, "token_acc": 0.8468543046357616, "train_speed(iter/s)": 0.095969 }, { "epoch": 0.6673274060553568, "eval_loss": 0.5712202191352844, "eval_runtime": 294.1138, "eval_samples_per_second": 11.815, "eval_steps_per_second": 11.815, "step": 14350 }, { "epoch": 0.6675599243153064, "grad_norm": 7.595315933227539, "learning_rate": 2.7290540662112363e-06, "loss": 0.6087830543518067, "memory(GiB)": 43.68, "step": 14355, "token_acc": 0.8307561379000273, "train_speed(iter/s)": 0.095799 }, { "epoch": 0.6677924425752559, "grad_norm": 9.151399612426758, "learning_rate": 2.7256292944333983e-06, "loss": 0.6383302688598633, "memory(GiB)": 43.68, "step": 14360, "token_acc": 0.8437638703950289, "train_speed(iter/s)": 0.095818 }, { "epoch": 0.6680249608352056, "grad_norm": 8.153346061706543, "learning_rate": 2.722205867561179e-06, "loss": 0.7106346607208252, "memory(GiB)": 43.68, "step": 14365, "token_acc": 0.8124318429661941, "train_speed(iter/s)": 0.095837 }, { "epoch": 0.6682574790951552, "grad_norm": 9.878233909606934, "learning_rate": 2.718783787618956e-06, "loss": 0.5851165294647217, "memory(GiB)": 43.68, "step": 14370, "token_acc": 0.847217298830202, "train_speed(iter/s)": 0.095855 }, { "epoch": 0.6684899973551048, "grad_norm": 7.442888259887695, "learning_rate": 2.715363056630312e-06, "loss": 0.6458406448364258, "memory(GiB)": 43.68, "step": 14375, "token_acc": 0.8395563770794824, "train_speed(iter/s)": 0.095873 }, { "epoch": 0.6687225156150544, "grad_norm": 8.392727851867676, "learning_rate": 2.7119436766180273e-06, "loss": 0.6156882286071778, "memory(GiB)": 43.68, "step": 14380, "token_acc": 0.8352173913043478, "train_speed(iter/s)": 0.095893 }, { "epoch": 0.668955033875004, "grad_norm": 7.722550392150879, "learning_rate": 2.7085256496040914e-06, "loss": 0.5698171615600586, "memory(GiB)": 43.68, "step": 14385, "token_acc": 0.8557172557172558, "train_speed(iter/s)": 0.095911 }, { "epoch": 0.6691875521349536, "grad_norm": 7.979699611663818, "learning_rate": 2.7051089776096846e-06, "loss": 0.6607985973358155, "memory(GiB)": 43.68, "step": 14390, "token_acc": 0.8231414868105515, "train_speed(iter/s)": 0.09593 }, { "epoch": 0.6694200703949033, "grad_norm": 7.391639709472656, "learning_rate": 2.701693662655195e-06, "loss": 0.5928860187530518, "memory(GiB)": 43.68, "step": 14395, "token_acc": 0.8518250813155042, "train_speed(iter/s)": 0.095948 }, { "epoch": 0.6696525886548528, "grad_norm": 8.907283782958984, "learning_rate": 2.6982797067601997e-06, "loss": 0.538227128982544, "memory(GiB)": 43.68, "step": 14400, "token_acc": 0.8650433347090384, "train_speed(iter/s)": 0.095967 }, { "epoch": 0.6696525886548528, "eval_loss": 0.5737596154212952, "eval_runtime": 297.3936, "eval_samples_per_second": 11.685, "eval_steps_per_second": 11.685, "step": 14400 }, { "epoch": 0.6698851069148024, "grad_norm": 6.146030902862549, "learning_rate": 2.694867111943478e-06, "loss": 0.6597367763519287, "memory(GiB)": 43.68, "step": 14405, "token_acc": 0.8308298941037208, "train_speed(iter/s)": 0.095796 }, { "epoch": 0.670117625174752, "grad_norm": 6.513291835784912, "learning_rate": 2.6914558802230018e-06, "loss": 0.5947196006774902, "memory(GiB)": 43.68, "step": 14410, "token_acc": 0.8473229706390328, "train_speed(iter/s)": 0.095815 }, { "epoch": 0.6703501434347016, "grad_norm": 9.858675003051758, "learning_rate": 2.6880460136159415e-06, "loss": 0.5992252349853515, "memory(GiB)": 43.68, "step": 14415, "token_acc": 0.8595160707836764, "train_speed(iter/s)": 0.095834 }, { "epoch": 0.6705826616946512, "grad_norm": 6.774270534515381, "learning_rate": 2.684637514138651e-06, "loss": 0.6149023056030274, "memory(GiB)": 43.68, "step": 14420, "token_acc": 0.8390723822909346, "train_speed(iter/s)": 0.095852 }, { "epoch": 0.6708151799546008, "grad_norm": 9.595948219299316, "learning_rate": 2.6812303838066863e-06, "loss": 0.5945103645324707, "memory(GiB)": 43.68, "step": 14425, "token_acc": 0.8649948471315699, "train_speed(iter/s)": 0.095871 }, { "epoch": 0.6710476982145505, "grad_norm": 10.324151039123535, "learning_rate": 2.677824624634784e-06, "loss": 0.6018610954284668, "memory(GiB)": 43.68, "step": 14430, "token_acc": 0.8574784651527017, "train_speed(iter/s)": 0.09589 }, { "epoch": 0.6712802164745, "grad_norm": 6.825169563293457, "learning_rate": 2.674420238636879e-06, "loss": 0.6257966518402099, "memory(GiB)": 43.68, "step": 14435, "token_acc": 0.8469807145115397, "train_speed(iter/s)": 0.095908 }, { "epoch": 0.6715127347344496, "grad_norm": 6.618413925170898, "learning_rate": 2.671017227826086e-06, "loss": 0.6099302768707275, "memory(GiB)": 43.68, "step": 14440, "token_acc": 0.842526436124607, "train_speed(iter/s)": 0.095927 }, { "epoch": 0.6717452529943992, "grad_norm": 8.459787368774414, "learning_rate": 2.667615594214712e-06, "loss": 0.627414321899414, "memory(GiB)": 43.68, "step": 14445, "token_acc": 0.8332191780821918, "train_speed(iter/s)": 0.095946 }, { "epoch": 0.6719777712543488, "grad_norm": 7.22158670425415, "learning_rate": 2.664215339814248e-06, "loss": 0.5792049407958985, "memory(GiB)": 43.68, "step": 14450, "token_acc": 0.8588117489986649, "train_speed(iter/s)": 0.095965 }, { "epoch": 0.6719777712543488, "eval_loss": 0.573811411857605, "eval_runtime": 294.5723, "eval_samples_per_second": 11.797, "eval_steps_per_second": 11.797, "step": 14450 }, { "epoch": 0.6722102895142984, "grad_norm": 9.8744535446167, "learning_rate": 2.66081646663537e-06, "loss": 0.6201520442962647, "memory(GiB)": 43.68, "step": 14455, "token_acc": 0.8305132160526884, "train_speed(iter/s)": 0.095796 }, { "epoch": 0.672442807774248, "grad_norm": 6.522641658782959, "learning_rate": 2.6574189766879377e-06, "loss": 0.6753710746765137, "memory(GiB)": 43.68, "step": 14460, "token_acc": 0.8397988505747126, "train_speed(iter/s)": 0.095814 }, { "epoch": 0.6726753260341977, "grad_norm": 7.374817371368408, "learning_rate": 2.654022871980989e-06, "loss": 0.6833240032196045, "memory(GiB)": 43.68, "step": 14465, "token_acc": 0.8307291666666666, "train_speed(iter/s)": 0.095833 }, { "epoch": 0.6729078442941472, "grad_norm": 7.5340118408203125, "learning_rate": 2.650628154522748e-06, "loss": 0.726350975036621, "memory(GiB)": 43.68, "step": 14470, "token_acc": 0.8182887386062923, "train_speed(iter/s)": 0.095851 }, { "epoch": 0.6731403625540968, "grad_norm": 7.650021076202393, "learning_rate": 2.647234826320613e-06, "loss": 0.7587420463562011, "memory(GiB)": 43.68, "step": 14475, "token_acc": 0.8096377306903623, "train_speed(iter/s)": 0.09587 }, { "epoch": 0.6733728808140464, "grad_norm": 8.88990306854248, "learning_rate": 2.6438428893811657e-06, "loss": 0.7666681289672852, "memory(GiB)": 43.68, "step": 14480, "token_acc": 0.8095693779904306, "train_speed(iter/s)": 0.095888 }, { "epoch": 0.6736053990739961, "grad_norm": 8.150176048278809, "learning_rate": 2.640452345710163e-06, "loss": 0.710899019241333, "memory(GiB)": 43.68, "step": 14485, "token_acc": 0.8158881691101262, "train_speed(iter/s)": 0.095907 }, { "epoch": 0.6738379173339456, "grad_norm": 8.934723854064941, "learning_rate": 2.6370631973125394e-06, "loss": 0.7278150081634521, "memory(GiB)": 43.68, "step": 14490, "token_acc": 0.7667876588021778, "train_speed(iter/s)": 0.095924 }, { "epoch": 0.6740704355938952, "grad_norm": 11.422002792358398, "learning_rate": 2.6336754461923997e-06, "loss": 0.5746397495269775, "memory(GiB)": 43.68, "step": 14495, "token_acc": 0.8616751269035533, "train_speed(iter/s)": 0.095943 }, { "epoch": 0.6743029538538449, "grad_norm": 9.185924530029297, "learning_rate": 2.6302890943530294e-06, "loss": 0.7092298984527587, "memory(GiB)": 43.68, "step": 14500, "token_acc": 0.8205882352941176, "train_speed(iter/s)": 0.095961 }, { "epoch": 0.6743029538538449, "eval_loss": 0.5714608430862427, "eval_runtime": 291.671, "eval_samples_per_second": 11.914, "eval_steps_per_second": 11.914, "step": 14500 }, { "epoch": 0.6745354721137944, "grad_norm": 10.095582008361816, "learning_rate": 2.6269041437968794e-06, "loss": 0.6917707920074463, "memory(GiB)": 43.68, "step": 14505, "token_acc": 0.8309227052367162, "train_speed(iter/s)": 0.095795 }, { "epoch": 0.674767990373744, "grad_norm": 9.785516738891602, "learning_rate": 2.6235205965255794e-06, "loss": 0.6394733428955078, "memory(GiB)": 43.68, "step": 14510, "token_acc": 0.8392096086788067, "train_speed(iter/s)": 0.095814 }, { "epoch": 0.6750005086336937, "grad_norm": 7.499686241149902, "learning_rate": 2.6201384545399205e-06, "loss": 0.6602446556091308, "memory(GiB)": 43.68, "step": 14515, "token_acc": 0.8349798755945848, "train_speed(iter/s)": 0.095833 }, { "epoch": 0.6752330268936433, "grad_norm": 8.838566780090332, "learning_rate": 2.616757719839871e-06, "loss": 0.710049057006836, "memory(GiB)": 43.68, "step": 14520, "token_acc": 0.8274932614555256, "train_speed(iter/s)": 0.095852 }, { "epoch": 0.6754655451535928, "grad_norm": 10.444742202758789, "learning_rate": 2.6133783944245617e-06, "loss": 0.67647705078125, "memory(GiB)": 43.68, "step": 14525, "token_acc": 0.8286158631415241, "train_speed(iter/s)": 0.095871 }, { "epoch": 0.6756980634135424, "grad_norm": 7.48112154006958, "learning_rate": 2.6100004802922963e-06, "loss": 0.6187932968139649, "memory(GiB)": 43.68, "step": 14530, "token_acc": 0.8535674262233844, "train_speed(iter/s)": 0.09589 }, { "epoch": 0.6759305816734921, "grad_norm": 7.320490837097168, "learning_rate": 2.6066239794405346e-06, "loss": 0.5680778503417969, "memory(GiB)": 43.68, "step": 14535, "token_acc": 0.8603896103896104, "train_speed(iter/s)": 0.095908 }, { "epoch": 0.6761630999334417, "grad_norm": 9.288532257080078, "learning_rate": 2.6032488938659096e-06, "loss": 0.6736385345458984, "memory(GiB)": 43.68, "step": 14540, "token_acc": 0.8226904376012966, "train_speed(iter/s)": 0.095926 }, { "epoch": 0.6763956181933912, "grad_norm": 8.691412925720215, "learning_rate": 2.59987522556421e-06, "loss": 0.5699333667755127, "memory(GiB)": 43.68, "step": 14545, "token_acc": 0.8680815647107782, "train_speed(iter/s)": 0.095945 }, { "epoch": 0.6766281364533409, "grad_norm": 6.899057388305664, "learning_rate": 2.596502976530394e-06, "loss": 0.5840402603149414, "memory(GiB)": 43.68, "step": 14550, "token_acc": 0.8597560975609756, "train_speed(iter/s)": 0.095964 }, { "epoch": 0.6766281364533409, "eval_loss": 0.5713512301445007, "eval_runtime": 293.982, "eval_samples_per_second": 11.82, "eval_steps_per_second": 11.82, "step": 14550 }, { "epoch": 0.6768606547132905, "grad_norm": 7.8339080810546875, "learning_rate": 2.593132148758573e-06, "loss": 0.6375294685363769, "memory(GiB)": 43.68, "step": 14555, "token_acc": 0.831493550740322, "train_speed(iter/s)": 0.095796 }, { "epoch": 0.67709317297324, "grad_norm": 10.36740493774414, "learning_rate": 2.5897627442420224e-06, "loss": 0.6267732143402099, "memory(GiB)": 43.68, "step": 14560, "token_acc": 0.8491691521090754, "train_speed(iter/s)": 0.095815 }, { "epoch": 0.6773256912331896, "grad_norm": 8.344315528869629, "learning_rate": 2.586394764973177e-06, "loss": 0.6922134399414063, "memory(GiB)": 43.68, "step": 14565, "token_acc": 0.838535164377861, "train_speed(iter/s)": 0.095833 }, { "epoch": 0.6775582094931393, "grad_norm": 8.412717819213867, "learning_rate": 2.583028212943627e-06, "loss": 0.6234781265258789, "memory(GiB)": 43.68, "step": 14570, "token_acc": 0.8586094260382641, "train_speed(iter/s)": 0.095852 }, { "epoch": 0.6777907277530889, "grad_norm": 6.043086051940918, "learning_rate": 2.5796630901441144e-06, "loss": 0.6654970645904541, "memory(GiB)": 43.68, "step": 14575, "token_acc": 0.8365173817726276, "train_speed(iter/s)": 0.09587 }, { "epoch": 0.6780232460130384, "grad_norm": 7.909218788146973, "learning_rate": 2.576299398564544e-06, "loss": 0.635191822052002, "memory(GiB)": 43.68, "step": 14580, "token_acc": 0.82876254180602, "train_speed(iter/s)": 0.095889 }, { "epoch": 0.6782557642729881, "grad_norm": 7.435328960418701, "learning_rate": 2.572937140193972e-06, "loss": 0.6927794456481934, "memory(GiB)": 43.68, "step": 14585, "token_acc": 0.8353344768439108, "train_speed(iter/s)": 0.095908 }, { "epoch": 0.6784882825329377, "grad_norm": 9.59237289428711, "learning_rate": 2.5695763170206e-06, "loss": 0.6599873065948486, "memory(GiB)": 43.68, "step": 14590, "token_acc": 0.8305149884704074, "train_speed(iter/s)": 0.095927 }, { "epoch": 0.6787208007928872, "grad_norm": 7.779507637023926, "learning_rate": 2.5662169310317923e-06, "loss": 0.6364209651947021, "memory(GiB)": 43.68, "step": 14595, "token_acc": 0.831943981327109, "train_speed(iter/s)": 0.095946 }, { "epoch": 0.6789533190528368, "grad_norm": 7.693490505218506, "learning_rate": 2.5628589842140528e-06, "loss": 0.6035785675048828, "memory(GiB)": 43.68, "step": 14600, "token_acc": 0.8487282463186078, "train_speed(iter/s)": 0.095964 }, { "epoch": 0.6789533190528368, "eval_loss": 0.5727770328521729, "eval_runtime": 291.4508, "eval_samples_per_second": 11.923, "eval_steps_per_second": 11.923, "step": 14600 }, { "epoch": 0.6791858373127865, "grad_norm": 9.653355598449707, "learning_rate": 2.5595024785530415e-06, "loss": 0.6479739189147949, "memory(GiB)": 43.68, "step": 14605, "token_acc": 0.8313283952202489, "train_speed(iter/s)": 0.095799 }, { "epoch": 0.6794183555727361, "grad_norm": 5.43248987197876, "learning_rate": 2.5561474160335633e-06, "loss": 0.6288150787353516, "memory(GiB)": 43.68, "step": 14610, "token_acc": 0.8369351669941061, "train_speed(iter/s)": 0.095818 }, { "epoch": 0.6796508738326856, "grad_norm": 8.895267486572266, "learning_rate": 2.5527937986395722e-06, "loss": 0.7126208782196045, "memory(GiB)": 43.68, "step": 14615, "token_acc": 0.8321744627054362, "train_speed(iter/s)": 0.095836 }, { "epoch": 0.6798833920926353, "grad_norm": 6.117085933685303, "learning_rate": 2.549441628354163e-06, "loss": 0.5901892185211182, "memory(GiB)": 43.68, "step": 14620, "token_acc": 0.8526863084922011, "train_speed(iter/s)": 0.095855 }, { "epoch": 0.6801159103525849, "grad_norm": 7.739634037017822, "learning_rate": 2.5460909071595795e-06, "loss": 0.6293376922607422, "memory(GiB)": 43.68, "step": 14625, "token_acc": 0.8395632087358252, "train_speed(iter/s)": 0.095873 }, { "epoch": 0.6803484286125345, "grad_norm": 9.097126007080078, "learning_rate": 2.542741637037204e-06, "loss": 0.5195389270782471, "memory(GiB)": 43.68, "step": 14630, "token_acc": 0.8595877090626215, "train_speed(iter/s)": 0.095892 }, { "epoch": 0.680580946872484, "grad_norm": 7.753961563110352, "learning_rate": 2.5393938199675673e-06, "loss": 0.7167007923126221, "memory(GiB)": 43.68, "step": 14635, "token_acc": 0.8415798611111112, "train_speed(iter/s)": 0.095911 }, { "epoch": 0.6808134651324337, "grad_norm": 8.212662696838379, "learning_rate": 2.5360474579303323e-06, "loss": 0.643263578414917, "memory(GiB)": 43.68, "step": 14640, "token_acc": 0.8508230452674898, "train_speed(iter/s)": 0.09593 }, { "epoch": 0.6810459833923833, "grad_norm": 8.01890754699707, "learning_rate": 2.5327025529043083e-06, "loss": 0.6267054080963135, "memory(GiB)": 43.68, "step": 14645, "token_acc": 0.8474452554744526, "train_speed(iter/s)": 0.095949 }, { "epoch": 0.6812785016523328, "grad_norm": 8.446707725524902, "learning_rate": 2.5293591068674418e-06, "loss": 0.7221577167510986, "memory(GiB)": 43.68, "step": 14650, "token_acc": 0.8208, "train_speed(iter/s)": 0.095968 }, { "epoch": 0.6812785016523328, "eval_loss": 0.5712907910346985, "eval_runtime": 292.4027, "eval_samples_per_second": 11.884, "eval_steps_per_second": 11.884, "step": 14650 }, { "epoch": 0.6815110199122825, "grad_norm": 8.568792343139648, "learning_rate": 2.5260171217968164e-06, "loss": 0.5040855407714844, "memory(GiB)": 43.68, "step": 14655, "token_acc": 0.8315160352189971, "train_speed(iter/s)": 0.095803 }, { "epoch": 0.6817435381722321, "grad_norm": 8.438779830932617, "learning_rate": 2.5226765996686477e-06, "loss": 0.6068079471588135, "memory(GiB)": 43.68, "step": 14660, "token_acc": 0.8387342737323675, "train_speed(iter/s)": 0.095821 }, { "epoch": 0.6819760564321817, "grad_norm": 7.259321212768555, "learning_rate": 2.5193375424582933e-06, "loss": 0.6753248691558837, "memory(GiB)": 43.68, "step": 14665, "token_acc": 0.8390367553865653, "train_speed(iter/s)": 0.095839 }, { "epoch": 0.6822085746921313, "grad_norm": 9.771519660949707, "learning_rate": 2.5159999521402377e-06, "loss": 0.6740274429321289, "memory(GiB)": 43.68, "step": 14670, "token_acc": 0.839384878257155, "train_speed(iter/s)": 0.095858 }, { "epoch": 0.6824410929520809, "grad_norm": 10.084802627563477, "learning_rate": 2.512663830688104e-06, "loss": 0.637111759185791, "memory(GiB)": 43.68, "step": 14675, "token_acc": 0.8309859154929577, "train_speed(iter/s)": 0.095877 }, { "epoch": 0.6826736112120305, "grad_norm": 11.251710891723633, "learning_rate": 2.5093291800746404e-06, "loss": 0.6224228858947753, "memory(GiB)": 43.68, "step": 14680, "token_acc": 0.8412305516265912, "train_speed(iter/s)": 0.095896 }, { "epoch": 0.6829061294719802, "grad_norm": 6.67312479019165, "learning_rate": 2.505996002271731e-06, "loss": 0.6442068576812744, "memory(GiB)": 43.68, "step": 14685, "token_acc": 0.842228935884525, "train_speed(iter/s)": 0.095914 }, { "epoch": 0.6831386477319297, "grad_norm": 8.050361633300781, "learning_rate": 2.502664299250386e-06, "loss": 0.684641170501709, "memory(GiB)": 43.68, "step": 14690, "token_acc": 0.8318224445139106, "train_speed(iter/s)": 0.095933 }, { "epoch": 0.6833711659918793, "grad_norm": 8.623970031738281, "learning_rate": 2.4993340729807463e-06, "loss": 0.6142326354980469, "memory(GiB)": 43.68, "step": 14695, "token_acc": 0.8480436760691538, "train_speed(iter/s)": 0.095952 }, { "epoch": 0.6836036842518289, "grad_norm": 8.763551712036133, "learning_rate": 2.496005325432074e-06, "loss": 0.6094542026519776, "memory(GiB)": 43.68, "step": 14700, "token_acc": 0.8512938093678349, "train_speed(iter/s)": 0.095971 }, { "epoch": 0.6836036842518289, "eval_loss": 0.569529116153717, "eval_runtime": 291.8592, "eval_samples_per_second": 11.906, "eval_steps_per_second": 11.906, "step": 14700 }, { "epoch": 0.6838362025117785, "grad_norm": 7.144622325897217, "learning_rate": 2.492678058572765e-06, "loss": 0.5630511283874512, "memory(GiB)": 43.68, "step": 14705, "token_acc": 0.8320883273685138, "train_speed(iter/s)": 0.095806 }, { "epoch": 0.6840687207717281, "grad_norm": 9.401549339294434, "learning_rate": 2.4893522743703293e-06, "loss": 0.6558740615844727, "memory(GiB)": 43.68, "step": 14710, "token_acc": 0.8358806404657934, "train_speed(iter/s)": 0.095825 }, { "epoch": 0.6843012390316777, "grad_norm": 8.555597305297852, "learning_rate": 2.4860279747914083e-06, "loss": 0.7010702610015869, "memory(GiB)": 43.68, "step": 14715, "token_acc": 0.8217913204062789, "train_speed(iter/s)": 0.095843 }, { "epoch": 0.6845337572916274, "grad_norm": 7.839824199676514, "learning_rate": 2.482705161801766e-06, "loss": 0.7387192249298096, "memory(GiB)": 43.68, "step": 14720, "token_acc": 0.8162778366914104, "train_speed(iter/s)": 0.095861 }, { "epoch": 0.6847662755515769, "grad_norm": 7.019917011260986, "learning_rate": 2.4793838373662787e-06, "loss": 0.6605482578277588, "memory(GiB)": 43.68, "step": 14725, "token_acc": 0.8397932816537468, "train_speed(iter/s)": 0.09588 }, { "epoch": 0.6849987938115265, "grad_norm": 10.25475025177002, "learning_rate": 2.476064003448952e-06, "loss": 0.6720106124877929, "memory(GiB)": 43.68, "step": 14730, "token_acc": 0.8458235753317721, "train_speed(iter/s)": 0.095898 }, { "epoch": 0.6852313120714761, "grad_norm": 9.277771949768066, "learning_rate": 2.472745662012904e-06, "loss": 0.7551665306091309, "memory(GiB)": 43.68, "step": 14735, "token_acc": 0.7976900149031296, "train_speed(iter/s)": 0.095917 }, { "epoch": 0.6854638303314257, "grad_norm": 9.814400672912598, "learning_rate": 2.469428815020376e-06, "loss": 0.6167050361633301, "memory(GiB)": 43.68, "step": 14740, "token_acc": 0.849003984063745, "train_speed(iter/s)": 0.095935 }, { "epoch": 0.6856963485913753, "grad_norm": 6.580095291137695, "learning_rate": 2.466113464432718e-06, "loss": 0.6547997474670411, "memory(GiB)": 43.68, "step": 14745, "token_acc": 0.8346456692913385, "train_speed(iter/s)": 0.095954 }, { "epoch": 0.6859288668513249, "grad_norm": 8.515876770019531, "learning_rate": 2.462799612210402e-06, "loss": 0.641704797744751, "memory(GiB)": 43.68, "step": 14750, "token_acc": 0.8397271952259164, "train_speed(iter/s)": 0.095972 }, { "epoch": 0.6859288668513249, "eval_loss": 0.5697548985481262, "eval_runtime": 292.9245, "eval_samples_per_second": 11.863, "eval_steps_per_second": 11.863, "step": 14750 }, { "epoch": 0.6861613851112746, "grad_norm": 7.716666221618652, "learning_rate": 2.459487260313008e-06, "loss": 0.5847614765167236, "memory(GiB)": 43.68, "step": 14755, "token_acc": 0.8317114187568244, "train_speed(iter/s)": 0.095807 }, { "epoch": 0.6863939033712241, "grad_norm": 9.125048637390137, "learning_rate": 2.4561764106992364e-06, "loss": 0.5890578746795654, "memory(GiB)": 43.68, "step": 14760, "token_acc": 0.853467073702573, "train_speed(iter/s)": 0.095826 }, { "epoch": 0.6866264216311737, "grad_norm": 10.58772087097168, "learning_rate": 2.45286706532689e-06, "loss": 0.5699628829956055, "memory(GiB)": 43.68, "step": 14765, "token_acc": 0.8517412935323383, "train_speed(iter/s)": 0.095844 }, { "epoch": 0.6868589398911233, "grad_norm": 9.083113670349121, "learning_rate": 2.449559226152889e-06, "loss": 0.559787654876709, "memory(GiB)": 43.68, "step": 14770, "token_acc": 0.8571428571428571, "train_speed(iter/s)": 0.095862 }, { "epoch": 0.687091458151073, "grad_norm": 7.75703239440918, "learning_rate": 2.446252895133261e-06, "loss": 0.5728509426116943, "memory(GiB)": 43.68, "step": 14775, "token_acc": 0.8530890804597702, "train_speed(iter/s)": 0.09588 }, { "epoch": 0.6873239764110225, "grad_norm": 9.354948997497559, "learning_rate": 2.4429480742231433e-06, "loss": 0.6882061004638672, "memory(GiB)": 43.68, "step": 14780, "token_acc": 0.8113659705580281, "train_speed(iter/s)": 0.095898 }, { "epoch": 0.6875564946709721, "grad_norm": 7.420952320098877, "learning_rate": 2.4396447653767746e-06, "loss": 0.811635684967041, "memory(GiB)": 43.68, "step": 14785, "token_acc": 0.7832310838445807, "train_speed(iter/s)": 0.095916 }, { "epoch": 0.6877890129309218, "grad_norm": 7.162716865539551, "learning_rate": 2.4363429705475082e-06, "loss": 0.7320106506347657, "memory(GiB)": 43.68, "step": 14790, "token_acc": 0.8040944881889763, "train_speed(iter/s)": 0.095934 }, { "epoch": 0.6880215311908713, "grad_norm": 9.993701934814453, "learning_rate": 2.4330426916877927e-06, "loss": 0.6562893867492676, "memory(GiB)": 43.68, "step": 14795, "token_acc": 0.832933653077538, "train_speed(iter/s)": 0.095952 }, { "epoch": 0.6882540494508209, "grad_norm": 6.7831950187683105, "learning_rate": 2.429743930749189e-06, "loss": 0.6879189014434814, "memory(GiB)": 43.68, "step": 14800, "token_acc": 0.8200647249190939, "train_speed(iter/s)": 0.09597 }, { "epoch": 0.6882540494508209, "eval_loss": 0.571153461933136, "eval_runtime": 291.7537, "eval_samples_per_second": 11.911, "eval_steps_per_second": 11.911, "step": 14800 }, { "epoch": 0.6884865677107705, "grad_norm": 8.59450626373291, "learning_rate": 2.4264466896823494e-06, "loss": 0.5881685256958008, "memory(GiB)": 43.68, "step": 14805, "token_acc": 0.831436996201984, "train_speed(iter/s)": 0.095807 }, { "epoch": 0.6887190859707202, "grad_norm": 6.911591053009033, "learning_rate": 2.4231509704370438e-06, "loss": 0.6806635856628418, "memory(GiB)": 43.68, "step": 14810, "token_acc": 0.8354381936471489, "train_speed(iter/s)": 0.095826 }, { "epoch": 0.6889516042306697, "grad_norm": 10.008810043334961, "learning_rate": 2.419856774962126e-06, "loss": 0.6647405624389648, "memory(GiB)": 43.68, "step": 14815, "token_acc": 0.8392990305741984, "train_speed(iter/s)": 0.095845 }, { "epoch": 0.6891841224906193, "grad_norm": 7.691843509674072, "learning_rate": 2.4165641052055592e-06, "loss": 0.6704154014587402, "memory(GiB)": 43.68, "step": 14820, "token_acc": 0.83171657528469, "train_speed(iter/s)": 0.095863 }, { "epoch": 0.689416640750569, "grad_norm": 7.425168037414551, "learning_rate": 2.4132729631143974e-06, "loss": 0.6791874408721924, "memory(GiB)": 43.68, "step": 14825, "token_acc": 0.8379405666897028, "train_speed(iter/s)": 0.095881 }, { "epoch": 0.6896491590105186, "grad_norm": 7.715751647949219, "learning_rate": 2.4099833506347984e-06, "loss": 0.6338499546051025, "memory(GiB)": 43.68, "step": 14830, "token_acc": 0.8356896010053408, "train_speed(iter/s)": 0.095899 }, { "epoch": 0.6898816772704681, "grad_norm": 10.072502136230469, "learning_rate": 2.4066952697120073e-06, "loss": 0.6765162467956543, "memory(GiB)": 43.68, "step": 14835, "token_acc": 0.8386277001270648, "train_speed(iter/s)": 0.095917 }, { "epoch": 0.6901141955304178, "grad_norm": 10.854269027709961, "learning_rate": 2.4034087222903703e-06, "loss": 0.5723191738128662, "memory(GiB)": 43.68, "step": 14840, "token_acc": 0.8614406779661017, "train_speed(iter/s)": 0.095935 }, { "epoch": 0.6903467137903674, "grad_norm": 9.747332572937012, "learning_rate": 2.4001237103133233e-06, "loss": 0.6472094535827637, "memory(GiB)": 43.68, "step": 14845, "token_acc": 0.8457767722473605, "train_speed(iter/s)": 0.095954 }, { "epoch": 0.6905792320503169, "grad_norm": 7.175398826599121, "learning_rate": 2.3968402357233966e-06, "loss": 0.6688910007476807, "memory(GiB)": 43.68, "step": 14850, "token_acc": 0.8367983367983368, "train_speed(iter/s)": 0.095971 }, { "epoch": 0.6905792320503169, "eval_loss": 0.5700727105140686, "eval_runtime": 291.5929, "eval_samples_per_second": 11.917, "eval_steps_per_second": 11.917, "step": 14850 }, { "epoch": 0.6908117503102665, "grad_norm": 9.811661720275879, "learning_rate": 2.3935583004622117e-06, "loss": 0.642160701751709, "memory(GiB)": 43.68, "step": 14855, "token_acc": 0.8314214066791387, "train_speed(iter/s)": 0.095808 }, { "epoch": 0.6910442685702162, "grad_norm": 9.53758430480957, "learning_rate": 2.3902779064704733e-06, "loss": 0.6385757923126221, "memory(GiB)": 43.68, "step": 14860, "token_acc": 0.8374074074074074, "train_speed(iter/s)": 0.095826 }, { "epoch": 0.6912767868301658, "grad_norm": 7.413933753967285, "learning_rate": 2.386999055687985e-06, "loss": 0.6519462108612061, "memory(GiB)": 43.68, "step": 14865, "token_acc": 0.8399087055754809, "train_speed(iter/s)": 0.095845 }, { "epoch": 0.6915093050901153, "grad_norm": 8.833547592163086, "learning_rate": 2.3837217500536283e-06, "loss": 0.6524216175079346, "memory(GiB)": 43.68, "step": 14870, "token_acc": 0.8381706244503079, "train_speed(iter/s)": 0.095863 }, { "epoch": 0.691741823350065, "grad_norm": 7.5595173835754395, "learning_rate": 2.3804459915053777e-06, "loss": 0.6694557666778564, "memory(GiB)": 43.68, "step": 14875, "token_acc": 0.8293471234647706, "train_speed(iter/s)": 0.095881 }, { "epoch": 0.6919743416100146, "grad_norm": 9.778063774108887, "learning_rate": 2.3771717819802885e-06, "loss": 0.665160083770752, "memory(GiB)": 43.68, "step": 14880, "token_acc": 0.8377831715210357, "train_speed(iter/s)": 0.0959 }, { "epoch": 0.6922068598699641, "grad_norm": 5.986135959625244, "learning_rate": 2.3738991234145025e-06, "loss": 0.6411314964294433, "memory(GiB)": 43.68, "step": 14885, "token_acc": 0.8561580882352942, "train_speed(iter/s)": 0.095919 }, { "epoch": 0.6924393781299137, "grad_norm": 7.493768692016602, "learning_rate": 2.3706280177432444e-06, "loss": 0.6026975631713867, "memory(GiB)": 43.68, "step": 14890, "token_acc": 0.8538228359717076, "train_speed(iter/s)": 0.095937 }, { "epoch": 0.6926718963898634, "grad_norm": 9.839439392089844, "learning_rate": 2.367358466900822e-06, "loss": 0.603148365020752, "memory(GiB)": 43.68, "step": 14895, "token_acc": 0.8563741721854304, "train_speed(iter/s)": 0.095955 }, { "epoch": 0.692904414649813, "grad_norm": 9.162924766540527, "learning_rate": 2.364090472820617e-06, "loss": 0.7842535972595215, "memory(GiB)": 43.68, "step": 14900, "token_acc": 0.7974545454545454, "train_speed(iter/s)": 0.095973 }, { "epoch": 0.692904414649813, "eval_loss": 0.5699092149734497, "eval_runtime": 292.1984, "eval_samples_per_second": 11.893, "eval_steps_per_second": 11.893, "step": 14900 }, { "epoch": 0.6931369329097625, "grad_norm": 6.95413064956665, "learning_rate": 2.3608240374350994e-06, "loss": 0.7118723869323731, "memory(GiB)": 43.68, "step": 14905, "token_acc": 0.8312244376278118, "train_speed(iter/s)": 0.095811 }, { "epoch": 0.6933694511697122, "grad_norm": 7.15910530090332, "learning_rate": 2.35755916267581e-06, "loss": 0.6422323703765869, "memory(GiB)": 43.68, "step": 14910, "token_acc": 0.8529996027016289, "train_speed(iter/s)": 0.095829 }, { "epoch": 0.6936019694296618, "grad_norm": 8.158949851989746, "learning_rate": 2.3542958504733733e-06, "loss": 0.6427113533020019, "memory(GiB)": 43.68, "step": 14915, "token_acc": 0.8367631670735961, "train_speed(iter/s)": 0.095847 }, { "epoch": 0.6938344876896114, "grad_norm": 9.417699813842773, "learning_rate": 2.3510341027574824e-06, "loss": 0.8722169876098633, "memory(GiB)": 43.68, "step": 14920, "token_acc": 0.8015364916773368, "train_speed(iter/s)": 0.095865 }, { "epoch": 0.694067005949561, "grad_norm": 9.309544563293457, "learning_rate": 2.3477739214569124e-06, "loss": 0.7727357864379882, "memory(GiB)": 43.68, "step": 14925, "token_acc": 0.8069565217391305, "train_speed(iter/s)": 0.095883 }, { "epoch": 0.6942995242095106, "grad_norm": 6.384006023406982, "learning_rate": 2.3445153084995083e-06, "loss": 0.6519456386566163, "memory(GiB)": 43.68, "step": 14930, "token_acc": 0.8368974266038419, "train_speed(iter/s)": 0.095901 }, { "epoch": 0.6945320424694602, "grad_norm": 7.078763961791992, "learning_rate": 2.3412582658121907e-06, "loss": 0.7300206184387207, "memory(GiB)": 43.68, "step": 14935, "token_acc": 0.8188202247191011, "train_speed(iter/s)": 0.095918 }, { "epoch": 0.6947645607294097, "grad_norm": 9.662609100341797, "learning_rate": 2.3380027953209463e-06, "loss": 0.6980655670166016, "memory(GiB)": 43.68, "step": 14940, "token_acc": 0.8221408221408222, "train_speed(iter/s)": 0.095936 }, { "epoch": 0.6949970789893594, "grad_norm": 9.974004745483398, "learning_rate": 2.3347488989508377e-06, "loss": 0.6442455768585205, "memory(GiB)": 43.68, "step": 14945, "token_acc": 0.8377208799134511, "train_speed(iter/s)": 0.095954 }, { "epoch": 0.695229597249309, "grad_norm": 9.717050552368164, "learning_rate": 2.3314965786259918e-06, "loss": 0.6342096328735352, "memory(GiB)": 43.68, "step": 14950, "token_acc": 0.8540729635182409, "train_speed(iter/s)": 0.095972 }, { "epoch": 0.695229597249309, "eval_loss": 0.5690205097198486, "eval_runtime": 296.494, "eval_samples_per_second": 11.72, "eval_steps_per_second": 11.72, "step": 14950 }, { "epoch": 0.6954621155092586, "grad_norm": 8.165386199951172, "learning_rate": 2.328245836269609e-06, "loss": 0.6690054893493652, "memory(GiB)": 43.68, "step": 14955, "token_acc": 0.8313861648130477, "train_speed(iter/s)": 0.095807 }, { "epoch": 0.6956946337692081, "grad_norm": 8.748926162719727, "learning_rate": 2.32499667380395e-06, "loss": 0.7164567470550537, "memory(GiB)": 43.68, "step": 14960, "token_acc": 0.8199805384365877, "train_speed(iter/s)": 0.095826 }, { "epoch": 0.6959271520291578, "grad_norm": 9.91205883026123, "learning_rate": 2.3217490931503478e-06, "loss": 0.6394514083862305, "memory(GiB)": 43.68, "step": 14965, "token_acc": 0.8360460500198491, "train_speed(iter/s)": 0.095844 }, { "epoch": 0.6961596702891074, "grad_norm": 7.21843147277832, "learning_rate": 2.3185030962291954e-06, "loss": 0.6726161479949951, "memory(GiB)": 43.68, "step": 14970, "token_acc": 0.8351713859910581, "train_speed(iter/s)": 0.095862 }, { "epoch": 0.696392188549057, "grad_norm": 9.273655891418457, "learning_rate": 2.3152586849599544e-06, "loss": 0.6527256488800048, "memory(GiB)": 43.68, "step": 14975, "token_acc": 0.8383581547402833, "train_speed(iter/s)": 0.09588 }, { "epoch": 0.6966247068090066, "grad_norm": 8.348166465759277, "learning_rate": 2.3120158612611406e-06, "loss": 0.7497655868530273, "memory(GiB)": 43.68, "step": 14980, "token_acc": 0.8282306163021869, "train_speed(iter/s)": 0.095899 }, { "epoch": 0.6968572250689562, "grad_norm": 9.885611534118652, "learning_rate": 2.308774627050338e-06, "loss": 0.7358698368072509, "memory(GiB)": 43.68, "step": 14985, "token_acc": 0.8208015899304405, "train_speed(iter/s)": 0.095917 }, { "epoch": 0.6970897433289058, "grad_norm": 10.031953811645508, "learning_rate": 2.3055349842441903e-06, "loss": 0.6113825798034668, "memory(GiB)": 43.68, "step": 14990, "token_acc": 0.8391099700470689, "train_speed(iter/s)": 0.095935 }, { "epoch": 0.6973222615888554, "grad_norm": 7.13198184967041, "learning_rate": 2.3022969347583944e-06, "loss": 0.6385971069335937, "memory(GiB)": 43.68, "step": 14995, "token_acc": 0.8495164410058027, "train_speed(iter/s)": 0.095953 }, { "epoch": 0.697554779848805, "grad_norm": 10.534452438354492, "learning_rate": 2.299060480507713e-06, "loss": 0.7708720684051513, "memory(GiB)": 43.68, "step": 15000, "token_acc": 0.8125, "train_speed(iter/s)": 0.09597 }, { "epoch": 0.697554779848805, "eval_loss": 0.5687591433525085, "eval_runtime": 297.5632, "eval_samples_per_second": 11.678, "eval_steps_per_second": 11.678, "step": 15000 }, { "epoch": 0.6977872981087546, "grad_norm": 10.113677978515625, "learning_rate": 2.295825623405958e-06, "loss": 0.6230460166931152, "memory(GiB)": 43.68, "step": 15005, "token_acc": 0.8312627079297482, "train_speed(iter/s)": 0.095806 }, { "epoch": 0.6980198163687042, "grad_norm": 6.392548084259033, "learning_rate": 2.2925923653660017e-06, "loss": 0.717192268371582, "memory(GiB)": 43.68, "step": 15010, "token_acc": 0.8032128514056225, "train_speed(iter/s)": 0.095824 }, { "epoch": 0.6982523346286538, "grad_norm": 8.732665061950684, "learning_rate": 2.2893607082997686e-06, "loss": 0.7233646392822266, "memory(GiB)": 43.68, "step": 15015, "token_acc": 0.802689075630252, "train_speed(iter/s)": 0.095842 }, { "epoch": 0.6984848528886034, "grad_norm": 7.695556163787842, "learning_rate": 2.2861306541182403e-06, "loss": 0.6288101196289062, "memory(GiB)": 43.68, "step": 15020, "token_acc": 0.8389084507042254, "train_speed(iter/s)": 0.095859 }, { "epoch": 0.698717371148553, "grad_norm": 8.413089752197266, "learning_rate": 2.2829022047314436e-06, "loss": 0.6322573661804199, "memory(GiB)": 43.68, "step": 15025, "token_acc": 0.8373626373626374, "train_speed(iter/s)": 0.095878 }, { "epoch": 0.6989498894085026, "grad_norm": 8.214163780212402, "learning_rate": 2.2796753620484636e-06, "loss": 0.6683392524719238, "memory(GiB)": 43.68, "step": 15030, "token_acc": 0.8358092259577795, "train_speed(iter/s)": 0.095896 }, { "epoch": 0.6991824076684522, "grad_norm": 6.662692546844482, "learning_rate": 2.2764501279774288e-06, "loss": 0.6149757862091064, "memory(GiB)": 43.68, "step": 15035, "token_acc": 0.8461538461538461, "train_speed(iter/s)": 0.095913 }, { "epoch": 0.6994149259284018, "grad_norm": 8.121337890625, "learning_rate": 2.273226504425523e-06, "loss": 0.621696949005127, "memory(GiB)": 43.68, "step": 15040, "token_acc": 0.8443677439598368, "train_speed(iter/s)": 0.09593 }, { "epoch": 0.6996474441883515, "grad_norm": 7.089735507965088, "learning_rate": 2.2700044932989713e-06, "loss": 0.6983431816101074, "memory(GiB)": 43.68, "step": 15045, "token_acc": 0.8254545454545454, "train_speed(iter/s)": 0.095948 }, { "epoch": 0.699879962448301, "grad_norm": 9.890140533447266, "learning_rate": 2.26678409650305e-06, "loss": 0.4932220458984375, "memory(GiB)": 43.68, "step": 15050, "token_acc": 0.8757187085360459, "train_speed(iter/s)": 0.095966 }, { "epoch": 0.699879962448301, "eval_loss": 0.569486141204834, "eval_runtime": 295.1351, "eval_samples_per_second": 11.774, "eval_steps_per_second": 11.774, "step": 15050 }, { "epoch": 0.7001124807082506, "grad_norm": 9.600886344909668, "learning_rate": 2.263565315942078e-06, "loss": 0.7439414501190186, "memory(GiB)": 43.68, "step": 15055, "token_acc": 0.8303879017496743, "train_speed(iter/s)": 0.095803 }, { "epoch": 0.7003449989682002, "grad_norm": 7.0749287605285645, "learning_rate": 2.260348153519423e-06, "loss": 0.6322244167327881, "memory(GiB)": 43.68, "step": 15060, "token_acc": 0.8393371757925072, "train_speed(iter/s)": 0.095821 }, { "epoch": 0.7005775172281499, "grad_norm": 9.142419815063477, "learning_rate": 2.2571326111374876e-06, "loss": 0.6422764301300049, "memory(GiB)": 43.68, "step": 15065, "token_acc": 0.8480373105324523, "train_speed(iter/s)": 0.095839 }, { "epoch": 0.7008100354880994, "grad_norm": 8.40816593170166, "learning_rate": 2.2539186906977256e-06, "loss": 0.7075412750244141, "memory(GiB)": 43.68, "step": 15070, "token_acc": 0.8350113673270543, "train_speed(iter/s)": 0.095857 }, { "epoch": 0.701042553748049, "grad_norm": 9.429239273071289, "learning_rate": 2.2507063941006237e-06, "loss": 0.5903666973114013, "memory(GiB)": 43.68, "step": 15075, "token_acc": 0.8476021314387211, "train_speed(iter/s)": 0.095874 }, { "epoch": 0.7012750720079987, "grad_norm": 6.772191524505615, "learning_rate": 2.2474957232457157e-06, "loss": 0.6024794101715087, "memory(GiB)": 43.68, "step": 15080, "token_acc": 0.8567311650885138, "train_speed(iter/s)": 0.095892 }, { "epoch": 0.7015075902679482, "grad_norm": 6.209893226623535, "learning_rate": 2.2442866800315666e-06, "loss": 0.5590320587158203, "memory(GiB)": 43.68, "step": 15085, "token_acc": 0.8531830642704843, "train_speed(iter/s)": 0.095909 }, { "epoch": 0.7017401085278978, "grad_norm": 7.735403060913086, "learning_rate": 2.2410792663557847e-06, "loss": 0.654276704788208, "memory(GiB)": 43.68, "step": 15090, "token_acc": 0.8297485610421085, "train_speed(iter/s)": 0.095927 }, { "epoch": 0.7019726267878474, "grad_norm": 8.271126747131348, "learning_rate": 2.2378734841150124e-06, "loss": 0.6791478633880615, "memory(GiB)": 43.68, "step": 15095, "token_acc": 0.826361721336142, "train_speed(iter/s)": 0.095944 }, { "epoch": 0.7022051450477971, "grad_norm": 7.209582805633545, "learning_rate": 2.2346693352049305e-06, "loss": 0.7014167785644532, "memory(GiB)": 43.68, "step": 15100, "token_acc": 0.8235294117647058, "train_speed(iter/s)": 0.095961 }, { "epoch": 0.7022051450477971, "eval_loss": 0.5675148367881775, "eval_runtime": 293.9971, "eval_samples_per_second": 11.82, "eval_steps_per_second": 11.82, "step": 15100 }, { "epoch": 0.7024376633077466, "grad_norm": 8.525975227355957, "learning_rate": 2.2314668215202463e-06, "loss": 0.6555490016937255, "memory(GiB)": 43.68, "step": 15105, "token_acc": 0.8320721442885771, "train_speed(iter/s)": 0.0958 }, { "epoch": 0.7026701815676962, "grad_norm": 7.5753397941589355, "learning_rate": 2.2282659449547074e-06, "loss": 0.7085586071014405, "memory(GiB)": 43.68, "step": 15110, "token_acc": 0.8353581901968998, "train_speed(iter/s)": 0.095818 }, { "epoch": 0.7029026998276459, "grad_norm": 8.003477096557617, "learning_rate": 2.2250667074010927e-06, "loss": 0.6282653331756591, "memory(GiB)": 43.68, "step": 15115, "token_acc": 0.848521668691778, "train_speed(iter/s)": 0.095835 }, { "epoch": 0.7031352180875955, "grad_norm": 9.937396049499512, "learning_rate": 2.221869110751207e-06, "loss": 0.6093691349029541, "memory(GiB)": 43.68, "step": 15120, "token_acc": 0.836764705882353, "train_speed(iter/s)": 0.095852 }, { "epoch": 0.703367736347545, "grad_norm": 8.043498992919922, "learning_rate": 2.2186731568958907e-06, "loss": 0.6808287143707276, "memory(GiB)": 43.68, "step": 15125, "token_acc": 0.8264291632145816, "train_speed(iter/s)": 0.09587 }, { "epoch": 0.7036002546074946, "grad_norm": 7.490905284881592, "learning_rate": 2.215478847725005e-06, "loss": 0.6184853553771973, "memory(GiB)": 43.68, "step": 15130, "token_acc": 0.8535514764565044, "train_speed(iter/s)": 0.095888 }, { "epoch": 0.7038327728674443, "grad_norm": 8.327759742736816, "learning_rate": 2.2122861851274507e-06, "loss": 0.7480375289916992, "memory(GiB)": 43.68, "step": 15135, "token_acc": 0.8175512665862484, "train_speed(iter/s)": 0.095905 }, { "epoch": 0.7040652911273938, "grad_norm": 9.702696800231934, "learning_rate": 2.2090951709911423e-06, "loss": 0.6199995994567871, "memory(GiB)": 43.68, "step": 15140, "token_acc": 0.8450704225352113, "train_speed(iter/s)": 0.095923 }, { "epoch": 0.7042978093873434, "grad_norm": 6.733327388763428, "learning_rate": 2.2059058072030286e-06, "loss": 0.712891960144043, "memory(GiB)": 43.68, "step": 15145, "token_acc": 0.8355555555555556, "train_speed(iter/s)": 0.095941 }, { "epoch": 0.7045303276472931, "grad_norm": 5.543945789337158, "learning_rate": 2.2027180956490756e-06, "loss": 0.6161305427551269, "memory(GiB)": 43.68, "step": 15150, "token_acc": 0.8320469280642173, "train_speed(iter/s)": 0.095958 }, { "epoch": 0.7045303276472931, "eval_loss": 0.5683196783065796, "eval_runtime": 291.8022, "eval_samples_per_second": 11.909, "eval_steps_per_second": 11.909, "step": 15150 }, { "epoch": 0.7047628459072427, "grad_norm": 8.165233612060547, "learning_rate": 2.199532038214279e-06, "loss": 0.7056881427764893, "memory(GiB)": 43.68, "step": 15155, "token_acc": 0.831401147343477, "train_speed(iter/s)": 0.095798 }, { "epoch": 0.7049953641671922, "grad_norm": 6.688375949859619, "learning_rate": 2.1963476367826488e-06, "loss": 0.6474667549133301, "memory(GiB)": 43.68, "step": 15160, "token_acc": 0.8420677361853832, "train_speed(iter/s)": 0.095816 }, { "epoch": 0.7052278824271419, "grad_norm": 8.083352088928223, "learning_rate": 2.1931648932372222e-06, "loss": 0.6967349052429199, "memory(GiB)": 43.68, "step": 15165, "token_acc": 0.835667215815486, "train_speed(iter/s)": 0.095833 }, { "epoch": 0.7054604006870915, "grad_norm": 8.43288803100586, "learning_rate": 2.189983809460054e-06, "loss": 0.6387832641601563, "memory(GiB)": 43.68, "step": 15170, "token_acc": 0.8391264226391879, "train_speed(iter/s)": 0.095851 }, { "epoch": 0.7056929189470411, "grad_norm": 10.101707458496094, "learning_rate": 2.186804387332218e-06, "loss": 0.6357023239135742, "memory(GiB)": 43.68, "step": 15175, "token_acc": 0.8369157284203343, "train_speed(iter/s)": 0.095868 }, { "epoch": 0.7059254372069906, "grad_norm": 9.89687728881836, "learning_rate": 2.1836266287338026e-06, "loss": 0.7231058120727539, "memory(GiB)": 43.68, "step": 15180, "token_acc": 0.8384531984098301, "train_speed(iter/s)": 0.095886 }, { "epoch": 0.7061579554669403, "grad_norm": 7.171428680419922, "learning_rate": 2.1804505355439167e-06, "loss": 0.7421711921691895, "memory(GiB)": 43.68, "step": 15185, "token_acc": 0.809905316824472, "train_speed(iter/s)": 0.095904 }, { "epoch": 0.7063904737268899, "grad_norm": 9.13636589050293, "learning_rate": 2.177276109640679e-06, "loss": 0.5785239696502685, "memory(GiB)": 43.68, "step": 15190, "token_acc": 0.8532567049808429, "train_speed(iter/s)": 0.095922 }, { "epoch": 0.7066229919868394, "grad_norm": 7.7410197257995605, "learning_rate": 2.1741033529012303e-06, "loss": 0.7573292255401611, "memory(GiB)": 43.68, "step": 15195, "token_acc": 0.8209109730848861, "train_speed(iter/s)": 0.095939 }, { "epoch": 0.706855510246789, "grad_norm": 8.934343338012695, "learning_rate": 2.1709322672017146e-06, "loss": 0.685581636428833, "memory(GiB)": 43.68, "step": 15200, "token_acc": 0.8298518799848082, "train_speed(iter/s)": 0.095957 }, { "epoch": 0.706855510246789, "eval_loss": 0.567160427570343, "eval_runtime": 292.0237, "eval_samples_per_second": 11.9, "eval_steps_per_second": 11.9, "step": 15200 }, { "epoch": 0.7070880285067387, "grad_norm": 11.455964088439941, "learning_rate": 2.167762854417295e-06, "loss": 0.5708463668823243, "memory(GiB)": 43.68, "step": 15205, "token_acc": 0.8325553885993798, "train_speed(iter/s)": 0.095798 }, { "epoch": 0.7073205467666883, "grad_norm": 7.0011091232299805, "learning_rate": 2.1645951164221435e-06, "loss": 0.7240097999572754, "memory(GiB)": 43.68, "step": 15210, "token_acc": 0.8169977206121785, "train_speed(iter/s)": 0.095815 }, { "epoch": 0.7075530650266378, "grad_norm": 8.62884521484375, "learning_rate": 2.161429055089443e-06, "loss": 0.5619840145111084, "memory(GiB)": 43.68, "step": 15215, "token_acc": 0.8600823045267489, "train_speed(iter/s)": 0.095833 }, { "epoch": 0.7077855832865875, "grad_norm": 7.956699371337891, "learning_rate": 2.1582646722913797e-06, "loss": 0.5228055000305176, "memory(GiB)": 43.68, "step": 15220, "token_acc": 0.8676521141285665, "train_speed(iter/s)": 0.095851 }, { "epoch": 0.7080181015465371, "grad_norm": 11.059106826782227, "learning_rate": 2.1551019698991536e-06, "loss": 0.6586771011352539, "memory(GiB)": 43.68, "step": 15225, "token_acc": 0.846788990825688, "train_speed(iter/s)": 0.095869 }, { "epoch": 0.7082506198064866, "grad_norm": 8.307846069335938, "learning_rate": 2.1519409497829662e-06, "loss": 0.6575278282165528, "memory(GiB)": 43.68, "step": 15230, "token_acc": 0.815359477124183, "train_speed(iter/s)": 0.095887 }, { "epoch": 0.7084831380664363, "grad_norm": 7.24730110168457, "learning_rate": 2.1487816138120295e-06, "loss": 0.6653165817260742, "memory(GiB)": 43.68, "step": 15235, "token_acc": 0.8274193548387097, "train_speed(iter/s)": 0.095904 }, { "epoch": 0.7087156563263859, "grad_norm": 8.117443084716797, "learning_rate": 2.1456239638545517e-06, "loss": 0.6554098129272461, "memory(GiB)": 43.68, "step": 15240, "token_acc": 0.843871975019516, "train_speed(iter/s)": 0.095922 }, { "epoch": 0.7089481745863355, "grad_norm": 8.179558753967285, "learning_rate": 2.1424680017777517e-06, "loss": 0.6279301643371582, "memory(GiB)": 43.68, "step": 15245, "token_acc": 0.8371501272264631, "train_speed(iter/s)": 0.095939 }, { "epoch": 0.709180692846285, "grad_norm": 9.259252548217773, "learning_rate": 2.139313729447847e-06, "loss": 0.7471514225006104, "memory(GiB)": 43.68, "step": 15250, "token_acc": 0.819258693160107, "train_speed(iter/s)": 0.095958 }, { "epoch": 0.709180692846285, "eval_loss": 0.5682738423347473, "eval_runtime": 294.5503, "eval_samples_per_second": 11.798, "eval_steps_per_second": 11.798, "step": 15250 }, { "epoch": 0.7094132111062347, "grad_norm": 9.733885765075684, "learning_rate": 2.1361611487300552e-06, "loss": 0.6511485576629639, "memory(GiB)": 43.68, "step": 15255, "token_acc": 0.8323060028695785, "train_speed(iter/s)": 0.095797 }, { "epoch": 0.7096457293661843, "grad_norm": 6.685081481933594, "learning_rate": 2.1330102614885983e-06, "loss": 0.6439203262329102, "memory(GiB)": 43.68, "step": 15260, "token_acc": 0.8359621451104101, "train_speed(iter/s)": 0.095815 }, { "epoch": 0.7098782476261339, "grad_norm": 6.545475006103516, "learning_rate": 2.1298610695866883e-06, "loss": 0.6626582145690918, "memory(GiB)": 43.68, "step": 15265, "token_acc": 0.848257006151743, "train_speed(iter/s)": 0.095833 }, { "epoch": 0.7101107658860835, "grad_norm": 8.421649932861328, "learning_rate": 2.1267135748865434e-06, "loss": 0.668232011795044, "memory(GiB)": 43.68, "step": 15270, "token_acc": 0.824332712600869, "train_speed(iter/s)": 0.09585 }, { "epoch": 0.7103432841460331, "grad_norm": 7.355252265930176, "learning_rate": 2.1235677792493707e-06, "loss": 0.7098326683044434, "memory(GiB)": 43.68, "step": 15275, "token_acc": 0.8215933558904952, "train_speed(iter/s)": 0.095868 }, { "epoch": 0.7105758024059827, "grad_norm": 8.247864723205566, "learning_rate": 2.120423684535381e-06, "loss": 0.6275835037231445, "memory(GiB)": 43.68, "step": 15280, "token_acc": 0.8363309352517986, "train_speed(iter/s)": 0.095886 }, { "epoch": 0.7108083206659322, "grad_norm": 10.8916597366333, "learning_rate": 2.1172812926037693e-06, "loss": 0.7184661388397217, "memory(GiB)": 43.68, "step": 15285, "token_acc": 0.7979899497487437, "train_speed(iter/s)": 0.095903 }, { "epoch": 0.7110408389258819, "grad_norm": 8.518646240234375, "learning_rate": 2.114140605312732e-06, "loss": 0.5484371185302734, "memory(GiB)": 43.68, "step": 15290, "token_acc": 0.8663251047973917, "train_speed(iter/s)": 0.095921 }, { "epoch": 0.7112733571858315, "grad_norm": 8.54529857635498, "learning_rate": 2.1110016245194533e-06, "loss": 0.44772658348083494, "memory(GiB)": 43.68, "step": 15295, "token_acc": 0.8883770375620128, "train_speed(iter/s)": 0.095938 }, { "epoch": 0.7115058754457811, "grad_norm": 9.231746673583984, "learning_rate": 2.1078643520801124e-06, "loss": 0.6469157695770263, "memory(GiB)": 43.68, "step": 15300, "token_acc": 0.8370279146141215, "train_speed(iter/s)": 0.095956 }, { "epoch": 0.7115058754457811, "eval_loss": 0.5666142702102661, "eval_runtime": 292.2752, "eval_samples_per_second": 11.889, "eval_steps_per_second": 11.889, "step": 15300 }, { "epoch": 0.7117383937057307, "grad_norm": 8.66163158416748, "learning_rate": 2.1047287898498714e-06, "loss": 0.6179323673248291, "memory(GiB)": 43.68, "step": 15305, "token_acc": 0.8329655040061411, "train_speed(iter/s)": 0.095797 }, { "epoch": 0.7119709119656803, "grad_norm": 9.693718910217285, "learning_rate": 2.1015949396828884e-06, "loss": 0.5869213581085205, "memory(GiB)": 43.68, "step": 15310, "token_acc": 0.8610321007720438, "train_speed(iter/s)": 0.095815 }, { "epoch": 0.7122034302256299, "grad_norm": 10.758035659790039, "learning_rate": 2.0984628034323025e-06, "loss": 0.6068830490112305, "memory(GiB)": 43.68, "step": 15315, "token_acc": 0.8376518218623482, "train_speed(iter/s)": 0.095833 }, { "epoch": 0.7124359484855796, "grad_norm": 7.330872058868408, "learning_rate": 2.095332382950246e-06, "loss": 0.797172737121582, "memory(GiB)": 43.68, "step": 15320, "token_acc": 0.7954419121734296, "train_speed(iter/s)": 0.09585 }, { "epoch": 0.7126684667455291, "grad_norm": 8.344979286193848, "learning_rate": 2.092203680087829e-06, "loss": 0.8205219268798828, "memory(GiB)": 43.68, "step": 15325, "token_acc": 0.7884267631103075, "train_speed(iter/s)": 0.095868 }, { "epoch": 0.7129009850054787, "grad_norm": 8.697916984558105, "learning_rate": 2.089076696695153e-06, "loss": 0.7596703052520752, "memory(GiB)": 47.44, "step": 15330, "token_acc": 0.764751552795031, "train_speed(iter/s)": 0.095885 }, { "epoch": 0.7131335032654283, "grad_norm": 9.916091918945312, "learning_rate": 2.0859514346212993e-06, "loss": 0.6388274192810058, "memory(GiB)": 47.44, "step": 15335, "token_acc": 0.8622060284862537, "train_speed(iter/s)": 0.095902 }, { "epoch": 0.7133660215253779, "grad_norm": 8.304482460021973, "learning_rate": 2.0828278957143332e-06, "loss": 0.6312067985534668, "memory(GiB)": 47.44, "step": 15340, "token_acc": 0.8490203611217826, "train_speed(iter/s)": 0.095919 }, { "epoch": 0.7135985397853275, "grad_norm": 9.36806869506836, "learning_rate": 2.0797060818212972e-06, "loss": 0.659159803390503, "memory(GiB)": 47.44, "step": 15345, "token_acc": 0.839056681836988, "train_speed(iter/s)": 0.095937 }, { "epoch": 0.7138310580452771, "grad_norm": 6.6365509033203125, "learning_rate": 2.0765859947882188e-06, "loss": 0.6138392925262451, "memory(GiB)": 47.44, "step": 15350, "token_acc": 0.8444, "train_speed(iter/s)": 0.095955 }, { "epoch": 0.7138310580452771, "eval_loss": 0.5672307014465332, "eval_runtime": 292.2735, "eval_samples_per_second": 11.89, "eval_steps_per_second": 11.89, "step": 15350 }, { "epoch": 0.7140635763052268, "grad_norm": 7.136357307434082, "learning_rate": 2.0734676364600986e-06, "loss": 0.7098967075347901, "memory(GiB)": 47.44, "step": 15355, "token_acc": 0.8321157404453519, "train_speed(iter/s)": 0.095797 }, { "epoch": 0.7142960945651763, "grad_norm": 6.81723690032959, "learning_rate": 2.070351008680922e-06, "loss": 0.6600059509277344, "memory(GiB)": 47.44, "step": 15360, "token_acc": 0.8320908768193114, "train_speed(iter/s)": 0.095815 }, { "epoch": 0.7145286128251259, "grad_norm": 6.2631707191467285, "learning_rate": 2.067236113293643e-06, "loss": 0.6850215911865234, "memory(GiB)": 47.44, "step": 15365, "token_acc": 0.829415501905972, "train_speed(iter/s)": 0.095832 }, { "epoch": 0.7147611310850756, "grad_norm": 7.0183796882629395, "learning_rate": 2.064122952140198e-06, "loss": 0.6342861175537109, "memory(GiB)": 47.44, "step": 15370, "token_acc": 0.8410546139359699, "train_speed(iter/s)": 0.09585 }, { "epoch": 0.7149936493450251, "grad_norm": 11.088750839233398, "learning_rate": 2.061011527061495e-06, "loss": 0.6724984645843506, "memory(GiB)": 47.44, "step": 15375, "token_acc": 0.8334586466165413, "train_speed(iter/s)": 0.095868 }, { "epoch": 0.7152261676049747, "grad_norm": 9.443142890930176, "learning_rate": 2.0579018398974147e-06, "loss": 0.6077555656433106, "memory(GiB)": 47.44, "step": 15380, "token_acc": 0.8423803779654202, "train_speed(iter/s)": 0.095886 }, { "epoch": 0.7154586858649243, "grad_norm": 9.812091827392578, "learning_rate": 2.054793892486815e-06, "loss": 0.6703172206878663, "memory(GiB)": 47.44, "step": 15385, "token_acc": 0.8240412504028359, "train_speed(iter/s)": 0.095903 }, { "epoch": 0.715691204124874, "grad_norm": 7.7283477783203125, "learning_rate": 2.0516876866675155e-06, "loss": 0.6120262145996094, "memory(GiB)": 47.44, "step": 15390, "token_acc": 0.8498862774829417, "train_speed(iter/s)": 0.095921 }, { "epoch": 0.7159237223848235, "grad_norm": 6.389163970947266, "learning_rate": 2.0485832242763164e-06, "loss": 0.6700059413909912, "memory(GiB)": 47.44, "step": 15395, "token_acc": 0.8351304347826087, "train_speed(iter/s)": 0.095939 }, { "epoch": 0.7161562406447731, "grad_norm": 6.533292770385742, "learning_rate": 2.0454805071489785e-06, "loss": 0.6029557228088379, "memory(GiB)": 47.44, "step": 15400, "token_acc": 0.8440822111977321, "train_speed(iter/s)": 0.095956 }, { "epoch": 0.7161562406447731, "eval_loss": 0.5675437450408936, "eval_runtime": 292.8256, "eval_samples_per_second": 11.867, "eval_steps_per_second": 11.867, "step": 15400 }, { "epoch": 0.7163887589047228, "grad_norm": 7.733583450317383, "learning_rate": 2.042379537120237e-06, "loss": 0.4989192008972168, "memory(GiB)": 47.44, "step": 15405, "token_acc": 0.8333587351499643, "train_speed(iter/s)": 0.095799 }, { "epoch": 0.7166212771646724, "grad_norm": 10.462811470031738, "learning_rate": 2.0392803160237888e-06, "loss": 0.670336389541626, "memory(GiB)": 47.44, "step": 15410, "token_acc": 0.8332145402708482, "train_speed(iter/s)": 0.095816 }, { "epoch": 0.7168537954246219, "grad_norm": 10.108728408813477, "learning_rate": 2.0361828456923e-06, "loss": 0.6763839721679688, "memory(GiB)": 47.44, "step": 15415, "token_acc": 0.8427698574338085, "train_speed(iter/s)": 0.095834 }, { "epoch": 0.7170863136845715, "grad_norm": 7.717390537261963, "learning_rate": 2.0330871279574006e-06, "loss": 0.6465532302856445, "memory(GiB)": 47.44, "step": 15420, "token_acc": 0.8464551508029768, "train_speed(iter/s)": 0.095851 }, { "epoch": 0.7173188319445212, "grad_norm": 7.598498344421387, "learning_rate": 2.0299931646496864e-06, "loss": 0.8090932846069336, "memory(GiB)": 47.44, "step": 15425, "token_acc": 0.8031572164948454, "train_speed(iter/s)": 0.095868 }, { "epoch": 0.7175513502044707, "grad_norm": 7.445656776428223, "learning_rate": 2.0269009575987087e-06, "loss": 0.6042123794555664, "memory(GiB)": 47.44, "step": 15430, "token_acc": 0.85587018771874, "train_speed(iter/s)": 0.095885 }, { "epoch": 0.7177838684644203, "grad_norm": 8.982229232788086, "learning_rate": 2.0238105086329894e-06, "loss": 0.6927988052368164, "memory(GiB)": 47.44, "step": 15435, "token_acc": 0.8335028823329942, "train_speed(iter/s)": 0.095903 }, { "epoch": 0.71801638672437, "grad_norm": 8.382378578186035, "learning_rate": 2.020721819580003e-06, "loss": 0.6854721546173096, "memory(GiB)": 47.44, "step": 15440, "token_acc": 0.8381818181818181, "train_speed(iter/s)": 0.095921 }, { "epoch": 0.7182489049843196, "grad_norm": 6.911797046661377, "learning_rate": 2.0176348922661893e-06, "loss": 0.6210799217224121, "memory(GiB)": 47.44, "step": 15445, "token_acc": 0.8527648234510327, "train_speed(iter/s)": 0.095938 }, { "epoch": 0.7184814232442691, "grad_norm": 8.026445388793945, "learning_rate": 2.014549728516941e-06, "loss": 0.7180139064788819, "memory(GiB)": 47.44, "step": 15450, "token_acc": 0.8164603058994901, "train_speed(iter/s)": 0.095955 }, { "epoch": 0.7184814232442691, "eval_loss": 0.5675672888755798, "eval_runtime": 294.7585, "eval_samples_per_second": 11.789, "eval_steps_per_second": 11.789, "step": 15450 }, { "epoch": 0.7187139415042187, "grad_norm": 9.904377937316895, "learning_rate": 2.0114663301566128e-06, "loss": 0.6216643333435059, "memory(GiB)": 47.44, "step": 15455, "token_acc": 0.832423668411083, "train_speed(iter/s)": 0.095797 }, { "epoch": 0.7189464597641684, "grad_norm": 6.8368988037109375, "learning_rate": 2.0083846990085125e-06, "loss": 0.6121196746826172, "memory(GiB)": 47.44, "step": 15460, "token_acc": 0.8402468289338362, "train_speed(iter/s)": 0.095815 }, { "epoch": 0.719178978024118, "grad_norm": 11.11691665649414, "learning_rate": 2.005304836894906e-06, "loss": 0.5897928714752197, "memory(GiB)": 47.44, "step": 15465, "token_acc": 0.8488927485887973, "train_speed(iter/s)": 0.095832 }, { "epoch": 0.7194114962840675, "grad_norm": 9.302391052246094, "learning_rate": 2.002226745637007e-06, "loss": 0.5943363666534424, "memory(GiB)": 47.44, "step": 15470, "token_acc": 0.8488794669897032, "train_speed(iter/s)": 0.09585 }, { "epoch": 0.7196440145440172, "grad_norm": 9.435709953308105, "learning_rate": 1.9991504270549895e-06, "loss": 0.5556567668914795, "memory(GiB)": 47.44, "step": 15475, "token_acc": 0.8607260726072608, "train_speed(iter/s)": 0.095867 }, { "epoch": 0.7198765328039668, "grad_norm": 9.0478515625, "learning_rate": 1.996075882967972e-06, "loss": 0.6758971214294434, "memory(GiB)": 47.44, "step": 15480, "token_acc": 0.831566994700984, "train_speed(iter/s)": 0.095885 }, { "epoch": 0.7201090510639163, "grad_norm": 9.4107084274292, "learning_rate": 1.99300311519403e-06, "loss": 0.5821828365325927, "memory(GiB)": 47.44, "step": 15485, "token_acc": 0.8466947960618847, "train_speed(iter/s)": 0.095902 }, { "epoch": 0.720341569323866, "grad_norm": 9.905683517456055, "learning_rate": 1.9899321255501845e-06, "loss": 0.6144163608551025, "memory(GiB)": 47.44, "step": 15490, "token_acc": 0.8548801369863014, "train_speed(iter/s)": 0.095919 }, { "epoch": 0.7205740875838156, "grad_norm": 9.403578758239746, "learning_rate": 1.9868629158524093e-06, "loss": 0.5417950630187989, "memory(GiB)": 47.44, "step": 15495, "token_acc": 0.8679031037093111, "train_speed(iter/s)": 0.095937 }, { "epoch": 0.7208066058437652, "grad_norm": 7.02471923828125, "learning_rate": 1.983795487915619e-06, "loss": 0.593695592880249, "memory(GiB)": 47.44, "step": 15500, "token_acc": 0.859472049689441, "train_speed(iter/s)": 0.095954 }, { "epoch": 0.7208066058437652, "eval_loss": 0.5664077401161194, "eval_runtime": 292.9703, "eval_samples_per_second": 11.861, "eval_steps_per_second": 11.861, "step": 15500 }, { "epoch": 0.7210391241037147, "grad_norm": 9.336894989013672, "learning_rate": 1.9807298435536803e-06, "loss": 0.5820582866668701, "memory(GiB)": 47.44, "step": 15505, "token_acc": 0.8331124976912637, "train_speed(iter/s)": 0.095798 }, { "epoch": 0.7212716423636644, "grad_norm": 7.727779865264893, "learning_rate": 1.977665984579405e-06, "loss": 0.5849403858184814, "memory(GiB)": 47.44, "step": 15510, "token_acc": 0.8396624472573839, "train_speed(iter/s)": 0.095815 }, { "epoch": 0.721504160623614, "grad_norm": 8.475208282470703, "learning_rate": 1.974603912804544e-06, "loss": 0.6912332057952881, "memory(GiB)": 47.44, "step": 15515, "token_acc": 0.8325024925224327, "train_speed(iter/s)": 0.095832 }, { "epoch": 0.7217366788835635, "grad_norm": 7.069519519805908, "learning_rate": 1.971543630039799e-06, "loss": 0.6353631019592285, "memory(GiB)": 47.44, "step": 15520, "token_acc": 0.8291793313069908, "train_speed(iter/s)": 0.09585 }, { "epoch": 0.7219691971435132, "grad_norm": 11.572453498840332, "learning_rate": 1.968485138094805e-06, "loss": 0.7156434059143066, "memory(GiB)": 47.44, "step": 15525, "token_acc": 0.8214027476500362, "train_speed(iter/s)": 0.095867 }, { "epoch": 0.7222017154034628, "grad_norm": 11.58348560333252, "learning_rate": 1.9654284387781453e-06, "loss": 0.7011518001556396, "memory(GiB)": 47.44, "step": 15530, "token_acc": 0.8055172413793104, "train_speed(iter/s)": 0.095884 }, { "epoch": 0.7224342336634124, "grad_norm": 10.236895561218262, "learning_rate": 1.9623735338973404e-06, "loss": 0.675053882598877, "memory(GiB)": 47.44, "step": 15535, "token_acc": 0.8261376896149358, "train_speed(iter/s)": 0.095902 }, { "epoch": 0.7226667519233619, "grad_norm": 7.544062614440918, "learning_rate": 1.9593204252588515e-06, "loss": 0.593741512298584, "memory(GiB)": 47.44, "step": 15540, "token_acc": 0.8604933279417711, "train_speed(iter/s)": 0.095919 }, { "epoch": 0.7228992701833116, "grad_norm": 7.399377822875977, "learning_rate": 1.956269114668073e-06, "loss": 0.6229902744293213, "memory(GiB)": 47.44, "step": 15545, "token_acc": 0.8310586499446698, "train_speed(iter/s)": 0.095937 }, { "epoch": 0.7231317884432612, "grad_norm": 9.306867599487305, "learning_rate": 1.9532196039293415e-06, "loss": 0.6560436248779297, "memory(GiB)": 47.44, "step": 15550, "token_acc": 0.8388037928519329, "train_speed(iter/s)": 0.095955 }, { "epoch": 0.7231317884432612, "eval_loss": 0.5657230019569397, "eval_runtime": 291.9442, "eval_samples_per_second": 11.903, "eval_steps_per_second": 11.903, "step": 15550 }, { "epoch": 0.7233643067032108, "grad_norm": 7.044973373413086, "learning_rate": 1.950171894845924e-06, "loss": 0.5830237388610839, "memory(GiB)": 47.44, "step": 15555, "token_acc": 0.8333811299110984, "train_speed(iter/s)": 0.095799 }, { "epoch": 0.7235968249631604, "grad_norm": 6.381565570831299, "learning_rate": 1.9471259892200296e-06, "loss": 0.673720407485962, "memory(GiB)": 47.44, "step": 15560, "token_acc": 0.8369430693069307, "train_speed(iter/s)": 0.095817 }, { "epoch": 0.72382934322311, "grad_norm": 9.84653377532959, "learning_rate": 1.9440818888527908e-06, "loss": 0.7157990455627441, "memory(GiB)": 47.44, "step": 15565, "token_acc": 0.8328240942819729, "train_speed(iter/s)": 0.095834 }, { "epoch": 0.7240618614830596, "grad_norm": 7.861911296844482, "learning_rate": 1.941039595544281e-06, "loss": 0.8552507400512696, "memory(GiB)": 47.44, "step": 15570, "token_acc": 0.7770975056689342, "train_speed(iter/s)": 0.095851 }, { "epoch": 0.7242943797430091, "grad_norm": 7.185626983642578, "learning_rate": 1.937999111093502e-06, "loss": 0.628577184677124, "memory(GiB)": 47.44, "step": 15575, "token_acc": 0.8374384236453202, "train_speed(iter/s)": 0.095868 }, { "epoch": 0.7245268980029588, "grad_norm": 7.608919620513916, "learning_rate": 1.9349604372983877e-06, "loss": 0.8854595184326172, "memory(GiB)": 47.44, "step": 15580, "token_acc": 0.7832712495767017, "train_speed(iter/s)": 0.095886 }, { "epoch": 0.7247594162629084, "grad_norm": 8.92005443572998, "learning_rate": 1.9319235759557964e-06, "loss": 0.5188089847564697, "memory(GiB)": 47.44, "step": 15585, "token_acc": 0.8663994655978624, "train_speed(iter/s)": 0.095903 }, { "epoch": 0.724991934522858, "grad_norm": 7.952560901641846, "learning_rate": 1.9288885288615216e-06, "loss": 0.6974950790405273, "memory(GiB)": 47.44, "step": 15590, "token_acc": 0.8319559228650137, "train_speed(iter/s)": 0.09592 }, { "epoch": 0.7252244527828076, "grad_norm": 7.014359474182129, "learning_rate": 1.925855297810277e-06, "loss": 0.7252201080322266, "memory(GiB)": 47.44, "step": 15595, "token_acc": 0.8212915601023018, "train_speed(iter/s)": 0.095937 }, { "epoch": 0.7254569710427572, "grad_norm": 7.969059944152832, "learning_rate": 1.922823884595708e-06, "loss": 0.644140625, "memory(GiB)": 47.44, "step": 15600, "token_acc": 0.8409179307662388, "train_speed(iter/s)": 0.095954 }, { "epoch": 0.7254569710427572, "eval_loss": 0.5666154026985168, "eval_runtime": 294.9961, "eval_samples_per_second": 11.78, "eval_steps_per_second": 11.78, "step": 15600 }, { "epoch": 0.7256894893027068, "grad_norm": 11.729976654052734, "learning_rate": 1.919794291010381e-06, "loss": 0.7216911315917969, "memory(GiB)": 47.44, "step": 15605, "token_acc": 0.8319603824591902, "train_speed(iter/s)": 0.095798 }, { "epoch": 0.7259220075626565, "grad_norm": 8.04870319366455, "learning_rate": 1.9167665188457894e-06, "loss": 0.6861124992370605, "memory(GiB)": 47.44, "step": 15610, "token_acc": 0.8221914008321776, "train_speed(iter/s)": 0.095814 }, { "epoch": 0.726154525822606, "grad_norm": 6.789844512939453, "learning_rate": 1.9137405698923476e-06, "loss": 0.7341386795043945, "memory(GiB)": 47.44, "step": 15615, "token_acc": 0.826115061409179, "train_speed(iter/s)": 0.095832 }, { "epoch": 0.7263870440825556, "grad_norm": 7.037516117095947, "learning_rate": 1.9107164459393956e-06, "loss": 0.6245403289794922, "memory(GiB)": 47.44, "step": 15620, "token_acc": 0.8459883184752536, "train_speed(iter/s)": 0.095849 }, { "epoch": 0.7266195623425052, "grad_norm": 10.631331443786621, "learning_rate": 1.907694148775187e-06, "loss": 0.683773422241211, "memory(GiB)": 47.44, "step": 15625, "token_acc": 0.8298048982980489, "train_speed(iter/s)": 0.095866 }, { "epoch": 0.7268520806024548, "grad_norm": 8.501079559326172, "learning_rate": 1.9046736801869037e-06, "loss": 0.5465147972106934, "memory(GiB)": 47.44, "step": 15630, "token_acc": 0.86484375, "train_speed(iter/s)": 0.095883 }, { "epoch": 0.7270845988624044, "grad_norm": 8.950760841369629, "learning_rate": 1.9016550419606372e-06, "loss": 0.6169103622436524, "memory(GiB)": 47.44, "step": 15635, "token_acc": 0.8555262165220672, "train_speed(iter/s)": 0.0959 }, { "epoch": 0.727317117122354, "grad_norm": 8.562825202941895, "learning_rate": 1.8986382358814043e-06, "loss": 0.660850715637207, "memory(GiB)": 47.44, "step": 15640, "token_acc": 0.8426395939086294, "train_speed(iter/s)": 0.095917 }, { "epoch": 0.7275496353823037, "grad_norm": 9.356790542602539, "learning_rate": 1.8956232637331368e-06, "loss": 0.6509898185729981, "memory(GiB)": 47.44, "step": 15645, "token_acc": 0.8205387205387206, "train_speed(iter/s)": 0.095934 }, { "epoch": 0.7277821536422532, "grad_norm": 11.473599433898926, "learning_rate": 1.8926101272986775e-06, "loss": 0.5593137264251709, "memory(GiB)": 47.44, "step": 15650, "token_acc": 0.869598180439727, "train_speed(iter/s)": 0.095951 }, { "epoch": 0.7277821536422532, "eval_loss": 0.5655349493026733, "eval_runtime": 294.0032, "eval_samples_per_second": 11.82, "eval_steps_per_second": 11.82, "step": 15650 }, { "epoch": 0.7280146719022028, "grad_norm": 5.624059200286865, "learning_rate": 1.8895988283597894e-06, "loss": 0.6395047187805176, "memory(GiB)": 47.44, "step": 15655, "token_acc": 0.8326598337554284, "train_speed(iter/s)": 0.095795 }, { "epoch": 0.7282471901621524, "grad_norm": 8.765081405639648, "learning_rate": 1.8865893686971454e-06, "loss": 0.6849233150482178, "memory(GiB)": 47.44, "step": 15660, "token_acc": 0.8255968169761273, "train_speed(iter/s)": 0.095813 }, { "epoch": 0.728479708422102, "grad_norm": 9.399160385131836, "learning_rate": 1.883581750090334e-06, "loss": 0.5533103466033935, "memory(GiB)": 47.44, "step": 15665, "token_acc": 0.8568696206566784, "train_speed(iter/s)": 0.09583 }, { "epoch": 0.7287122266820516, "grad_norm": 7.852560520172119, "learning_rate": 1.8805759743178497e-06, "loss": 0.5719121932983399, "memory(GiB)": 47.44, "step": 15670, "token_acc": 0.8539235412474849, "train_speed(iter/s)": 0.095848 }, { "epoch": 0.7289447449420012, "grad_norm": 9.14884090423584, "learning_rate": 1.8775720431571042e-06, "loss": 0.5568684101104736, "memory(GiB)": 47.44, "step": 15675, "token_acc": 0.8567848191908343, "train_speed(iter/s)": 0.095865 }, { "epoch": 0.7291772632019509, "grad_norm": 6.8728766441345215, "learning_rate": 1.8745699583844108e-06, "loss": 0.6052249908447266, "memory(GiB)": 47.44, "step": 15680, "token_acc": 0.8487972508591065, "train_speed(iter/s)": 0.095883 }, { "epoch": 0.7294097814619004, "grad_norm": 7.464291572570801, "learning_rate": 1.8715697217749985e-06, "loss": 0.599236249923706, "memory(GiB)": 47.44, "step": 15685, "token_acc": 0.8551859099804305, "train_speed(iter/s)": 0.0959 }, { "epoch": 0.72964229972185, "grad_norm": 10.347436904907227, "learning_rate": 1.8685713351029965e-06, "loss": 0.6958876132965088, "memory(GiB)": 47.44, "step": 15690, "token_acc": 0.8292985723153321, "train_speed(iter/s)": 0.095918 }, { "epoch": 0.7298748179817997, "grad_norm": 7.473763942718506, "learning_rate": 1.8655748001414452e-06, "loss": 0.7635407447814941, "memory(GiB)": 47.44, "step": 15695, "token_acc": 0.8141470180305131, "train_speed(iter/s)": 0.095935 }, { "epoch": 0.7301073362417493, "grad_norm": 6.556323051452637, "learning_rate": 1.8625801186622883e-06, "loss": 0.6744859218597412, "memory(GiB)": 47.44, "step": 15700, "token_acc": 0.836890243902439, "train_speed(iter/s)": 0.095952 }, { "epoch": 0.7301073362417493, "eval_loss": 0.565443217754364, "eval_runtime": 295.1311, "eval_samples_per_second": 11.774, "eval_steps_per_second": 11.774, "step": 15700 }, { "epoch": 0.7303398545016988, "grad_norm": 7.942391395568848, "learning_rate": 1.8595872924363744e-06, "loss": 0.6304332733154296, "memory(GiB)": 47.44, "step": 15705, "token_acc": 0.8331690112620737, "train_speed(iter/s)": 0.095797 }, { "epoch": 0.7305723727616484, "grad_norm": 10.444586753845215, "learning_rate": 1.8565963232334516e-06, "loss": 0.6448088169097901, "memory(GiB)": 47.44, "step": 15710, "token_acc": 0.8362779740871613, "train_speed(iter/s)": 0.095814 }, { "epoch": 0.7308048910215981, "grad_norm": 7.157534599304199, "learning_rate": 1.853607212822175e-06, "loss": 0.6816494941711426, "memory(GiB)": 47.44, "step": 15715, "token_acc": 0.8259624562519885, "train_speed(iter/s)": 0.095832 }, { "epoch": 0.7310374092815476, "grad_norm": 7.530759811401367, "learning_rate": 1.8506199629700949e-06, "loss": 0.6713624000549316, "memory(GiB)": 47.44, "step": 15720, "token_acc": 0.8371550719005052, "train_speed(iter/s)": 0.095849 }, { "epoch": 0.7312699275414972, "grad_norm": 8.83544635772705, "learning_rate": 1.847634575443668e-06, "loss": 0.7240818977355957, "memory(GiB)": 47.44, "step": 15725, "token_acc": 0.8081967213114755, "train_speed(iter/s)": 0.095865 }, { "epoch": 0.7315024458014469, "grad_norm": 9.414497375488281, "learning_rate": 1.8446510520082423e-06, "loss": 0.6359312057495117, "memory(GiB)": 47.44, "step": 15730, "token_acc": 0.8445309964297306, "train_speed(iter/s)": 0.095882 }, { "epoch": 0.7317349640613965, "grad_norm": 10.996722221374512, "learning_rate": 1.8416693944280689e-06, "loss": 0.6379516124725342, "memory(GiB)": 47.44, "step": 15735, "token_acc": 0.8364099299809039, "train_speed(iter/s)": 0.095899 }, { "epoch": 0.731967482321346, "grad_norm": 10.735776901245117, "learning_rate": 1.8386896044662944e-06, "loss": 0.6942223072052002, "memory(GiB)": 47.44, "step": 15740, "token_acc": 0.8277654046028211, "train_speed(iter/s)": 0.095917 }, { "epoch": 0.7322000005812956, "grad_norm": 8.521059036254883, "learning_rate": 1.835711683884962e-06, "loss": 0.6801161766052246, "memory(GiB)": 47.44, "step": 15745, "token_acc": 0.8320294523699954, "train_speed(iter/s)": 0.095934 }, { "epoch": 0.7324325188412453, "grad_norm": 6.813046455383301, "learning_rate": 1.8327356344450048e-06, "loss": 0.6779103755950928, "memory(GiB)": 47.44, "step": 15750, "token_acc": 0.8271536102592696, "train_speed(iter/s)": 0.095951 }, { "epoch": 0.7324325188412453, "eval_loss": 0.5668007135391235, "eval_runtime": 294.0271, "eval_samples_per_second": 11.819, "eval_steps_per_second": 11.819, "step": 15750 }, { "epoch": 0.7326650371011949, "grad_norm": 9.11612319946289, "learning_rate": 1.8297614579062557e-06, "loss": 0.5452903270721435, "memory(GiB)": 47.44, "step": 15755, "token_acc": 0.8330476627316548, "train_speed(iter/s)": 0.095796 }, { "epoch": 0.7328975553611444, "grad_norm": 5.6731038093566895, "learning_rate": 1.8267891560274342e-06, "loss": 0.7143843173980713, "memory(GiB)": 47.44, "step": 15760, "token_acc": 0.8132650156561344, "train_speed(iter/s)": 0.095813 }, { "epoch": 0.7331300736210941, "grad_norm": 8.81519889831543, "learning_rate": 1.823818730566158e-06, "loss": 0.6255356311798096, "memory(GiB)": 47.44, "step": 15765, "token_acc": 0.8302339532093581, "train_speed(iter/s)": 0.09583 }, { "epoch": 0.7333625918810437, "grad_norm": 7.650895118713379, "learning_rate": 1.8208501832789271e-06, "loss": 0.5208076953887939, "memory(GiB)": 47.44, "step": 15770, "token_acc": 0.8714511041009464, "train_speed(iter/s)": 0.095847 }, { "epoch": 0.7335951101409932, "grad_norm": 6.7183756828308105, "learning_rate": 1.8178835159211371e-06, "loss": 0.5086612224578857, "memory(GiB)": 47.44, "step": 15775, "token_acc": 0.8686751641560448, "train_speed(iter/s)": 0.095864 }, { "epoch": 0.7338276284009428, "grad_norm": 9.220132827758789, "learning_rate": 1.8149187302470706e-06, "loss": 0.6746468544006348, "memory(GiB)": 47.44, "step": 15780, "token_acc": 0.8361801242236024, "train_speed(iter/s)": 0.095881 }, { "epoch": 0.7340601466608925, "grad_norm": 7.657320976257324, "learning_rate": 1.811955828009896e-06, "loss": 0.5522412300109864, "memory(GiB)": 47.44, "step": 15785, "token_acc": 0.8663440059568132, "train_speed(iter/s)": 0.095898 }, { "epoch": 0.7342926649208421, "grad_norm": 10.659828186035156, "learning_rate": 1.8089948109616712e-06, "loss": 0.6790434837341308, "memory(GiB)": 47.44, "step": 15790, "token_acc": 0.8299460804645376, "train_speed(iter/s)": 0.095915 }, { "epoch": 0.7345251831807916, "grad_norm": 5.570415019989014, "learning_rate": 1.806035680853333e-06, "loss": 0.780366849899292, "memory(GiB)": 47.44, "step": 15795, "token_acc": 0.800187617260788, "train_speed(iter/s)": 0.095931 }, { "epoch": 0.7347577014407413, "grad_norm": 11.067488670349121, "learning_rate": 1.8030784394347106e-06, "loss": 0.7626240730285645, "memory(GiB)": 47.44, "step": 15800, "token_acc": 0.7994816974408812, "train_speed(iter/s)": 0.095948 }, { "epoch": 0.7347577014407413, "eval_loss": 0.564397931098938, "eval_runtime": 296.0923, "eval_samples_per_second": 11.736, "eval_steps_per_second": 11.736, "step": 15800 }, { "epoch": 0.7349902197006909, "grad_norm": 10.09076976776123, "learning_rate": 1.8001230884545084e-06, "loss": 0.6031111240386963, "memory(GiB)": 47.44, "step": 15805, "token_acc": 0.8330454392783456, "train_speed(iter/s)": 0.095793 }, { "epoch": 0.7352227379606405, "grad_norm": 8.190320014953613, "learning_rate": 1.797169629660318e-06, "loss": 0.6044882297515869, "memory(GiB)": 47.44, "step": 15810, "token_acc": 0.8356687898089172, "train_speed(iter/s)": 0.095809 }, { "epoch": 0.73545525622059, "grad_norm": 6.573525905609131, "learning_rate": 1.7942180647986113e-06, "loss": 0.6540366649627686, "memory(GiB)": 47.44, "step": 15815, "token_acc": 0.8233638282899367, "train_speed(iter/s)": 0.095826 }, { "epoch": 0.7356877744805397, "grad_norm": 10.954277038574219, "learning_rate": 1.7912683956147415e-06, "loss": 0.62742018699646, "memory(GiB)": 47.44, "step": 15820, "token_acc": 0.8497256226255804, "train_speed(iter/s)": 0.095843 }, { "epoch": 0.7359202927404893, "grad_norm": 7.76631498336792, "learning_rate": 1.788320623852935e-06, "loss": 0.6723296642303467, "memory(GiB)": 47.44, "step": 15825, "token_acc": 0.821697803998689, "train_speed(iter/s)": 0.09586 }, { "epoch": 0.7361528110004388, "grad_norm": 10.623085021972656, "learning_rate": 1.7853747512563042e-06, "loss": 0.7741125583648681, "memory(GiB)": 47.44, "step": 15830, "token_acc": 0.804755944931164, "train_speed(iter/s)": 0.095877 }, { "epoch": 0.7363853292603885, "grad_norm": 8.324301719665527, "learning_rate": 1.782430779566831e-06, "loss": 0.6203536033630371, "memory(GiB)": 47.44, "step": 15835, "token_acc": 0.8429657794676806, "train_speed(iter/s)": 0.095894 }, { "epoch": 0.7366178475203381, "grad_norm": 6.4376115798950195, "learning_rate": 1.77948871052538e-06, "loss": 0.6823951244354248, "memory(GiB)": 47.44, "step": 15840, "token_acc": 0.8315889628924833, "train_speed(iter/s)": 0.095911 }, { "epoch": 0.7368503657802877, "grad_norm": 7.326390266418457, "learning_rate": 1.7765485458716842e-06, "loss": 0.6224531650543212, "memory(GiB)": 47.44, "step": 15845, "token_acc": 0.8392242727557084, "train_speed(iter/s)": 0.095928 }, { "epoch": 0.7370828840402373, "grad_norm": 8.882484436035156, "learning_rate": 1.7736102873443555e-06, "loss": 0.7446016311645508, "memory(GiB)": 47.44, "step": 15850, "token_acc": 0.8142804291527932, "train_speed(iter/s)": 0.095945 }, { "epoch": 0.7370828840402373, "eval_loss": 0.5673614144325256, "eval_runtime": 293.6289, "eval_samples_per_second": 11.835, "eval_steps_per_second": 11.835, "step": 15850 }, { "epoch": 0.7373154023001869, "grad_norm": 8.337072372436523, "learning_rate": 1.7706739366808768e-06, "loss": 0.739326810836792, "memory(GiB)": 47.44, "step": 15855, "token_acc": 0.8325765038482177, "train_speed(iter/s)": 0.095791 }, { "epoch": 0.7375479205601365, "grad_norm": 6.729470729827881, "learning_rate": 1.7677394956176042e-06, "loss": 0.5923904418945313, "memory(GiB)": 47.44, "step": 15860, "token_acc": 0.8559451219512195, "train_speed(iter/s)": 0.095808 }, { "epoch": 0.737780438820086, "grad_norm": 10.68433666229248, "learning_rate": 1.7648069658897605e-06, "loss": 0.8138419151306152, "memory(GiB)": 47.44, "step": 15865, "token_acc": 0.8052335210334548, "train_speed(iter/s)": 0.095825 }, { "epoch": 0.7380129570800357, "grad_norm": 9.127498626708984, "learning_rate": 1.7618763492314444e-06, "loss": 0.6489748001098633, "memory(GiB)": 47.44, "step": 15870, "token_acc": 0.8489361702127659, "train_speed(iter/s)": 0.095842 }, { "epoch": 0.7382454753399853, "grad_norm": 13.219034194946289, "learning_rate": 1.7589476473756167e-06, "loss": 0.5790813446044922, "memory(GiB)": 47.44, "step": 15875, "token_acc": 0.8512685914260717, "train_speed(iter/s)": 0.095859 }, { "epoch": 0.7384779935999349, "grad_norm": 7.873594284057617, "learning_rate": 1.756020862054112e-06, "loss": 0.6810230731964111, "memory(GiB)": 47.44, "step": 15880, "token_acc": 0.8141503046716316, "train_speed(iter/s)": 0.095876 }, { "epoch": 0.7387105118598845, "grad_norm": 7.8127522468566895, "learning_rate": 1.7530959949976262e-06, "loss": 0.719043493270874, "memory(GiB)": 47.44, "step": 15885, "token_acc": 0.8200278164116829, "train_speed(iter/s)": 0.095893 }, { "epoch": 0.7389430301198341, "grad_norm": 8.471165657043457, "learning_rate": 1.7501730479357242e-06, "loss": 0.6684861660003663, "memory(GiB)": 47.44, "step": 15890, "token_acc": 0.834314880251276, "train_speed(iter/s)": 0.09591 }, { "epoch": 0.7391755483797837, "grad_norm": 9.835054397583008, "learning_rate": 1.747252022596836e-06, "loss": 0.6790287494659424, "memory(GiB)": 47.44, "step": 15895, "token_acc": 0.8317520556609741, "train_speed(iter/s)": 0.095926 }, { "epoch": 0.7394080666397334, "grad_norm": 8.918397903442383, "learning_rate": 1.7443329207082548e-06, "loss": 0.6621670246124267, "memory(GiB)": 47.44, "step": 15900, "token_acc": 0.8318614130434783, "train_speed(iter/s)": 0.095943 }, { "epoch": 0.7394080666397334, "eval_loss": 0.5644251108169556, "eval_runtime": 292.8436, "eval_samples_per_second": 11.866, "eval_steps_per_second": 11.866, "step": 15900 }, { "epoch": 0.7396405848996829, "grad_norm": 7.586838245391846, "learning_rate": 1.7414157439961332e-06, "loss": 0.6244229793548584, "memory(GiB)": 47.44, "step": 15905, "token_acc": 0.8333346639309579, "train_speed(iter/s)": 0.095791 }, { "epoch": 0.7398731031596325, "grad_norm": 7.999522686004639, "learning_rate": 1.7385004941854888e-06, "loss": 0.6422570228576661, "memory(GiB)": 47.44, "step": 15910, "token_acc": 0.8473042109405746, "train_speed(iter/s)": 0.095808 }, { "epoch": 0.7401056214195821, "grad_norm": 7.450502395629883, "learning_rate": 1.735587173000201e-06, "loss": 0.7661757946014405, "memory(GiB)": 47.44, "step": 15915, "token_acc": 0.8147856861974047, "train_speed(iter/s)": 0.095824 }, { "epoch": 0.7403381396795317, "grad_norm": 9.627984046936035, "learning_rate": 1.7326757821630026e-06, "loss": 0.6477952480316163, "memory(GiB)": 47.44, "step": 15920, "token_acc": 0.8338368580060423, "train_speed(iter/s)": 0.095841 }, { "epoch": 0.7405706579394813, "grad_norm": 9.843599319458008, "learning_rate": 1.729766323395493e-06, "loss": 0.6486878395080566, "memory(GiB)": 47.44, "step": 15925, "token_acc": 0.8337614678899082, "train_speed(iter/s)": 0.095858 }, { "epoch": 0.7408031761994309, "grad_norm": 9.124186515808105, "learning_rate": 1.7268587984181213e-06, "loss": 0.7141555309295654, "memory(GiB)": 47.44, "step": 15930, "token_acc": 0.8325718015665796, "train_speed(iter/s)": 0.095875 }, { "epoch": 0.7410356944593806, "grad_norm": 9.008081436157227, "learning_rate": 1.7239532089501982e-06, "loss": 0.6716857433319092, "memory(GiB)": 47.44, "step": 15935, "token_acc": 0.8405483405483406, "train_speed(iter/s)": 0.095892 }, { "epoch": 0.7412682127193301, "grad_norm": 8.329229354858398, "learning_rate": 1.7210495567098885e-06, "loss": 0.6629110813140869, "memory(GiB)": 47.44, "step": 15940, "token_acc": 0.8348040945993647, "train_speed(iter/s)": 0.095908 }, { "epoch": 0.7415007309792797, "grad_norm": 7.304653644561768, "learning_rate": 1.7181478434142134e-06, "loss": 0.49824161529541017, "memory(GiB)": 47.44, "step": 15945, "token_acc": 0.8744855967078189, "train_speed(iter/s)": 0.095926 }, { "epoch": 0.7417332492392293, "grad_norm": 6.444869041442871, "learning_rate": 1.715248070779042e-06, "loss": 0.6452473640441895, "memory(GiB)": 47.44, "step": 15950, "token_acc": 0.8382570162481536, "train_speed(iter/s)": 0.095943 }, { "epoch": 0.7417332492392293, "eval_loss": 0.5636093020439148, "eval_runtime": 291.7132, "eval_samples_per_second": 11.912, "eval_steps_per_second": 11.912, "step": 15950 }, { "epoch": 0.741965767499179, "grad_norm": 7.70983362197876, "learning_rate": 1.712350240519103e-06, "loss": 0.6836659908294678, "memory(GiB)": 47.44, "step": 15955, "token_acc": 0.8332400223946252, "train_speed(iter/s)": 0.095791 }, { "epoch": 0.7421982857591285, "grad_norm": 9.401704788208008, "learning_rate": 1.709454354347969e-06, "loss": 0.6634652137756347, "memory(GiB)": 47.44, "step": 15960, "token_acc": 0.8399353274050121, "train_speed(iter/s)": 0.095808 }, { "epoch": 0.7424308040190781, "grad_norm": 8.098258972167969, "learning_rate": 1.7065604139780712e-06, "loss": 0.7540879726409913, "memory(GiB)": 47.44, "step": 15965, "token_acc": 0.827323717948718, "train_speed(iter/s)": 0.095826 }, { "epoch": 0.7426633222790278, "grad_norm": 10.010939598083496, "learning_rate": 1.7036684211206817e-06, "loss": 0.6124141216278076, "memory(GiB)": 47.44, "step": 15970, "token_acc": 0.8495206335973322, "train_speed(iter/s)": 0.095843 }, { "epoch": 0.7428958405389773, "grad_norm": 10.145440101623535, "learning_rate": 1.700778377485927e-06, "loss": 0.713443660736084, "memory(GiB)": 47.44, "step": 15975, "token_acc": 0.830102622576967, "train_speed(iter/s)": 0.09586 }, { "epoch": 0.7431283587989269, "grad_norm": 8.140713691711426, "learning_rate": 1.6978902847827793e-06, "loss": 0.6635231018066406, "memory(GiB)": 47.44, "step": 15980, "token_acc": 0.8262042389210019, "train_speed(iter/s)": 0.095877 }, { "epoch": 0.7433608770588765, "grad_norm": 6.786279678344727, "learning_rate": 1.6950041447190584e-06, "loss": 0.6087878227233887, "memory(GiB)": 47.44, "step": 15985, "token_acc": 0.8394011568560735, "train_speed(iter/s)": 0.095894 }, { "epoch": 0.7435933953188262, "grad_norm": 9.08250904083252, "learning_rate": 1.6921199590014253e-06, "loss": 0.6415596008300781, "memory(GiB)": 47.44, "step": 15990, "token_acc": 0.8438988640527666, "train_speed(iter/s)": 0.095912 }, { "epoch": 0.7438259135787757, "grad_norm": 7.580020427703857, "learning_rate": 1.6892377293353906e-06, "loss": 0.6963030815124511, "memory(GiB)": 47.44, "step": 15995, "token_acc": 0.8396860986547086, "train_speed(iter/s)": 0.095929 }, { "epoch": 0.7440584318387253, "grad_norm": 7.410154819488525, "learning_rate": 1.6863574574253033e-06, "loss": 0.635734224319458, "memory(GiB)": 47.44, "step": 16000, "token_acc": 0.8359079986268452, "train_speed(iter/s)": 0.095946 }, { "epoch": 0.7440584318387253, "eval_loss": 0.5647992491722107, "eval_runtime": 291.7146, "eval_samples_per_second": 11.912, "eval_steps_per_second": 11.912, "step": 16000 }, { "epoch": 0.744290950098675, "grad_norm": 7.568226337432861, "learning_rate": 1.6834791449743594e-06, "loss": 0.5388147354125976, "memory(GiB)": 47.44, "step": 16005, "token_acc": 0.8337101532935668, "train_speed(iter/s)": 0.095795 }, { "epoch": 0.7445234683586245, "grad_norm": 8.07730484008789, "learning_rate": 1.6806027936845908e-06, "loss": 0.5757100582122803, "memory(GiB)": 47.44, "step": 16010, "token_acc": 0.8572752548656163, "train_speed(iter/s)": 0.095811 }, { "epoch": 0.7447559866185741, "grad_norm": 8.20462417602539, "learning_rate": 1.6777284052568755e-06, "loss": 0.6828523635864258, "memory(GiB)": 47.44, "step": 16015, "token_acc": 0.8220823798627003, "train_speed(iter/s)": 0.095829 }, { "epoch": 0.7449885048785237, "grad_norm": 7.580663681030273, "learning_rate": 1.6748559813909266e-06, "loss": 0.7655567646026611, "memory(GiB)": 47.44, "step": 16020, "token_acc": 0.8057692307692308, "train_speed(iter/s)": 0.095846 }, { "epoch": 0.7452210231384734, "grad_norm": 7.321385860443115, "learning_rate": 1.6719855237853e-06, "loss": 0.7181625843048096, "memory(GiB)": 47.44, "step": 16025, "token_acc": 0.8347318496898942, "train_speed(iter/s)": 0.095863 }, { "epoch": 0.7454535413984229, "grad_norm": 10.682913780212402, "learning_rate": 1.669117034137382e-06, "loss": 0.6672065734863282, "memory(GiB)": 47.44, "step": 16030, "token_acc": 0.837620578778135, "train_speed(iter/s)": 0.09588 }, { "epoch": 0.7456860596583725, "grad_norm": 8.959480285644531, "learning_rate": 1.6662505141434004e-06, "loss": 0.6392136573791504, "memory(GiB)": 47.44, "step": 16035, "token_acc": 0.845925925925926, "train_speed(iter/s)": 0.095897 }, { "epoch": 0.7459185779183222, "grad_norm": 8.299612998962402, "learning_rate": 1.6633859654984192e-06, "loss": 0.5677808284759521, "memory(GiB)": 47.44, "step": 16040, "token_acc": 0.8607932875667429, "train_speed(iter/s)": 0.095913 }, { "epoch": 0.7461510961782718, "grad_norm": 7.424012660980225, "learning_rate": 1.6605233898963307e-06, "loss": 0.5513256072998047, "memory(GiB)": 47.44, "step": 16045, "token_acc": 0.8574193548387097, "train_speed(iter/s)": 0.09593 }, { "epoch": 0.7463836144382213, "grad_norm": 7.407084941864014, "learning_rate": 1.6576627890298685e-06, "loss": 0.7417648315429688, "memory(GiB)": 47.44, "step": 16050, "token_acc": 0.8085351787773933, "train_speed(iter/s)": 0.095947 }, { "epoch": 0.7463836144382213, "eval_loss": 0.5642947554588318, "eval_runtime": 292.8114, "eval_samples_per_second": 11.868, "eval_steps_per_second": 11.868, "step": 16050 }, { "epoch": 0.746616132698171, "grad_norm": 7.718512535095215, "learning_rate": 1.6548041645905894e-06, "loss": 0.579425048828125, "memory(GiB)": 47.44, "step": 16055, "token_acc": 0.8340791050739113, "train_speed(iter/s)": 0.095796 }, { "epoch": 0.7468486509581206, "grad_norm": 7.850195407867432, "learning_rate": 1.6519475182688894e-06, "loss": 0.5587710857391357, "memory(GiB)": 47.44, "step": 16060, "token_acc": 0.8575553416746872, "train_speed(iter/s)": 0.095812 }, { "epoch": 0.7470811692180701, "grad_norm": 9.208138465881348, "learning_rate": 1.6490928517539906e-06, "loss": 0.8099372863769532, "memory(GiB)": 47.44, "step": 16065, "token_acc": 0.8002898550724638, "train_speed(iter/s)": 0.095829 }, { "epoch": 0.7473136874780197, "grad_norm": 10.444913864135742, "learning_rate": 1.6462401667339477e-06, "loss": 0.6440964221954346, "memory(GiB)": 47.44, "step": 16070, "token_acc": 0.8335183129855716, "train_speed(iter/s)": 0.095846 }, { "epoch": 0.7475462057379694, "grad_norm": 9.752091407775879, "learning_rate": 1.6433894648956377e-06, "loss": 0.6518924236297607, "memory(GiB)": 47.44, "step": 16075, "token_acc": 0.8316729646169702, "train_speed(iter/s)": 0.095863 }, { "epoch": 0.747778723997919, "grad_norm": 8.187569618225098, "learning_rate": 1.6405407479247727e-06, "loss": 0.6321462631225586, "memory(GiB)": 47.44, "step": 16080, "token_acc": 0.8360964581763376, "train_speed(iter/s)": 0.095879 }, { "epoch": 0.7480112422578685, "grad_norm": 9.651229858398438, "learning_rate": 1.6376940175058826e-06, "loss": 0.6507625102996826, "memory(GiB)": 47.44, "step": 16085, "token_acc": 0.8332066869300911, "train_speed(iter/s)": 0.095896 }, { "epoch": 0.7482437605178182, "grad_norm": 8.283926010131836, "learning_rate": 1.634849275322331e-06, "loss": 0.6909523487091065, "memory(GiB)": 47.44, "step": 16090, "token_acc": 0.8298148749594024, "train_speed(iter/s)": 0.095913 }, { "epoch": 0.7484762787777678, "grad_norm": 8.173871040344238, "learning_rate": 1.632006523056298e-06, "loss": 0.6695907592773438, "memory(GiB)": 47.44, "step": 16095, "token_acc": 0.8302583025830258, "train_speed(iter/s)": 0.09593 }, { "epoch": 0.7487087970377174, "grad_norm": 6.885328769683838, "learning_rate": 1.6291657623887935e-06, "loss": 0.6500693321228027, "memory(GiB)": 47.44, "step": 16100, "token_acc": 0.8388214904679376, "train_speed(iter/s)": 0.095947 }, { "epoch": 0.7487087970377174, "eval_loss": 0.5655313730239868, "eval_runtime": 295.4483, "eval_samples_per_second": 11.762, "eval_steps_per_second": 11.762, "step": 16100 }, { "epoch": 0.7489413152976669, "grad_norm": 8.974815368652344, "learning_rate": 1.6263269949996457e-06, "loss": 0.6070386886596679, "memory(GiB)": 47.44, "step": 16105, "token_acc": 0.8336296794656158, "train_speed(iter/s)": 0.095794 }, { "epoch": 0.7491738335576166, "grad_norm": 6.081818103790283, "learning_rate": 1.6234902225675075e-06, "loss": 0.8032929420471191, "memory(GiB)": 47.44, "step": 16110, "token_acc": 0.7996732026143791, "train_speed(iter/s)": 0.095811 }, { "epoch": 0.7494063518175662, "grad_norm": 11.375482559204102, "learning_rate": 1.620655446769847e-06, "loss": 0.5640523433685303, "memory(GiB)": 47.44, "step": 16115, "token_acc": 0.8554140127388535, "train_speed(iter/s)": 0.095827 }, { "epoch": 0.7496388700775157, "grad_norm": 8.1347017288208, "learning_rate": 1.6178226692829579e-06, "loss": 0.649559736251831, "memory(GiB)": 47.44, "step": 16120, "token_acc": 0.8394160583941606, "train_speed(iter/s)": 0.095844 }, { "epoch": 0.7498713883374654, "grad_norm": 10.170631408691406, "learning_rate": 1.6149918917819452e-06, "loss": 0.6405446052551269, "memory(GiB)": 47.44, "step": 16125, "token_acc": 0.8422807602534178, "train_speed(iter/s)": 0.09586 }, { "epoch": 0.750103906597415, "grad_norm": 6.244584083557129, "learning_rate": 1.612163115940739e-06, "loss": 0.6530537605285645, "memory(GiB)": 47.44, "step": 16130, "token_acc": 0.8325503355704698, "train_speed(iter/s)": 0.095877 }, { "epoch": 0.7503364248573646, "grad_norm": 7.818486213684082, "learning_rate": 1.609336343432078e-06, "loss": 0.5714409828186036, "memory(GiB)": 47.44, "step": 16135, "token_acc": 0.8491704374057315, "train_speed(iter/s)": 0.095894 }, { "epoch": 0.7505689431173141, "grad_norm": 9.446545600891113, "learning_rate": 1.6065115759275224e-06, "loss": 0.6604638576507569, "memory(GiB)": 47.44, "step": 16140, "token_acc": 0.8426255436931593, "train_speed(iter/s)": 0.095911 }, { "epoch": 0.7508014613772638, "grad_norm": 7.6879987716674805, "learning_rate": 1.6036888150974433e-06, "loss": 0.6549732685089111, "memory(GiB)": 47.44, "step": 16145, "token_acc": 0.8540145985401459, "train_speed(iter/s)": 0.095927 }, { "epoch": 0.7510339796372134, "grad_norm": 8.272690773010254, "learning_rate": 1.600868062611029e-06, "loss": 0.6290201187133789, "memory(GiB)": 47.44, "step": 16150, "token_acc": 0.840620592383639, "train_speed(iter/s)": 0.095944 }, { "epoch": 0.7510339796372134, "eval_loss": 0.5634961128234863, "eval_runtime": 292.755, "eval_samples_per_second": 11.87, "eval_steps_per_second": 11.87, "step": 16150 }, { "epoch": 0.7512664978971629, "grad_norm": 8.650757789611816, "learning_rate": 1.5980493201362734e-06, "loss": 0.6302085876464844, "memory(GiB)": 47.44, "step": 16155, "token_acc": 0.8328231617881513, "train_speed(iter/s)": 0.095794 }, { "epoch": 0.7514990161571126, "grad_norm": 9.49666690826416, "learning_rate": 1.59523258933999e-06, "loss": 0.8467613220214844, "memory(GiB)": 47.44, "step": 16160, "token_acc": 0.7879041248606466, "train_speed(iter/s)": 0.095811 }, { "epoch": 0.7517315344170622, "grad_norm": 11.6074800491333, "learning_rate": 1.5924178718877953e-06, "loss": 0.6553841590881347, "memory(GiB)": 47.44, "step": 16165, "token_acc": 0.8311300639658848, "train_speed(iter/s)": 0.095828 }, { "epoch": 0.7519640526770118, "grad_norm": 7.853658199310303, "learning_rate": 1.5896051694441195e-06, "loss": 0.6012135982513428, "memory(GiB)": 47.44, "step": 16170, "token_acc": 0.843441466854725, "train_speed(iter/s)": 0.095844 }, { "epoch": 0.7521965709369614, "grad_norm": 8.311904907226562, "learning_rate": 1.5867944836722015e-06, "loss": 0.565187931060791, "memory(GiB)": 47.44, "step": 16175, "token_acc": 0.8583779333058872, "train_speed(iter/s)": 0.095861 }, { "epoch": 0.752429089196911, "grad_norm": 7.729232311248779, "learning_rate": 1.5839858162340854e-06, "loss": 0.7426953792572022, "memory(GiB)": 47.44, "step": 16180, "token_acc": 0.8138832997987927, "train_speed(iter/s)": 0.095877 }, { "epoch": 0.7526616074568606, "grad_norm": 7.0569281578063965, "learning_rate": 1.5811791687906259e-06, "loss": 0.6274663925170898, "memory(GiB)": 47.44, "step": 16185, "token_acc": 0.8426877470355731, "train_speed(iter/s)": 0.095894 }, { "epoch": 0.7528941257168102, "grad_norm": 7.353418827056885, "learning_rate": 1.5783745430014763e-06, "loss": 0.7416880607604981, "memory(GiB)": 47.44, "step": 16190, "token_acc": 0.8261287223823247, "train_speed(iter/s)": 0.09591 }, { "epoch": 0.7531266439767598, "grad_norm": 10.384500503540039, "learning_rate": 1.5755719405251014e-06, "loss": 0.5416950225830078, "memory(GiB)": 47.44, "step": 16195, "token_acc": 0.861673672143676, "train_speed(iter/s)": 0.095926 }, { "epoch": 0.7533591622367094, "grad_norm": 8.491443634033203, "learning_rate": 1.5727713630187635e-06, "loss": 0.6371305465698243, "memory(GiB)": 47.44, "step": 16200, "token_acc": 0.842686002522068, "train_speed(iter/s)": 0.095943 }, { "epoch": 0.7533591622367094, "eval_loss": 0.5626077651977539, "eval_runtime": 294.2238, "eval_samples_per_second": 11.811, "eval_steps_per_second": 11.811, "step": 16200 }, { "epoch": 0.753591680496659, "grad_norm": 8.919591903686523, "learning_rate": 1.5699728121385344e-06, "loss": 0.5992330551147461, "memory(GiB)": 47.44, "step": 16205, "token_acc": 0.8337118001695105, "train_speed(iter/s)": 0.095793 }, { "epoch": 0.7538241987566086, "grad_norm": 8.25503158569336, "learning_rate": 1.5671762895392801e-06, "loss": 0.6749239921569824, "memory(GiB)": 47.44, "step": 16210, "token_acc": 0.842032967032967, "train_speed(iter/s)": 0.095809 }, { "epoch": 0.7540567170165582, "grad_norm": 8.522311210632324, "learning_rate": 1.5643817968746717e-06, "loss": 0.6455776214599609, "memory(GiB)": 47.44, "step": 16215, "token_acc": 0.8392246294184721, "train_speed(iter/s)": 0.095825 }, { "epoch": 0.7542892352765078, "grad_norm": 8.548661231994629, "learning_rate": 1.5615893357971795e-06, "loss": 0.6254148960113526, "memory(GiB)": 47.44, "step": 16220, "token_acc": 0.8373618784530387, "train_speed(iter/s)": 0.095842 }, { "epoch": 0.7545217535364575, "grad_norm": 7.641754627227783, "learning_rate": 1.558798907958074e-06, "loss": 0.6256637096405029, "memory(GiB)": 47.44, "step": 16225, "token_acc": 0.8421729347476695, "train_speed(iter/s)": 0.095858 }, { "epoch": 0.754754271796407, "grad_norm": 7.494331359863281, "learning_rate": 1.5560105150074172e-06, "loss": 0.7765919208526612, "memory(GiB)": 47.44, "step": 16230, "token_acc": 0.8147335423197493, "train_speed(iter/s)": 0.095874 }, { "epoch": 0.7549867900563566, "grad_norm": 6.190079689025879, "learning_rate": 1.553224158594076e-06, "loss": 0.7715739250183106, "memory(GiB)": 47.44, "step": 16235, "token_acc": 0.8063669182802757, "train_speed(iter/s)": 0.095891 }, { "epoch": 0.7552193083163062, "grad_norm": 9.882500648498535, "learning_rate": 1.5504398403657055e-06, "loss": 0.6255253791809082, "memory(GiB)": 47.44, "step": 16240, "token_acc": 0.8476679503637141, "train_speed(iter/s)": 0.095907 }, { "epoch": 0.7554518265762559, "grad_norm": 8.659635543823242, "learning_rate": 1.5476575619687617e-06, "loss": 0.6734414577484131, "memory(GiB)": 47.44, "step": 16245, "token_acc": 0.8355778264954589, "train_speed(iter/s)": 0.095924 }, { "epoch": 0.7556843448362054, "grad_norm": 8.26206111907959, "learning_rate": 1.5448773250484895e-06, "loss": 0.669712495803833, "memory(GiB)": 47.44, "step": 16250, "token_acc": 0.8396972824217406, "train_speed(iter/s)": 0.095941 }, { "epoch": 0.7556843448362054, "eval_loss": 0.563765287399292, "eval_runtime": 293.9564, "eval_samples_per_second": 11.821, "eval_steps_per_second": 11.821, "step": 16250 }, { "epoch": 0.755916863096155, "grad_norm": 7.431678295135498, "learning_rate": 1.5420991312489298e-06, "loss": 0.7340573787689209, "memory(GiB)": 47.44, "step": 16255, "token_acc": 0.8327177580829151, "train_speed(iter/s)": 0.095791 }, { "epoch": 0.7561493813561047, "grad_norm": 8.096651077270508, "learning_rate": 1.5393229822129142e-06, "loss": 0.6868834018707275, "memory(GiB)": 47.44, "step": 16260, "token_acc": 0.8088379705400982, "train_speed(iter/s)": 0.095807 }, { "epoch": 0.7563818996160542, "grad_norm": 8.451735496520996, "learning_rate": 1.536548879582067e-06, "loss": 0.5752121448516846, "memory(GiB)": 47.44, "step": 16265, "token_acc": 0.8594154642989271, "train_speed(iter/s)": 0.095824 }, { "epoch": 0.7566144178760038, "grad_norm": 8.332307815551758, "learning_rate": 1.5337768249967984e-06, "loss": 0.5772712707519532, "memory(GiB)": 47.44, "step": 16270, "token_acc": 0.851790450928382, "train_speed(iter/s)": 0.09584 }, { "epoch": 0.7568469361359534, "grad_norm": 7.697171688079834, "learning_rate": 1.5310068200963119e-06, "loss": 0.6366849422454834, "memory(GiB)": 47.44, "step": 16275, "token_acc": 0.843067143424712, "train_speed(iter/s)": 0.095857 }, { "epoch": 0.7570794543959031, "grad_norm": 6.656437873840332, "learning_rate": 1.5282388665185942e-06, "loss": 0.5415359020233155, "memory(GiB)": 47.44, "step": 16280, "token_acc": 0.8706157443491817, "train_speed(iter/s)": 0.095874 }, { "epoch": 0.7573119726558526, "grad_norm": 10.454804420471191, "learning_rate": 1.5254729659004252e-06, "loss": 0.6282804489135743, "memory(GiB)": 47.44, "step": 16285, "token_acc": 0.853583916083916, "train_speed(iter/s)": 0.09589 }, { "epoch": 0.7575444909158022, "grad_norm": 10.950794219970703, "learning_rate": 1.5227091198773641e-06, "loss": 0.6014307975769043, "memory(GiB)": 47.44, "step": 16290, "token_acc": 0.8438438438438438, "train_speed(iter/s)": 0.095907 }, { "epoch": 0.7577770091757519, "grad_norm": 8.460285186767578, "learning_rate": 1.519947330083759e-06, "loss": 0.648493766784668, "memory(GiB)": 47.44, "step": 16295, "token_acc": 0.8322683706070287, "train_speed(iter/s)": 0.095924 }, { "epoch": 0.7580095274357014, "grad_norm": 9.404953956604004, "learning_rate": 1.5171875981527429e-06, "loss": 0.7454773902893066, "memory(GiB)": 47.44, "step": 16300, "token_acc": 0.8106194690265487, "train_speed(iter/s)": 0.09594 }, { "epoch": 0.7580095274357014, "eval_loss": 0.5625870227813721, "eval_runtime": 293.7911, "eval_samples_per_second": 11.828, "eval_steps_per_second": 11.828, "step": 16300 }, { "epoch": 0.758242045695651, "grad_norm": 7.188810348510742, "learning_rate": 1.5144299257162293e-06, "loss": 0.6204580783843994, "memory(GiB)": 47.44, "step": 16305, "token_acc": 0.8339520296974254, "train_speed(iter/s)": 0.095791 }, { "epoch": 0.7584745639556006, "grad_norm": 10.468605041503906, "learning_rate": 1.5116743144049174e-06, "loss": 0.6677374362945556, "memory(GiB)": 47.44, "step": 16310, "token_acc": 0.8274293142671433, "train_speed(iter/s)": 0.095807 }, { "epoch": 0.7587070822155503, "grad_norm": 12.967514991760254, "learning_rate": 1.5089207658482818e-06, "loss": 0.613736343383789, "memory(GiB)": 47.44, "step": 16315, "token_acc": 0.848092404620231, "train_speed(iter/s)": 0.095824 }, { "epoch": 0.7589396004754998, "grad_norm": 6.795639514923096, "learning_rate": 1.5061692816745844e-06, "loss": 0.7447350025177002, "memory(GiB)": 47.44, "step": 16320, "token_acc": 0.8165016501650165, "train_speed(iter/s)": 0.09584 }, { "epoch": 0.7591721187354494, "grad_norm": 8.063556671142578, "learning_rate": 1.5034198635108583e-06, "loss": 0.6021570205688477, "memory(GiB)": 47.44, "step": 16325, "token_acc": 0.8490967056323061, "train_speed(iter/s)": 0.095856 }, { "epoch": 0.7594046369953991, "grad_norm": 7.179757595062256, "learning_rate": 1.5006725129829243e-06, "loss": 0.5568655967712403, "memory(GiB)": 47.44, "step": 16330, "token_acc": 0.8635454181672669, "train_speed(iter/s)": 0.095872 }, { "epoch": 0.7596371552553487, "grad_norm": 5.947718143463135, "learning_rate": 1.4979272317153704e-06, "loss": 0.5865846157073975, "memory(GiB)": 47.44, "step": 16335, "token_acc": 0.8524265434136679, "train_speed(iter/s)": 0.095888 }, { "epoch": 0.7598696735152982, "grad_norm": 7.1008076667785645, "learning_rate": 1.4951840213315694e-06, "loss": 0.5202283382415771, "memory(GiB)": 47.44, "step": 16340, "token_acc": 0.8688915375446961, "train_speed(iter/s)": 0.095905 }, { "epoch": 0.7601021917752478, "grad_norm": 11.718537330627441, "learning_rate": 1.4924428834536659e-06, "loss": 0.6101597785949707, "memory(GiB)": 47.44, "step": 16345, "token_acc": 0.8434628975265017, "train_speed(iter/s)": 0.095921 }, { "epoch": 0.7603347100351975, "grad_norm": 10.827213287353516, "learning_rate": 1.4897038197025805e-06, "loss": 0.7467214584350585, "memory(GiB)": 47.44, "step": 16350, "token_acc": 0.8111263736263736, "train_speed(iter/s)": 0.095937 }, { "epoch": 0.7603347100351975, "eval_loss": 0.5625221133232117, "eval_runtime": 293.4663, "eval_samples_per_second": 11.841, "eval_steps_per_second": 11.841, "step": 16350 }, { "epoch": 0.760567228295147, "grad_norm": 6.726182460784912, "learning_rate": 1.4869668316980034e-06, "loss": 0.6483430862426758, "memory(GiB)": 47.44, "step": 16355, "token_acc": 0.8333920112661631, "train_speed(iter/s)": 0.095788 }, { "epoch": 0.7607997465550966, "grad_norm": 9.858255386352539, "learning_rate": 1.4842319210584033e-06, "loss": 0.5826794624328613, "memory(GiB)": 47.44, "step": 16360, "token_acc": 0.8565055762081785, "train_speed(iter/s)": 0.095804 }, { "epoch": 0.7610322648150463, "grad_norm": 6.0644145011901855, "learning_rate": 1.4814990894010139e-06, "loss": 0.6393117427825927, "memory(GiB)": 47.44, "step": 16365, "token_acc": 0.8459736456808199, "train_speed(iter/s)": 0.095821 }, { "epoch": 0.7612647830749959, "grad_norm": 8.901994705200195, "learning_rate": 1.478768338341846e-06, "loss": 0.552800464630127, "memory(GiB)": 47.44, "step": 16370, "token_acc": 0.8708815672306323, "train_speed(iter/s)": 0.095838 }, { "epoch": 0.7614973013349454, "grad_norm": 10.548748016357422, "learning_rate": 1.476039669495674e-06, "loss": 0.5374550819396973, "memory(GiB)": 47.44, "step": 16375, "token_acc": 0.8676470588235294, "train_speed(iter/s)": 0.095854 }, { "epoch": 0.761729819594895, "grad_norm": 8.564501762390137, "learning_rate": 1.4733130844760456e-06, "loss": 0.7531012535095215, "memory(GiB)": 47.44, "step": 16380, "token_acc": 0.8209969788519638, "train_speed(iter/s)": 0.09587 }, { "epoch": 0.7619623378548447, "grad_norm": 7.566102504730225, "learning_rate": 1.470588584895275e-06, "loss": 0.6096216678619385, "memory(GiB)": 47.44, "step": 16385, "token_acc": 0.8416728902165795, "train_speed(iter/s)": 0.095886 }, { "epoch": 0.7621948561147943, "grad_norm": 9.330343246459961, "learning_rate": 1.4678661723644445e-06, "loss": 0.6481287479400635, "memory(GiB)": 47.44, "step": 16390, "token_acc": 0.8470254957507082, "train_speed(iter/s)": 0.095903 }, { "epoch": 0.7624273743747438, "grad_norm": 11.43049144744873, "learning_rate": 1.4651458484933967e-06, "loss": 0.6192949771881103, "memory(GiB)": 47.44, "step": 16395, "token_acc": 0.8459622909996443, "train_speed(iter/s)": 0.095919 }, { "epoch": 0.7626598926346935, "grad_norm": 11.158167839050293, "learning_rate": 1.4624276148907468e-06, "loss": 0.7683715343475341, "memory(GiB)": 47.44, "step": 16400, "token_acc": 0.8229934924078091, "train_speed(iter/s)": 0.095936 }, { "epoch": 0.7626598926346935, "eval_loss": 0.5630708336830139, "eval_runtime": 293.6494, "eval_samples_per_second": 11.834, "eval_steps_per_second": 11.834, "step": 16400 }, { "epoch": 0.7628924108946431, "grad_norm": 7.216041564941406, "learning_rate": 1.4597114731638674e-06, "loss": 0.6736807823181152, "memory(GiB)": 47.44, "step": 16405, "token_acc": 0.8332021153922621, "train_speed(iter/s)": 0.095787 }, { "epoch": 0.7631249291545926, "grad_norm": 9.324705123901367, "learning_rate": 1.4569974249189e-06, "loss": 0.6405007362365722, "memory(GiB)": 47.44, "step": 16410, "token_acc": 0.8385508265916286, "train_speed(iter/s)": 0.095803 }, { "epoch": 0.7633574474145423, "grad_norm": 4.808244705200195, "learning_rate": 1.4542854717607414e-06, "loss": 0.7190701961517334, "memory(GiB)": 47.44, "step": 16415, "token_acc": 0.8179800221975583, "train_speed(iter/s)": 0.095819 }, { "epoch": 0.7635899656744919, "grad_norm": 9.507863998413086, "learning_rate": 1.4515756152930556e-06, "loss": 0.546860933303833, "memory(GiB)": 47.44, "step": 16420, "token_acc": 0.8678556951763275, "train_speed(iter/s)": 0.095836 }, { "epoch": 0.7638224839344415, "grad_norm": 8.980109214782715, "learning_rate": 1.448867857118264e-06, "loss": 0.7133777618408204, "memory(GiB)": 47.44, "step": 16425, "token_acc": 0.8264150943396227, "train_speed(iter/s)": 0.095852 }, { "epoch": 0.764055002194391, "grad_norm": 7.252748966217041, "learning_rate": 1.4461621988375473e-06, "loss": 0.6807666301727295, "memory(GiB)": 47.44, "step": 16430, "token_acc": 0.8331565924354896, "train_speed(iter/s)": 0.095869 }, { "epoch": 0.7642875204543407, "grad_norm": 7.0165205001831055, "learning_rate": 1.4434586420508467e-06, "loss": 0.5628365993499755, "memory(GiB)": 47.44, "step": 16435, "token_acc": 0.8530997304582211, "train_speed(iter/s)": 0.095885 }, { "epoch": 0.7645200387142903, "grad_norm": 7.279083251953125, "learning_rate": 1.440757188356856e-06, "loss": 0.6637139797210694, "memory(GiB)": 47.44, "step": 16440, "token_acc": 0.8276481149012568, "train_speed(iter/s)": 0.095902 }, { "epoch": 0.7647525569742399, "grad_norm": 8.89866828918457, "learning_rate": 1.4380578393530303e-06, "loss": 0.5501326560974121, "memory(GiB)": 47.44, "step": 16445, "token_acc": 0.8655049151027703, "train_speed(iter/s)": 0.095919 }, { "epoch": 0.7649850752341895, "grad_norm": 7.709764003753662, "learning_rate": 1.435360596635576e-06, "loss": 0.7855375289916993, "memory(GiB)": 47.44, "step": 16450, "token_acc": 0.8118338846012495, "train_speed(iter/s)": 0.095935 }, { "epoch": 0.7649850752341895, "eval_loss": 0.5636538863182068, "eval_runtime": 296.5811, "eval_samples_per_second": 11.717, "eval_steps_per_second": 11.717, "step": 16450 }, { "epoch": 0.7652175934941391, "grad_norm": 6.851712703704834, "learning_rate": 1.4326654617994585e-06, "loss": 0.7187223911285401, "memory(GiB)": 47.44, "step": 16455, "token_acc": 0.8327603672300612, "train_speed(iter/s)": 0.095786 }, { "epoch": 0.7654501117540887, "grad_norm": 6.9142889976501465, "learning_rate": 1.4299724364383915e-06, "loss": 0.5832521915435791, "memory(GiB)": 47.44, "step": 16460, "token_acc": 0.8634826711749789, "train_speed(iter/s)": 0.095802 }, { "epoch": 0.7656826300140382, "grad_norm": 7.56882381439209, "learning_rate": 1.427281522144845e-06, "loss": 0.7005406856536865, "memory(GiB)": 47.44, "step": 16465, "token_acc": 0.8402910762160092, "train_speed(iter/s)": 0.095818 }, { "epoch": 0.7659151482739879, "grad_norm": 7.536351203918457, "learning_rate": 1.4245927205100402e-06, "loss": 0.6036080360412598, "memory(GiB)": 47.44, "step": 16470, "token_acc": 0.8438514244500541, "train_speed(iter/s)": 0.095835 }, { "epoch": 0.7661476665339375, "grad_norm": 7.149500370025635, "learning_rate": 1.4219060331239498e-06, "loss": 0.7330766677856445, "memory(GiB)": 47.44, "step": 16475, "token_acc": 0.8186915887850468, "train_speed(iter/s)": 0.095851 }, { "epoch": 0.7663801847938871, "grad_norm": 11.843749046325684, "learning_rate": 1.419221461575292e-06, "loss": 0.7601501941680908, "memory(GiB)": 47.44, "step": 16480, "token_acc": 0.8212290502793296, "train_speed(iter/s)": 0.095867 }, { "epoch": 0.7666127030538367, "grad_norm": 8.46166706085205, "learning_rate": 1.41653900745154e-06, "loss": 0.6596882820129395, "memory(GiB)": 47.44, "step": 16485, "token_acc": 0.8379591836734694, "train_speed(iter/s)": 0.095884 }, { "epoch": 0.7668452213137863, "grad_norm": 8.52076244354248, "learning_rate": 1.4138586723389092e-06, "loss": 0.7091259002685547, "memory(GiB)": 47.44, "step": 16490, "token_acc": 0.8447533929162528, "train_speed(iter/s)": 0.0959 }, { "epoch": 0.7670777395737359, "grad_norm": 8.565044403076172, "learning_rate": 1.4111804578223649e-06, "loss": 0.5640076160430908, "memory(GiB)": 47.44, "step": 16495, "token_acc": 0.8522522522522522, "train_speed(iter/s)": 0.095917 }, { "epoch": 0.7673102578336854, "grad_norm": 7.396138668060303, "learning_rate": 1.4085043654856184e-06, "loss": 0.6888665199279785, "memory(GiB)": 47.44, "step": 16500, "token_acc": 0.8275146906325613, "train_speed(iter/s)": 0.095934 }, { "epoch": 0.7673102578336854, "eval_loss": 0.562106192111969, "eval_runtime": 291.388, "eval_samples_per_second": 11.926, "eval_steps_per_second": 11.926, "step": 16500 }, { "epoch": 0.7675427760936351, "grad_norm": 6.648313999176025, "learning_rate": 1.405830396911128e-06, "loss": 0.7077539443969727, "memory(GiB)": 47.44, "step": 16505, "token_acc": 0.8332853371730261, "train_speed(iter/s)": 0.095787 }, { "epoch": 0.7677752943535847, "grad_norm": 8.782613754272461, "learning_rate": 1.4031585536800896e-06, "loss": 0.696587085723877, "memory(GiB)": 47.44, "step": 16510, "token_acc": 0.822529224229543, "train_speed(iter/s)": 0.095803 }, { "epoch": 0.7680078126135343, "grad_norm": 7.743236541748047, "learning_rate": 1.4004888373724506e-06, "loss": 0.6986588954925537, "memory(GiB)": 47.44, "step": 16515, "token_acc": 0.8177627535341119, "train_speed(iter/s)": 0.095819 }, { "epoch": 0.7682403308734839, "grad_norm": 6.123650074005127, "learning_rate": 1.3978212495668936e-06, "loss": 0.6276377201080322, "memory(GiB)": 47.44, "step": 16520, "token_acc": 0.8322475570032574, "train_speed(iter/s)": 0.095835 }, { "epoch": 0.7684728491334335, "grad_norm": 7.923332214355469, "learning_rate": 1.3951557918408482e-06, "loss": 0.5356187343597412, "memory(GiB)": 47.44, "step": 16525, "token_acc": 0.8737796373779637, "train_speed(iter/s)": 0.095851 }, { "epoch": 0.7687053673933831, "grad_norm": 9.798731803894043, "learning_rate": 1.392492465770479e-06, "loss": 0.7026069641113282, "memory(GiB)": 47.44, "step": 16530, "token_acc": 0.8280542986425339, "train_speed(iter/s)": 0.095868 }, { "epoch": 0.7689378856533328, "grad_norm": 8.680355072021484, "learning_rate": 1.389831272930695e-06, "loss": 0.6817941188812255, "memory(GiB)": 47.44, "step": 16535, "token_acc": 0.8211987809007789, "train_speed(iter/s)": 0.095884 }, { "epoch": 0.7691704039132823, "grad_norm": 6.331023693084717, "learning_rate": 1.3871722148951404e-06, "loss": 0.6216944217681885, "memory(GiB)": 47.44, "step": 16540, "token_acc": 0.8433771795656164, "train_speed(iter/s)": 0.0959 }, { "epoch": 0.7694029221732319, "grad_norm": 9.733691215515137, "learning_rate": 1.384515293236201e-06, "loss": 0.8239374160766602, "memory(GiB)": 47.44, "step": 16545, "token_acc": 0.7935423781434338, "train_speed(iter/s)": 0.095916 }, { "epoch": 0.7696354404331816, "grad_norm": 7.86898946762085, "learning_rate": 1.3818605095249932e-06, "loss": 0.6025336742401123, "memory(GiB)": 47.44, "step": 16550, "token_acc": 0.84384, "train_speed(iter/s)": 0.095932 }, { "epoch": 0.7696354404331816, "eval_loss": 0.5639436841011047, "eval_runtime": 295.1839, "eval_samples_per_second": 11.772, "eval_steps_per_second": 11.772, "step": 16550 }, { "epoch": 0.7698679586931311, "grad_norm": 7.427320957183838, "learning_rate": 1.3792078653313757e-06, "loss": 0.6521914958953857, "memory(GiB)": 47.44, "step": 16555, "token_acc": 0.8331147645854657, "train_speed(iter/s)": 0.095783 }, { "epoch": 0.7701004769530807, "grad_norm": 8.594679832458496, "learning_rate": 1.3765573622239354e-06, "loss": 0.5646349430084229, "memory(GiB)": 47.44, "step": 16560, "token_acc": 0.8466364586964099, "train_speed(iter/s)": 0.0958 }, { "epoch": 0.7703329952130303, "grad_norm": 8.932754516601562, "learning_rate": 1.3739090017699985e-06, "loss": 0.6921500205993653, "memory(GiB)": 47.44, "step": 16565, "token_acc": 0.8274748923959828, "train_speed(iter/s)": 0.095815 }, { "epoch": 0.77056551347298, "grad_norm": 6.5633463859558105, "learning_rate": 1.3712627855356241e-06, "loss": 0.8388358116149902, "memory(GiB)": 47.44, "step": 16570, "token_acc": 0.7907249779864984, "train_speed(iter/s)": 0.095832 }, { "epoch": 0.7707980317329295, "grad_norm": 9.961206436157227, "learning_rate": 1.368618715085598e-06, "loss": 0.6288596153259277, "memory(GiB)": 47.44, "step": 16575, "token_acc": 0.8423252279635258, "train_speed(iter/s)": 0.095848 }, { "epoch": 0.7710305499928791, "grad_norm": 5.8794636726379395, "learning_rate": 1.3659767919834426e-06, "loss": 0.7108618736267089, "memory(GiB)": 47.44, "step": 16580, "token_acc": 0.826055575604475, "train_speed(iter/s)": 0.095863 }, { "epoch": 0.7712630682528288, "grad_norm": 7.274999141693115, "learning_rate": 1.3633370177914086e-06, "loss": 0.6645435333251953, "memory(GiB)": 47.44, "step": 16585, "token_acc": 0.836635843240863, "train_speed(iter/s)": 0.095879 }, { "epoch": 0.7714955865127784, "grad_norm": 8.009333610534668, "learning_rate": 1.3606993940704766e-06, "loss": 0.58541841506958, "memory(GiB)": 47.44, "step": 16590, "token_acc": 0.8406515580736544, "train_speed(iter/s)": 0.095895 }, { "epoch": 0.7717281047727279, "grad_norm": 9.380660057067871, "learning_rate": 1.3580639223803533e-06, "loss": 0.6907434940338135, "memory(GiB)": 47.44, "step": 16595, "token_acc": 0.8184210526315789, "train_speed(iter/s)": 0.095912 }, { "epoch": 0.7719606230326775, "grad_norm": 7.912655353546143, "learning_rate": 1.3554306042794769e-06, "loss": 0.605219030380249, "memory(GiB)": 47.44, "step": 16600, "token_acc": 0.84609375, "train_speed(iter/s)": 0.095928 }, { "epoch": 0.7719606230326775, "eval_loss": 0.5613829493522644, "eval_runtime": 292.7757, "eval_samples_per_second": 11.869, "eval_steps_per_second": 11.869, "step": 16600 }, { "epoch": 0.7721931412926272, "grad_norm": 8.54576587677002, "learning_rate": 1.352799441325006e-06, "loss": 0.8587137222290039, "memory(GiB)": 47.44, "step": 16605, "token_acc": 0.8318083553371571, "train_speed(iter/s)": 0.095782 }, { "epoch": 0.7724256595525767, "grad_norm": 9.212173461914062, "learning_rate": 1.3501704350728328e-06, "loss": 0.6808501720428467, "memory(GiB)": 47.44, "step": 16610, "token_acc": 0.824523396880416, "train_speed(iter/s)": 0.095797 }, { "epoch": 0.7726581778125263, "grad_norm": 7.881503582000732, "learning_rate": 1.347543587077566e-06, "loss": 0.6166458606719971, "memory(GiB)": 47.44, "step": 16615, "token_acc": 0.8433734939759037, "train_speed(iter/s)": 0.095814 }, { "epoch": 0.772890696072476, "grad_norm": 9.114757537841797, "learning_rate": 1.3449188988925438e-06, "loss": 0.6046999454498291, "memory(GiB)": 47.44, "step": 16620, "token_acc": 0.8512783579402232, "train_speed(iter/s)": 0.09583 }, { "epoch": 0.7731232143324256, "grad_norm": 8.172106742858887, "learning_rate": 1.3422963720698252e-06, "loss": 0.5814279556274414, "memory(GiB)": 47.44, "step": 16625, "token_acc": 0.8441821247892074, "train_speed(iter/s)": 0.095846 }, { "epoch": 0.7733557325923751, "grad_norm": 6.119772911071777, "learning_rate": 1.3396760081601922e-06, "loss": 0.5838455200195313, "memory(GiB)": 47.44, "step": 16630, "token_acc": 0.845360824742268, "train_speed(iter/s)": 0.095862 }, { "epoch": 0.7735882508523247, "grad_norm": 9.693753242492676, "learning_rate": 1.3370578087131447e-06, "loss": 0.667085599899292, "memory(GiB)": 47.44, "step": 16635, "token_acc": 0.8405443126308444, "train_speed(iter/s)": 0.095879 }, { "epoch": 0.7738207691122744, "grad_norm": 10.949572563171387, "learning_rate": 1.3344417752769079e-06, "loss": 0.695890235900879, "memory(GiB)": 47.44, "step": 16640, "token_acc": 0.8172002978406553, "train_speed(iter/s)": 0.095895 }, { "epoch": 0.7740532873722239, "grad_norm": 6.82157039642334, "learning_rate": 1.33182790939842e-06, "loss": 0.5298008441925048, "memory(GiB)": 47.44, "step": 16645, "token_acc": 0.8507552870090634, "train_speed(iter/s)": 0.095911 }, { "epoch": 0.7742858056321735, "grad_norm": 8.834753036499023, "learning_rate": 1.3292162126233426e-06, "loss": 0.6341611385345459, "memory(GiB)": 47.44, "step": 16650, "token_acc": 0.8390536487837388, "train_speed(iter/s)": 0.095927 }, { "epoch": 0.7742858056321735, "eval_loss": 0.5621234178543091, "eval_runtime": 295.7835, "eval_samples_per_second": 11.748, "eval_steps_per_second": 11.748, "step": 16650 }, { "epoch": 0.7745183238921232, "grad_norm": 6.885671615600586, "learning_rate": 1.326606686496051e-06, "loss": 0.6353133678436279, "memory(GiB)": 47.44, "step": 16655, "token_acc": 0.8338329112300836, "train_speed(iter/s)": 0.09578 }, { "epoch": 0.7747508421520728, "grad_norm": 6.530189037322998, "learning_rate": 1.3239993325596396e-06, "loss": 0.8055611610412597, "memory(GiB)": 47.44, "step": 16660, "token_acc": 0.7988453357642054, "train_speed(iter/s)": 0.095796 }, { "epoch": 0.7749833604120223, "grad_norm": 8.837427139282227, "learning_rate": 1.321394152355917e-06, "loss": 0.6655847549438476, "memory(GiB)": 47.44, "step": 16665, "token_acc": 0.8508305647840532, "train_speed(iter/s)": 0.095812 }, { "epoch": 0.775215878671972, "grad_norm": 6.300332069396973, "learning_rate": 1.318791147425409e-06, "loss": 0.523322868347168, "memory(GiB)": 47.44, "step": 16670, "token_acc": 0.872, "train_speed(iter/s)": 0.095828 }, { "epoch": 0.7754483969319216, "grad_norm": 6.994781494140625, "learning_rate": 1.3161903193073484e-06, "loss": 0.8038483619689941, "memory(GiB)": 47.44, "step": 16675, "token_acc": 0.796137339055794, "train_speed(iter/s)": 0.095844 }, { "epoch": 0.7756809151918712, "grad_norm": 7.429974555969238, "learning_rate": 1.3135916695396893e-06, "loss": 0.6116120338439941, "memory(GiB)": 47.44, "step": 16680, "token_acc": 0.8417356408327062, "train_speed(iter/s)": 0.09586 }, { "epoch": 0.7759134334518207, "grad_norm": 9.190269470214844, "learning_rate": 1.3109951996590902e-06, "loss": 0.5953320503234864, "memory(GiB)": 47.44, "step": 16685, "token_acc": 0.8514371033967898, "train_speed(iter/s)": 0.095876 }, { "epoch": 0.7761459517117704, "grad_norm": 8.540185928344727, "learning_rate": 1.308400911200927e-06, "loss": 0.6601593971252442, "memory(GiB)": 47.44, "step": 16690, "token_acc": 0.8291079812206573, "train_speed(iter/s)": 0.095892 }, { "epoch": 0.77637846997172, "grad_norm": 7.5788350105285645, "learning_rate": 1.3058088056992796e-06, "loss": 0.594563627243042, "memory(GiB)": 47.44, "step": 16695, "token_acc": 0.8420711974110032, "train_speed(iter/s)": 0.095908 }, { "epoch": 0.7766109882316695, "grad_norm": 6.727237224578857, "learning_rate": 1.3032188846869416e-06, "loss": 0.7008658885955811, "memory(GiB)": 47.44, "step": 16700, "token_acc": 0.8154761904761905, "train_speed(iter/s)": 0.095924 }, { "epoch": 0.7766109882316695, "eval_loss": 0.5621960163116455, "eval_runtime": 295.9438, "eval_samples_per_second": 11.742, "eval_steps_per_second": 11.742, "step": 16700 }, { "epoch": 0.7768435064916192, "grad_norm": 7.424905300140381, "learning_rate": 1.3006311496954123e-06, "loss": 0.5942497730255127, "memory(GiB)": 47.44, "step": 16705, "token_acc": 0.8338894565425634, "train_speed(iter/s)": 0.095777 }, { "epoch": 0.7770760247515688, "grad_norm": 8.602036476135254, "learning_rate": 1.2980456022549003e-06, "loss": 0.774842643737793, "memory(GiB)": 47.44, "step": 16710, "token_acc": 0.8064624705486368, "train_speed(iter/s)": 0.095793 }, { "epoch": 0.7773085430115184, "grad_norm": 10.397388458251953, "learning_rate": 1.295462243894321e-06, "loss": 0.645169734954834, "memory(GiB)": 47.44, "step": 16715, "token_acc": 0.8327734229189996, "train_speed(iter/s)": 0.095808 }, { "epoch": 0.7775410612714679, "grad_norm": 9.090619087219238, "learning_rate": 1.2928810761412907e-06, "loss": 0.6278375625610352, "memory(GiB)": 47.44, "step": 16720, "token_acc": 0.8567083474146672, "train_speed(iter/s)": 0.095825 }, { "epoch": 0.7777735795314176, "grad_norm": 6.70664119720459, "learning_rate": 1.2903021005221367e-06, "loss": 0.62248854637146, "memory(GiB)": 47.44, "step": 16725, "token_acc": 0.8399031811894883, "train_speed(iter/s)": 0.095841 }, { "epoch": 0.7780060977913672, "grad_norm": 8.718195915222168, "learning_rate": 1.2877253185618843e-06, "loss": 0.6643401145935058, "memory(GiB)": 47.44, "step": 16730, "token_acc": 0.8326480263157895, "train_speed(iter/s)": 0.095857 }, { "epoch": 0.7782386160513168, "grad_norm": 8.241601943969727, "learning_rate": 1.2851507317842664e-06, "loss": 0.7027626037597656, "memory(GiB)": 47.44, "step": 16735, "token_acc": 0.8238440962934658, "train_speed(iter/s)": 0.095873 }, { "epoch": 0.7784711343112664, "grad_norm": 6.704036712646484, "learning_rate": 1.2825783417117132e-06, "loss": 0.6183983325958252, "memory(GiB)": 47.44, "step": 16740, "token_acc": 0.8387401574803149, "train_speed(iter/s)": 0.095889 }, { "epoch": 0.778703652571216, "grad_norm": 7.664031505584717, "learning_rate": 1.2800081498653598e-06, "loss": 0.6994392395019531, "memory(GiB)": 47.44, "step": 16745, "token_acc": 0.8196496049467537, "train_speed(iter/s)": 0.095904 }, { "epoch": 0.7789361708311656, "grad_norm": 10.418838500976562, "learning_rate": 1.2774401577650403e-06, "loss": 0.7204785346984863, "memory(GiB)": 47.44, "step": 16750, "token_acc": 0.8209718670076727, "train_speed(iter/s)": 0.09592 }, { "epoch": 0.7789361708311656, "eval_loss": 0.5633994340896606, "eval_runtime": 293.6917, "eval_samples_per_second": 11.832, "eval_steps_per_second": 11.832, "step": 16750 }, { "epoch": 0.7791686890911151, "grad_norm": 7.190987586975098, "learning_rate": 1.2748743669292884e-06, "loss": 0.6094263076782227, "memory(GiB)": 47.44, "step": 16755, "token_acc": 0.8336955915956084, "train_speed(iter/s)": 0.095774 }, { "epoch": 0.7794012073510648, "grad_norm": 9.657785415649414, "learning_rate": 1.272310778875333e-06, "loss": 0.6889170646667481, "memory(GiB)": 47.44, "step": 16760, "token_acc": 0.8287827076222981, "train_speed(iter/s)": 0.09579 }, { "epoch": 0.7796337256110144, "grad_norm": 10.186052322387695, "learning_rate": 1.269749395119106e-06, "loss": 0.6745296478271484, "memory(GiB)": 47.44, "step": 16765, "token_acc": 0.8335174953959484, "train_speed(iter/s)": 0.095806 }, { "epoch": 0.779866243870964, "grad_norm": 9.893561363220215, "learning_rate": 1.2671902171752292e-06, "loss": 0.6191123008728028, "memory(GiB)": 47.44, "step": 16770, "token_acc": 0.8578020134228188, "train_speed(iter/s)": 0.095822 }, { "epoch": 0.7800987621309136, "grad_norm": 11.516806602478027, "learning_rate": 1.2646332465570271e-06, "loss": 0.6638372898101806, "memory(GiB)": 47.44, "step": 16775, "token_acc": 0.8319570602807597, "train_speed(iter/s)": 0.095839 }, { "epoch": 0.7803312803908632, "grad_norm": 10.242958068847656, "learning_rate": 1.2620784847765122e-06, "loss": 0.648340892791748, "memory(GiB)": 47.44, "step": 16780, "token_acc": 0.8328214129003949, "train_speed(iter/s)": 0.095855 }, { "epoch": 0.7805637986508128, "grad_norm": 11.26280403137207, "learning_rate": 1.2595259333443948e-06, "loss": 0.6338286399841309, "memory(GiB)": 47.44, "step": 16785, "token_acc": 0.8478260869565217, "train_speed(iter/s)": 0.095871 }, { "epoch": 0.7807963169107623, "grad_norm": 6.660550117492676, "learning_rate": 1.2569755937700784e-06, "loss": 0.6081973552703858, "memory(GiB)": 47.44, "step": 16790, "token_acc": 0.8490693739424704, "train_speed(iter/s)": 0.095887 }, { "epoch": 0.781028835170712, "grad_norm": 9.49515151977539, "learning_rate": 1.2544274675616587e-06, "loss": 0.6035889148712158, "memory(GiB)": 47.44, "step": 16795, "token_acc": 0.8607979184735473, "train_speed(iter/s)": 0.095902 }, { "epoch": 0.7812613534306616, "grad_norm": 8.45023250579834, "learning_rate": 1.251881556225918e-06, "loss": 0.5794333457946778, "memory(GiB)": 47.44, "step": 16800, "token_acc": 0.8581162324649299, "train_speed(iter/s)": 0.095918 }, { "epoch": 0.7812613534306616, "eval_loss": 0.5614963173866272, "eval_runtime": 296.7538, "eval_samples_per_second": 11.71, "eval_steps_per_second": 11.71, "step": 16800 }, { "epoch": 0.7814938716906112, "grad_norm": 8.028955459594727, "learning_rate": 1.2493378612683354e-06, "loss": 0.6352114677429199, "memory(GiB)": 47.44, "step": 16805, "token_acc": 0.833584542612042, "train_speed(iter/s)": 0.095771 }, { "epoch": 0.7817263899505608, "grad_norm": 10.286711692810059, "learning_rate": 1.2467963841930736e-06, "loss": 0.8744414329528809, "memory(GiB)": 47.44, "step": 16810, "token_acc": 0.8024602026049205, "train_speed(iter/s)": 0.095787 }, { "epoch": 0.7819589082105104, "grad_norm": 10.463303565979004, "learning_rate": 1.2442571265029896e-06, "loss": 0.5919531345367431, "memory(GiB)": 47.44, "step": 16815, "token_acc": 0.8532846715328467, "train_speed(iter/s)": 0.095803 }, { "epoch": 0.78219142647046, "grad_norm": 12.795990943908691, "learning_rate": 1.24172008969962e-06, "loss": 0.6375868797302247, "memory(GiB)": 47.44, "step": 16820, "token_acc": 0.8484609313338595, "train_speed(iter/s)": 0.095819 }, { "epoch": 0.7824239447304097, "grad_norm": 7.519901275634766, "learning_rate": 1.2391852752831989e-06, "loss": 0.6498225688934326, "memory(GiB)": 47.44, "step": 16825, "token_acc": 0.8424710424710424, "train_speed(iter/s)": 0.095834 }, { "epoch": 0.7826564629903592, "grad_norm": 10.070960998535156, "learning_rate": 1.236652684752636e-06, "loss": 0.6001636981964111, "memory(GiB)": 47.44, "step": 16830, "token_acc": 0.8563958165728077, "train_speed(iter/s)": 0.09585 }, { "epoch": 0.7828889812503088, "grad_norm": 8.31798267364502, "learning_rate": 1.234122319605532e-06, "loss": 0.608649730682373, "memory(GiB)": 47.44, "step": 16835, "token_acc": 0.8568680261639092, "train_speed(iter/s)": 0.095866 }, { "epoch": 0.7831214995102584, "grad_norm": 8.985321998596191, "learning_rate": 1.2315941813381704e-06, "loss": 0.6267675399780274, "memory(GiB)": 47.44, "step": 16840, "token_acc": 0.8328173374613003, "train_speed(iter/s)": 0.095882 }, { "epoch": 0.783354017770208, "grad_norm": 6.239924907684326, "learning_rate": 1.229068271445516e-06, "loss": 0.8521398544311524, "memory(GiB)": 47.44, "step": 16845, "token_acc": 0.7998193859121011, "train_speed(iter/s)": 0.095898 }, { "epoch": 0.7835865360301576, "grad_norm": 7.774771213531494, "learning_rate": 1.2265445914212192e-06, "loss": 0.6899847507476806, "memory(GiB)": 47.44, "step": 16850, "token_acc": 0.8074837310195228, "train_speed(iter/s)": 0.095914 }, { "epoch": 0.7835865360301576, "eval_loss": 0.5608976483345032, "eval_runtime": 297.5458, "eval_samples_per_second": 11.679, "eval_steps_per_second": 11.679, "step": 16850 }, { "epoch": 0.7838190542901072, "grad_norm": 10.392410278320312, "learning_rate": 1.2240231427576072e-06, "loss": 0.5573481559753418, "memory(GiB)": 47.44, "step": 16855, "token_acc": 0.8341591699940668, "train_speed(iter/s)": 0.095767 }, { "epoch": 0.7840515725500569, "grad_norm": 7.2723565101623535, "learning_rate": 1.2215039269456919e-06, "loss": 0.6622334957122803, "memory(GiB)": 47.44, "step": 16860, "token_acc": 0.8323529411764706, "train_speed(iter/s)": 0.095783 }, { "epoch": 0.7842840908100064, "grad_norm": 8.384167671203613, "learning_rate": 1.218986945475164e-06, "loss": 0.7650614738464355, "memory(GiB)": 47.44, "step": 16865, "token_acc": 0.8271112722000725, "train_speed(iter/s)": 0.095799 }, { "epoch": 0.784516609069956, "grad_norm": 7.8060126304626465, "learning_rate": 1.2164721998343926e-06, "loss": 0.5492019653320312, "memory(GiB)": 47.44, "step": 16870, "token_acc": 0.8539993014320643, "train_speed(iter/s)": 0.095815 }, { "epoch": 0.7847491273299056, "grad_norm": 9.077139854431152, "learning_rate": 1.2139596915104224e-06, "loss": 0.6087624549865722, "memory(GiB)": 47.44, "step": 16875, "token_acc": 0.8356775300171526, "train_speed(iter/s)": 0.095831 }, { "epoch": 0.7849816455898553, "grad_norm": 9.597736358642578, "learning_rate": 1.21144942198898e-06, "loss": 0.5296175479888916, "memory(GiB)": 47.44, "step": 16880, "token_acc": 0.8635584504628042, "train_speed(iter/s)": 0.095846 }, { "epoch": 0.7852141638498048, "grad_norm": 8.164989471435547, "learning_rate": 1.2089413927544624e-06, "loss": 0.6467774868011474, "memory(GiB)": 47.44, "step": 16885, "token_acc": 0.8486714193130266, "train_speed(iter/s)": 0.095862 }, { "epoch": 0.7854466821097544, "grad_norm": 7.90884256362915, "learning_rate": 1.2064356052899473e-06, "loss": 0.6473551273345948, "memory(GiB)": 47.44, "step": 16890, "token_acc": 0.8361884368308351, "train_speed(iter/s)": 0.095878 }, { "epoch": 0.7856792003697041, "grad_norm": 10.231267929077148, "learning_rate": 1.203932061077181e-06, "loss": 0.533270263671875, "memory(GiB)": 47.44, "step": 16895, "token_acc": 0.8644901610017889, "train_speed(iter/s)": 0.095894 }, { "epoch": 0.7859117186296536, "grad_norm": 8.054085731506348, "learning_rate": 1.2014307615965887e-06, "loss": 0.7031919002532959, "memory(GiB)": 47.44, "step": 16900, "token_acc": 0.8313442211055276, "train_speed(iter/s)": 0.09591 }, { "epoch": 0.7859117186296536, "eval_loss": 0.5615659952163696, "eval_runtime": 293.9684, "eval_samples_per_second": 11.821, "eval_steps_per_second": 11.821, "step": 16900 }, { "epoch": 0.7861442368896032, "grad_norm": 6.312916278839111, "learning_rate": 1.1989317083272655e-06, "loss": 0.6901503562927246, "memory(GiB)": 47.44, "step": 16905, "token_acc": 0.833696296177681, "train_speed(iter/s)": 0.095766 }, { "epoch": 0.7863767551495529, "grad_norm": 9.281437873840332, "learning_rate": 1.1964349027469806e-06, "loss": 0.5565787315368652, "memory(GiB)": 47.44, "step": 16910, "token_acc": 0.8596059113300493, "train_speed(iter/s)": 0.095781 }, { "epoch": 0.7866092734095025, "grad_norm": 7.677142143249512, "learning_rate": 1.1939403463321692e-06, "loss": 0.6321005821228027, "memory(GiB)": 47.44, "step": 16915, "token_acc": 0.8306896551724138, "train_speed(iter/s)": 0.095797 }, { "epoch": 0.786841791669452, "grad_norm": 8.295937538146973, "learning_rate": 1.191448040557942e-06, "loss": 0.6121166706085205, "memory(GiB)": 47.44, "step": 16920, "token_acc": 0.8432967810399717, "train_speed(iter/s)": 0.095813 }, { "epoch": 0.7870743099294016, "grad_norm": 8.459718704223633, "learning_rate": 1.188957986898074e-06, "loss": 0.6047228813171387, "memory(GiB)": 47.44, "step": 16925, "token_acc": 0.8478260869565217, "train_speed(iter/s)": 0.095829 }, { "epoch": 0.7873068281893513, "grad_norm": 12.994466781616211, "learning_rate": 1.1864701868250133e-06, "loss": 0.6566365242004395, "memory(GiB)": 47.44, "step": 16930, "token_acc": 0.8429752066115702, "train_speed(iter/s)": 0.095845 }, { "epoch": 0.7875393464493008, "grad_norm": 8.316268920898438, "learning_rate": 1.1839846418098705e-06, "loss": 0.6249664783477783, "memory(GiB)": 47.44, "step": 16935, "token_acc": 0.8341105929380414, "train_speed(iter/s)": 0.09586 }, { "epoch": 0.7877718647092504, "grad_norm": 6.81507682800293, "learning_rate": 1.1815013533224262e-06, "loss": 0.5734968185424805, "memory(GiB)": 47.44, "step": 16940, "token_acc": 0.8526754690757471, "train_speed(iter/s)": 0.095876 }, { "epoch": 0.7880043829692001, "grad_norm": 12.082489013671875, "learning_rate": 1.1790203228311253e-06, "loss": 0.6177358627319336, "memory(GiB)": 47.44, "step": 16945, "token_acc": 0.8375468164794008, "train_speed(iter/s)": 0.095892 }, { "epoch": 0.7882369012291497, "grad_norm": 8.138704299926758, "learning_rate": 1.17654155180308e-06, "loss": 0.6662531852722168, "memory(GiB)": 47.44, "step": 16950, "token_acc": 0.8362164151637835, "train_speed(iter/s)": 0.095908 }, { "epoch": 0.7882369012291497, "eval_loss": 0.5608460307121277, "eval_runtime": 290.8127, "eval_samples_per_second": 11.949, "eval_steps_per_second": 11.949, "step": 16950 }, { "epoch": 0.7884694194890992, "grad_norm": 8.260912895202637, "learning_rate": 1.17406504170406e-06, "loss": 0.5241977214813233, "memory(GiB)": 47.44, "step": 16955, "token_acc": 0.8349334059910567, "train_speed(iter/s)": 0.095766 }, { "epoch": 0.7887019377490488, "grad_norm": 11.177227020263672, "learning_rate": 1.171590793998505e-06, "loss": 0.6105329036712647, "memory(GiB)": 47.44, "step": 16960, "token_acc": 0.8349753694581281, "train_speed(iter/s)": 0.095782 }, { "epoch": 0.7889344560089985, "grad_norm": 8.79315185546875, "learning_rate": 1.1691188101495142e-06, "loss": 0.5571176528930664, "memory(GiB)": 47.44, "step": 16965, "token_acc": 0.8521072796934865, "train_speed(iter/s)": 0.095798 }, { "epoch": 0.7891669742689481, "grad_norm": 9.868995666503906, "learning_rate": 1.1666490916188462e-06, "loss": 0.659188985824585, "memory(GiB)": 47.44, "step": 16970, "token_acc": 0.8380414312617702, "train_speed(iter/s)": 0.095815 }, { "epoch": 0.7893994925288976, "grad_norm": 8.86392593383789, "learning_rate": 1.1641816398669242e-06, "loss": 0.6795865535736084, "memory(GiB)": 47.44, "step": 16975, "token_acc": 0.8313253012048193, "train_speed(iter/s)": 0.09583 }, { "epoch": 0.7896320107888473, "grad_norm": 11.150150299072266, "learning_rate": 1.161716456352826e-06, "loss": 0.6437868595123291, "memory(GiB)": 47.44, "step": 16980, "token_acc": 0.8402964959568733, "train_speed(iter/s)": 0.095846 }, { "epoch": 0.7898645290487969, "grad_norm": 8.442800521850586, "learning_rate": 1.1592535425342915e-06, "loss": 0.5874933719635009, "memory(GiB)": 47.44, "step": 16985, "token_acc": 0.8525849335302806, "train_speed(iter/s)": 0.095862 }, { "epoch": 0.7900970473087464, "grad_norm": 8.539682388305664, "learning_rate": 1.156792899867718e-06, "loss": 0.6538908004760742, "memory(GiB)": 47.44, "step": 16990, "token_acc": 0.8395200599925009, "train_speed(iter/s)": 0.095878 }, { "epoch": 0.790329565568696, "grad_norm": 8.99543285369873, "learning_rate": 1.1543345298081614e-06, "loss": 0.7801726818084717, "memory(GiB)": 47.44, "step": 16995, "token_acc": 0.8106287425149701, "train_speed(iter/s)": 0.095893 }, { "epoch": 0.7905620838286457, "grad_norm": 9.519553184509277, "learning_rate": 1.1518784338093287e-06, "loss": 0.6260163307189941, "memory(GiB)": 47.44, "step": 17000, "token_acc": 0.8426924507251766, "train_speed(iter/s)": 0.095909 }, { "epoch": 0.7905620838286457, "eval_loss": 0.5613806247711182, "eval_runtime": 293.023, "eval_samples_per_second": 11.859, "eval_steps_per_second": 11.859, "step": 17000 }, { "epoch": 0.7907946020885953, "grad_norm": 8.278639793395996, "learning_rate": 1.1494246133235875e-06, "loss": 0.6730622768402099, "memory(GiB)": 47.44, "step": 17005, "token_acc": 0.8339751955611424, "train_speed(iter/s)": 0.095767 }, { "epoch": 0.7910271203485448, "grad_norm": 8.14084529876709, "learning_rate": 1.1469730698019554e-06, "loss": 0.7279993057250976, "memory(GiB)": 47.44, "step": 17010, "token_acc": 0.8286266924564797, "train_speed(iter/s)": 0.095782 }, { "epoch": 0.7912596386084945, "grad_norm": 5.9442243576049805, "learning_rate": 1.1445238046941087e-06, "loss": 0.7142318725585938, "memory(GiB)": 47.44, "step": 17015, "token_acc": 0.8208279430789134, "train_speed(iter/s)": 0.095798 }, { "epoch": 0.7914921568684441, "grad_norm": 7.770115375518799, "learning_rate": 1.1420768194483707e-06, "loss": 0.6370692729949952, "memory(GiB)": 47.44, "step": 17020, "token_acc": 0.8414426675740048, "train_speed(iter/s)": 0.095813 }, { "epoch": 0.7917246751283937, "grad_norm": 8.381105422973633, "learning_rate": 1.1396321155117197e-06, "loss": 0.6671901226043702, "memory(GiB)": 47.44, "step": 17025, "token_acc": 0.8299632352941176, "train_speed(iter/s)": 0.095829 }, { "epoch": 0.7919571933883433, "grad_norm": 7.974504470825195, "learning_rate": 1.1371896943297861e-06, "loss": 0.6833327770233154, "memory(GiB)": 47.44, "step": 17030, "token_acc": 0.8307752853207399, "train_speed(iter/s)": 0.095844 }, { "epoch": 0.7921897116482929, "grad_norm": 8.342890739440918, "learning_rate": 1.1347495573468492e-06, "loss": 0.678836441040039, "memory(GiB)": 47.44, "step": 17035, "token_acc": 0.8252461322081576, "train_speed(iter/s)": 0.09586 }, { "epoch": 0.7924222299082425, "grad_norm": 10.222599983215332, "learning_rate": 1.1323117060058353e-06, "loss": 0.5971874713897705, "memory(GiB)": 47.44, "step": 17040, "token_acc": 0.8501472754050073, "train_speed(iter/s)": 0.095876 }, { "epoch": 0.792654748168192, "grad_norm": 9.33965015411377, "learning_rate": 1.1298761417483235e-06, "loss": 0.6354103565216065, "memory(GiB)": 47.44, "step": 17045, "token_acc": 0.8423252279635258, "train_speed(iter/s)": 0.095892 }, { "epoch": 0.7928872664281417, "grad_norm": 10.085137367248535, "learning_rate": 1.127442866014536e-06, "loss": 0.5293305397033692, "memory(GiB)": 47.44, "step": 17050, "token_acc": 0.8703703703703703, "train_speed(iter/s)": 0.095907 }, { "epoch": 0.7928872664281417, "eval_loss": 0.5618590712547302, "eval_runtime": 295.7006, "eval_samples_per_second": 11.752, "eval_steps_per_second": 11.752, "step": 17050 }, { "epoch": 0.7931197846880913, "grad_norm": 8.592312812805176, "learning_rate": 1.125011880243345e-06, "loss": 0.5516678333282471, "memory(GiB)": 47.44, "step": 17055, "token_acc": 0.8342294409410331, "train_speed(iter/s)": 0.095764 }, { "epoch": 0.7933523029480409, "grad_norm": 9.838059425354004, "learning_rate": 1.1225831858722668e-06, "loss": 0.6128196239471435, "memory(GiB)": 47.44, "step": 17060, "token_acc": 0.8487261146496815, "train_speed(iter/s)": 0.09578 }, { "epoch": 0.7935848212079905, "grad_norm": 8.308734893798828, "learning_rate": 1.1201567843374639e-06, "loss": 0.7210927486419678, "memory(GiB)": 47.44, "step": 17065, "token_acc": 0.8256519102486355, "train_speed(iter/s)": 0.095796 }, { "epoch": 0.7938173394679401, "grad_norm": 9.248513221740723, "learning_rate": 1.1177326770737418e-06, "loss": 0.685799503326416, "memory(GiB)": 47.44, "step": 17070, "token_acc": 0.8226449831236575, "train_speed(iter/s)": 0.095811 }, { "epoch": 0.7940498577278897, "grad_norm": 9.848244667053223, "learning_rate": 1.1153108655145516e-06, "loss": 0.6207716464996338, "memory(GiB)": 47.44, "step": 17075, "token_acc": 0.8317911434236616, "train_speed(iter/s)": 0.095827 }, { "epoch": 0.7942823759878392, "grad_norm": 8.39963436126709, "learning_rate": 1.1128913510919836e-06, "loss": 0.556895112991333, "memory(GiB)": 47.44, "step": 17080, "token_acc": 0.8656422379826635, "train_speed(iter/s)": 0.095843 }, { "epoch": 0.7945148942477889, "grad_norm": 9.106492042541504, "learning_rate": 1.1104741352367727e-06, "loss": 0.6832783222198486, "memory(GiB)": 47.44, "step": 17085, "token_acc": 0.8177522780965238, "train_speed(iter/s)": 0.095859 }, { "epoch": 0.7947474125077385, "grad_norm": 7.3956499099731445, "learning_rate": 1.1080592193782913e-06, "loss": 0.6365145683288574, "memory(GiB)": 47.44, "step": 17090, "token_acc": 0.8236943568173852, "train_speed(iter/s)": 0.095874 }, { "epoch": 0.7949799307676881, "grad_norm": 6.6494927406311035, "learning_rate": 1.1056466049445547e-06, "loss": 0.5309269428253174, "memory(GiB)": 47.44, "step": 17095, "token_acc": 0.8577352472089315, "train_speed(iter/s)": 0.09589 }, { "epoch": 0.7952124490276377, "grad_norm": 8.483128547668457, "learning_rate": 1.103236293362218e-06, "loss": 0.7713404655456543, "memory(GiB)": 47.44, "step": 17100, "token_acc": 0.7899129172714079, "train_speed(iter/s)": 0.095905 }, { "epoch": 0.7952124490276377, "eval_loss": 0.5605780482292175, "eval_runtime": 293.2171, "eval_samples_per_second": 11.851, "eval_steps_per_second": 11.851, "step": 17100 }, { "epoch": 0.7954449672875873, "grad_norm": 8.216812133789062, "learning_rate": 1.10082828605657e-06, "loss": 0.6681423187255859, "memory(GiB)": 47.44, "step": 17105, "token_acc": 0.8337287972552666, "train_speed(iter/s)": 0.095763 }, { "epoch": 0.7956774855475369, "grad_norm": 10.678299903869629, "learning_rate": 1.098422584451541e-06, "loss": 0.6339476108551025, "memory(GiB)": 47.44, "step": 17110, "token_acc": 0.8391070053887606, "train_speed(iter/s)": 0.095779 }, { "epoch": 0.7959100038074866, "grad_norm": 7.154067516326904, "learning_rate": 1.0960191899696965e-06, "loss": 0.6219954013824462, "memory(GiB)": 47.44, "step": 17115, "token_acc": 0.8290206354405503, "train_speed(iter/s)": 0.095795 }, { "epoch": 0.7961425220674361, "grad_norm": 9.737374305725098, "learning_rate": 1.0936181040322402e-06, "loss": 0.6709455013275146, "memory(GiB)": 47.44, "step": 17120, "token_acc": 0.8210654737698251, "train_speed(iter/s)": 0.09581 }, { "epoch": 0.7963750403273857, "grad_norm": 6.7336745262146, "learning_rate": 1.0912193280590044e-06, "loss": 0.6627488136291504, "memory(GiB)": 47.44, "step": 17125, "token_acc": 0.8298080052066384, "train_speed(iter/s)": 0.095826 }, { "epoch": 0.7966075585873353, "grad_norm": 7.747184753417969, "learning_rate": 1.0888228634684623e-06, "loss": 0.6206040382385254, "memory(GiB)": 47.44, "step": 17130, "token_acc": 0.8367875647668394, "train_speed(iter/s)": 0.095842 }, { "epoch": 0.7968400768472849, "grad_norm": 10.590611457824707, "learning_rate": 1.0864287116777157e-06, "loss": 0.6220110893249512, "memory(GiB)": 47.44, "step": 17135, "token_acc": 0.8553191489361702, "train_speed(iter/s)": 0.095857 }, { "epoch": 0.7970725951072345, "grad_norm": 8.546687126159668, "learning_rate": 1.084036874102502e-06, "loss": 0.59197998046875, "memory(GiB)": 47.44, "step": 17140, "token_acc": 0.8423146473779385, "train_speed(iter/s)": 0.095873 }, { "epoch": 0.7973051133671841, "grad_norm": 7.821950435638428, "learning_rate": 1.0816473521571862e-06, "loss": 0.5261586666107178, "memory(GiB)": 47.44, "step": 17145, "token_acc": 0.8734987990392313, "train_speed(iter/s)": 0.095888 }, { "epoch": 0.7975376316271338, "grad_norm": 10.940779685974121, "learning_rate": 1.0792601472547682e-06, "loss": 0.6301316738128662, "memory(GiB)": 47.44, "step": 17150, "token_acc": 0.8311926605504587, "train_speed(iter/s)": 0.095903 }, { "epoch": 0.7975376316271338, "eval_loss": 0.5604414343833923, "eval_runtime": 296.127, "eval_samples_per_second": 11.735, "eval_steps_per_second": 11.735, "step": 17150 }, { "epoch": 0.7977701498870833, "grad_norm": 9.065601348876953, "learning_rate": 1.0768752608068756e-06, "loss": 0.6107148170471192, "memory(GiB)": 47.44, "step": 17155, "token_acc": 0.8342620960284338, "train_speed(iter/s)": 0.09576 }, { "epoch": 0.7980026681470329, "grad_norm": 8.328161239624023, "learning_rate": 1.074492694223767e-06, "loss": 0.5185272693634033, "memory(GiB)": 47.44, "step": 17160, "token_acc": 0.8681526256139025, "train_speed(iter/s)": 0.095776 }, { "epoch": 0.7982351864069825, "grad_norm": 8.856505393981934, "learning_rate": 1.0721124489143248e-06, "loss": 0.6319143772125244, "memory(GiB)": 47.44, "step": 17165, "token_acc": 0.8422652983656405, "train_speed(iter/s)": 0.095792 }, { "epoch": 0.7984677046669322, "grad_norm": 9.311681747436523, "learning_rate": 1.0697345262860638e-06, "loss": 0.777432918548584, "memory(GiB)": 47.44, "step": 17170, "token_acc": 0.8133465477370334, "train_speed(iter/s)": 0.095808 }, { "epoch": 0.7987002229268817, "grad_norm": 10.043967247009277, "learning_rate": 1.0673589277451208e-06, "loss": 0.6721518039703369, "memory(GiB)": 47.44, "step": 17175, "token_acc": 0.8283671036948749, "train_speed(iter/s)": 0.095824 }, { "epoch": 0.7989327411868313, "grad_norm": 7.669497013092041, "learning_rate": 1.0649856546962617e-06, "loss": 0.7782588958740234, "memory(GiB)": 47.44, "step": 17180, "token_acc": 0.8111563044741429, "train_speed(iter/s)": 0.095839 }, { "epoch": 0.799165259446781, "grad_norm": 9.350188255310059, "learning_rate": 1.0626147085428761e-06, "loss": 0.5789762973785401, "memory(GiB)": 47.44, "step": 17185, "token_acc": 0.8528351360209767, "train_speed(iter/s)": 0.095855 }, { "epoch": 0.7993977777067305, "grad_norm": 8.215911865234375, "learning_rate": 1.0602460906869799e-06, "loss": 0.8086822509765625, "memory(GiB)": 47.44, "step": 17190, "token_acc": 0.8049076037564374, "train_speed(iter/s)": 0.09587 }, { "epoch": 0.7996302959666801, "grad_norm": 7.175421714782715, "learning_rate": 1.057879802529206e-06, "loss": 0.6093656539916992, "memory(GiB)": 47.44, "step": 17195, "token_acc": 0.8360609480812641, "train_speed(iter/s)": 0.095886 }, { "epoch": 0.7998628142266297, "grad_norm": 8.85153579711914, "learning_rate": 1.055515845468817e-06, "loss": 0.6791608810424805, "memory(GiB)": 47.44, "step": 17200, "token_acc": 0.8277543061423465, "train_speed(iter/s)": 0.095902 }, { "epoch": 0.7998628142266297, "eval_loss": 0.5614746809005737, "eval_runtime": 297.9952, "eval_samples_per_second": 11.661, "eval_steps_per_second": 11.661, "step": 17200 }, { "epoch": 0.8000953324865794, "grad_norm": 9.05666446685791, "learning_rate": 1.053154220903691e-06, "loss": 0.6790387630462646, "memory(GiB)": 47.44, "step": 17205, "token_acc": 0.8340710792930423, "train_speed(iter/s)": 0.095758 }, { "epoch": 0.8003278507465289, "grad_norm": 7.279318809509277, "learning_rate": 1.0507949302303315e-06, "loss": 0.6103257656097412, "memory(GiB)": 47.44, "step": 17210, "token_acc": 0.848814862267777, "train_speed(iter/s)": 0.095773 }, { "epoch": 0.8005603690064785, "grad_norm": 8.824692726135254, "learning_rate": 1.0484379748438584e-06, "loss": 0.5345361232757568, "memory(GiB)": 47.44, "step": 17215, "token_acc": 0.8665330661322646, "train_speed(iter/s)": 0.095789 }, { "epoch": 0.8007928872664282, "grad_norm": 10.482314109802246, "learning_rate": 1.046083356138013e-06, "loss": 0.6422648429870605, "memory(GiB)": 47.44, "step": 17220, "token_acc": 0.8412544455221468, "train_speed(iter/s)": 0.095804 }, { "epoch": 0.8010254055263778, "grad_norm": 10.245368957519531, "learning_rate": 1.0437310755051533e-06, "loss": 0.6183192253112793, "memory(GiB)": 47.44, "step": 17225, "token_acc": 0.8408354339214981, "train_speed(iter/s)": 0.09582 }, { "epoch": 0.8012579237863273, "grad_norm": 8.228830337524414, "learning_rate": 1.0413811343362567e-06, "loss": 0.7094565868377686, "memory(GiB)": 47.44, "step": 17230, "token_acc": 0.8140299598100109, "train_speed(iter/s)": 0.095836 }, { "epoch": 0.801490442046277, "grad_norm": 8.723837852478027, "learning_rate": 1.0390335340209169e-06, "loss": 0.6350565910339355, "memory(GiB)": 47.44, "step": 17235, "token_acc": 0.8355555555555556, "train_speed(iter/s)": 0.095851 }, { "epoch": 0.8017229603062266, "grad_norm": 9.939167976379395, "learning_rate": 1.0366882759473391e-06, "loss": 0.750972843170166, "memory(GiB)": 47.44, "step": 17240, "token_acc": 0.7975287840494243, "train_speed(iter/s)": 0.095865 }, { "epoch": 0.8019554785661761, "grad_norm": 8.15424633026123, "learning_rate": 1.0343453615023501e-06, "loss": 0.6216236114501953, "memory(GiB)": 47.44, "step": 17245, "token_acc": 0.8429609131788308, "train_speed(iter/s)": 0.095881 }, { "epoch": 0.8021879968261257, "grad_norm": 12.222821235656738, "learning_rate": 1.0320047920713854e-06, "loss": 0.5298105239868164, "memory(GiB)": 47.44, "step": 17250, "token_acc": 0.8692610406646262, "train_speed(iter/s)": 0.095897 }, { "epoch": 0.8021879968261257, "eval_loss": 0.5603924989700317, "eval_runtime": 296.949, "eval_samples_per_second": 11.702, "eval_steps_per_second": 11.702, "step": 17250 }, { "epoch": 0.8024205150860754, "grad_norm": 11.590433120727539, "learning_rate": 1.0296665690384977e-06, "loss": 0.721394157409668, "memory(GiB)": 47.44, "step": 17255, "token_acc": 0.8327286584155664, "train_speed(iter/s)": 0.095754 }, { "epoch": 0.802653033346025, "grad_norm": 8.545548439025879, "learning_rate": 1.0273306937863474e-06, "loss": 0.6448258399963379, "memory(GiB)": 47.44, "step": 17260, "token_acc": 0.8342391304347826, "train_speed(iter/s)": 0.095769 }, { "epoch": 0.8028855516059745, "grad_norm": 9.636141777038574, "learning_rate": 1.0249971676962127e-06, "loss": 0.5769176006317138, "memory(GiB)": 47.44, "step": 17265, "token_acc": 0.8622431795217245, "train_speed(iter/s)": 0.095785 }, { "epoch": 0.8031180698659242, "grad_norm": 6.957939147949219, "learning_rate": 1.0226659921479782e-06, "loss": 0.689917802810669, "memory(GiB)": 47.44, "step": 17270, "token_acc": 0.8231624627068077, "train_speed(iter/s)": 0.0958 }, { "epoch": 0.8033505881258738, "grad_norm": 8.156017303466797, "learning_rate": 1.020337168520142e-06, "loss": 0.647607421875, "memory(GiB)": 47.44, "step": 17275, "token_acc": 0.8418691588785047, "train_speed(iter/s)": 0.095816 }, { "epoch": 0.8035831063858233, "grad_norm": 8.871214866638184, "learning_rate": 1.0180106981898058e-06, "loss": 0.6655109882354736, "memory(GiB)": 47.44, "step": 17280, "token_acc": 0.8353528153955809, "train_speed(iter/s)": 0.095831 }, { "epoch": 0.8038156246457729, "grad_norm": 8.65011215209961, "learning_rate": 1.0156865825326873e-06, "loss": 0.7450146198272705, "memory(GiB)": 47.44, "step": 17285, "token_acc": 0.8207972270363951, "train_speed(iter/s)": 0.095847 }, { "epoch": 0.8040481429057226, "grad_norm": 11.104325294494629, "learning_rate": 1.0133648229231047e-06, "loss": 0.5896714687347412, "memory(GiB)": 47.44, "step": 17290, "token_acc": 0.8539137714524906, "train_speed(iter/s)": 0.095862 }, { "epoch": 0.8042806611656722, "grad_norm": 8.021060943603516, "learning_rate": 1.011045420733988e-06, "loss": 0.6420575618743897, "memory(GiB)": 47.44, "step": 17295, "token_acc": 0.8414845646895595, "train_speed(iter/s)": 0.095878 }, { "epoch": 0.8045131794256217, "grad_norm": 9.281312942504883, "learning_rate": 1.00872837733687e-06, "loss": 0.5397995948791504, "memory(GiB)": 47.44, "step": 17300, "token_acc": 0.8661740558292282, "train_speed(iter/s)": 0.095893 }, { "epoch": 0.8045131794256217, "eval_loss": 0.560265064239502, "eval_runtime": 294.7526, "eval_samples_per_second": 11.79, "eval_steps_per_second": 11.79, "step": 17300 }, { "epoch": 0.8047456976855714, "grad_norm": 10.986329078674316, "learning_rate": 1.0064136941018904e-06, "loss": 0.5388300895690918, "memory(GiB)": 47.44, "step": 17305, "token_acc": 0.8344242987509314, "train_speed(iter/s)": 0.095752 }, { "epoch": 0.804978215945521, "grad_norm": 12.374367713928223, "learning_rate": 1.0041013723977933e-06, "loss": 0.6608255863189697, "memory(GiB)": 47.44, "step": 17310, "token_acc": 0.8333333333333334, "train_speed(iter/s)": 0.095767 }, { "epoch": 0.8052107342054706, "grad_norm": 8.375582695007324, "learning_rate": 1.0017914135919265e-06, "loss": 0.5581004619598389, "memory(GiB)": 47.44, "step": 17315, "token_acc": 0.8572536850271528, "train_speed(iter/s)": 0.095783 }, { "epoch": 0.8054432524654201, "grad_norm": 5.624876499176025, "learning_rate": 9.994838190502381e-07, "loss": 0.7049301624298095, "memory(GiB)": 47.44, "step": 17320, "token_acc": 0.8228299643281808, "train_speed(iter/s)": 0.095798 }, { "epoch": 0.8056757707253698, "grad_norm": 9.356409072875977, "learning_rate": 9.971785901372827e-07, "loss": 0.6245266437530518, "memory(GiB)": 47.44, "step": 17325, "token_acc": 0.8592896174863388, "train_speed(iter/s)": 0.095813 }, { "epoch": 0.8059082889853194, "grad_norm": 7.530612945556641, "learning_rate": 9.948757282162103e-07, "loss": 0.6261490821838379, "memory(GiB)": 47.44, "step": 17330, "token_acc": 0.8362235067437379, "train_speed(iter/s)": 0.095829 }, { "epoch": 0.8061408072452689, "grad_norm": 8.12159252166748, "learning_rate": 9.925752346487772e-07, "loss": 0.6676755428314209, "memory(GiB)": 47.44, "step": 17335, "token_acc": 0.8142649199417759, "train_speed(iter/s)": 0.095844 }, { "epoch": 0.8063733255052186, "grad_norm": 7.375349998474121, "learning_rate": 9.902771107953329e-07, "loss": 0.6126779556274414, "memory(GiB)": 47.44, "step": 17340, "token_acc": 0.8516806722689075, "train_speed(iter/s)": 0.09586 }, { "epoch": 0.8066058437651682, "grad_norm": 10.835264205932617, "learning_rate": 9.879813580148312e-07, "loss": 0.686242151260376, "memory(GiB)": 47.44, "step": 17345, "token_acc": 0.8460502692998204, "train_speed(iter/s)": 0.095875 }, { "epoch": 0.8068383620251178, "grad_norm": 6.949465274810791, "learning_rate": 9.856879776648214e-07, "loss": 0.7318108558654786, "memory(GiB)": 47.44, "step": 17350, "token_acc": 0.8146841936518114, "train_speed(iter/s)": 0.095891 }, { "epoch": 0.8068383620251178, "eval_loss": 0.5608267784118652, "eval_runtime": 295.2159, "eval_samples_per_second": 11.771, "eval_steps_per_second": 11.771, "step": 17350 }, { "epoch": 0.8070708802850673, "grad_norm": 7.345584392547607, "learning_rate": 9.833969711014497e-07, "loss": 0.6408491611480713, "memory(GiB)": 47.44, "step": 17355, "token_acc": 0.8342026527575559, "train_speed(iter/s)": 0.09575 }, { "epoch": 0.807303398545017, "grad_norm": 10.600166320800781, "learning_rate": 9.811083396794607e-07, "loss": 0.6612047672271728, "memory(GiB)": 47.44, "step": 17360, "token_acc": 0.8144783118405627, "train_speed(iter/s)": 0.095765 }, { "epoch": 0.8075359168049666, "grad_norm": 7.029186248779297, "learning_rate": 9.788220847521895e-07, "loss": 0.6378894805908203, "memory(GiB)": 47.44, "step": 17365, "token_acc": 0.8553054662379421, "train_speed(iter/s)": 0.095781 }, { "epoch": 0.8077684350649162, "grad_norm": 6.6874470710754395, "learning_rate": 9.76538207671573e-07, "loss": 0.6131662368774414, "memory(GiB)": 47.44, "step": 17370, "token_acc": 0.841715976331361, "train_speed(iter/s)": 0.095797 }, { "epoch": 0.8080009533248658, "grad_norm": 6.854613304138184, "learning_rate": 9.74256709788135e-07, "loss": 0.6927377700805664, "memory(GiB)": 47.44, "step": 17375, "token_acc": 0.8233151183970856, "train_speed(iter/s)": 0.095812 }, { "epoch": 0.8082334715848154, "grad_norm": 9.138595581054688, "learning_rate": 9.719775924509982e-07, "loss": 0.6806143283843994, "memory(GiB)": 47.44, "step": 17380, "token_acc": 0.8281972265023112, "train_speed(iter/s)": 0.095828 }, { "epoch": 0.808465989844765, "grad_norm": 10.136089324951172, "learning_rate": 9.697008570078726e-07, "loss": 0.6174187183380127, "memory(GiB)": 47.44, "step": 17385, "token_acc": 0.8403677392394484, "train_speed(iter/s)": 0.095843 }, { "epoch": 0.8086985081047146, "grad_norm": 9.113638877868652, "learning_rate": 9.674265048050636e-07, "loss": 0.7248986721038818, "memory(GiB)": 47.44, "step": 17390, "token_acc": 0.8193146417445483, "train_speed(iter/s)": 0.095859 }, { "epoch": 0.8089310263646642, "grad_norm": 6.724001407623291, "learning_rate": 9.65154537187465e-07, "loss": 0.5666268825531006, "memory(GiB)": 47.44, "step": 17395, "token_acc": 0.8590038314176245, "train_speed(iter/s)": 0.095875 }, { "epoch": 0.8091635446246138, "grad_norm": 6.4625091552734375, "learning_rate": 9.628849554985637e-07, "loss": 0.5667214393615723, "memory(GiB)": 47.44, "step": 17400, "token_acc": 0.8561014263074485, "train_speed(iter/s)": 0.09589 }, { "epoch": 0.8091635446246138, "eval_loss": 0.5599117279052734, "eval_runtime": 294.7125, "eval_samples_per_second": 11.791, "eval_steps_per_second": 11.791, "step": 17400 }, { "epoch": 0.8093960628845634, "grad_norm": 11.611597061157227, "learning_rate": 9.606177610804308e-07, "loss": 0.6227489948272705, "memory(GiB)": 47.44, "step": 17405, "token_acc": 0.8339909898396064, "train_speed(iter/s)": 0.095749 }, { "epoch": 0.809628581144513, "grad_norm": 7.669189453125, "learning_rate": 9.583529552737303e-07, "loss": 0.6733208656311035, "memory(GiB)": 47.44, "step": 17410, "token_acc": 0.8414746543778802, "train_speed(iter/s)": 0.095764 }, { "epoch": 0.8098610994044626, "grad_norm": 7.762040615081787, "learning_rate": 9.560905394177096e-07, "loss": 0.7308223247528076, "memory(GiB)": 47.44, "step": 17415, "token_acc": 0.8231552162849872, "train_speed(iter/s)": 0.09578 }, { "epoch": 0.8100936176644122, "grad_norm": 9.89555549621582, "learning_rate": 9.538305148502074e-07, "loss": 0.5911648750305176, "memory(GiB)": 47.44, "step": 17420, "token_acc": 0.8471774193548387, "train_speed(iter/s)": 0.095795 }, { "epoch": 0.8103261359243618, "grad_norm": 8.816936492919922, "learning_rate": 9.515728829076437e-07, "loss": 0.5312886714935303, "memory(GiB)": 47.44, "step": 17425, "token_acc": 0.8606158833063209, "train_speed(iter/s)": 0.095811 }, { "epoch": 0.8105586541843114, "grad_norm": 8.899998664855957, "learning_rate": 9.493176449250274e-07, "loss": 0.6454525947570801, "memory(GiB)": 47.44, "step": 17430, "token_acc": 0.8451676528599605, "train_speed(iter/s)": 0.095826 }, { "epoch": 0.810791172444261, "grad_norm": 9.899276733398438, "learning_rate": 9.470648022359496e-07, "loss": 0.6515559673309326, "memory(GiB)": 47.44, "step": 17435, "token_acc": 0.8539670371789958, "train_speed(iter/s)": 0.095842 }, { "epoch": 0.8110236907042107, "grad_norm": 9.048864364624023, "learning_rate": 9.448143561725881e-07, "loss": 0.6078864574432373, "memory(GiB)": 47.44, "step": 17440, "token_acc": 0.8472925594078691, "train_speed(iter/s)": 0.095858 }, { "epoch": 0.8112562089641602, "grad_norm": 7.783231735229492, "learning_rate": 9.425663080656977e-07, "loss": 0.6650321960449219, "memory(GiB)": 47.44, "step": 17445, "token_acc": 0.8310636731689364, "train_speed(iter/s)": 0.095873 }, { "epoch": 0.8114887272241098, "grad_norm": 7.394621849060059, "learning_rate": 9.403206592446217e-07, "loss": 0.6150126934051514, "memory(GiB)": 47.44, "step": 17450, "token_acc": 0.8462922966162707, "train_speed(iter/s)": 0.095888 }, { "epoch": 0.8114887272241098, "eval_loss": 0.5597295165061951, "eval_runtime": 291.778, "eval_samples_per_second": 11.91, "eval_steps_per_second": 11.91, "step": 17450 }, { "epoch": 0.8117212454840594, "grad_norm": 8.46907901763916, "learning_rate": 9.380774110372786e-07, "loss": 0.6139643669128418, "memory(GiB)": 47.44, "step": 17455, "token_acc": 0.834572073875923, "train_speed(iter/s)": 0.09575 }, { "epoch": 0.8119537637440091, "grad_norm": 7.3746747970581055, "learning_rate": 9.358365647701734e-07, "loss": 0.6090128421783447, "memory(GiB)": 47.44, "step": 17460, "token_acc": 0.8393951777686963, "train_speed(iter/s)": 0.095765 }, { "epoch": 0.8121862820039586, "grad_norm": 10.859003067016602, "learning_rate": 9.335981217683848e-07, "loss": 0.6623462200164795, "memory(GiB)": 47.44, "step": 17465, "token_acc": 0.831107903284013, "train_speed(iter/s)": 0.095781 }, { "epoch": 0.8124188002639082, "grad_norm": 6.167802810668945, "learning_rate": 9.313620833555742e-07, "loss": 0.7212738990783691, "memory(GiB)": 47.44, "step": 17470, "token_acc": 0.8365045806906272, "train_speed(iter/s)": 0.095797 }, { "epoch": 0.8126513185238579, "grad_norm": 9.171575546264648, "learning_rate": 9.29128450853981e-07, "loss": 0.6452064037322998, "memory(GiB)": 47.44, "step": 17475, "token_acc": 0.8430935709739019, "train_speed(iter/s)": 0.095812 }, { "epoch": 0.8128838367838074, "grad_norm": 7.961929798126221, "learning_rate": 9.268972255844217e-07, "loss": 0.5232569217681885, "memory(GiB)": 47.44, "step": 17480, "token_acc": 0.8704663212435233, "train_speed(iter/s)": 0.095828 }, { "epoch": 0.813116355043757, "grad_norm": 7.137867450714111, "learning_rate": 9.246684088662861e-07, "loss": 0.5751725673675537, "memory(GiB)": 47.44, "step": 17485, "token_acc": 0.8549962434259955, "train_speed(iter/s)": 0.095844 }, { "epoch": 0.8133488733037066, "grad_norm": 10.85989761352539, "learning_rate": 9.22442002017544e-07, "loss": 0.5673013210296631, "memory(GiB)": 47.44, "step": 17490, "token_acc": 0.8622823984526112, "train_speed(iter/s)": 0.095859 }, { "epoch": 0.8135813915636563, "grad_norm": 9.529802322387695, "learning_rate": 9.202180063547395e-07, "loss": 0.7671789169311524, "memory(GiB)": 47.44, "step": 17495, "token_acc": 0.8173950670705322, "train_speed(iter/s)": 0.095875 }, { "epoch": 0.8138139098236058, "grad_norm": 8.780619621276855, "learning_rate": 9.179964231929878e-07, "loss": 0.6349299907684326, "memory(GiB)": 47.44, "step": 17500, "token_acc": 0.8404255319148937, "train_speed(iter/s)": 0.09589 }, { "epoch": 0.8138139098236058, "eval_loss": 0.5591466426849365, "eval_runtime": 295.2355, "eval_samples_per_second": 11.77, "eval_steps_per_second": 11.77, "step": 17500 }, { "epoch": 0.8140464280835554, "grad_norm": 7.224985599517822, "learning_rate": 9.157772538459802e-07, "loss": 0.6598941802978515, "memory(GiB)": 47.44, "step": 17505, "token_acc": 0.8337386284812196, "train_speed(iter/s)": 0.095749 }, { "epoch": 0.8142789463435051, "grad_norm": 11.278789520263672, "learning_rate": 9.135604996259806e-07, "loss": 0.7177794933319092, "memory(GiB)": 47.44, "step": 17510, "token_acc": 0.8180579216354344, "train_speed(iter/s)": 0.095765 }, { "epoch": 0.8145114646034547, "grad_norm": 11.014581680297852, "learning_rate": 9.113461618438251e-07, "loss": 0.5065845012664795, "memory(GiB)": 47.44, "step": 17515, "token_acc": 0.8631626642501132, "train_speed(iter/s)": 0.09578 }, { "epoch": 0.8147439828634042, "grad_norm": 7.524247646331787, "learning_rate": 9.091342418089178e-07, "loss": 0.7104159355163574, "memory(GiB)": 47.44, "step": 17520, "token_acc": 0.8215931533903884, "train_speed(iter/s)": 0.095796 }, { "epoch": 0.8149765011233538, "grad_norm": 8.67385196685791, "learning_rate": 9.069247408292375e-07, "loss": 0.587507963180542, "memory(GiB)": 47.44, "step": 17525, "token_acc": 0.8514241724403387, "train_speed(iter/s)": 0.095811 }, { "epoch": 0.8152090193833035, "grad_norm": 11.212056159973145, "learning_rate": 9.047176602113278e-07, "loss": 0.6599641799926758, "memory(GiB)": 47.44, "step": 17530, "token_acc": 0.8254988163679405, "train_speed(iter/s)": 0.095826 }, { "epoch": 0.815441537643253, "grad_norm": 10.266169548034668, "learning_rate": 9.025130012603068e-07, "loss": 0.5785196781158447, "memory(GiB)": 47.44, "step": 17535, "token_acc": 0.8489553924336533, "train_speed(iter/s)": 0.095841 }, { "epoch": 0.8156740559032026, "grad_norm": 10.367372512817383, "learning_rate": 9.003107652798542e-07, "loss": 0.699970293045044, "memory(GiB)": 47.44, "step": 17540, "token_acc": 0.8289133247089263, "train_speed(iter/s)": 0.095857 }, { "epoch": 0.8159065741631523, "grad_norm": 8.538458824157715, "learning_rate": 8.981109535722215e-07, "loss": 0.7431015491485595, "memory(GiB)": 47.44, "step": 17545, "token_acc": 0.8228647391159853, "train_speed(iter/s)": 0.095872 }, { "epoch": 0.8161390924231019, "grad_norm": 8.287872314453125, "learning_rate": 8.959135674382258e-07, "loss": 0.5843753337860107, "memory(GiB)": 47.44, "step": 17550, "token_acc": 0.8404864091559371, "train_speed(iter/s)": 0.095887 }, { "epoch": 0.8161390924231019, "eval_loss": 0.5593584775924683, "eval_runtime": 297.4667, "eval_samples_per_second": 11.682, "eval_steps_per_second": 11.682, "step": 17550 }, { "epoch": 0.8163716106830514, "grad_norm": 7.792442321777344, "learning_rate": 8.937186081772498e-07, "loss": 0.5183809757232666, "memory(GiB)": 47.44, "step": 17555, "token_acc": 0.8351830272833206, "train_speed(iter/s)": 0.095747 }, { "epoch": 0.816604128943001, "grad_norm": 9.457621574401855, "learning_rate": 8.915260770872386e-07, "loss": 0.5537750720977783, "memory(GiB)": 47.44, "step": 17560, "token_acc": 0.8717538953256093, "train_speed(iter/s)": 0.095762 }, { "epoch": 0.8168366472029507, "grad_norm": 9.286090850830078, "learning_rate": 8.893359754647063e-07, "loss": 0.6299626350402832, "memory(GiB)": 47.44, "step": 17565, "token_acc": 0.8482068390325271, "train_speed(iter/s)": 0.095776 }, { "epoch": 0.8170691654629002, "grad_norm": 10.740191459655762, "learning_rate": 8.871483046047247e-07, "loss": 0.7528077125549316, "memory(GiB)": 47.44, "step": 17570, "token_acc": 0.8146027201145312, "train_speed(iter/s)": 0.095791 }, { "epoch": 0.8173016837228498, "grad_norm": 8.127224922180176, "learning_rate": 8.849630658009333e-07, "loss": 0.6412830829620362, "memory(GiB)": 47.44, "step": 17575, "token_acc": 0.8376005852231163, "train_speed(iter/s)": 0.095807 }, { "epoch": 0.8175342019827995, "grad_norm": 9.804791450500488, "learning_rate": 8.827802603455293e-07, "loss": 0.6093309402465821, "memory(GiB)": 47.44, "step": 17580, "token_acc": 0.8454398708635997, "train_speed(iter/s)": 0.095822 }, { "epoch": 0.8177667202427491, "grad_norm": 9.034673690795898, "learning_rate": 8.805998895292745e-07, "loss": 0.6556478023529053, "memory(GiB)": 47.44, "step": 17585, "token_acc": 0.8411873840445269, "train_speed(iter/s)": 0.095838 }, { "epoch": 0.8179992385026986, "grad_norm": 6.9533491134643555, "learning_rate": 8.78421954641489e-07, "loss": 0.6458122253417968, "memory(GiB)": 47.44, "step": 17590, "token_acc": 0.8348040945993647, "train_speed(iter/s)": 0.095853 }, { "epoch": 0.8182317567626483, "grad_norm": 6.9696478843688965, "learning_rate": 8.76246456970054e-07, "loss": 0.5575369834899903, "memory(GiB)": 47.44, "step": 17595, "token_acc": 0.8447592067988668, "train_speed(iter/s)": 0.095867 }, { "epoch": 0.8184642750225979, "grad_norm": 9.378483772277832, "learning_rate": 8.740733978014065e-07, "loss": 0.5304455757141113, "memory(GiB)": 47.44, "step": 17600, "token_acc": 0.8685944363103953, "train_speed(iter/s)": 0.095882 }, { "epoch": 0.8184642750225979, "eval_loss": 0.5591949820518494, "eval_runtime": 297.3176, "eval_samples_per_second": 11.688, "eval_steps_per_second": 11.688, "step": 17600 }, { "epoch": 0.8186967932825475, "grad_norm": 7.968575954437256, "learning_rate": 8.719027784205458e-07, "loss": 0.5970022678375244, "memory(GiB)": 47.44, "step": 17605, "token_acc": 0.8342140503091197, "train_speed(iter/s)": 0.095742 }, { "epoch": 0.818929311542497, "grad_norm": 8.821393013000488, "learning_rate": 8.697346001110235e-07, "loss": 0.5827204704284668, "memory(GiB)": 47.44, "step": 17610, "token_acc": 0.8548732050106935, "train_speed(iter/s)": 0.095757 }, { "epoch": 0.8191618298024467, "grad_norm": 9.860644340515137, "learning_rate": 8.675688641549529e-07, "loss": 0.6527186393737793, "memory(GiB)": 47.44, "step": 17615, "token_acc": 0.8406456953642384, "train_speed(iter/s)": 0.095772 }, { "epoch": 0.8193943480623963, "grad_norm": 10.435001373291016, "learning_rate": 8.654055718329979e-07, "loss": 0.7885581493377686, "memory(GiB)": 47.44, "step": 17620, "token_acc": 0.7981525829627095, "train_speed(iter/s)": 0.095788 }, { "epoch": 0.8196268663223458, "grad_norm": 7.4888386726379395, "learning_rate": 8.632447244243814e-07, "loss": 0.689478588104248, "memory(GiB)": 47.44, "step": 17625, "token_acc": 0.8370837083708371, "train_speed(iter/s)": 0.095803 }, { "epoch": 0.8198593845822955, "grad_norm": 10.054991722106934, "learning_rate": 8.610863232068795e-07, "loss": 0.5457469940185546, "memory(GiB)": 47.44, "step": 17630, "token_acc": 0.8628948281846581, "train_speed(iter/s)": 0.095818 }, { "epoch": 0.8200919028422451, "grad_norm": 7.449556827545166, "learning_rate": 8.589303694568213e-07, "loss": 0.6463999271392822, "memory(GiB)": 47.44, "step": 17635, "token_acc": 0.8249056603773585, "train_speed(iter/s)": 0.095834 }, { "epoch": 0.8203244211021947, "grad_norm": 9.351618766784668, "learning_rate": 8.567768644490898e-07, "loss": 0.5305264472961426, "memory(GiB)": 47.44, "step": 17640, "token_acc": 0.8620582765034098, "train_speed(iter/s)": 0.095849 }, { "epoch": 0.8205569393621442, "grad_norm": 7.768716335296631, "learning_rate": 8.54625809457117e-07, "loss": 0.6108527183532715, "memory(GiB)": 47.44, "step": 17645, "token_acc": 0.8543071161048689, "train_speed(iter/s)": 0.095864 }, { "epoch": 0.8207894576220939, "grad_norm": 7.626466274261475, "learning_rate": 8.524772057528902e-07, "loss": 0.618977403640747, "memory(GiB)": 47.44, "step": 17650, "token_acc": 0.8455143747835123, "train_speed(iter/s)": 0.095879 }, { "epoch": 0.8207894576220939, "eval_loss": 0.5600150227546692, "eval_runtime": 292.3615, "eval_samples_per_second": 11.886, "eval_steps_per_second": 11.886, "step": 17650 }, { "epoch": 0.8210219758820435, "grad_norm": 13.730313301086426, "learning_rate": 8.503310546069421e-07, "loss": 0.600614595413208, "memory(GiB)": 47.44, "step": 17655, "token_acc": 0.8345608248743155, "train_speed(iter/s)": 0.095742 }, { "epoch": 0.8212544941419931, "grad_norm": 7.256465911865234, "learning_rate": 8.48187357288362e-07, "loss": 0.7289290428161621, "memory(GiB)": 47.44, "step": 17660, "token_acc": 0.8085688240656336, "train_speed(iter/s)": 0.095758 }, { "epoch": 0.8214870124019427, "grad_norm": 7.9318437576293945, "learning_rate": 8.460461150647809e-07, "loss": 0.6045058250427247, "memory(GiB)": 47.44, "step": 17665, "token_acc": 0.8450986952157913, "train_speed(iter/s)": 0.095773 }, { "epoch": 0.8217195306618923, "grad_norm": 6.51909875869751, "learning_rate": 8.439073292023831e-07, "loss": 0.671275281906128, "memory(GiB)": 47.44, "step": 17670, "token_acc": 0.8170532505717086, "train_speed(iter/s)": 0.095788 }, { "epoch": 0.8219520489218419, "grad_norm": 8.830435752868652, "learning_rate": 8.41771000965898e-07, "loss": 0.6234053611755371, "memory(GiB)": 47.44, "step": 17675, "token_acc": 0.8314095837705492, "train_speed(iter/s)": 0.095804 }, { "epoch": 0.8221845671817914, "grad_norm": 10.897567749023438, "learning_rate": 8.396371316186041e-07, "loss": 0.594688892364502, "memory(GiB)": 47.44, "step": 17680, "token_acc": 0.8461187214611872, "train_speed(iter/s)": 0.095818 }, { "epoch": 0.8224170854417411, "grad_norm": 8.042645454406738, "learning_rate": 8.375057224223221e-07, "loss": 0.7245099067687988, "memory(GiB)": 47.44, "step": 17685, "token_acc": 0.833642089662838, "train_speed(iter/s)": 0.095834 }, { "epoch": 0.8226496037016907, "grad_norm": 10.030806541442871, "learning_rate": 8.353767746374225e-07, "loss": 0.6409588813781738, "memory(GiB)": 47.44, "step": 17690, "token_acc": 0.8301170515659602, "train_speed(iter/s)": 0.095849 }, { "epoch": 0.8228821219616403, "grad_norm": 8.641294479370117, "learning_rate": 8.332502895228145e-07, "loss": 0.771269416809082, "memory(GiB)": 47.44, "step": 17695, "token_acc": 0.804416403785489, "train_speed(iter/s)": 0.095864 }, { "epoch": 0.8231146402215899, "grad_norm": 8.572598457336426, "learning_rate": 8.311262683359583e-07, "loss": 0.6635285377502441, "memory(GiB)": 47.44, "step": 17700, "token_acc": 0.8299155609167672, "train_speed(iter/s)": 0.095879 }, { "epoch": 0.8231146402215899, "eval_loss": 0.5600427985191345, "eval_runtime": 295.028, "eval_samples_per_second": 11.779, "eval_steps_per_second": 11.779, "step": 17700 }, { "epoch": 0.8233471584815395, "grad_norm": 7.2865777015686035, "learning_rate": 8.290047123328493e-07, "loss": 0.6963011264801026, "memory(GiB)": 47.44, "step": 17705, "token_acc": 0.8337467003740241, "train_speed(iter/s)": 0.09574 }, { "epoch": 0.8235796767414891, "grad_norm": 8.243517875671387, "learning_rate": 8.26885622768031e-07, "loss": 0.7035470962524414, "memory(GiB)": 47.44, "step": 17710, "token_acc": 0.829205807002562, "train_speed(iter/s)": 0.095755 }, { "epoch": 0.8238121950014387, "grad_norm": 7.2582173347473145, "learning_rate": 8.247690008945869e-07, "loss": 0.6932243347167969, "memory(GiB)": 47.44, "step": 17715, "token_acc": 0.8333333333333334, "train_speed(iter/s)": 0.09577 }, { "epoch": 0.8240447132613883, "grad_norm": 6.59385871887207, "learning_rate": 8.226548479641411e-07, "loss": 0.6206794738769531, "memory(GiB)": 47.44, "step": 17720, "token_acc": 0.8364427860696517, "train_speed(iter/s)": 0.095786 }, { "epoch": 0.8242772315213379, "grad_norm": 9.237839698791504, "learning_rate": 8.205431652268559e-07, "loss": 0.5992973804473877, "memory(GiB)": 47.44, "step": 17725, "token_acc": 0.8515418502202643, "train_speed(iter/s)": 0.095801 }, { "epoch": 0.8245097497812875, "grad_norm": 8.55127239227295, "learning_rate": 8.184339539314362e-07, "loss": 0.5941375255584717, "memory(GiB)": 47.44, "step": 17730, "token_acc": 0.8611875737318129, "train_speed(iter/s)": 0.095816 }, { "epoch": 0.8247422680412371, "grad_norm": 8.08411979675293, "learning_rate": 8.163272153251222e-07, "loss": 0.5708928108215332, "memory(GiB)": 47.44, "step": 17735, "token_acc": 0.8621107966033158, "train_speed(iter/s)": 0.095831 }, { "epoch": 0.8249747863011867, "grad_norm": 8.919515609741211, "learning_rate": 8.142229506536952e-07, "loss": 0.5553316593170166, "memory(GiB)": 47.44, "step": 17740, "token_acc": 0.8557428459427675, "train_speed(iter/s)": 0.095846 }, { "epoch": 0.8252073045611363, "grad_norm": 8.339323043823242, "learning_rate": 8.121211611614699e-07, "loss": 0.6758771896362304, "memory(GiB)": 47.44, "step": 17745, "token_acc": 0.8269445478228508, "train_speed(iter/s)": 0.095861 }, { "epoch": 0.825439822821086, "grad_norm": 9.175220489501953, "learning_rate": 8.100218480913002e-07, "loss": 0.7087527751922608, "memory(GiB)": 47.44, "step": 17750, "token_acc": 0.8446563369090186, "train_speed(iter/s)": 0.095876 }, { "epoch": 0.825439822821086, "eval_loss": 0.5590119957923889, "eval_runtime": 297.7387, "eval_samples_per_second": 11.671, "eval_steps_per_second": 11.671, "step": 17750 }, { "epoch": 0.8256723410810355, "grad_norm": 6.341367721557617, "learning_rate": 8.079250126845745e-07, "loss": 0.5751936435699463, "memory(GiB)": 47.44, "step": 17755, "token_acc": 0.8348185055654035, "train_speed(iter/s)": 0.095737 }, { "epoch": 0.8259048593409851, "grad_norm": 10.396602630615234, "learning_rate": 8.058306561812168e-07, "loss": 0.6061834335327149, "memory(GiB)": 47.44, "step": 17760, "token_acc": 0.8371559633027523, "train_speed(iter/s)": 0.095752 }, { "epoch": 0.8261373776009348, "grad_norm": 12.562670707702637, "learning_rate": 8.03738779819686e-07, "loss": 0.610113000869751, "memory(GiB)": 47.44, "step": 17765, "token_acc": 0.8281904761904761, "train_speed(iter/s)": 0.095767 }, { "epoch": 0.8263698958608843, "grad_norm": 11.741593360900879, "learning_rate": 8.016493848369711e-07, "loss": 0.7463398456573487, "memory(GiB)": 47.44, "step": 17770, "token_acc": 0.8205345778532033, "train_speed(iter/s)": 0.095783 }, { "epoch": 0.8266024141208339, "grad_norm": 8.214224815368652, "learning_rate": 7.995624724685969e-07, "loss": 0.6975039958953857, "memory(GiB)": 47.44, "step": 17775, "token_acc": 0.8409549428379287, "train_speed(iter/s)": 0.095798 }, { "epoch": 0.8268349323807835, "grad_norm": 10.039314270019531, "learning_rate": 7.97478043948618e-07, "loss": 0.649553918838501, "memory(GiB)": 47.44, "step": 17780, "token_acc": 0.8373221216041398, "train_speed(iter/s)": 0.095814 }, { "epoch": 0.8270674506407332, "grad_norm": 7.765139102935791, "learning_rate": 7.953961005096234e-07, "loss": 0.795719051361084, "memory(GiB)": 47.44, "step": 17785, "token_acc": 0.7985458951832778, "train_speed(iter/s)": 0.095829 }, { "epoch": 0.8272999689006827, "grad_norm": 7.198999881744385, "learning_rate": 7.933166433827277e-07, "loss": 0.5839630603790283, "memory(GiB)": 47.44, "step": 17790, "token_acc": 0.8532955350815025, "train_speed(iter/s)": 0.095844 }, { "epoch": 0.8275324871606323, "grad_norm": 9.437348365783691, "learning_rate": 7.912396737975803e-07, "loss": 0.5713480472564697, "memory(GiB)": 47.44, "step": 17795, "token_acc": 0.8559150657229525, "train_speed(iter/s)": 0.095859 }, { "epoch": 0.827765005420582, "grad_norm": 9.003602027893066, "learning_rate": 7.891651929823562e-07, "loss": 0.6855646133422851, "memory(GiB)": 47.44, "step": 17800, "token_acc": 0.8108952116585705, "train_speed(iter/s)": 0.095874 }, { "epoch": 0.827765005420582, "eval_loss": 0.5586514472961426, "eval_runtime": 295.621, "eval_samples_per_second": 11.755, "eval_steps_per_second": 11.755, "step": 17800 }, { "epoch": 0.8279975236805316, "grad_norm": 10.380149841308594, "learning_rate": 7.870932021637622e-07, "loss": 0.6497800350189209, "memory(GiB)": 47.44, "step": 17805, "token_acc": 0.8343071185029853, "train_speed(iter/s)": 0.095736 }, { "epoch": 0.8282300419404811, "grad_norm": 8.715871810913086, "learning_rate": 7.85023702567027e-07, "loss": 0.6714212417602539, "memory(GiB)": 47.44, "step": 17810, "token_acc": 0.8406572411157814, "train_speed(iter/s)": 0.095751 }, { "epoch": 0.8284625602004307, "grad_norm": 7.516015529632568, "learning_rate": 7.829566954159135e-07, "loss": 0.5611215114593506, "memory(GiB)": 47.44, "step": 17815, "token_acc": 0.8698140200286123, "train_speed(iter/s)": 0.095767 }, { "epoch": 0.8286950784603804, "grad_norm": 11.052634239196777, "learning_rate": 7.808921819327025e-07, "loss": 0.6710268497467041, "memory(GiB)": 47.44, "step": 17820, "token_acc": 0.8282070517629407, "train_speed(iter/s)": 0.095782 }, { "epoch": 0.8289275967203299, "grad_norm": 7.307531356811523, "learning_rate": 7.788301633382089e-07, "loss": 0.6433767795562744, "memory(GiB)": 47.44, "step": 17825, "token_acc": 0.8330897398421514, "train_speed(iter/s)": 0.095797 }, { "epoch": 0.8291601149802795, "grad_norm": 13.037498474121094, "learning_rate": 7.767706408517628e-07, "loss": 0.6921501159667969, "memory(GiB)": 47.44, "step": 17830, "token_acc": 0.8359046283309958, "train_speed(iter/s)": 0.095812 }, { "epoch": 0.8293926332402292, "grad_norm": 8.326820373535156, "learning_rate": 7.747136156912294e-07, "loss": 0.7372483253479004, "memory(GiB)": 47.44, "step": 17835, "token_acc": 0.8138500635324015, "train_speed(iter/s)": 0.095827 }, { "epoch": 0.8296251515001788, "grad_norm": 8.070684432983398, "learning_rate": 7.726590890729868e-07, "loss": 0.5706949234008789, "memory(GiB)": 47.44, "step": 17840, "token_acc": 0.8526561977948547, "train_speed(iter/s)": 0.095842 }, { "epoch": 0.8298576697601283, "grad_norm": 9.806997299194336, "learning_rate": 7.706070622119433e-07, "loss": 0.5731672286987305, "memory(GiB)": 47.44, "step": 17845, "token_acc": 0.8651063829787234, "train_speed(iter/s)": 0.095857 }, { "epoch": 0.830090188020078, "grad_norm": 8.055378913879395, "learning_rate": 7.68557536321522e-07, "loss": 0.7135499477386474, "memory(GiB)": 47.44, "step": 17850, "token_acc": 0.8112104539202201, "train_speed(iter/s)": 0.095872 }, { "epoch": 0.830090188020078, "eval_loss": 0.5592818856239319, "eval_runtime": 293.2615, "eval_samples_per_second": 11.849, "eval_steps_per_second": 11.849, "step": 17850 }, { "epoch": 0.8303227062800276, "grad_norm": 9.667156219482422, "learning_rate": 7.66510512613674e-07, "loss": 0.5140660285949707, "memory(GiB)": 47.44, "step": 17855, "token_acc": 0.8352633724136278, "train_speed(iter/s)": 0.095736 }, { "epoch": 0.8305552245399772, "grad_norm": 7.986255645751953, "learning_rate": 7.644659922988657e-07, "loss": 0.5835517406463623, "memory(GiB)": 47.44, "step": 17860, "token_acc": 0.8530421216848674, "train_speed(iter/s)": 0.095751 }, { "epoch": 0.8307877427999267, "grad_norm": 8.109107971191406, "learning_rate": 7.624239765860858e-07, "loss": 0.5663367748260498, "memory(GiB)": 47.44, "step": 17865, "token_acc": 0.8564120054570259, "train_speed(iter/s)": 0.095766 }, { "epoch": 0.8310202610598764, "grad_norm": 10.03013801574707, "learning_rate": 7.603844666828408e-07, "loss": 0.6867617130279541, "memory(GiB)": 47.44, "step": 17870, "token_acc": 0.839731643682445, "train_speed(iter/s)": 0.095782 }, { "epoch": 0.831252779319826, "grad_norm": 8.078845977783203, "learning_rate": 7.583474637951577e-07, "loss": 0.6239857196807861, "memory(GiB)": 47.44, "step": 17875, "token_acc": 0.8506363027461487, "train_speed(iter/s)": 0.095797 }, { "epoch": 0.8314852975797755, "grad_norm": 8.256733894348145, "learning_rate": 7.563129691275767e-07, "loss": 0.6100050449371338, "memory(GiB)": 47.44, "step": 17880, "token_acc": 0.8483475479744137, "train_speed(iter/s)": 0.095812 }, { "epoch": 0.8317178158397251, "grad_norm": 7.669572830200195, "learning_rate": 7.542809838831583e-07, "loss": 0.7314640045166015, "memory(GiB)": 47.44, "step": 17885, "token_acc": 0.8145631067961165, "train_speed(iter/s)": 0.095828 }, { "epoch": 0.8319503340996748, "grad_norm": 7.9997968673706055, "learning_rate": 7.522515092634791e-07, "loss": 0.7675912380218506, "memory(GiB)": 47.44, "step": 17890, "token_acc": 0.8113207547169812, "train_speed(iter/s)": 0.095843 }, { "epoch": 0.8321828523596244, "grad_norm": 7.206613063812256, "learning_rate": 7.502245464686286e-07, "loss": 0.5973569869995117, "memory(GiB)": 47.44, "step": 17895, "token_acc": 0.8514957264957265, "train_speed(iter/s)": 0.095858 }, { "epoch": 0.8324153706195739, "grad_norm": 8.209953308105469, "learning_rate": 7.482000966972141e-07, "loss": 0.5934009075164794, "memory(GiB)": 47.44, "step": 17900, "token_acc": 0.8530659467797918, "train_speed(iter/s)": 0.095874 }, { "epoch": 0.8324153706195739, "eval_loss": 0.5590567588806152, "eval_runtime": 296.3204, "eval_samples_per_second": 11.727, "eval_steps_per_second": 11.727, "step": 17900 }, { "epoch": 0.8326478888795236, "grad_norm": 6.786571502685547, "learning_rate": 7.461781611463531e-07, "loss": 0.6626224994659424, "memory(GiB)": 47.44, "step": 17905, "token_acc": 0.8339872506028507, "train_speed(iter/s)": 0.095736 }, { "epoch": 0.8328804071394732, "grad_norm": 9.307394981384277, "learning_rate": 7.441587410116796e-07, "loss": 0.696648359298706, "memory(GiB)": 47.44, "step": 17910, "token_acc": 0.8244766505636071, "train_speed(iter/s)": 0.095751 }, { "epoch": 0.8331129253994227, "grad_norm": 7.47057580947876, "learning_rate": 7.42141837487339e-07, "loss": 0.48722333908081056, "memory(GiB)": 47.44, "step": 17915, "token_acc": 0.8612959719789842, "train_speed(iter/s)": 0.095766 }, { "epoch": 0.8333454436593724, "grad_norm": 11.0787935256958, "learning_rate": 7.401274517659901e-07, "loss": 0.672635269165039, "memory(GiB)": 47.44, "step": 17920, "token_acc": 0.8394547519878833, "train_speed(iter/s)": 0.095781 }, { "epoch": 0.833577961919322, "grad_norm": 8.495865821838379, "learning_rate": 7.381155850387988e-07, "loss": 0.6448636531829834, "memory(GiB)": 47.44, "step": 17925, "token_acc": 0.8262056414922657, "train_speed(iter/s)": 0.095796 }, { "epoch": 0.8338104801792716, "grad_norm": 6.004539966583252, "learning_rate": 7.36106238495447e-07, "loss": 0.7003479480743409, "memory(GiB)": 47.44, "step": 17930, "token_acc": 0.818907697221335, "train_speed(iter/s)": 0.09581 }, { "epoch": 0.8340429984392211, "grad_norm": 10.699225425720215, "learning_rate": 7.340994133241197e-07, "loss": 0.6735119819641113, "memory(GiB)": 47.44, "step": 17935, "token_acc": 0.8234676007005254, "train_speed(iter/s)": 0.095825 }, { "epoch": 0.8342755166991708, "grad_norm": 13.282435417175293, "learning_rate": 7.320951107115182e-07, "loss": 0.7957521438598633, "memory(GiB)": 47.44, "step": 17940, "token_acc": 0.8079071766222604, "train_speed(iter/s)": 0.09584 }, { "epoch": 0.8345080349591204, "grad_norm": 7.707286357879639, "learning_rate": 7.30093331842845e-07, "loss": 0.673992919921875, "memory(GiB)": 47.44, "step": 17945, "token_acc": 0.839835728952772, "train_speed(iter/s)": 0.095855 }, { "epoch": 0.83474055321907, "grad_norm": 12.175082206726074, "learning_rate": 7.280940779018158e-07, "loss": 0.5859807968139649, "memory(GiB)": 47.44, "step": 17950, "token_acc": 0.8603807796917498, "train_speed(iter/s)": 0.09587 }, { "epoch": 0.83474055321907, "eval_loss": 0.5583511590957642, "eval_runtime": 291.943, "eval_samples_per_second": 11.903, "eval_steps_per_second": 11.903, "step": 17950 }, { "epoch": 0.8349730714790196, "grad_norm": 8.121362686157227, "learning_rate": 7.260973500706514e-07, "loss": 0.602592658996582, "memory(GiB)": 47.44, "step": 17955, "token_acc": 0.8347663491657611, "train_speed(iter/s)": 0.095736 }, { "epoch": 0.8352055897389692, "grad_norm": 13.328322410583496, "learning_rate": 7.241031495300788e-07, "loss": 0.7775434970855712, "memory(GiB)": 47.44, "step": 17960, "token_acc": 0.8044280442804428, "train_speed(iter/s)": 0.095751 }, { "epoch": 0.8354381079989188, "grad_norm": 8.016280174255371, "learning_rate": 7.221114774593291e-07, "loss": 0.6770434379577637, "memory(GiB)": 47.44, "step": 17965, "token_acc": 0.8331099195710456, "train_speed(iter/s)": 0.095766 }, { "epoch": 0.8356706262588683, "grad_norm": 9.899717330932617, "learning_rate": 7.201223350361408e-07, "loss": 0.572743272781372, "memory(GiB)": 47.44, "step": 17970, "token_acc": 0.8537030280919372, "train_speed(iter/s)": 0.095781 }, { "epoch": 0.835903144518818, "grad_norm": 9.240289688110352, "learning_rate": 7.181357234367531e-07, "loss": 0.628160047531128, "memory(GiB)": 47.44, "step": 17975, "token_acc": 0.8461538461538461, "train_speed(iter/s)": 0.095796 }, { "epoch": 0.8361356627787676, "grad_norm": 9.355088233947754, "learning_rate": 7.16151643835914e-07, "loss": 0.8679290771484375, "memory(GiB)": 47.44, "step": 17980, "token_acc": 0.7922360248447204, "train_speed(iter/s)": 0.095811 }, { "epoch": 0.8363681810387172, "grad_norm": 9.195143699645996, "learning_rate": 7.141700974068678e-07, "loss": 0.6345166206359864, "memory(GiB)": 47.44, "step": 17985, "token_acc": 0.8389312977099237, "train_speed(iter/s)": 0.095826 }, { "epoch": 0.8366006992986668, "grad_norm": 7.399878025054932, "learning_rate": 7.121910853213654e-07, "loss": 0.6902605533599854, "memory(GiB)": 47.44, "step": 17990, "token_acc": 0.8341933264533884, "train_speed(iter/s)": 0.095841 }, { "epoch": 0.8368332175586164, "grad_norm": 9.452262878417969, "learning_rate": 7.102146087496576e-07, "loss": 0.6338638305664063, "memory(GiB)": 47.44, "step": 17995, "token_acc": 0.8394355453852022, "train_speed(iter/s)": 0.095856 }, { "epoch": 0.837065735818566, "grad_norm": 9.153210639953613, "learning_rate": 7.082406688604981e-07, "loss": 0.612544584274292, "memory(GiB)": 47.44, "step": 18000, "token_acc": 0.8403565640194489, "train_speed(iter/s)": 0.095871 }, { "epoch": 0.837065735818566, "eval_loss": 0.5581134557723999, "eval_runtime": 293.9767, "eval_samples_per_second": 11.821, "eval_steps_per_second": 11.821, "step": 18000 }, { "epoch": 0.8372982540785157, "grad_norm": 9.343351364135742, "learning_rate": 7.062692668211351e-07, "loss": 0.7623269557952881, "memory(GiB)": 47.44, "step": 18005, "token_acc": 0.8332722258532465, "train_speed(iter/s)": 0.095735 }, { "epoch": 0.8375307723384652, "grad_norm": 9.574581146240234, "learning_rate": 7.043004037973222e-07, "loss": 0.627281904220581, "memory(GiB)": 47.44, "step": 18010, "token_acc": 0.831611174894757, "train_speed(iter/s)": 0.09575 }, { "epoch": 0.8377632905984148, "grad_norm": 8.998746871948242, "learning_rate": 7.023340809533064e-07, "loss": 0.6361284255981445, "memory(GiB)": 47.44, "step": 18015, "token_acc": 0.8403451995685005, "train_speed(iter/s)": 0.095765 }, { "epoch": 0.8379958088583644, "grad_norm": 8.098225593566895, "learning_rate": 7.003702994518369e-07, "loss": 0.5176102161407471, "memory(GiB)": 47.44, "step": 18020, "token_acc": 0.8788109756097561, "train_speed(iter/s)": 0.09578 }, { "epoch": 0.838228327118314, "grad_norm": 6.4128007888793945, "learning_rate": 6.984090604541588e-07, "loss": 0.6634963512420654, "memory(GiB)": 47.44, "step": 18025, "token_acc": 0.8323624595469256, "train_speed(iter/s)": 0.095795 }, { "epoch": 0.8384608453782636, "grad_norm": 10.500700950622559, "learning_rate": 6.964503651200111e-07, "loss": 0.6115920543670654, "memory(GiB)": 47.44, "step": 18030, "token_acc": 0.8481152993348116, "train_speed(iter/s)": 0.09581 }, { "epoch": 0.8386933636382132, "grad_norm": 9.047125816345215, "learning_rate": 6.944942146076323e-07, "loss": 0.5878973484039307, "memory(GiB)": 47.44, "step": 18035, "token_acc": 0.8544600938967136, "train_speed(iter/s)": 0.095825 }, { "epoch": 0.8389258818981629, "grad_norm": 8.661657333374023, "learning_rate": 6.925406100737542e-07, "loss": 0.5965959548950195, "memory(GiB)": 47.44, "step": 18040, "token_acc": 0.8525943396226415, "train_speed(iter/s)": 0.095839 }, { "epoch": 0.8391584001581124, "grad_norm": 8.694890975952148, "learning_rate": 6.905895526736051e-07, "loss": 0.6183953285217285, "memory(GiB)": 47.44, "step": 18045, "token_acc": 0.8443489755452743, "train_speed(iter/s)": 0.095854 }, { "epoch": 0.839390918418062, "grad_norm": 8.679917335510254, "learning_rate": 6.886410435609025e-07, "loss": 0.5601376056671142, "memory(GiB)": 47.44, "step": 18050, "token_acc": 0.8597122302158273, "train_speed(iter/s)": 0.095869 }, { "epoch": 0.839390918418062, "eval_loss": 0.5584567785263062, "eval_runtime": 292.551, "eval_samples_per_second": 11.878, "eval_steps_per_second": 11.878, "step": 18050 }, { "epoch": 0.8396234366780116, "grad_norm": 8.077510833740234, "learning_rate": 6.866950838878628e-07, "loss": 0.5764139652252197, "memory(GiB)": 47.44, "step": 18055, "token_acc": 0.8349615180680303, "train_speed(iter/s)": 0.095736 }, { "epoch": 0.8398559549379612, "grad_norm": 10.266379356384277, "learning_rate": 6.847516748051897e-07, "loss": 0.6612685680389404, "memory(GiB)": 47.44, "step": 18060, "token_acc": 0.8329888383629599, "train_speed(iter/s)": 0.095751 }, { "epoch": 0.8400884731979108, "grad_norm": 9.361281394958496, "learning_rate": 6.828108174620835e-07, "loss": 0.655527400970459, "memory(GiB)": 47.44, "step": 18065, "token_acc": 0.8432756794917049, "train_speed(iter/s)": 0.095766 }, { "epoch": 0.8403209914578604, "grad_norm": 7.153183937072754, "learning_rate": 6.808725130062299e-07, "loss": 0.603009843826294, "memory(GiB)": 47.44, "step": 18070, "token_acc": 0.8513646826701743, "train_speed(iter/s)": 0.095781 }, { "epoch": 0.8405535097178101, "grad_norm": 8.630191802978516, "learning_rate": 6.789367625838106e-07, "loss": 0.6014632225036621, "memory(GiB)": 47.44, "step": 18075, "token_acc": 0.8404099560761347, "train_speed(iter/s)": 0.095795 }, { "epoch": 0.8407860279777596, "grad_norm": 6.671989440917969, "learning_rate": 6.770035673394931e-07, "loss": 0.5919332504272461, "memory(GiB)": 47.44, "step": 18080, "token_acc": 0.8455786736020806, "train_speed(iter/s)": 0.09581 }, { "epoch": 0.8410185462377092, "grad_norm": 8.644976615905762, "learning_rate": 6.750729284164381e-07, "loss": 0.6521486282348633, "memory(GiB)": 47.44, "step": 18085, "token_acc": 0.8404094010614102, "train_speed(iter/s)": 0.095825 }, { "epoch": 0.8412510644976589, "grad_norm": 8.989290237426758, "learning_rate": 6.731448469562885e-07, "loss": 0.5394969463348389, "memory(GiB)": 47.44, "step": 18090, "token_acc": 0.8701393983859135, "train_speed(iter/s)": 0.095839 }, { "epoch": 0.8414835827576085, "grad_norm": 14.609663963317871, "learning_rate": 6.71219324099181e-07, "loss": 0.6068337917327881, "memory(GiB)": 47.44, "step": 18095, "token_acc": 0.8634204275534442, "train_speed(iter/s)": 0.095854 }, { "epoch": 0.841716101017558, "grad_norm": 8.150311470031738, "learning_rate": 6.69296360983735e-07, "loss": 0.6084172248840332, "memory(GiB)": 47.44, "step": 18100, "token_acc": 0.8488570453770045, "train_speed(iter/s)": 0.095869 }, { "epoch": 0.841716101017558, "eval_loss": 0.5587973594665527, "eval_runtime": 292.6113, "eval_samples_per_second": 11.876, "eval_steps_per_second": 11.876, "step": 18100 }, { "epoch": 0.8419486192775076, "grad_norm": 9.314252853393555, "learning_rate": 6.673759587470596e-07, "loss": 0.7102108001708984, "memory(GiB)": 47.44, "step": 18105, "token_acc": 0.8340893328337625, "train_speed(iter/s)": 0.095735 }, { "epoch": 0.8421811375374573, "grad_norm": 10.721854209899902, "learning_rate": 6.65458118524746e-07, "loss": 0.5791438579559326, "memory(GiB)": 47.44, "step": 18110, "token_acc": 0.8577524893314367, "train_speed(iter/s)": 0.09575 }, { "epoch": 0.8424136557974068, "grad_norm": 9.090928077697754, "learning_rate": 6.635428414508738e-07, "loss": 0.789949607849121, "memory(GiB)": 47.44, "step": 18115, "token_acc": 0.7949775112443778, "train_speed(iter/s)": 0.095764 }, { "epoch": 0.8426461740573564, "grad_norm": 5.7705583572387695, "learning_rate": 6.616301286580046e-07, "loss": 0.7491995811462402, "memory(GiB)": 47.44, "step": 18120, "token_acc": 0.8068391866913124, "train_speed(iter/s)": 0.095778 }, { "epoch": 0.842878692317306, "grad_norm": 7.779153347015381, "learning_rate": 6.59719981277186e-07, "loss": 0.7720121383666992, "memory(GiB)": 47.44, "step": 18125, "token_acc": 0.8114950393431406, "train_speed(iter/s)": 0.095793 }, { "epoch": 0.8431112105772557, "grad_norm": 10.119519233703613, "learning_rate": 6.578124004379449e-07, "loss": 0.6325667381286622, "memory(GiB)": 47.44, "step": 18130, "token_acc": 0.8481435127242386, "train_speed(iter/s)": 0.095808 }, { "epoch": 0.8433437288372052, "grad_norm": 9.072452545166016, "learning_rate": 6.559073872682953e-07, "loss": 0.7809335231781006, "memory(GiB)": 47.44, "step": 18135, "token_acc": 0.8059299191374663, "train_speed(iter/s)": 0.095822 }, { "epoch": 0.8435762470971548, "grad_norm": 8.513379096984863, "learning_rate": 6.540049428947276e-07, "loss": 0.6669524669647217, "memory(GiB)": 47.44, "step": 18140, "token_acc": 0.835621521335807, "train_speed(iter/s)": 0.095837 }, { "epoch": 0.8438087653571045, "grad_norm": 7.087943077087402, "learning_rate": 6.521050684422187e-07, "loss": 0.5620086193084717, "memory(GiB)": 47.44, "step": 18145, "token_acc": 0.8636363636363636, "train_speed(iter/s)": 0.095852 }, { "epoch": 0.8440412836170541, "grad_norm": 7.02882719039917, "learning_rate": 6.502077650342204e-07, "loss": 0.6077769279479981, "memory(GiB)": 47.44, "step": 18150, "token_acc": 0.8466188137514167, "train_speed(iter/s)": 0.095867 }, { "epoch": 0.8440412836170541, "eval_loss": 0.5583994388580322, "eval_runtime": 296.5266, "eval_samples_per_second": 11.719, "eval_steps_per_second": 11.719, "step": 18150 }, { "epoch": 0.8442738018770036, "grad_norm": 9.41649341583252, "learning_rate": 6.483130337926675e-07, "loss": 0.5184135913848877, "memory(GiB)": 47.44, "step": 18155, "token_acc": 0.83554784055548, "train_speed(iter/s)": 0.095731 }, { "epoch": 0.8445063201369533, "grad_norm": 9.200589179992676, "learning_rate": 6.464208758379736e-07, "loss": 0.6405242443084717, "memory(GiB)": 47.44, "step": 18160, "token_acc": 0.8416289592760181, "train_speed(iter/s)": 0.095746 }, { "epoch": 0.8447388383969029, "grad_norm": 6.7900872230529785, "learning_rate": 6.445312922890301e-07, "loss": 0.5888498306274415, "memory(GiB)": 47.44, "step": 18165, "token_acc": 0.8449640287769784, "train_speed(iter/s)": 0.09576 }, { "epoch": 0.8449713566568524, "grad_norm": 8.711414337158203, "learning_rate": 6.426442842632075e-07, "loss": 0.5557902336120606, "memory(GiB)": 47.44, "step": 18170, "token_acc": 0.8608058608058609, "train_speed(iter/s)": 0.095775 }, { "epoch": 0.845203874916802, "grad_norm": 7.435390949249268, "learning_rate": 6.407598528763492e-07, "loss": 0.6613565444946289, "memory(GiB)": 47.44, "step": 18175, "token_acc": 0.8336182336182336, "train_speed(iter/s)": 0.095789 }, { "epoch": 0.8454363931767517, "grad_norm": 8.605823516845703, "learning_rate": 6.388779992427796e-07, "loss": 0.6953454971313476, "memory(GiB)": 47.44, "step": 18180, "token_acc": 0.8309314586994727, "train_speed(iter/s)": 0.095804 }, { "epoch": 0.8456689114367013, "grad_norm": 6.164673328399658, "learning_rate": 6.369987244752951e-07, "loss": 0.7036227226257324, "memory(GiB)": 47.44, "step": 18185, "token_acc": 0.8248962655601659, "train_speed(iter/s)": 0.095819 }, { "epoch": 0.8459014296966508, "grad_norm": 11.071667671203613, "learning_rate": 6.351220296851701e-07, "loss": 0.5725512027740478, "memory(GiB)": 47.44, "step": 18190, "token_acc": 0.8512280701754386, "train_speed(iter/s)": 0.095833 }, { "epoch": 0.8461339479566005, "grad_norm": 13.574658393859863, "learning_rate": 6.33247915982152e-07, "loss": 0.7959741115570068, "memory(GiB)": 47.44, "step": 18195, "token_acc": 0.7978000647039792, "train_speed(iter/s)": 0.095847 }, { "epoch": 0.8463664662165501, "grad_norm": 6.559995174407959, "learning_rate": 6.313763844744636e-07, "loss": 0.6718171119689942, "memory(GiB)": 47.44, "step": 18200, "token_acc": 0.8321273516642547, "train_speed(iter/s)": 0.095862 }, { "epoch": 0.8463664662165501, "eval_loss": 0.5578708648681641, "eval_runtime": 290.8952, "eval_samples_per_second": 11.946, "eval_steps_per_second": 11.946, "step": 18200 }, { "epoch": 0.8465989844764996, "grad_norm": 8.805997848510742, "learning_rate": 6.295074362687959e-07, "loss": 0.6185959815979004, "memory(GiB)": 47.44, "step": 18205, "token_acc": 0.8348420808731294, "train_speed(iter/s)": 0.09573 }, { "epoch": 0.8468315027364492, "grad_norm": 9.289185523986816, "learning_rate": 6.276410724703191e-07, "loss": 0.6430187225341797, "memory(GiB)": 47.44, "step": 18210, "token_acc": 0.8297029702970297, "train_speed(iter/s)": 0.095745 }, { "epoch": 0.8470640209963989, "grad_norm": 8.73018741607666, "learning_rate": 6.25777294182669e-07, "loss": 0.7390905857086182, "memory(GiB)": 47.44, "step": 18215, "token_acc": 0.8018839258584017, "train_speed(iter/s)": 0.095759 }, { "epoch": 0.8472965392563485, "grad_norm": 7.182153224945068, "learning_rate": 6.239161025079577e-07, "loss": 0.7059009075164795, "memory(GiB)": 47.44, "step": 18220, "token_acc": 0.8162042875157629, "train_speed(iter/s)": 0.095774 }, { "epoch": 0.847529057516298, "grad_norm": 8.976435661315918, "learning_rate": 6.220574985467625e-07, "loss": 0.6841076850891114, "memory(GiB)": 47.44, "step": 18225, "token_acc": 0.8229736449527598, "train_speed(iter/s)": 0.095788 }, { "epoch": 0.8477615757762477, "grad_norm": 7.973471164703369, "learning_rate": 6.202014833981351e-07, "loss": 0.5512941837310791, "memory(GiB)": 47.44, "step": 18230, "token_acc": 0.8602552131963896, "train_speed(iter/s)": 0.095803 }, { "epoch": 0.8479940940361973, "grad_norm": 8.258837699890137, "learning_rate": 6.183480581595941e-07, "loss": 0.6314374923706054, "memory(GiB)": 47.44, "step": 18235, "token_acc": 0.8396159317211949, "train_speed(iter/s)": 0.095818 }, { "epoch": 0.8482266122961469, "grad_norm": 9.106829643249512, "learning_rate": 6.164972239271288e-07, "loss": 0.6298882484436035, "memory(GiB)": 47.44, "step": 18240, "token_acc": 0.8451190065539841, "train_speed(iter/s)": 0.095832 }, { "epoch": 0.8484591305560965, "grad_norm": 9.711416244506836, "learning_rate": 6.146489817951917e-07, "loss": 0.5575997829437256, "memory(GiB)": 47.44, "step": 18245, "token_acc": 0.8538461538461538, "train_speed(iter/s)": 0.095847 }, { "epoch": 0.8486916488160461, "grad_norm": 7.332555294036865, "learning_rate": 6.128033328567079e-07, "loss": 0.5691449165344238, "memory(GiB)": 47.44, "step": 18250, "token_acc": 0.8534361851332398, "train_speed(iter/s)": 0.095861 }, { "epoch": 0.8486916488160461, "eval_loss": 0.5585102438926697, "eval_runtime": 291.6188, "eval_samples_per_second": 11.916, "eval_steps_per_second": 11.916, "step": 18250 }, { "epoch": 0.8489241670759957, "grad_norm": 7.50723123550415, "learning_rate": 6.109602782030644e-07, "loss": 0.6675292015075683, "memory(GiB)": 47.44, "step": 18255, "token_acc": 0.8349338967316495, "train_speed(iter/s)": 0.095729 }, { "epoch": 0.8491566853359452, "grad_norm": 7.4508233070373535, "learning_rate": 6.091198189241182e-07, "loss": 0.6501538276672363, "memory(GiB)": 47.44, "step": 18260, "token_acc": 0.8348740835192859, "train_speed(iter/s)": 0.095744 }, { "epoch": 0.8493892035958949, "grad_norm": 11.345195770263672, "learning_rate": 6.072819561081883e-07, "loss": 0.7528467655181885, "memory(GiB)": 47.44, "step": 18265, "token_acc": 0.8289156626506025, "train_speed(iter/s)": 0.095758 }, { "epoch": 0.8496217218558445, "grad_norm": 9.892457962036133, "learning_rate": 6.054466908420604e-07, "loss": 0.7120511054992675, "memory(GiB)": 47.44, "step": 18270, "token_acc": 0.8325581395348837, "train_speed(iter/s)": 0.095773 }, { "epoch": 0.8498542401157941, "grad_norm": 9.037054061889648, "learning_rate": 6.036140242109834e-07, "loss": 0.6773629188537598, "memory(GiB)": 47.44, "step": 18275, "token_acc": 0.8391859537110934, "train_speed(iter/s)": 0.095788 }, { "epoch": 0.8500867583757437, "grad_norm": 9.72851848602295, "learning_rate": 6.017839572986695e-07, "loss": 0.48967576026916504, "memory(GiB)": 47.44, "step": 18280, "token_acc": 0.8733153638814016, "train_speed(iter/s)": 0.095803 }, { "epoch": 0.8503192766356933, "grad_norm": 11.348685264587402, "learning_rate": 5.999564911872952e-07, "loss": 0.6709898948669434, "memory(GiB)": 47.44, "step": 18285, "token_acc": 0.8287549054584374, "train_speed(iter/s)": 0.095818 }, { "epoch": 0.8505517948956429, "grad_norm": 11.713817596435547, "learning_rate": 5.981316269574955e-07, "loss": 0.5737239360809326, "memory(GiB)": 47.44, "step": 18290, "token_acc": 0.8550358196375896, "train_speed(iter/s)": 0.095832 }, { "epoch": 0.8507843131555926, "grad_norm": 8.777615547180176, "learning_rate": 5.963093656883706e-07, "loss": 0.5813944816589356, "memory(GiB)": 47.44, "step": 18295, "token_acc": 0.8532901833872708, "train_speed(iter/s)": 0.095847 }, { "epoch": 0.8510168314155421, "grad_norm": 9.691421508789062, "learning_rate": 5.944897084574786e-07, "loss": 0.5308315277099609, "memory(GiB)": 47.44, "step": 18300, "token_acc": 0.8565149136577708, "train_speed(iter/s)": 0.095861 }, { "epoch": 0.8510168314155421, "eval_loss": 0.5576656460762024, "eval_runtime": 292.1527, "eval_samples_per_second": 11.894, "eval_steps_per_second": 11.894, "step": 18300 }, { "epoch": 0.8512493496754917, "grad_norm": 9.326038360595703, "learning_rate": 5.926726563408402e-07, "loss": 0.6760049343109131, "memory(GiB)": 47.44, "step": 18305, "token_acc": 0.8342113442732699, "train_speed(iter/s)": 0.095729 }, { "epoch": 0.8514818679354413, "grad_norm": 5.767177104949951, "learning_rate": 5.908582104129329e-07, "loss": 0.7487131595611572, "memory(GiB)": 47.44, "step": 18310, "token_acc": 0.8074925816023739, "train_speed(iter/s)": 0.095744 }, { "epoch": 0.8517143861953909, "grad_norm": 7.889694690704346, "learning_rate": 5.890463717466954e-07, "loss": 0.7449358940124512, "memory(GiB)": 47.44, "step": 18315, "token_acc": 0.8118628359592215, "train_speed(iter/s)": 0.095758 }, { "epoch": 0.8519469044553405, "grad_norm": 7.965071678161621, "learning_rate": 5.872371414135241e-07, "loss": 0.6570825576782227, "memory(GiB)": 47.44, "step": 18320, "token_acc": 0.8311509303928325, "train_speed(iter/s)": 0.095773 }, { "epoch": 0.8521794227152901, "grad_norm": 10.874944686889648, "learning_rate": 5.854305204832733e-07, "loss": 0.7097513675689697, "memory(GiB)": 47.44, "step": 18325, "token_acc": 0.8337330135891287, "train_speed(iter/s)": 0.095787 }, { "epoch": 0.8524119409752398, "grad_norm": 8.72624397277832, "learning_rate": 5.836265100242522e-07, "loss": 0.7030339241027832, "memory(GiB)": 47.44, "step": 18330, "token_acc": 0.8376623376623377, "train_speed(iter/s)": 0.095802 }, { "epoch": 0.8526444592351893, "grad_norm": 10.563741683959961, "learning_rate": 5.818251111032297e-07, "loss": 0.5798059463500976, "memory(GiB)": 47.44, "step": 18335, "token_acc": 0.8491039426523298, "train_speed(iter/s)": 0.095817 }, { "epoch": 0.8528769774951389, "grad_norm": 8.762702941894531, "learning_rate": 5.800263247854265e-07, "loss": 0.7150753021240235, "memory(GiB)": 47.44, "step": 18340, "token_acc": 0.8208005985783764, "train_speed(iter/s)": 0.095831 }, { "epoch": 0.8531094957550885, "grad_norm": 8.711050987243652, "learning_rate": 5.782301521345224e-07, "loss": 0.5809697151184082, "memory(GiB)": 47.44, "step": 18345, "token_acc": 0.8659476117103235, "train_speed(iter/s)": 0.095845 }, { "epoch": 0.8533420140150381, "grad_norm": 9.37763500213623, "learning_rate": 5.764365942126482e-07, "loss": 0.6817828178405761, "memory(GiB)": 47.44, "step": 18350, "token_acc": 0.8369325694138386, "train_speed(iter/s)": 0.09586 }, { "epoch": 0.8533420140150381, "eval_loss": 0.5577992796897888, "eval_runtime": 294.47, "eval_samples_per_second": 11.801, "eval_steps_per_second": 11.801, "step": 18350 }, { "epoch": 0.8535745322749877, "grad_norm": 10.525474548339844, "learning_rate": 5.746456520803906e-07, "loss": 0.6407979488372803, "memory(GiB)": 47.44, "step": 18355, "token_acc": 0.83432, "train_speed(iter/s)": 0.095727 }, { "epoch": 0.8538070505349373, "grad_norm": 8.024861335754395, "learning_rate": 5.728573267967891e-07, "loss": 0.6994614601135254, "memory(GiB)": 47.44, "step": 18360, "token_acc": 0.8156079854809437, "train_speed(iter/s)": 0.095742 }, { "epoch": 0.854039568794887, "grad_norm": 7.524238109588623, "learning_rate": 5.710716194193367e-07, "loss": 0.6358412265777588, "memory(GiB)": 47.44, "step": 18365, "token_acc": 0.8227394807520143, "train_speed(iter/s)": 0.095756 }, { "epoch": 0.8542720870548365, "grad_norm": 8.940537452697754, "learning_rate": 5.692885310039753e-07, "loss": 0.5769914150238037, "memory(GiB)": 47.44, "step": 18370, "token_acc": 0.8603263032232391, "train_speed(iter/s)": 0.095771 }, { "epoch": 0.8545046053147861, "grad_norm": 9.157532691955566, "learning_rate": 5.675080626051021e-07, "loss": 0.805474853515625, "memory(GiB)": 47.44, "step": 18375, "token_acc": 0.7917448405253283, "train_speed(iter/s)": 0.095786 }, { "epoch": 0.8547371235747357, "grad_norm": 10.06224536895752, "learning_rate": 5.657302152755612e-07, "loss": 0.6651059627532959, "memory(GiB)": 47.44, "step": 18380, "token_acc": 0.8368121442125237, "train_speed(iter/s)": 0.0958 }, { "epoch": 0.8549696418346854, "grad_norm": 6.025383949279785, "learning_rate": 5.639549900666508e-07, "loss": 0.6011961936950684, "memory(GiB)": 47.44, "step": 18385, "token_acc": 0.8556067588325653, "train_speed(iter/s)": 0.095815 }, { "epoch": 0.8552021600946349, "grad_norm": 7.602214336395264, "learning_rate": 5.621823880281135e-07, "loss": 0.5666591167449951, "memory(GiB)": 47.44, "step": 18390, "token_acc": 0.8518024032042724, "train_speed(iter/s)": 0.095829 }, { "epoch": 0.8554346783545845, "grad_norm": 7.266246318817139, "learning_rate": 5.604124102081454e-07, "loss": 0.48615288734436035, "memory(GiB)": 47.44, "step": 18395, "token_acc": 0.8771049802938015, "train_speed(iter/s)": 0.095844 }, { "epoch": 0.8556671966145342, "grad_norm": 9.022258758544922, "learning_rate": 5.586450576533892e-07, "loss": 0.7559492588043213, "memory(GiB)": 47.44, "step": 18400, "token_acc": 0.8222374742621825, "train_speed(iter/s)": 0.095859 }, { "epoch": 0.8556671966145342, "eval_loss": 0.5577629804611206, "eval_runtime": 292.6222, "eval_samples_per_second": 11.875, "eval_steps_per_second": 11.875, "step": 18400 }, { "epoch": 0.8558997148744837, "grad_norm": 9.033079147338867, "learning_rate": 5.568803314089349e-07, "loss": 0.6726770401000977, "memory(GiB)": 47.44, "step": 18405, "token_acc": 0.8342267810279715, "train_speed(iter/s)": 0.095727 }, { "epoch": 0.8561322331344333, "grad_norm": 9.249860763549805, "learning_rate": 5.551182325183191e-07, "loss": 0.6182631969451904, "memory(GiB)": 47.44, "step": 18410, "token_acc": 0.8520777735417461, "train_speed(iter/s)": 0.095742 }, { "epoch": 0.856364751394383, "grad_norm": 8.269856452941895, "learning_rate": 5.533587620235254e-07, "loss": 0.6515116691589355, "memory(GiB)": 47.44, "step": 18415, "token_acc": 0.8340821566110398, "train_speed(iter/s)": 0.095756 }, { "epoch": 0.8565972696543326, "grad_norm": 9.464628219604492, "learning_rate": 5.516019209649837e-07, "loss": 0.6144441127777099, "memory(GiB)": 47.44, "step": 18420, "token_acc": 0.8478441127694859, "train_speed(iter/s)": 0.095771 }, { "epoch": 0.8568297879142821, "grad_norm": 10.576383590698242, "learning_rate": 5.498477103815669e-07, "loss": 0.6464351654052735, "memory(GiB)": 47.44, "step": 18425, "token_acc": 0.8510998307952623, "train_speed(iter/s)": 0.095785 }, { "epoch": 0.8570623061742317, "grad_norm": 7.952290058135986, "learning_rate": 5.480961313105964e-07, "loss": 0.6013254642486572, "memory(GiB)": 47.44, "step": 18430, "token_acc": 0.8617727450214759, "train_speed(iter/s)": 0.0958 }, { "epoch": 0.8572948244341814, "grad_norm": 8.244209289550781, "learning_rate": 5.463471847878321e-07, "loss": 0.7223697662353515, "memory(GiB)": 47.44, "step": 18435, "token_acc": 0.8237899398924391, "train_speed(iter/s)": 0.095815 }, { "epoch": 0.857527342694131, "grad_norm": 7.9289350509643555, "learning_rate": 5.446008718474811e-07, "loss": 0.7440857410430908, "memory(GiB)": 47.44, "step": 18440, "token_acc": 0.8179434896591902, "train_speed(iter/s)": 0.095829 }, { "epoch": 0.8577598609540805, "grad_norm": 8.39036750793457, "learning_rate": 5.428571935221927e-07, "loss": 0.6755226612091064, "memory(GiB)": 47.44, "step": 18445, "token_acc": 0.8333333333333334, "train_speed(iter/s)": 0.095844 }, { "epoch": 0.8579923792140302, "grad_norm": 7.686507701873779, "learning_rate": 5.411161508430585e-07, "loss": 0.5707077503204345, "memory(GiB)": 47.44, "step": 18450, "token_acc": 0.8539792387543252, "train_speed(iter/s)": 0.095858 }, { "epoch": 0.8579923792140302, "eval_loss": 0.5588163733482361, "eval_runtime": 295.8192, "eval_samples_per_second": 11.747, "eval_steps_per_second": 11.747, "step": 18450 }, { "epoch": 0.8582248974739798, "grad_norm": 7.943417072296143, "learning_rate": 5.393777448396081e-07, "loss": 0.8202121734619141, "memory(GiB)": 47.44, "step": 18455, "token_acc": 0.833792784753649, "train_speed(iter/s)": 0.095725 }, { "epoch": 0.8584574157339293, "grad_norm": 7.825821399688721, "learning_rate": 5.376419765398183e-07, "loss": 0.7452343463897705, "memory(GiB)": 47.44, "step": 18460, "token_acc": 0.8056155507559395, "train_speed(iter/s)": 0.095739 }, { "epoch": 0.8586899339938789, "grad_norm": 9.753142356872559, "learning_rate": 5.359088469700985e-07, "loss": 0.6943498611450195, "memory(GiB)": 47.44, "step": 18465, "token_acc": 0.8412244897959184, "train_speed(iter/s)": 0.095754 }, { "epoch": 0.8589224522538286, "grad_norm": 7.731417179107666, "learning_rate": 5.341783571553056e-07, "loss": 0.6089068412780761, "memory(GiB)": 47.44, "step": 18470, "token_acc": 0.8480913026367571, "train_speed(iter/s)": 0.095769 }, { "epoch": 0.8591549705137782, "grad_norm": 10.879091262817383, "learning_rate": 5.324505081187281e-07, "loss": 0.657605504989624, "memory(GiB)": 47.44, "step": 18475, "token_acc": 0.834061135371179, "train_speed(iter/s)": 0.095783 }, { "epoch": 0.8593874887737277, "grad_norm": 9.33606243133545, "learning_rate": 5.307253008820984e-07, "loss": 0.6699440002441406, "memory(GiB)": 47.44, "step": 18480, "token_acc": 0.8354124748490945, "train_speed(iter/s)": 0.095798 }, { "epoch": 0.8596200070336774, "grad_norm": 9.070079803466797, "learning_rate": 5.290027364655842e-07, "loss": 0.7180463314056397, "memory(GiB)": 47.44, "step": 18485, "token_acc": 0.8165910563836681, "train_speed(iter/s)": 0.095813 }, { "epoch": 0.859852525293627, "grad_norm": 8.304587364196777, "learning_rate": 5.27282815887793e-07, "loss": 0.5282202243804932, "memory(GiB)": 47.44, "step": 18490, "token_acc": 0.8615094339622642, "train_speed(iter/s)": 0.095827 }, { "epoch": 0.8600850435535765, "grad_norm": 7.470762729644775, "learning_rate": 5.255655401657639e-07, "loss": 0.5523840427398682, "memory(GiB)": 47.44, "step": 18495, "token_acc": 0.8608035431825372, "train_speed(iter/s)": 0.095842 }, { "epoch": 0.8603175618135261, "grad_norm": 8.002354621887207, "learning_rate": 5.238509103149774e-07, "loss": 0.7330649852752685, "memory(GiB)": 47.44, "step": 18500, "token_acc": 0.8097764304660856, "train_speed(iter/s)": 0.095857 }, { "epoch": 0.8603175618135261, "eval_loss": 0.5571265816688538, "eval_runtime": 296.8144, "eval_samples_per_second": 11.708, "eval_steps_per_second": 11.708, "step": 18500 }, { "epoch": 0.8605500800734758, "grad_norm": 6.639530181884766, "learning_rate": 5.221389273493449e-07, "loss": 0.6615922451019287, "memory(GiB)": 47.44, "step": 18505, "token_acc": 0.8347044071370382, "train_speed(iter/s)": 0.095724 }, { "epoch": 0.8607825983334254, "grad_norm": 8.321513175964355, "learning_rate": 5.204295922812175e-07, "loss": 0.7088462829589843, "memory(GiB)": 47.44, "step": 18510, "token_acc": 0.821732283464567, "train_speed(iter/s)": 0.095738 }, { "epoch": 0.8610151165933749, "grad_norm": 10.790754318237305, "learning_rate": 5.187229061213739e-07, "loss": 0.6691200256347656, "memory(GiB)": 47.44, "step": 18515, "token_acc": 0.819551282051282, "train_speed(iter/s)": 0.095753 }, { "epoch": 0.8612476348533246, "grad_norm": 9.01181411743164, "learning_rate": 5.170188698790352e-07, "loss": 0.7157214641571045, "memory(GiB)": 47.44, "step": 18520, "token_acc": 0.8334043459735833, "train_speed(iter/s)": 0.095768 }, { "epoch": 0.8614801531132742, "grad_norm": 9.205448150634766, "learning_rate": 5.15317484561847e-07, "loss": 0.7251205444335938, "memory(GiB)": 47.44, "step": 18525, "token_acc": 0.8173913043478261, "train_speed(iter/s)": 0.095782 }, { "epoch": 0.8617126713732238, "grad_norm": 9.783936500549316, "learning_rate": 5.136187511758927e-07, "loss": 0.7142038345336914, "memory(GiB)": 47.44, "step": 18530, "token_acc": 0.830752990851513, "train_speed(iter/s)": 0.095797 }, { "epoch": 0.8619451896331733, "grad_norm": 8.41704273223877, "learning_rate": 5.119226707256847e-07, "loss": 0.5512996673583984, "memory(GiB)": 47.44, "step": 18535, "token_acc": 0.8576122672508215, "train_speed(iter/s)": 0.095811 }, { "epoch": 0.862177707893123, "grad_norm": 8.004241943359375, "learning_rate": 5.102292442141693e-07, "loss": 0.5882785320281982, "memory(GiB)": 47.44, "step": 18540, "token_acc": 0.851418439716312, "train_speed(iter/s)": 0.095826 }, { "epoch": 0.8624102261530726, "grad_norm": 7.506921768188477, "learning_rate": 5.085384726427195e-07, "loss": 0.7991962432861328, "memory(GiB)": 47.44, "step": 18545, "token_acc": 0.7994314592545799, "train_speed(iter/s)": 0.09584 }, { "epoch": 0.8626427444130221, "grad_norm": 8.778756141662598, "learning_rate": 5.068503570111422e-07, "loss": 0.6244667053222657, "memory(GiB)": 47.44, "step": 18550, "token_acc": 0.8412140575079872, "train_speed(iter/s)": 0.095855 }, { "epoch": 0.8626427444130221, "eval_loss": 0.5570151209831238, "eval_runtime": 294.7339, "eval_samples_per_second": 11.79, "eval_steps_per_second": 11.79, "step": 18550 }, { "epoch": 0.8628752626729718, "grad_norm": 8.074554443359375, "learning_rate": 5.051648983176722e-07, "loss": 0.6556201457977295, "memory(GiB)": 47.44, "step": 18555, "token_acc": 0.834621143575495, "train_speed(iter/s)": 0.095724 }, { "epoch": 0.8631077809329214, "grad_norm": 8.432696342468262, "learning_rate": 5.034820975589732e-07, "loss": 0.59825758934021, "memory(GiB)": 47.44, "step": 18560, "token_acc": 0.8419255718675316, "train_speed(iter/s)": 0.095738 }, { "epoch": 0.863340299192871, "grad_norm": 8.92758560180664, "learning_rate": 5.018019557301385e-07, "loss": 0.6601286888122558, "memory(GiB)": 47.44, "step": 18565, "token_acc": 0.837074583635047, "train_speed(iter/s)": 0.095752 }, { "epoch": 0.8635728174528206, "grad_norm": 7.215491771697998, "learning_rate": 5.001244738246852e-07, "loss": 0.5904222965240479, "memory(GiB)": 47.44, "step": 18570, "token_acc": 0.844404973357016, "train_speed(iter/s)": 0.095767 }, { "epoch": 0.8638053357127702, "grad_norm": 10.66939926147461, "learning_rate": 4.984496528345628e-07, "loss": 0.5614672660827636, "memory(GiB)": 47.44, "step": 18575, "token_acc": 0.863036303630363, "train_speed(iter/s)": 0.095782 }, { "epoch": 0.8640378539727198, "grad_norm": 9.135615348815918, "learning_rate": 4.967774937501424e-07, "loss": 0.679977560043335, "memory(GiB)": 47.44, "step": 18580, "token_acc": 0.837968561064087, "train_speed(iter/s)": 0.095796 }, { "epoch": 0.8642703722326694, "grad_norm": 9.716190338134766, "learning_rate": 4.951079975602257e-07, "loss": 0.6977860927581787, "memory(GiB)": 47.44, "step": 18585, "token_acc": 0.8257796257796258, "train_speed(iter/s)": 0.095811 }, { "epoch": 0.864502890492619, "grad_norm": 8.211316108703613, "learning_rate": 4.934411652520344e-07, "loss": 0.6847392559051514, "memory(GiB)": 47.44, "step": 18590, "token_acc": 0.834858734580183, "train_speed(iter/s)": 0.095825 }, { "epoch": 0.8647354087525686, "grad_norm": 6.788343906402588, "learning_rate": 4.917769978112196e-07, "loss": 0.5711318969726562, "memory(GiB)": 47.44, "step": 18595, "token_acc": 0.8474124809741248, "train_speed(iter/s)": 0.09584 }, { "epoch": 0.8649679270125182, "grad_norm": 8.990870475769043, "learning_rate": 4.901154962218552e-07, "loss": 0.6872212409973144, "memory(GiB)": 47.44, "step": 18600, "token_acc": 0.8271604938271605, "train_speed(iter/s)": 0.095854 }, { "epoch": 0.8649679270125182, "eval_loss": 0.5573961734771729, "eval_runtime": 295.4738, "eval_samples_per_second": 11.761, "eval_steps_per_second": 11.761, "step": 18600 }, { "epoch": 0.8652004452724678, "grad_norm": 11.367814064025879, "learning_rate": 4.884566614664383e-07, "loss": 0.6198303699493408, "memory(GiB)": 47.44, "step": 18605, "token_acc": 0.8351723307787987, "train_speed(iter/s)": 0.095722 }, { "epoch": 0.8654329635324174, "grad_norm": 10.426321983337402, "learning_rate": 4.868004945258881e-07, "loss": 0.649804162979126, "memory(GiB)": 47.44, "step": 18610, "token_acc": 0.8430807248764415, "train_speed(iter/s)": 0.095737 }, { "epoch": 0.865665481792367, "grad_norm": 7.450739860534668, "learning_rate": 4.851469963795485e-07, "loss": 0.6020886898040771, "memory(GiB)": 47.44, "step": 18615, "token_acc": 0.8388082505729565, "train_speed(iter/s)": 0.095752 }, { "epoch": 0.8658980000523167, "grad_norm": 7.820985317230225, "learning_rate": 4.834961680051825e-07, "loss": 0.6482667446136474, "memory(GiB)": 47.44, "step": 18620, "token_acc": 0.8368228647391159, "train_speed(iter/s)": 0.095766 }, { "epoch": 0.8661305183122662, "grad_norm": 9.831792831420898, "learning_rate": 4.81848010378978e-07, "loss": 0.7055737972259521, "memory(GiB)": 47.44, "step": 18625, "token_acc": 0.8175155734701356, "train_speed(iter/s)": 0.095781 }, { "epoch": 0.8663630365722158, "grad_norm": 7.0892109870910645, "learning_rate": 4.802025244755392e-07, "loss": 0.5333632469177246, "memory(GiB)": 47.44, "step": 18630, "token_acc": 0.8736027515047291, "train_speed(iter/s)": 0.095795 }, { "epoch": 0.8665955548321654, "grad_norm": 10.063704490661621, "learning_rate": 4.785597112678941e-07, "loss": 0.6076772689819336, "memory(GiB)": 47.44, "step": 18635, "token_acc": 0.8538344722854974, "train_speed(iter/s)": 0.09581 }, { "epoch": 0.8668280730921151, "grad_norm": 8.082415580749512, "learning_rate": 4.769195717274882e-07, "loss": 0.5907391548156739, "memory(GiB)": 47.44, "step": 18640, "token_acc": 0.8345757898473554, "train_speed(iter/s)": 0.095824 }, { "epoch": 0.8670605913520646, "grad_norm": 9.68175220489502, "learning_rate": 4.7528210682418807e-07, "loss": 0.6756155014038085, "memory(GiB)": 47.44, "step": 18645, "token_acc": 0.8227746591820368, "train_speed(iter/s)": 0.095838 }, { "epoch": 0.8672931096120142, "grad_norm": 6.71742057800293, "learning_rate": 4.7364731752627514e-07, "loss": 0.625584888458252, "memory(GiB)": 47.44, "step": 18650, "token_acc": 0.8328366296396641, "train_speed(iter/s)": 0.095852 }, { "epoch": 0.8672931096120142, "eval_loss": 0.5572828650474548, "eval_runtime": 293.8674, "eval_samples_per_second": 11.825, "eval_steps_per_second": 11.825, "step": 18650 }, { "epoch": 0.8675256278719639, "grad_norm": 6.967619895935059, "learning_rate": 4.7201520480045284e-07, "loss": 0.5962310314178467, "memory(GiB)": 47.44, "step": 18655, "token_acc": 0.8348475509697664, "train_speed(iter/s)": 0.095722 }, { "epoch": 0.8677581461319134, "grad_norm": 8.604484558105469, "learning_rate": 4.70385769611838e-07, "loss": 0.6598502159118652, "memory(GiB)": 47.44, "step": 18660, "token_acc": 0.8352527191298784, "train_speed(iter/s)": 0.095737 }, { "epoch": 0.867990664391863, "grad_norm": 10.318936347961426, "learning_rate": 4.687590129239672e-07, "loss": 0.7001275062561035, "memory(GiB)": 47.44, "step": 18665, "token_acc": 0.8137119113573407, "train_speed(iter/s)": 0.095751 }, { "epoch": 0.8682231826518126, "grad_norm": 10.031474113464355, "learning_rate": 4.671349356987909e-07, "loss": 0.7455158233642578, "memory(GiB)": 47.44, "step": 18670, "token_acc": 0.8162195497995683, "train_speed(iter/s)": 0.095765 }, { "epoch": 0.8684557009117623, "grad_norm": 8.311787605285645, "learning_rate": 4.6551353889667693e-07, "loss": 0.7342134952545166, "memory(GiB)": 47.44, "step": 18675, "token_acc": 0.8178947368421052, "train_speed(iter/s)": 0.09578 }, { "epoch": 0.8686882191717118, "grad_norm": 8.125550270080566, "learning_rate": 4.638948234764068e-07, "loss": 0.6627881526947021, "memory(GiB)": 47.44, "step": 18680, "token_acc": 0.8397740784780023, "train_speed(iter/s)": 0.095794 }, { "epoch": 0.8689207374316614, "grad_norm": 10.395947456359863, "learning_rate": 4.6227879039517754e-07, "loss": 0.540507698059082, "memory(GiB)": 47.44, "step": 18685, "token_acc": 0.8612343686970553, "train_speed(iter/s)": 0.095809 }, { "epoch": 0.8691532556916111, "grad_norm": 7.6285200119018555, "learning_rate": 4.606654406085992e-07, "loss": 0.5860920429229737, "memory(GiB)": 47.44, "step": 18690, "token_acc": 0.8527542372881356, "train_speed(iter/s)": 0.095823 }, { "epoch": 0.8693857739515606, "grad_norm": 9.4238862991333, "learning_rate": 4.5905477507069473e-07, "loss": 0.6630524158477783, "memory(GiB)": 47.44, "step": 18695, "token_acc": 0.828101644245142, "train_speed(iter/s)": 0.095837 }, { "epoch": 0.8696182922115102, "grad_norm": 8.019683837890625, "learning_rate": 4.574467947339017e-07, "loss": 0.616099214553833, "memory(GiB)": 47.44, "step": 18700, "token_acc": 0.841552142279709, "train_speed(iter/s)": 0.095852 }, { "epoch": 0.8696182922115102, "eval_loss": 0.557950496673584, "eval_runtime": 294.649, "eval_samples_per_second": 11.794, "eval_steps_per_second": 11.794, "step": 18700 }, { "epoch": 0.8698508104714598, "grad_norm": 7.529056549072266, "learning_rate": 4.5584150054906626e-07, "loss": 0.5641360759735108, "memory(GiB)": 47.44, "step": 18705, "token_acc": 0.8352028927280032, "train_speed(iter/s)": 0.095721 }, { "epoch": 0.8700833287314095, "grad_norm": 7.795188903808594, "learning_rate": 4.5423889346545125e-07, "loss": 0.6588688850402832, "memory(GiB)": 47.44, "step": 18710, "token_acc": 0.8301944106925881, "train_speed(iter/s)": 0.095736 }, { "epoch": 0.870315846991359, "grad_norm": 7.797845363616943, "learning_rate": 4.5263897443072525e-07, "loss": 0.7449404239654541, "memory(GiB)": 47.44, "step": 18715, "token_acc": 0.8107714701601164, "train_speed(iter/s)": 0.09575 }, { "epoch": 0.8705483652513086, "grad_norm": 9.954667091369629, "learning_rate": 4.5104174439097037e-07, "loss": 0.5302114963531495, "memory(GiB)": 47.44, "step": 18720, "token_acc": 0.8644132168628941, "train_speed(iter/s)": 0.095764 }, { "epoch": 0.8707808835112583, "grad_norm": 9.294275283813477, "learning_rate": 4.494472042906789e-07, "loss": 0.6280638694763183, "memory(GiB)": 47.44, "step": 18725, "token_acc": 0.8386212299255777, "train_speed(iter/s)": 0.095779 }, { "epoch": 0.8710134017712079, "grad_norm": 8.05164623260498, "learning_rate": 4.478553550727521e-07, "loss": 0.7135934352874755, "memory(GiB)": 47.44, "step": 18730, "token_acc": 0.8139735480161012, "train_speed(iter/s)": 0.095793 }, { "epoch": 0.8712459200311574, "grad_norm": 7.834764003753662, "learning_rate": 4.4626619767849764e-07, "loss": 0.6254189968109131, "memory(GiB)": 47.44, "step": 18735, "token_acc": 0.8413329257107918, "train_speed(iter/s)": 0.095807 }, { "epoch": 0.871478438291107, "grad_norm": 10.09204387664795, "learning_rate": 4.446797330476349e-07, "loss": 0.7326918601989746, "memory(GiB)": 47.44, "step": 18740, "token_acc": 0.8233387358184765, "train_speed(iter/s)": 0.095822 }, { "epoch": 0.8717109565510567, "grad_norm": 9.581809043884277, "learning_rate": 4.430959621182884e-07, "loss": 0.5726360321044922, "memory(GiB)": 47.44, "step": 18745, "token_acc": 0.8482712319570326, "train_speed(iter/s)": 0.095836 }, { "epoch": 0.8719434748110062, "grad_norm": 7.136640548706055, "learning_rate": 4.4151488582699186e-07, "loss": 0.5546000480651856, "memory(GiB)": 47.44, "step": 18750, "token_acc": 0.8588807785888077, "train_speed(iter/s)": 0.095851 }, { "epoch": 0.8719434748110062, "eval_loss": 0.5570844411849976, "eval_runtime": 295.122, "eval_samples_per_second": 11.775, "eval_steps_per_second": 11.775, "step": 18750 }, { "epoch": 0.8721759930709558, "grad_norm": 9.215892791748047, "learning_rate": 4.3993650510868347e-07, "loss": 0.6979795455932617, "memory(GiB)": 47.44, "step": 18755, "token_acc": 0.8343617539294181, "train_speed(iter/s)": 0.095721 }, { "epoch": 0.8724085113309055, "grad_norm": 7.706457138061523, "learning_rate": 4.383608208967083e-07, "loss": 0.5588757514953613, "memory(GiB)": 47.44, "step": 18760, "token_acc": 0.8442001516300227, "train_speed(iter/s)": 0.095735 }, { "epoch": 0.8726410295908551, "grad_norm": 10.519378662109375, "learning_rate": 4.367878341228182e-07, "loss": 0.7292253017425537, "memory(GiB)": 47.44, "step": 18765, "token_acc": 0.8270793036750483, "train_speed(iter/s)": 0.095748 }, { "epoch": 0.8728735478508046, "grad_norm": 8.943109512329102, "learning_rate": 4.3521754571716865e-07, "loss": 0.6495208263397216, "memory(GiB)": 47.44, "step": 18770, "token_acc": 0.8444714459295262, "train_speed(iter/s)": 0.095763 }, { "epoch": 0.8731060661107543, "grad_norm": 9.06867504119873, "learning_rate": 4.336499566083191e-07, "loss": 0.6166380882263184, "memory(GiB)": 47.44, "step": 18775, "token_acc": 0.8551223241590215, "train_speed(iter/s)": 0.095777 }, { "epoch": 0.8733385843707039, "grad_norm": 8.140872955322266, "learning_rate": 4.320850677232341e-07, "loss": 0.43561468124389646, "memory(GiB)": 47.44, "step": 18780, "token_acc": 0.9012189995796553, "train_speed(iter/s)": 0.095791 }, { "epoch": 0.8735711026306535, "grad_norm": 9.459227561950684, "learning_rate": 4.305228799872796e-07, "loss": 0.5819206237792969, "memory(GiB)": 47.44, "step": 18785, "token_acc": 0.8629981024667932, "train_speed(iter/s)": 0.095806 }, { "epoch": 0.873803620890603, "grad_norm": 6.878607273101807, "learning_rate": 4.289633943242266e-07, "loss": 0.5773335456848144, "memory(GiB)": 47.44, "step": 18790, "token_acc": 0.8589874277947672, "train_speed(iter/s)": 0.09582 }, { "epoch": 0.8740361391505527, "grad_norm": 10.950090408325195, "learning_rate": 4.2740661165624585e-07, "loss": 0.6097473621368408, "memory(GiB)": 47.44, "step": 18795, "token_acc": 0.8536082474226804, "train_speed(iter/s)": 0.095834 }, { "epoch": 0.8742686574105023, "grad_norm": 10.073168754577637, "learning_rate": 4.2585253290391205e-07, "loss": 0.7397896766662597, "memory(GiB)": 47.44, "step": 18800, "token_acc": 0.8026701400195376, "train_speed(iter/s)": 0.095849 }, { "epoch": 0.8742686574105023, "eval_loss": 0.5571832060813904, "eval_runtime": 291.4443, "eval_samples_per_second": 11.923, "eval_steps_per_second": 11.923, "step": 18800 }, { "epoch": 0.8745011756704518, "grad_norm": 8.847938537597656, "learning_rate": 4.243011589861995e-07, "loss": 0.5326892852783203, "memory(GiB)": 47.44, "step": 18805, "token_acc": 0.8355497269157877, "train_speed(iter/s)": 0.095721 }, { "epoch": 0.8747336939304015, "grad_norm": 7.569638729095459, "learning_rate": 4.227524908204833e-07, "loss": 0.6248403549194336, "memory(GiB)": 47.44, "step": 18810, "token_acc": 0.846929422548121, "train_speed(iter/s)": 0.095735 }, { "epoch": 0.8749662121903511, "grad_norm": 7.655401229858398, "learning_rate": 4.2120652932254036e-07, "loss": 0.5118704795837402, "memory(GiB)": 47.44, "step": 18815, "token_acc": 0.8706986444212722, "train_speed(iter/s)": 0.095749 }, { "epoch": 0.8751987304503007, "grad_norm": 9.32550048828125, "learning_rate": 4.1966327540654327e-07, "loss": 0.539171838760376, "memory(GiB)": 47.44, "step": 18820, "token_acc": 0.8633844147375286, "train_speed(iter/s)": 0.095763 }, { "epoch": 0.8754312487102502, "grad_norm": 7.188518524169922, "learning_rate": 4.1812272998506765e-07, "loss": 0.6495114803314209, "memory(GiB)": 47.44, "step": 18825, "token_acc": 0.8364995328558081, "train_speed(iter/s)": 0.095778 }, { "epoch": 0.8756637669701999, "grad_norm": 10.491996765136719, "learning_rate": 4.165848939690836e-07, "loss": 0.7552758693695069, "memory(GiB)": 47.44, "step": 18830, "token_acc": 0.8204585537918871, "train_speed(iter/s)": 0.095792 }, { "epoch": 0.8758962852301495, "grad_norm": 11.448698997497559, "learning_rate": 4.1504976826796327e-07, "loss": 0.6858109474182129, "memory(GiB)": 47.44, "step": 18835, "token_acc": 0.8332103321033211, "train_speed(iter/s)": 0.095806 }, { "epoch": 0.876128803490099, "grad_norm": 9.617608070373535, "learning_rate": 4.1351735378947043e-07, "loss": 0.6972774028778076, "memory(GiB)": 47.44, "step": 18840, "token_acc": 0.831145584725537, "train_speed(iter/s)": 0.09582 }, { "epoch": 0.8763613217500487, "grad_norm": 8.686561584472656, "learning_rate": 4.11987651439773e-07, "loss": 0.7151837825775147, "memory(GiB)": 47.44, "step": 18845, "token_acc": 0.8249015449863678, "train_speed(iter/s)": 0.095833 }, { "epoch": 0.8765938400099983, "grad_norm": 9.857751846313477, "learning_rate": 4.104606621234286e-07, "loss": 0.5853623390197754, "memory(GiB)": 47.44, "step": 18850, "token_acc": 0.8560057887120116, "train_speed(iter/s)": 0.095848 }, { "epoch": 0.8765938400099983, "eval_loss": 0.557105541229248, "eval_runtime": 289.6076, "eval_samples_per_second": 11.999, "eval_steps_per_second": 11.999, "step": 18850 }, { "epoch": 0.8768263582699479, "grad_norm": 9.501344680786133, "learning_rate": 4.089363867433954e-07, "loss": 0.7283553123474121, "memory(GiB)": 47.44, "step": 18855, "token_acc": 0.8342097049148839, "train_speed(iter/s)": 0.09572 }, { "epoch": 0.8770588765298974, "grad_norm": 9.011695861816406, "learning_rate": 4.074148262010219e-07, "loss": 0.6737423419952393, "memory(GiB)": 47.44, "step": 18860, "token_acc": 0.8236682400539447, "train_speed(iter/s)": 0.095735 }, { "epoch": 0.8772913947898471, "grad_norm": 9.307239532470703, "learning_rate": 4.058959813960556e-07, "loss": 0.5290046215057373, "memory(GiB)": 47.44, "step": 18865, "token_acc": 0.8745748299319728, "train_speed(iter/s)": 0.095749 }, { "epoch": 0.8775239130497967, "grad_norm": 6.860555648803711, "learning_rate": 4.043798532266352e-07, "loss": 0.6110732555389404, "memory(GiB)": 47.44, "step": 18870, "token_acc": 0.8466997870830376, "train_speed(iter/s)": 0.095763 }, { "epoch": 0.8777564313097463, "grad_norm": 7.625909328460693, "learning_rate": 4.0286644258929476e-07, "loss": 0.6237162590026856, "memory(GiB)": 47.44, "step": 18875, "token_acc": 0.8529635258358662, "train_speed(iter/s)": 0.095778 }, { "epoch": 0.8779889495696959, "grad_norm": 9.900529861450195, "learning_rate": 4.0135575037896056e-07, "loss": 0.6516123294830323, "memory(GiB)": 47.44, "step": 18880, "token_acc": 0.8463067240808214, "train_speed(iter/s)": 0.095792 }, { "epoch": 0.8782214678296455, "grad_norm": 11.614465713500977, "learning_rate": 3.9984777748895253e-07, "loss": 0.6818142414093018, "memory(GiB)": 47.44, "step": 18885, "token_acc": 0.8380439659039928, "train_speed(iter/s)": 0.095806 }, { "epoch": 0.8784539860895951, "grad_norm": 9.174266815185547, "learning_rate": 3.983425248109796e-07, "loss": 0.5818466663360595, "memory(GiB)": 47.44, "step": 18890, "token_acc": 0.8540983606557377, "train_speed(iter/s)": 0.09582 }, { "epoch": 0.8786865043495447, "grad_norm": 7.933727741241455, "learning_rate": 3.968399932351463e-07, "loss": 0.6590875148773193, "memory(GiB)": 47.44, "step": 18895, "token_acc": 0.8458536585365853, "train_speed(iter/s)": 0.095835 }, { "epoch": 0.8789190226094943, "grad_norm": 9.447802543640137, "learning_rate": 3.953401836499443e-07, "loss": 0.8476919174194336, "memory(GiB)": 47.44, "step": 18900, "token_acc": 0.8137555328566565, "train_speed(iter/s)": 0.095849 }, { "epoch": 0.8789190226094943, "eval_loss": 0.556678295135498, "eval_runtime": 292.5011, "eval_samples_per_second": 11.88, "eval_steps_per_second": 11.88, "step": 18900 }, { "epoch": 0.8791515408694439, "grad_norm": 7.718998432159424, "learning_rate": 3.9384309694225855e-07, "loss": 0.48586311340332033, "memory(GiB)": 47.44, "step": 18905, "token_acc": 0.8355619891552968, "train_speed(iter/s)": 0.095721 }, { "epoch": 0.8793840591293935, "grad_norm": 8.555072784423828, "learning_rate": 3.9234873399736086e-07, "loss": 0.6869643211364747, "memory(GiB)": 47.44, "step": 18910, "token_acc": 0.8362164151637835, "train_speed(iter/s)": 0.095735 }, { "epoch": 0.8796165773893431, "grad_norm": 13.64516830444336, "learning_rate": 3.908570956989155e-07, "loss": 0.832066535949707, "memory(GiB)": 47.44, "step": 18915, "token_acc": 0.807448159119763, "train_speed(iter/s)": 0.095749 }, { "epoch": 0.8798490956492927, "grad_norm": 8.657330513000488, "learning_rate": 3.893681829289736e-07, "loss": 0.6564640998840332, "memory(GiB)": 47.44, "step": 18920, "token_acc": 0.8369565217391305, "train_speed(iter/s)": 0.095763 }, { "epoch": 0.8800816139092423, "grad_norm": 8.098555564880371, "learning_rate": 3.87881996567977e-07, "loss": 0.6175037860870362, "memory(GiB)": 47.44, "step": 18925, "token_acc": 0.8464987926871335, "train_speed(iter/s)": 0.095778 }, { "epoch": 0.880314132169192, "grad_norm": 10.006739616394043, "learning_rate": 3.8639853749475153e-07, "loss": 0.5666879653930664, "memory(GiB)": 47.44, "step": 18930, "token_acc": 0.8589272593681117, "train_speed(iter/s)": 0.095792 }, { "epoch": 0.8805466504291415, "grad_norm": 9.115056037902832, "learning_rate": 3.849178065865139e-07, "loss": 0.6132634162902832, "memory(GiB)": 47.44, "step": 18935, "token_acc": 0.8485299590621511, "train_speed(iter/s)": 0.095806 }, { "epoch": 0.8807791686890911, "grad_norm": 6.633612632751465, "learning_rate": 3.8343980471886424e-07, "loss": 0.6955962657928467, "memory(GiB)": 47.44, "step": 18940, "token_acc": 0.8203592814371258, "train_speed(iter/s)": 0.09582 }, { "epoch": 0.8810116869490408, "grad_norm": 7.961598873138428, "learning_rate": 3.8196453276579173e-07, "loss": 0.7367173194885254, "memory(GiB)": 47.44, "step": 18945, "token_acc": 0.8152793614595211, "train_speed(iter/s)": 0.095834 }, { "epoch": 0.8812442052089903, "grad_norm": 8.601881980895996, "learning_rate": 3.8049199159967075e-07, "loss": 0.6443628787994384, "memory(GiB)": 47.44, "step": 18950, "token_acc": 0.8405551794177386, "train_speed(iter/s)": 0.095848 }, { "epoch": 0.8812442052089903, "eval_loss": 0.5564034581184387, "eval_runtime": 291.7319, "eval_samples_per_second": 11.912, "eval_steps_per_second": 11.912, "step": 18950 }, { "epoch": 0.8814767234689399, "grad_norm": 9.404342651367188, "learning_rate": 3.790221820912593e-07, "loss": 0.6218993186950683, "memory(GiB)": 47.44, "step": 18955, "token_acc": 0.8352454482045362, "train_speed(iter/s)": 0.095721 }, { "epoch": 0.8817092417288895, "grad_norm": 8.895606994628906, "learning_rate": 3.775551051097015e-07, "loss": 0.6595926761627198, "memory(GiB)": 47.44, "step": 18960, "token_acc": 0.8220284237726099, "train_speed(iter/s)": 0.095735 }, { "epoch": 0.8819417599888392, "grad_norm": 9.177611351013184, "learning_rate": 3.7609076152252513e-07, "loss": 0.7080647945404053, "memory(GiB)": 47.44, "step": 18965, "token_acc": 0.8249733191035219, "train_speed(iter/s)": 0.095749 }, { "epoch": 0.8821742782487887, "grad_norm": 9.349613189697266, "learning_rate": 3.7462915219564244e-07, "loss": 0.6600799083709716, "memory(GiB)": 47.44, "step": 18970, "token_acc": 0.829050279329609, "train_speed(iter/s)": 0.095763 }, { "epoch": 0.8824067965087383, "grad_norm": 8.506677627563477, "learning_rate": 3.731702779933477e-07, "loss": 0.6581337451934814, "memory(GiB)": 47.44, "step": 18975, "token_acc": 0.8370720188902007, "train_speed(iter/s)": 0.095777 }, { "epoch": 0.882639314768688, "grad_norm": 8.334615707397461, "learning_rate": 3.717141397783186e-07, "loss": 0.6230666160583496, "memory(GiB)": 47.44, "step": 18980, "token_acc": 0.8457613814756672, "train_speed(iter/s)": 0.095791 }, { "epoch": 0.8828718330286375, "grad_norm": 7.229907035827637, "learning_rate": 3.702607384116136e-07, "loss": 0.687413501739502, "memory(GiB)": 47.44, "step": 18985, "token_acc": 0.8236486486486486, "train_speed(iter/s)": 0.095805 }, { "epoch": 0.8831043512885871, "grad_norm": 8.757658004760742, "learning_rate": 3.6881007475267515e-07, "loss": 0.7368191242218017, "memory(GiB)": 47.44, "step": 18990, "token_acc": 0.8106930693069307, "train_speed(iter/s)": 0.095819 }, { "epoch": 0.8833368695485367, "grad_norm": 7.911934852600098, "learning_rate": 3.673621496593238e-07, "loss": 0.542356300354004, "memory(GiB)": 47.44, "step": 18995, "token_acc": 0.8660508083140878, "train_speed(iter/s)": 0.095833 }, { "epoch": 0.8835693878084864, "grad_norm": 9.330195426940918, "learning_rate": 3.6591696398776353e-07, "loss": 0.5444394111633301, "memory(GiB)": 47.44, "step": 19000, "token_acc": 0.8651315789473685, "train_speed(iter/s)": 0.095848 }, { "epoch": 0.8835693878084864, "eval_loss": 0.5561711192131042, "eval_runtime": 292.1787, "eval_samples_per_second": 11.893, "eval_steps_per_second": 11.893, "step": 19000 }, { "epoch": 0.8838019060684359, "grad_norm": 10.371062278747559, "learning_rate": 3.6447451859257685e-07, "loss": 0.6457038879394531, "memory(GiB)": 47.44, "step": 19005, "token_acc": 0.8348006338738335, "train_speed(iter/s)": 0.09572 }, { "epoch": 0.8840344243283855, "grad_norm": 10.796051979064941, "learning_rate": 3.630348143267276e-07, "loss": 0.6592034816741943, "memory(GiB)": 47.44, "step": 19010, "token_acc": 0.837495475931958, "train_speed(iter/s)": 0.095735 }, { "epoch": 0.8842669425883352, "grad_norm": 8.91750717163086, "learning_rate": 3.615978520415553e-07, "loss": 0.6092522144317627, "memory(GiB)": 47.44, "step": 19015, "token_acc": 0.8453038674033149, "train_speed(iter/s)": 0.095749 }, { "epoch": 0.8844994608482848, "grad_norm": 9.327253341674805, "learning_rate": 3.6016363258678187e-07, "loss": 0.6067311763763428, "memory(GiB)": 47.44, "step": 19020, "token_acc": 0.8439821693907875, "train_speed(iter/s)": 0.095763 }, { "epoch": 0.8847319791082343, "grad_norm": 10.384136199951172, "learning_rate": 3.58732156810504e-07, "loss": 0.6553625106811524, "memory(GiB)": 47.44, "step": 19025, "token_acc": 0.8404518178609248, "train_speed(iter/s)": 0.095777 }, { "epoch": 0.8849644973681839, "grad_norm": 10.37766170501709, "learning_rate": 3.5730342555919896e-07, "loss": 0.6201920986175538, "memory(GiB)": 47.44, "step": 19030, "token_acc": 0.8405219282348677, "train_speed(iter/s)": 0.095791 }, { "epoch": 0.8851970156281336, "grad_norm": 8.837468147277832, "learning_rate": 3.5587743967771816e-07, "loss": 0.6653841018676758, "memory(GiB)": 47.44, "step": 19035, "token_acc": 0.8508612873980055, "train_speed(iter/s)": 0.095805 }, { "epoch": 0.8854295338880831, "grad_norm": 10.655569076538086, "learning_rate": 3.544542000092921e-07, "loss": 0.57546067237854, "memory(GiB)": 47.44, "step": 19040, "token_acc": 0.8496751329001772, "train_speed(iter/s)": 0.095819 }, { "epoch": 0.8856620521480327, "grad_norm": 8.41292667388916, "learning_rate": 3.530337073955259e-07, "loss": 0.6289780139923096, "memory(GiB)": 47.44, "step": 19045, "token_acc": 0.8403242862178357, "train_speed(iter/s)": 0.095833 }, { "epoch": 0.8858945704079824, "grad_norm": 7.690162658691406, "learning_rate": 3.5161596267640153e-07, "loss": 0.564287519454956, "memory(GiB)": 47.44, "step": 19050, "token_acc": 0.8630756578947368, "train_speed(iter/s)": 0.095848 }, { "epoch": 0.8858945704079824, "eval_loss": 0.556488037109375, "eval_runtime": 291.1297, "eval_samples_per_second": 11.936, "eval_steps_per_second": 11.936, "step": 19050 }, { "epoch": 0.886127088667932, "grad_norm": 12.681654930114746, "learning_rate": 3.5020096669027395e-07, "loss": 0.6333631038665771, "memory(GiB)": 47.44, "step": 19055, "token_acc": 0.8349946314962902, "train_speed(iter/s)": 0.095721 }, { "epoch": 0.8863596069278815, "grad_norm": 6.69144344329834, "learning_rate": 3.4878872027387545e-07, "loss": 0.7488809585571289, "memory(GiB)": 47.44, "step": 19060, "token_acc": 0.8173852085524009, "train_speed(iter/s)": 0.095735 }, { "epoch": 0.8865921251878311, "grad_norm": 8.09365463256836, "learning_rate": 3.473792242623092e-07, "loss": 0.6257463455200195, "memory(GiB)": 47.44, "step": 19065, "token_acc": 0.8425925925925926, "train_speed(iter/s)": 0.095749 }, { "epoch": 0.8868246434477808, "grad_norm": 6.565597057342529, "learning_rate": 3.459724794890551e-07, "loss": 0.7603954792022705, "memory(GiB)": 47.44, "step": 19070, "token_acc": 0.8020090732339599, "train_speed(iter/s)": 0.095763 }, { "epoch": 0.8870571617077304, "grad_norm": 8.496455192565918, "learning_rate": 3.4456848678596387e-07, "loss": 0.5489050865173339, "memory(GiB)": 47.44, "step": 19075, "token_acc": 0.8712174524982407, "train_speed(iter/s)": 0.095777 }, { "epoch": 0.8872896799676799, "grad_norm": 6.798948287963867, "learning_rate": 3.4316724698325976e-07, "loss": 0.7015841007232666, "memory(GiB)": 47.44, "step": 19080, "token_acc": 0.8179470626210459, "train_speed(iter/s)": 0.095791 }, { "epoch": 0.8875221982276296, "grad_norm": 8.605868339538574, "learning_rate": 3.4176876090953993e-07, "loss": 0.629077959060669, "memory(GiB)": 47.44, "step": 19085, "token_acc": 0.8442776735459663, "train_speed(iter/s)": 0.095805 }, { "epoch": 0.8877547164875792, "grad_norm": 7.39762020111084, "learning_rate": 3.4037302939177174e-07, "loss": 0.7615146160125732, "memory(GiB)": 47.44, "step": 19090, "token_acc": 0.8089795918367347, "train_speed(iter/s)": 0.09582 }, { "epoch": 0.8879872347475287, "grad_norm": 8.316370010375977, "learning_rate": 3.3898005325529506e-07, "loss": 0.6413861274719238, "memory(GiB)": 47.44, "step": 19095, "token_acc": 0.8319242475481907, "train_speed(iter/s)": 0.095834 }, { "epoch": 0.8882197530074784, "grad_norm": 9.32655143737793, "learning_rate": 3.3758983332381865e-07, "loss": 0.6109539031982422, "memory(GiB)": 47.44, "step": 19100, "token_acc": 0.8414420721036052, "train_speed(iter/s)": 0.095848 }, { "epoch": 0.8882197530074784, "eval_loss": 0.5563390851020813, "eval_runtime": 293.6699, "eval_samples_per_second": 11.833, "eval_steps_per_second": 11.833, "step": 19100 }, { "epoch": 0.888452271267428, "grad_norm": 8.048409461975098, "learning_rate": 3.3620237041942396e-07, "loss": 0.7133908748626709, "memory(GiB)": 47.44, "step": 19105, "token_acc": 0.8342901337925975, "train_speed(iter/s)": 0.095721 }, { "epoch": 0.8886847895273776, "grad_norm": 8.017596244812012, "learning_rate": 3.3481766536255845e-07, "loss": 0.6549528121948243, "memory(GiB)": 47.44, "step": 19110, "token_acc": 0.8401264933239635, "train_speed(iter/s)": 0.095735 }, { "epoch": 0.8889173077873271, "grad_norm": 7.829741954803467, "learning_rate": 3.334357189720433e-07, "loss": 0.5810675144195556, "memory(GiB)": 47.44, "step": 19115, "token_acc": 0.8514382402707276, "train_speed(iter/s)": 0.095748 }, { "epoch": 0.8891498260472768, "grad_norm": 6.798005104064941, "learning_rate": 3.320565320650637e-07, "loss": 0.6954770565032959, "memory(GiB)": 47.44, "step": 19120, "token_acc": 0.8231875191189967, "train_speed(iter/s)": 0.095762 }, { "epoch": 0.8893823443072264, "grad_norm": 7.311758995056152, "learning_rate": 3.306801054571773e-07, "loss": 0.6007462501525879, "memory(GiB)": 47.44, "step": 19125, "token_acc": 0.8505388542257516, "train_speed(iter/s)": 0.095777 }, { "epoch": 0.8896148625671759, "grad_norm": 8.575642585754395, "learning_rate": 3.2930643996230696e-07, "loss": 0.5701055526733398, "memory(GiB)": 47.44, "step": 19130, "token_acc": 0.8495435945860875, "train_speed(iter/s)": 0.095791 }, { "epoch": 0.8898473808271256, "grad_norm": 8.63494873046875, "learning_rate": 3.2793553639274447e-07, "loss": 0.7098838329315186, "memory(GiB)": 47.44, "step": 19135, "token_acc": 0.8214892893573614, "train_speed(iter/s)": 0.095805 }, { "epoch": 0.8900798990870752, "grad_norm": 7.499673843383789, "learning_rate": 3.265673955591453e-07, "loss": 0.6947129249572754, "memory(GiB)": 47.44, "step": 19140, "token_acc": 0.8269592476489028, "train_speed(iter/s)": 0.095819 }, { "epoch": 0.8903124173470248, "grad_norm": 7.8279709815979, "learning_rate": 3.2520201827053564e-07, "loss": 0.6508459568023681, "memory(GiB)": 47.44, "step": 19145, "token_acc": 0.8448480063166206, "train_speed(iter/s)": 0.095832 }, { "epoch": 0.8905449356069743, "grad_norm": 10.999360084533691, "learning_rate": 3.2383940533430355e-07, "loss": 0.6100882053375244, "memory(GiB)": 47.44, "step": 19150, "token_acc": 0.848823294774631, "train_speed(iter/s)": 0.095846 }, { "epoch": 0.8905449356069743, "eval_loss": 0.5568610429763794, "eval_runtime": 293.2618, "eval_samples_per_second": 11.849, "eval_steps_per_second": 11.849, "step": 19150 }, { "epoch": 0.890777453866924, "grad_norm": 7.643911838531494, "learning_rate": 3.2247955755620566e-07, "loss": 0.5627256870269776, "memory(GiB)": 47.44, "step": 19155, "token_acc": 0.835324033594571, "train_speed(iter/s)": 0.095719 }, { "epoch": 0.8910099721268736, "grad_norm": 8.361198425292969, "learning_rate": 3.2112247574036025e-07, "loss": 0.6052700996398925, "memory(GiB)": 47.44, "step": 19160, "token_acc": 0.8570234113712375, "train_speed(iter/s)": 0.095733 }, { "epoch": 0.8912424903868232, "grad_norm": 12.711145401000977, "learning_rate": 3.1976816068925274e-07, "loss": 0.5738406658172608, "memory(GiB)": 47.44, "step": 19165, "token_acc": 0.8722176422093982, "train_speed(iter/s)": 0.095747 }, { "epoch": 0.8914750086467728, "grad_norm": 10.067715644836426, "learning_rate": 3.1841661320373086e-07, "loss": 0.6505169868469238, "memory(GiB)": 47.44, "step": 19170, "token_acc": 0.8445133772780148, "train_speed(iter/s)": 0.09576 }, { "epoch": 0.8917075269067224, "grad_norm": 8.287736892700195, "learning_rate": 3.17067834083008e-07, "loss": 0.5217358112335205, "memory(GiB)": 47.44, "step": 19175, "token_acc": 0.8596858638743455, "train_speed(iter/s)": 0.095774 }, { "epoch": 0.891940045166672, "grad_norm": 8.864876747131348, "learning_rate": 3.157218241246562e-07, "loss": 0.6427960395812988, "memory(GiB)": 47.44, "step": 19180, "token_acc": 0.844559585492228, "train_speed(iter/s)": 0.095788 }, { "epoch": 0.8921725634266215, "grad_norm": 9.918158531188965, "learning_rate": 3.143785841246155e-07, "loss": 0.5956603527069092, "memory(GiB)": 47.44, "step": 19185, "token_acc": 0.8509154315605929, "train_speed(iter/s)": 0.095802 }, { "epoch": 0.8924050816865712, "grad_norm": 12.312911033630371, "learning_rate": 3.130381148771827e-07, "loss": 0.700811243057251, "memory(GiB)": 47.44, "step": 19190, "token_acc": 0.820032310177706, "train_speed(iter/s)": 0.095815 }, { "epoch": 0.8926375999465208, "grad_norm": 9.34288215637207, "learning_rate": 3.11700417175021e-07, "loss": 0.7641796112060547, "memory(GiB)": 47.44, "step": 19195, "token_acc": 0.7960568842921784, "train_speed(iter/s)": 0.095829 }, { "epoch": 0.8928701182064704, "grad_norm": 6.648347854614258, "learning_rate": 3.1036549180914923e-07, "loss": 0.6445430278778076, "memory(GiB)": 47.44, "step": 19200, "token_acc": 0.8562091503267973, "train_speed(iter/s)": 0.095843 }, { "epoch": 0.8928701182064704, "eval_loss": 0.556313157081604, "eval_runtime": 294.234, "eval_samples_per_second": 11.81, "eval_steps_per_second": 11.81, "step": 19200 }, { "epoch": 0.89310263646642, "grad_norm": 9.468454360961914, "learning_rate": 3.0903333956895266e-07, "loss": 0.5887749195098877, "memory(GiB)": 47.44, "step": 19205, "token_acc": 0.8352915735601528, "train_speed(iter/s)": 0.095716 }, { "epoch": 0.8933351547263696, "grad_norm": 9.351832389831543, "learning_rate": 3.077039612421745e-07, "loss": 0.5907858371734619, "memory(GiB)": 47.44, "step": 19210, "token_acc": 0.850547195622435, "train_speed(iter/s)": 0.09573 }, { "epoch": 0.8935676729863192, "grad_norm": 11.74577522277832, "learning_rate": 3.063773576149143e-07, "loss": 0.7539110660552979, "memory(GiB)": 47.44, "step": 19215, "token_acc": 0.8197879858657244, "train_speed(iter/s)": 0.095744 }, { "epoch": 0.8938001912462689, "grad_norm": 10.45272159576416, "learning_rate": 3.0505352947163667e-07, "loss": 0.5318979263305664, "memory(GiB)": 47.44, "step": 19220, "token_acc": 0.8765799256505576, "train_speed(iter/s)": 0.095758 }, { "epoch": 0.8940327095062184, "grad_norm": 8.622464179992676, "learning_rate": 3.0373247759516e-07, "loss": 0.7953807353973389, "memory(GiB)": 47.44, "step": 19225, "token_acc": 0.7910621009866512, "train_speed(iter/s)": 0.095772 }, { "epoch": 0.894265227766168, "grad_norm": 8.542391777038574, "learning_rate": 3.024142027666649e-07, "loss": 0.652153205871582, "memory(GiB)": 47.44, "step": 19230, "token_acc": 0.8414383561643836, "train_speed(iter/s)": 0.095786 }, { "epoch": 0.8944977460261176, "grad_norm": 9.397589683532715, "learning_rate": 3.010987057656861e-07, "loss": 0.5987738132476806, "memory(GiB)": 47.44, "step": 19235, "token_acc": 0.8518365662401981, "train_speed(iter/s)": 0.0958 }, { "epoch": 0.8947302642860672, "grad_norm": 9.284369468688965, "learning_rate": 2.997859873701181e-07, "loss": 0.6676845550537109, "memory(GiB)": 47.44, "step": 19240, "token_acc": 0.8496299181924425, "train_speed(iter/s)": 0.095814 }, { "epoch": 0.8949627825460168, "grad_norm": 9.94536018371582, "learning_rate": 2.9847604835621216e-07, "loss": 0.5450184345245361, "memory(GiB)": 47.44, "step": 19245, "token_acc": 0.8580777096114519, "train_speed(iter/s)": 0.095828 }, { "epoch": 0.8951953008059664, "grad_norm": 6.833744049072266, "learning_rate": 2.9716888949857635e-07, "loss": 0.5578155517578125, "memory(GiB)": 47.44, "step": 19250, "token_acc": 0.8539007092198582, "train_speed(iter/s)": 0.095842 }, { "epoch": 0.8951953008059664, "eval_loss": 0.5563004016876221, "eval_runtime": 292.9507, "eval_samples_per_second": 11.862, "eval_steps_per_second": 11.862, "step": 19250 }, { "epoch": 0.8954278190659161, "grad_norm": 8.075855255126953, "learning_rate": 2.9586451157017304e-07, "loss": 0.6162880420684814, "memory(GiB)": 47.44, "step": 19255, "token_acc": 0.8352875220246676, "train_speed(iter/s)": 0.095716 }, { "epoch": 0.8956603373258656, "grad_norm": 7.645998001098633, "learning_rate": 2.9456291534232185e-07, "loss": 0.6005894660949707, "memory(GiB)": 47.44, "step": 19260, "token_acc": 0.8496168582375478, "train_speed(iter/s)": 0.09573 }, { "epoch": 0.8958928555858152, "grad_norm": 9.750205993652344, "learning_rate": 2.9326410158469543e-07, "loss": 0.5538561344146729, "memory(GiB)": 47.44, "step": 19265, "token_acc": 0.84688995215311, "train_speed(iter/s)": 0.095744 }, { "epoch": 0.8961253738457648, "grad_norm": 7.047579288482666, "learning_rate": 2.9196807106532443e-07, "loss": 0.561616325378418, "memory(GiB)": 47.44, "step": 19270, "token_acc": 0.8524930747922438, "train_speed(iter/s)": 0.095758 }, { "epoch": 0.8963578921057145, "grad_norm": 6.872752666473389, "learning_rate": 2.906748245505903e-07, "loss": 0.6426856517791748, "memory(GiB)": 47.44, "step": 19275, "token_acc": 0.8539862645565841, "train_speed(iter/s)": 0.095771 }, { "epoch": 0.896590410365664, "grad_norm": 6.302699565887451, "learning_rate": 2.893843628052301e-07, "loss": 0.6119589805603027, "memory(GiB)": 47.44, "step": 19280, "token_acc": 0.8436103663985701, "train_speed(iter/s)": 0.095785 }, { "epoch": 0.8968229286256136, "grad_norm": 8.906804084777832, "learning_rate": 2.8809668659233346e-07, "loss": 0.6653795719146729, "memory(GiB)": 47.44, "step": 19285, "token_acc": 0.8278719397363465, "train_speed(iter/s)": 0.095798 }, { "epoch": 0.8970554468855633, "grad_norm": 7.2945709228515625, "learning_rate": 2.8681179667334356e-07, "loss": 0.6008943080902099, "memory(GiB)": 47.44, "step": 19290, "token_acc": 0.8466141732283464, "train_speed(iter/s)": 0.095812 }, { "epoch": 0.8972879651455128, "grad_norm": 7.817905902862549, "learning_rate": 2.855296938080554e-07, "loss": 0.6331915855407715, "memory(GiB)": 47.44, "step": 19295, "token_acc": 0.8435329143235197, "train_speed(iter/s)": 0.095826 }, { "epoch": 0.8975204834054624, "grad_norm": 8.829483032226562, "learning_rate": 2.842503787546158e-07, "loss": 0.521914291381836, "memory(GiB)": 47.44, "step": 19300, "token_acc": 0.8705882352941177, "train_speed(iter/s)": 0.09584 }, { "epoch": 0.8975204834054624, "eval_loss": 0.5560697913169861, "eval_runtime": 291.6114, "eval_samples_per_second": 11.917, "eval_steps_per_second": 11.917, "step": 19300 }, { "epoch": 0.897753001665412, "grad_norm": 7.60385799407959, "learning_rate": 2.82973852269523e-07, "loss": 0.5124347686767579, "memory(GiB)": 47.44, "step": 19305, "token_acc": 0.8356065526448726, "train_speed(iter/s)": 0.095714 }, { "epoch": 0.8979855199253617, "grad_norm": 7.674799919128418, "learning_rate": 2.8170011510762706e-07, "loss": 0.7254348754882812, "memory(GiB)": 47.44, "step": 19310, "token_acc": 0.8155107618722242, "train_speed(iter/s)": 0.095728 }, { "epoch": 0.8982180381853112, "grad_norm": 8.559470176696777, "learning_rate": 2.804291680221277e-07, "loss": 0.6556037902832031, "memory(GiB)": 47.44, "step": 19315, "token_acc": 0.8498039215686275, "train_speed(iter/s)": 0.095742 }, { "epoch": 0.8984505564452608, "grad_norm": 9.701505661010742, "learning_rate": 2.791610117645749e-07, "loss": 0.5915635108947754, "memory(GiB)": 47.44, "step": 19320, "token_acc": 0.8635002139495079, "train_speed(iter/s)": 0.095756 }, { "epoch": 0.8986830747052105, "grad_norm": 7.899413585662842, "learning_rate": 2.7789564708486874e-07, "loss": 0.6606907844543457, "memory(GiB)": 47.44, "step": 19325, "token_acc": 0.8327725437415882, "train_speed(iter/s)": 0.095769 }, { "epoch": 0.89891559296516, "grad_norm": 9.57420539855957, "learning_rate": 2.766330747312601e-07, "loss": 0.5400222301483154, "memory(GiB)": 47.44, "step": 19330, "token_acc": 0.8736942070275404, "train_speed(iter/s)": 0.095783 }, { "epoch": 0.8991481112251096, "grad_norm": 9.367339134216309, "learning_rate": 2.7537329545034407e-07, "loss": 0.6600838661193847, "memory(GiB)": 47.44, "step": 19335, "token_acc": 0.8266504657756176, "train_speed(iter/s)": 0.095797 }, { "epoch": 0.8993806294850593, "grad_norm": 8.384690284729004, "learning_rate": 2.7411630998706917e-07, "loss": 0.5784670352935791, "memory(GiB)": 47.44, "step": 19340, "token_acc": 0.8525245187068652, "train_speed(iter/s)": 0.095811 }, { "epoch": 0.8996131477450089, "grad_norm": 9.686371803283691, "learning_rate": 2.7286211908472916e-07, "loss": 0.6274217128753662, "memory(GiB)": 47.44, "step": 19345, "token_acc": 0.8458904109589042, "train_speed(iter/s)": 0.095824 }, { "epoch": 0.8998456660049584, "grad_norm": 9.73272705078125, "learning_rate": 2.7161072348496576e-07, "loss": 0.6328916549682617, "memory(GiB)": 47.44, "step": 19350, "token_acc": 0.8475350963108064, "train_speed(iter/s)": 0.095839 }, { "epoch": 0.8998456660049584, "eval_loss": 0.5558338165283203, "eval_runtime": 293.162, "eval_samples_per_second": 11.854, "eval_steps_per_second": 11.854, "step": 19350 }, { "epoch": 0.900078184264908, "grad_norm": 9.117722511291504, "learning_rate": 2.703621239277682e-07, "loss": 0.6286519527435303, "memory(GiB)": 47.44, "step": 19355, "token_acc": 0.8351062975044592, "train_speed(iter/s)": 0.095713 }, { "epoch": 0.9003107025248577, "grad_norm": 8.885711669921875, "learning_rate": 2.691163211514708e-07, "loss": 0.7022455215454102, "memory(GiB)": 47.44, "step": 19360, "token_acc": 0.8235294117647058, "train_speed(iter/s)": 0.095727 }, { "epoch": 0.9005432207848073, "grad_norm": 12.919280052185059, "learning_rate": 2.67873315892756e-07, "loss": 0.7374226570129394, "memory(GiB)": 47.44, "step": 19365, "token_acc": 0.8167853128512551, "train_speed(iter/s)": 0.09574 }, { "epoch": 0.9007757390447568, "grad_norm": 8.281429290771484, "learning_rate": 2.6663310888665085e-07, "loss": 0.6170114994049072, "memory(GiB)": 47.44, "step": 19370, "token_acc": 0.846723044397463, "train_speed(iter/s)": 0.095754 }, { "epoch": 0.9010082573047065, "grad_norm": 8.904351234436035, "learning_rate": 2.653957008665298e-07, "loss": 0.6403151988983155, "memory(GiB)": 47.44, "step": 19375, "token_acc": 0.8420386198163976, "train_speed(iter/s)": 0.095768 }, { "epoch": 0.9012407755646561, "grad_norm": 10.579533576965332, "learning_rate": 2.641610925641075e-07, "loss": 0.6673800468444824, "memory(GiB)": 47.44, "step": 19380, "token_acc": 0.8332737030411449, "train_speed(iter/s)": 0.095782 }, { "epoch": 0.9014732938246056, "grad_norm": 8.038885116577148, "learning_rate": 2.629292847094489e-07, "loss": 0.661448621749878, "memory(GiB)": 47.44, "step": 19385, "token_acc": 0.8331445828614572, "train_speed(iter/s)": 0.095796 }, { "epoch": 0.9017058120845552, "grad_norm": 10.71098518371582, "learning_rate": 2.6170027803095685e-07, "loss": 0.623293113708496, "memory(GiB)": 47.44, "step": 19390, "token_acc": 0.8488517745302714, "train_speed(iter/s)": 0.09581 }, { "epoch": 0.9019383303445049, "grad_norm": 11.028908729553223, "learning_rate": 2.60474073255384e-07, "loss": 0.6097889900207519, "memory(GiB)": 47.44, "step": 19395, "token_acc": 0.8461862621154657, "train_speed(iter/s)": 0.095824 }, { "epoch": 0.9021708486044545, "grad_norm": 9.047528266906738, "learning_rate": 2.592506711078213e-07, "loss": 0.6516981601715088, "memory(GiB)": 47.44, "step": 19400, "token_acc": 0.8423571642237512, "train_speed(iter/s)": 0.095838 }, { "epoch": 0.9021708486044545, "eval_loss": 0.5555245876312256, "eval_runtime": 291.9705, "eval_samples_per_second": 11.902, "eval_steps_per_second": 11.902, "step": 19400 }, { "epoch": 0.902403366864404, "grad_norm": 8.492449760437012, "learning_rate": 2.580300723117041e-07, "loss": 0.5659789085388184, "memory(GiB)": 47.44, "step": 19405, "token_acc": 0.8352052408057097, "train_speed(iter/s)": 0.095714 }, { "epoch": 0.9026358851243537, "grad_norm": 8.339861869812012, "learning_rate": 2.56812277588811e-07, "loss": 0.7192893028259277, "memory(GiB)": 47.44, "step": 19410, "token_acc": 0.8159884767734966, "train_speed(iter/s)": 0.095727 }, { "epoch": 0.9028684033843033, "grad_norm": 10.295209884643555, "learning_rate": 2.555972876592616e-07, "loss": 0.6294547080993652, "memory(GiB)": 47.44, "step": 19415, "token_acc": 0.8360107095046854, "train_speed(iter/s)": 0.09574 }, { "epoch": 0.9031009216442529, "grad_norm": 10.894436836242676, "learning_rate": 2.543851032415162e-07, "loss": 0.6193259239196778, "memory(GiB)": 47.44, "step": 19420, "token_acc": 0.8415937803692906, "train_speed(iter/s)": 0.095754 }, { "epoch": 0.9033334399042025, "grad_norm": 8.60874080657959, "learning_rate": 2.531757250523781e-07, "loss": 0.6329953670501709, "memory(GiB)": 47.44, "step": 19425, "token_acc": 0.8524132429198245, "train_speed(iter/s)": 0.095768 }, { "epoch": 0.9035659581641521, "grad_norm": 9.994233131408691, "learning_rate": 2.519691538069885e-07, "loss": 0.6359256744384766, "memory(GiB)": 47.44, "step": 19430, "token_acc": 0.8427230046948356, "train_speed(iter/s)": 0.095782 }, { "epoch": 0.9037984764241017, "grad_norm": 7.38197135925293, "learning_rate": 2.507653902188317e-07, "loss": 0.5808037757873535, "memory(GiB)": 47.44, "step": 19435, "token_acc": 0.8666881859264042, "train_speed(iter/s)": 0.095795 }, { "epoch": 0.9040309946840512, "grad_norm": 7.369668006896973, "learning_rate": 2.495644349997289e-07, "loss": 0.6205277919769288, "memory(GiB)": 47.44, "step": 19440, "token_acc": 0.8471252907942838, "train_speed(iter/s)": 0.095809 }, { "epoch": 0.9042635129440009, "grad_norm": 7.311388969421387, "learning_rate": 2.483662888598426e-07, "loss": 0.7141438484191894, "memory(GiB)": 47.44, "step": 19445, "token_acc": 0.8237831176833025, "train_speed(iter/s)": 0.095823 }, { "epoch": 0.9044960312039505, "grad_norm": 7.697142124176025, "learning_rate": 2.471709525076732e-07, "loss": 0.8054816246032714, "memory(GiB)": 47.44, "step": 19450, "token_acc": 0.7954380883417813, "train_speed(iter/s)": 0.095837 }, { "epoch": 0.9044960312039505, "eval_loss": 0.5556977391242981, "eval_runtime": 292.9688, "eval_samples_per_second": 11.861, "eval_steps_per_second": 11.861, "step": 19450 }, { "epoch": 0.9047285494639001, "grad_norm": 8.828365325927734, "learning_rate": 2.4597842665006146e-07, "loss": 0.601045799255371, "memory(GiB)": 47.44, "step": 19455, "token_acc": 0.8351867989542776, "train_speed(iter/s)": 0.095712 }, { "epoch": 0.9049610677238497, "grad_norm": 6.522864818572998, "learning_rate": 2.447887119921827e-07, "loss": 0.6024109840393066, "memory(GiB)": 47.44, "step": 19460, "token_acc": 0.8365089121081746, "train_speed(iter/s)": 0.095726 }, { "epoch": 0.9051935859837993, "grad_norm": 10.061827659606934, "learning_rate": 2.436018092375542e-07, "loss": 0.7021986961364746, "memory(GiB)": 47.44, "step": 19465, "token_acc": 0.8153013910355487, "train_speed(iter/s)": 0.09574 }, { "epoch": 0.9054261042437489, "grad_norm": 12.953351974487305, "learning_rate": 2.424177190880256e-07, "loss": 0.6061929225921631, "memory(GiB)": 47.44, "step": 19470, "token_acc": 0.8400309119010819, "train_speed(iter/s)": 0.095753 }, { "epoch": 0.9056586225036984, "grad_norm": 11.2939453125, "learning_rate": 2.412364422437874e-07, "loss": 0.6210571765899658, "memory(GiB)": 47.44, "step": 19475, "token_acc": 0.8498431196772748, "train_speed(iter/s)": 0.095767 }, { "epoch": 0.9058911407636481, "grad_norm": 7.610166549682617, "learning_rate": 2.4005797940336585e-07, "loss": 0.5850472450256348, "memory(GiB)": 47.44, "step": 19480, "token_acc": 0.848943661971831, "train_speed(iter/s)": 0.095781 }, { "epoch": 0.9061236590235977, "grad_norm": 11.881199836730957, "learning_rate": 2.388823312636207e-07, "loss": 0.6663021087646485, "memory(GiB)": 47.44, "step": 19485, "token_acc": 0.839766081871345, "train_speed(iter/s)": 0.095795 }, { "epoch": 0.9063561772835473, "grad_norm": 11.320914268493652, "learning_rate": 2.3770949851974977e-07, "loss": 0.7544370174407959, "memory(GiB)": 47.44, "step": 19490, "token_acc": 0.8081852004960728, "train_speed(iter/s)": 0.095809 }, { "epoch": 0.9065886955434969, "grad_norm": 11.551216125488281, "learning_rate": 2.3653948186528552e-07, "loss": 0.6839058876037598, "memory(GiB)": 47.44, "step": 19495, "token_acc": 0.8361934477379095, "train_speed(iter/s)": 0.095822 }, { "epoch": 0.9068212138034465, "grad_norm": 10.214497566223145, "learning_rate": 2.3537228199209505e-07, "loss": 0.640674352645874, "memory(GiB)": 47.44, "step": 19500, "token_acc": 0.8455008488964346, "train_speed(iter/s)": 0.095836 }, { "epoch": 0.9068212138034465, "eval_loss": 0.5556568503379822, "eval_runtime": 295.0302, "eval_samples_per_second": 11.778, "eval_steps_per_second": 11.778, "step": 19500 }, { "epoch": 0.9070537320633961, "grad_norm": 7.023237705230713, "learning_rate": 2.3420789959037903e-07, "loss": 0.8054847717285156, "memory(GiB)": 47.44, "step": 19505, "token_acc": 0.8330771668515291, "train_speed(iter/s)": 0.09571 }, { "epoch": 0.9072862503233458, "grad_norm": 9.729205131530762, "learning_rate": 2.330463353486734e-07, "loss": 0.5784210205078125, "memory(GiB)": 47.44, "step": 19510, "token_acc": 0.8446054750402576, "train_speed(iter/s)": 0.095724 }, { "epoch": 0.9075187685832953, "grad_norm": 11.617768287658691, "learning_rate": 2.3188758995384585e-07, "loss": 0.6469026565551758, "memory(GiB)": 47.44, "step": 19515, "token_acc": 0.8218425869432581, "train_speed(iter/s)": 0.095738 }, { "epoch": 0.9077512868432449, "grad_norm": 9.739715576171875, "learning_rate": 2.3073166409110004e-07, "loss": 0.6391276836395263, "memory(GiB)": 47.44, "step": 19520, "token_acc": 0.8408128704487722, "train_speed(iter/s)": 0.095752 }, { "epoch": 0.9079838051031945, "grad_norm": 8.337206840515137, "learning_rate": 2.2957855844396804e-07, "loss": 0.5951900959014893, "memory(GiB)": 47.44, "step": 19525, "token_acc": 0.8453547046601365, "train_speed(iter/s)": 0.095765 }, { "epoch": 0.9082163233631441, "grad_norm": 8.902191162109375, "learning_rate": 2.2842827369431997e-07, "loss": 0.5920657157897949, "memory(GiB)": 47.44, "step": 19530, "token_acc": 0.8576858108108109, "train_speed(iter/s)": 0.095778 }, { "epoch": 0.9084488416230937, "grad_norm": 7.1681904792785645, "learning_rate": 2.2728081052235228e-07, "loss": 0.6802764892578125, "memory(GiB)": 47.44, "step": 19535, "token_acc": 0.8195467422096318, "train_speed(iter/s)": 0.095792 }, { "epoch": 0.9086813598830433, "grad_norm": 8.618856430053711, "learning_rate": 2.2613616960659723e-07, "loss": 0.6406535148620606, "memory(GiB)": 47.44, "step": 19540, "token_acc": 0.8352027610008628, "train_speed(iter/s)": 0.095806 }, { "epoch": 0.908913878142993, "grad_norm": 8.298696517944336, "learning_rate": 2.2499435162391448e-07, "loss": 0.6611350536346435, "memory(GiB)": 47.44, "step": 19545, "token_acc": 0.8341480446927374, "train_speed(iter/s)": 0.095819 }, { "epoch": 0.9091463964029425, "grad_norm": 8.729609489440918, "learning_rate": 2.238553572494978e-07, "loss": 0.706944465637207, "memory(GiB)": 47.44, "step": 19550, "token_acc": 0.8442392613408269, "train_speed(iter/s)": 0.095833 }, { "epoch": 0.9091463964029425, "eval_loss": 0.5558927059173584, "eval_runtime": 292.7905, "eval_samples_per_second": 11.869, "eval_steps_per_second": 11.869, "step": 19550 }, { "epoch": 0.9093789146628921, "grad_norm": 8.666666984558105, "learning_rate": 2.2271918715686792e-07, "loss": 0.7575130939483643, "memory(GiB)": 47.44, "step": 19555, "token_acc": 0.8341863254698121, "train_speed(iter/s)": 0.095709 }, { "epoch": 0.9096114329228417, "grad_norm": 7.633399486541748, "learning_rate": 2.2158584201787903e-07, "loss": 0.6025700569152832, "memory(GiB)": 47.44, "step": 19560, "token_acc": 0.8566856330014224, "train_speed(iter/s)": 0.095723 }, { "epoch": 0.9098439511827914, "grad_norm": 6.445369720458984, "learning_rate": 2.2045532250271228e-07, "loss": 0.6307368278503418, "memory(GiB)": 47.44, "step": 19565, "token_acc": 0.8414866581956798, "train_speed(iter/s)": 0.095736 }, { "epoch": 0.9100764694427409, "grad_norm": 10.30349063873291, "learning_rate": 2.193276292798796e-07, "loss": 0.5543890953063965, "memory(GiB)": 47.44, "step": 19570, "token_acc": 0.8581584292484766, "train_speed(iter/s)": 0.09575 }, { "epoch": 0.9103089877026905, "grad_norm": 6.994936466217041, "learning_rate": 2.1820276301621978e-07, "loss": 0.6006917953491211, "memory(GiB)": 47.44, "step": 19575, "token_acc": 0.846836191602602, "train_speed(iter/s)": 0.095764 }, { "epoch": 0.9105415059626402, "grad_norm": 8.902931213378906, "learning_rate": 2.1708072437690186e-07, "loss": 0.8253030776977539, "memory(GiB)": 47.44, "step": 19580, "token_acc": 0.8120567375886525, "train_speed(iter/s)": 0.095778 }, { "epoch": 0.9107740242225897, "grad_norm": 11.879782676696777, "learning_rate": 2.1596151402542065e-07, "loss": 0.6510292053222656, "memory(GiB)": 47.44, "step": 19585, "token_acc": 0.8594360086767896, "train_speed(iter/s)": 0.095792 }, { "epoch": 0.9110065424825393, "grad_norm": 8.595463752746582, "learning_rate": 2.1484513262360174e-07, "loss": 0.629924726486206, "memory(GiB)": 47.44, "step": 19590, "token_acc": 0.8402530644523527, "train_speed(iter/s)": 0.095805 }, { "epoch": 0.911239060742489, "grad_norm": 8.18715763092041, "learning_rate": 2.1373158083159374e-07, "loss": 0.6486124038696289, "memory(GiB)": 47.44, "step": 19595, "token_acc": 0.849500998003992, "train_speed(iter/s)": 0.095819 }, { "epoch": 0.9114715790024386, "grad_norm": 7.39414119720459, "learning_rate": 2.1262085930787546e-07, "loss": 0.5645887851715088, "memory(GiB)": 47.44, "step": 19600, "token_acc": 0.8555596601403768, "train_speed(iter/s)": 0.095833 }, { "epoch": 0.9114715790024386, "eval_loss": 0.5564916729927063, "eval_runtime": 291.8498, "eval_samples_per_second": 11.907, "eval_steps_per_second": 11.907, "step": 19600 }, { "epoch": 0.9117040972623881, "grad_norm": 9.974652290344238, "learning_rate": 2.1151296870925042e-07, "loss": 0.655756664276123, "memory(GiB)": 47.44, "step": 19605, "token_acc": 0.8347886103174857, "train_speed(iter/s)": 0.09571 }, { "epoch": 0.9119366155223377, "grad_norm": 9.164752960205078, "learning_rate": 2.1040790969084846e-07, "loss": 0.6285743713378906, "memory(GiB)": 47.44, "step": 19610, "token_acc": 0.8381852551984877, "train_speed(iter/s)": 0.095724 }, { "epoch": 0.9121691337822874, "grad_norm": 7.622852325439453, "learning_rate": 2.093056829061263e-07, "loss": 0.5900651454925537, "memory(GiB)": 47.44, "step": 19615, "token_acc": 0.8457163170991824, "train_speed(iter/s)": 0.095738 }, { "epoch": 0.9124016520422369, "grad_norm": 7.594034671783447, "learning_rate": 2.0820628900686313e-07, "loss": 0.6741018295288086, "memory(GiB)": 47.44, "step": 19620, "token_acc": 0.83187190240183, "train_speed(iter/s)": 0.095752 }, { "epoch": 0.9126341703021865, "grad_norm": 7.638304710388184, "learning_rate": 2.071097286431656e-07, "loss": 0.571320915222168, "memory(GiB)": 47.44, "step": 19625, "token_acc": 0.8505747126436781, "train_speed(iter/s)": 0.095765 }, { "epoch": 0.9128666885621362, "grad_norm": 7.910111904144287, "learning_rate": 2.0601600246346275e-07, "loss": 0.641974401473999, "memory(GiB)": 47.44, "step": 19630, "token_acc": 0.8341346153846154, "train_speed(iter/s)": 0.095779 }, { "epoch": 0.9130992068220858, "grad_norm": 10.283453941345215, "learning_rate": 2.0492511111450953e-07, "loss": 0.5953716278076172, "memory(GiB)": 47.44, "step": 19635, "token_acc": 0.8551294657489348, "train_speed(iter/s)": 0.095793 }, { "epoch": 0.9133317250820353, "grad_norm": 10.159235000610352, "learning_rate": 2.038370552413832e-07, "loss": 0.5992490291595459, "memory(GiB)": 47.44, "step": 19640, "token_acc": 0.8570114942528736, "train_speed(iter/s)": 0.095806 }, { "epoch": 0.9135642433419849, "grad_norm": 7.889383316040039, "learning_rate": 2.027518354874841e-07, "loss": 0.5519330024719238, "memory(GiB)": 47.44, "step": 19645, "token_acc": 0.8550350424197714, "train_speed(iter/s)": 0.09582 }, { "epoch": 0.9137967616019346, "grad_norm": 8.540481567382812, "learning_rate": 2.0166945249453728e-07, "loss": 0.5851781368255615, "memory(GiB)": 47.44, "step": 19650, "token_acc": 0.8482142857142857, "train_speed(iter/s)": 0.095833 }, { "epoch": 0.9137967616019346, "eval_loss": 0.556018054485321, "eval_runtime": 293.7189, "eval_samples_per_second": 11.831, "eval_steps_per_second": 11.831, "step": 19650 }, { "epoch": 0.9140292798618842, "grad_norm": 8.106054306030273, "learning_rate": 2.0058990690258963e-07, "loss": 0.6603738784790039, "memory(GiB)": 47.44, "step": 19655, "token_acc": 0.8345389579901481, "train_speed(iter/s)": 0.09571 }, { "epoch": 0.9142617981218337, "grad_norm": 7.776497840881348, "learning_rate": 1.9951319935000767e-07, "loss": 0.8171131134033203, "memory(GiB)": 47.44, "step": 19660, "token_acc": 0.8034281546316557, "train_speed(iter/s)": 0.095724 }, { "epoch": 0.9144943163817834, "grad_norm": 9.407666206359863, "learning_rate": 1.984393304734844e-07, "loss": 0.5196086883544921, "memory(GiB)": 47.44, "step": 19665, "token_acc": 0.8706225680933852, "train_speed(iter/s)": 0.095737 }, { "epoch": 0.914726834641733, "grad_norm": 9.289840698242188, "learning_rate": 1.9736830090802962e-07, "loss": 0.6363831520080566, "memory(GiB)": 47.44, "step": 19670, "token_acc": 0.8487730061349693, "train_speed(iter/s)": 0.095751 }, { "epoch": 0.9149593529016825, "grad_norm": 8.724823951721191, "learning_rate": 1.963001112869778e-07, "loss": 0.624758243560791, "memory(GiB)": 47.44, "step": 19675, "token_acc": 0.8428571428571429, "train_speed(iter/s)": 0.095764 }, { "epoch": 0.9151918711616321, "grad_norm": 9.431694984436035, "learning_rate": 1.952347622419809e-07, "loss": 0.5540366172790527, "memory(GiB)": 47.44, "step": 19680, "token_acc": 0.8601036269430051, "train_speed(iter/s)": 0.095777 }, { "epoch": 0.9154243894215818, "grad_norm": 9.550405502319336, "learning_rate": 1.9417225440301335e-07, "loss": 0.6447968006134033, "memory(GiB)": 47.44, "step": 19685, "token_acc": 0.8426339285714286, "train_speed(iter/s)": 0.09579 }, { "epoch": 0.9156569076815314, "grad_norm": 9.518305778503418, "learning_rate": 1.9311258839836865e-07, "loss": 0.5623124122619629, "memory(GiB)": 47.44, "step": 19690, "token_acc": 0.8611342785654712, "train_speed(iter/s)": 0.095804 }, { "epoch": 0.9158894259414809, "grad_norm": 7.255014896392822, "learning_rate": 1.9205576485466114e-07, "loss": 0.6957176685333252, "memory(GiB)": 47.44, "step": 19695, "token_acc": 0.8264137437365784, "train_speed(iter/s)": 0.095818 }, { "epoch": 0.9161219442014306, "grad_norm": 10.805427551269531, "learning_rate": 1.9100178439682148e-07, "loss": 0.8065167427062988, "memory(GiB)": 47.44, "step": 19700, "token_acc": 0.7643504531722054, "train_speed(iter/s)": 0.095831 }, { "epoch": 0.9161219442014306, "eval_loss": 0.5560776591300964, "eval_runtime": 294.8996, "eval_samples_per_second": 11.784, "eval_steps_per_second": 11.784, "step": 19700 }, { "epoch": 0.9163544624613802, "grad_norm": 6.1116719245910645, "learning_rate": 1.8995064764810278e-07, "loss": 0.6948729515075683, "memory(GiB)": 47.44, "step": 19705, "token_acc": 0.8349877016883314, "train_speed(iter/s)": 0.095707 }, { "epoch": 0.9165869807213298, "grad_norm": 8.436870574951172, "learning_rate": 1.8890235523007283e-07, "loss": 0.6319416999816895, "memory(GiB)": 47.44, "step": 19710, "token_acc": 0.8398151713515595, "train_speed(iter/s)": 0.095721 }, { "epoch": 0.9168194989812793, "grad_norm": 9.345169067382812, "learning_rate": 1.8785690776262023e-07, "loss": 0.6956872463226318, "memory(GiB)": 47.44, "step": 19715, "token_acc": 0.8270460358056266, "train_speed(iter/s)": 0.095735 }, { "epoch": 0.917052017241229, "grad_norm": 7.356499195098877, "learning_rate": 1.8681430586394988e-07, "loss": 0.5555019855499268, "memory(GiB)": 47.44, "step": 19720, "token_acc": 0.8647566235366605, "train_speed(iter/s)": 0.095748 }, { "epoch": 0.9172845355011786, "grad_norm": 6.536581039428711, "learning_rate": 1.8577455015058477e-07, "loss": 0.9057181358337403, "memory(GiB)": 47.44, "step": 19725, "token_acc": 0.7832127351664254, "train_speed(iter/s)": 0.095762 }, { "epoch": 0.9175170537611281, "grad_norm": 10.982614517211914, "learning_rate": 1.847376412373647e-07, "loss": 0.5344199657440185, "memory(GiB)": 47.44, "step": 19730, "token_acc": 0.8694404591104734, "train_speed(iter/s)": 0.095775 }, { "epoch": 0.9177495720210778, "grad_norm": 9.253430366516113, "learning_rate": 1.837035797374459e-07, "loss": 0.7098407745361328, "memory(GiB)": 47.44, "step": 19735, "token_acc": 0.8177966101694916, "train_speed(iter/s)": 0.095789 }, { "epoch": 0.9179820902810274, "grad_norm": 8.579312324523926, "learning_rate": 1.826723662623009e-07, "loss": 0.6048952102661133, "memory(GiB)": 47.44, "step": 19740, "token_acc": 0.8423377505946313, "train_speed(iter/s)": 0.095803 }, { "epoch": 0.918214608540977, "grad_norm": 8.829182624816895, "learning_rate": 1.8164400142171744e-07, "loss": 0.5817587852478028, "memory(GiB)": 47.44, "step": 19745, "token_acc": 0.8563344860106885, "train_speed(iter/s)": 0.095816 }, { "epoch": 0.9184471268009265, "grad_norm": 11.861217498779297, "learning_rate": 1.8061848582380081e-07, "loss": 0.6502750396728516, "memory(GiB)": 47.44, "step": 19750, "token_acc": 0.8443877551020408, "train_speed(iter/s)": 0.09583 }, { "epoch": 0.9184471268009265, "eval_loss": 0.5561826229095459, "eval_runtime": 293.7773, "eval_samples_per_second": 11.829, "eval_steps_per_second": 11.829, "step": 19750 }, { "epoch": 0.9186796450608762, "grad_norm": 8.321170806884766, "learning_rate": 1.7959582007496813e-07, "loss": 0.6396979331970215, "memory(GiB)": 47.44, "step": 19755, "token_acc": 0.8350863981927277, "train_speed(iter/s)": 0.095707 }, { "epoch": 0.9189121633208258, "grad_norm": 6.915319442749023, "learning_rate": 1.7857600477995507e-07, "loss": 0.5526315212249756, "memory(GiB)": 47.44, "step": 19760, "token_acc": 0.8581314878892734, "train_speed(iter/s)": 0.09572 }, { "epoch": 0.9191446815807753, "grad_norm": 8.197853088378906, "learning_rate": 1.7755904054180817e-07, "loss": 0.6716443061828613, "memory(GiB)": 47.44, "step": 19765, "token_acc": 0.8348387096774194, "train_speed(iter/s)": 0.095734 }, { "epoch": 0.919377199840725, "grad_norm": 8.290502548217773, "learning_rate": 1.7654492796189082e-07, "loss": 0.7327411651611329, "memory(GiB)": 47.44, "step": 19770, "token_acc": 0.8147554129911788, "train_speed(iter/s)": 0.095748 }, { "epoch": 0.9196097181006746, "grad_norm": 7.795845985412598, "learning_rate": 1.755336676398789e-07, "loss": 0.5660604000091553, "memory(GiB)": 47.44, "step": 19775, "token_acc": 0.8619830592924763, "train_speed(iter/s)": 0.095762 }, { "epoch": 0.9198422363606242, "grad_norm": 7.266628742218018, "learning_rate": 1.7452526017376238e-07, "loss": 0.6886507034301758, "memory(GiB)": 47.44, "step": 19780, "token_acc": 0.8298239558163617, "train_speed(iter/s)": 0.095776 }, { "epoch": 0.9200747546205738, "grad_norm": 7.9375691413879395, "learning_rate": 1.7351970615984258e-07, "loss": 0.6057173728942871, "memory(GiB)": 47.44, "step": 19785, "token_acc": 0.8486120077469335, "train_speed(iter/s)": 0.09579 }, { "epoch": 0.9203072728805234, "grad_norm": 8.474169731140137, "learning_rate": 1.7251700619273616e-07, "loss": 0.6554636478424072, "memory(GiB)": 47.44, "step": 19790, "token_acc": 0.8410298324478954, "train_speed(iter/s)": 0.095803 }, { "epoch": 0.920539791140473, "grad_norm": 10.37956714630127, "learning_rate": 1.7151716086536873e-07, "loss": 0.6464309215545654, "memory(GiB)": 47.44, "step": 19795, "token_acc": 0.8384670487106017, "train_speed(iter/s)": 0.095817 }, { "epoch": 0.9207723094004227, "grad_norm": 7.706953525543213, "learning_rate": 1.705201707689813e-07, "loss": 0.617960786819458, "memory(GiB)": 47.44, "step": 19800, "token_acc": 0.8595564941921858, "train_speed(iter/s)": 0.09583 }, { "epoch": 0.9207723094004227, "eval_loss": 0.5560248494148254, "eval_runtime": 293.0751, "eval_samples_per_second": 11.857, "eval_steps_per_second": 11.857, "step": 19800 }, { "epoch": 0.9210048276603722, "grad_norm": 10.098922729492188, "learning_rate": 1.6952603649312392e-07, "loss": 0.6413202285766602, "memory(GiB)": 47.44, "step": 19805, "token_acc": 0.8346519114043657, "train_speed(iter/s)": 0.095708 }, { "epoch": 0.9212373459203218, "grad_norm": 10.483457565307617, "learning_rate": 1.6853475862565916e-07, "loss": 0.7492653369903565, "memory(GiB)": 47.44, "step": 19810, "token_acc": 0.8183652875882946, "train_speed(iter/s)": 0.095721 }, { "epoch": 0.9214698641802714, "grad_norm": 8.322340965270996, "learning_rate": 1.675463377527603e-07, "loss": 0.5719735145568847, "memory(GiB)": 47.44, "step": 19815, "token_acc": 0.8572583906186818, "train_speed(iter/s)": 0.095735 }, { "epoch": 0.921702382440221, "grad_norm": 11.469117164611816, "learning_rate": 1.665607744589115e-07, "loss": 0.6902899742126465, "memory(GiB)": 47.44, "step": 19820, "token_acc": 0.8308492201039861, "train_speed(iter/s)": 0.095748 }, { "epoch": 0.9219349007001706, "grad_norm": 7.651955604553223, "learning_rate": 1.65578069326906e-07, "loss": 0.7511250495910644, "memory(GiB)": 47.44, "step": 19825, "token_acc": 0.8129323308270676, "train_speed(iter/s)": 0.095762 }, { "epoch": 0.9221674189601202, "grad_norm": 7.940315246582031, "learning_rate": 1.64598222937849e-07, "loss": 0.49587244987487794, "memory(GiB)": 47.44, "step": 19830, "token_acc": 0.8828061638280617, "train_speed(iter/s)": 0.095775 }, { "epoch": 0.9223999372200699, "grad_norm": 8.776915550231934, "learning_rate": 1.6362123587115198e-07, "loss": 0.6938657283782959, "memory(GiB)": 47.44, "step": 19835, "token_acc": 0.8298710601719198, "train_speed(iter/s)": 0.095789 }, { "epoch": 0.9226324554800194, "grad_norm": 10.612738609313965, "learning_rate": 1.6264710870453893e-07, "loss": 0.560774564743042, "memory(GiB)": 47.44, "step": 19840, "token_acc": 0.8556254917387883, "train_speed(iter/s)": 0.095802 }, { "epoch": 0.922864973739969, "grad_norm": 7.882064342498779, "learning_rate": 1.6167584201404074e-07, "loss": 0.598447322845459, "memory(GiB)": 47.44, "step": 19845, "token_acc": 0.844487552537989, "train_speed(iter/s)": 0.095815 }, { "epoch": 0.9230974919999186, "grad_norm": 11.58521842956543, "learning_rate": 1.6070743637399745e-07, "loss": 0.7264031887054443, "memory(GiB)": 47.44, "step": 19850, "token_acc": 0.8180930686625447, "train_speed(iter/s)": 0.095829 }, { "epoch": 0.9230974919999186, "eval_loss": 0.555980384349823, "eval_runtime": 294.0643, "eval_samples_per_second": 11.817, "eval_steps_per_second": 11.817, "step": 19850 }, { "epoch": 0.9233300102598683, "grad_norm": 6.906254768371582, "learning_rate": 1.5974189235705761e-07, "loss": 0.570603322982788, "memory(GiB)": 47.44, "step": 19855, "token_acc": 0.8350892057807904, "train_speed(iter/s)": 0.095706 }, { "epoch": 0.9235625285198178, "grad_norm": 6.352476119995117, "learning_rate": 1.5877921053417732e-07, "loss": 0.5671680450439454, "memory(GiB)": 47.44, "step": 19860, "token_acc": 0.853103448275862, "train_speed(iter/s)": 0.09572 }, { "epoch": 0.9237950467797674, "grad_norm": 8.458511352539062, "learning_rate": 1.57819391474619e-07, "loss": 0.7611868858337403, "memory(GiB)": 47.44, "step": 19865, "token_acc": 0.8183606557377049, "train_speed(iter/s)": 0.095733 }, { "epoch": 0.9240275650397171, "grad_norm": 11.120638847351074, "learning_rate": 1.5686243574595416e-07, "loss": 0.5099010467529297, "memory(GiB)": 47.44, "step": 19870, "token_acc": 0.8711592178770949, "train_speed(iter/s)": 0.095747 }, { "epoch": 0.9242600832996666, "grad_norm": 9.012869834899902, "learning_rate": 1.5590834391406072e-07, "loss": 0.6600170612335206, "memory(GiB)": 47.44, "step": 19875, "token_acc": 0.8373001776198934, "train_speed(iter/s)": 0.09576 }, { "epoch": 0.9244926015596162, "grad_norm": 7.926135063171387, "learning_rate": 1.5495711654312128e-07, "loss": 0.6755642890930176, "memory(GiB)": 47.44, "step": 19880, "token_acc": 0.8345549738219895, "train_speed(iter/s)": 0.095774 }, { "epoch": 0.9247251198195658, "grad_norm": 12.261335372924805, "learning_rate": 1.54008754195627e-07, "loss": 0.5872795581817627, "memory(GiB)": 47.44, "step": 19885, "token_acc": 0.8427717200140696, "train_speed(iter/s)": 0.095787 }, { "epoch": 0.9249576380795155, "grad_norm": 7.186399459838867, "learning_rate": 1.5306325743237316e-07, "loss": 0.666871976852417, "memory(GiB)": 47.44, "step": 19890, "token_acc": 0.8331595411887383, "train_speed(iter/s)": 0.095801 }, { "epoch": 0.925190156339465, "grad_norm": 8.706633567810059, "learning_rate": 1.5212062681246252e-07, "loss": 0.6248205661773681, "memory(GiB)": 47.44, "step": 19895, "token_acc": 0.8479046242774566, "train_speed(iter/s)": 0.095815 }, { "epoch": 0.9254226745994146, "grad_norm": 7.308403491973877, "learning_rate": 1.511808628932998e-07, "loss": 0.6236941814422607, "memory(GiB)": 47.44, "step": 19900, "token_acc": 0.8420720151610865, "train_speed(iter/s)": 0.095828 }, { "epoch": 0.9254226745994146, "eval_loss": 0.5559277534484863, "eval_runtime": 293.4352, "eval_samples_per_second": 11.842, "eval_steps_per_second": 11.842, "step": 19900 }, { "epoch": 0.9256551928593643, "grad_norm": 8.53604793548584, "learning_rate": 1.5024396623059767e-07, "loss": 0.7425576210021972, "memory(GiB)": 47.44, "step": 19905, "token_acc": 0.834554529892652, "train_speed(iter/s)": 0.095706 }, { "epoch": 0.9258877111193138, "grad_norm": 8.350555419921875, "learning_rate": 1.493099373783713e-07, "loss": 0.6046389102935791, "memory(GiB)": 47.44, "step": 19910, "token_acc": 0.8460446967009578, "train_speed(iter/s)": 0.09572 }, { "epoch": 0.9261202293792634, "grad_norm": 9.9799222946167, "learning_rate": 1.483787768889422e-07, "loss": 0.6118096828460693, "memory(GiB)": 47.44, "step": 19915, "token_acc": 0.8490630323679728, "train_speed(iter/s)": 0.095733 }, { "epoch": 0.926352747639213, "grad_norm": 7.102067947387695, "learning_rate": 1.4745048531293217e-07, "loss": 0.5712886810302734, "memory(GiB)": 47.44, "step": 19920, "token_acc": 0.8495081967213115, "train_speed(iter/s)": 0.095747 }, { "epoch": 0.9265852658991627, "grad_norm": 10.693116188049316, "learning_rate": 1.465250631992704e-07, "loss": 0.5446754455566406, "memory(GiB)": 47.44, "step": 19925, "token_acc": 0.8637228778073849, "train_speed(iter/s)": 0.09576 }, { "epoch": 0.9268177841591122, "grad_norm": 9.30135726928711, "learning_rate": 1.4560251109518642e-07, "loss": 0.566897201538086, "memory(GiB)": 47.44, "step": 19930, "token_acc": 0.8544303797468354, "train_speed(iter/s)": 0.095773 }, { "epoch": 0.9270503024190618, "grad_norm": 12.602972030639648, "learning_rate": 1.4468282954621493e-07, "loss": 0.7747231960296631, "memory(GiB)": 47.44, "step": 19935, "token_acc": 0.8014571948998178, "train_speed(iter/s)": 0.095787 }, { "epoch": 0.9272828206790115, "grad_norm": 10.188447952270508, "learning_rate": 1.4376601909619092e-07, "loss": 0.6704387664794922, "memory(GiB)": 47.44, "step": 19940, "token_acc": 0.8310099573257468, "train_speed(iter/s)": 0.0958 }, { "epoch": 0.9275153389389611, "grad_norm": 11.103850364685059, "learning_rate": 1.4285208028725406e-07, "loss": 0.6308998107910156, "memory(GiB)": 47.44, "step": 19945, "token_acc": 0.8469015795868773, "train_speed(iter/s)": 0.095814 }, { "epoch": 0.9277478571989106, "grad_norm": 8.788395881652832, "learning_rate": 1.419410136598426e-07, "loss": 0.5965378284454346, "memory(GiB)": 47.44, "step": 19950, "token_acc": 0.8495362418412916, "train_speed(iter/s)": 0.095827 }, { "epoch": 0.9277478571989106, "eval_loss": 0.55594402551651, "eval_runtime": 295.1143, "eval_samples_per_second": 11.775, "eval_steps_per_second": 11.775, "step": 19950 }, { "epoch": 0.9279803754588603, "grad_norm": 9.410270690917969, "learning_rate": 1.4103281975270055e-07, "loss": 0.600709056854248, "memory(GiB)": 47.44, "step": 19955, "token_acc": 0.8349722480669897, "train_speed(iter/s)": 0.095705 }, { "epoch": 0.9282128937188099, "grad_norm": 8.508853912353516, "learning_rate": 1.4012749910286948e-07, "loss": 0.6112794399261474, "memory(GiB)": 47.44, "step": 19960, "token_acc": 0.8422978412001464, "train_speed(iter/s)": 0.095718 }, { "epoch": 0.9284454119787594, "grad_norm": 10.257251739501953, "learning_rate": 1.3922505224569338e-07, "loss": 0.6842128276824951, "memory(GiB)": 47.44, "step": 19965, "token_acc": 0.8187641296156745, "train_speed(iter/s)": 0.095732 }, { "epoch": 0.928677930238709, "grad_norm": 8.22706127166748, "learning_rate": 1.3832547971481813e-07, "loss": 0.715480899810791, "memory(GiB)": 47.44, "step": 19970, "token_acc": 0.8302420622445772, "train_speed(iter/s)": 0.095745 }, { "epoch": 0.9289104484986587, "grad_norm": 6.012380123138428, "learning_rate": 1.3742878204218823e-07, "loss": 0.6090039730072021, "memory(GiB)": 47.44, "step": 19975, "token_acc": 0.832556471853711, "train_speed(iter/s)": 0.095759 }, { "epoch": 0.9291429667586083, "grad_norm": 7.759617328643799, "learning_rate": 1.3653495975804786e-07, "loss": 0.6397760391235352, "memory(GiB)": 47.44, "step": 19980, "token_acc": 0.8429378531073446, "train_speed(iter/s)": 0.095772 }, { "epoch": 0.9293754850185578, "grad_norm": 7.183525562286377, "learning_rate": 1.3564401339094312e-07, "loss": 0.6927905559539795, "memory(GiB)": 47.44, "step": 19985, "token_acc": 0.8360916613621897, "train_speed(iter/s)": 0.095785 }, { "epoch": 0.9296080032785075, "grad_norm": 5.6819329261779785, "learning_rate": 1.3475594346771703e-07, "loss": 0.5243013381958008, "memory(GiB)": 47.44, "step": 19990, "token_acc": 0.8552537526804861, "train_speed(iter/s)": 0.095799 }, { "epoch": 0.9298405215384571, "grad_norm": 12.953128814697266, "learning_rate": 1.338707505135134e-07, "loss": 0.6449570178985595, "memory(GiB)": 47.44, "step": 19995, "token_acc": 0.8425643262102049, "train_speed(iter/s)": 0.095812 }, { "epoch": 0.9300730397984067, "grad_norm": 8.761154174804688, "learning_rate": 1.329884350517735e-07, "loss": 0.5618272304534913, "memory(GiB)": 47.44, "step": 20000, "token_acc": 0.8585131894484412, "train_speed(iter/s)": 0.095826 }, { "epoch": 0.9300730397984067, "eval_loss": 0.5558544397354126, "eval_runtime": 294.6168, "eval_samples_per_second": 11.795, "eval_steps_per_second": 11.795, "step": 20000 }, { "epoch": 0.9303055580583562, "grad_norm": 8.427507400512695, "learning_rate": 1.3210899760423834e-07, "loss": 0.6308176517486572, "memory(GiB)": 47.44, "step": 20005, "token_acc": 0.8346826826826826, "train_speed(iter/s)": 0.095704 }, { "epoch": 0.9305380763183059, "grad_norm": 8.652649879455566, "learning_rate": 1.312324386909458e-07, "loss": 0.6576950550079346, "memory(GiB)": 47.44, "step": 20010, "token_acc": 0.8384502923976608, "train_speed(iter/s)": 0.095717 }, { "epoch": 0.9307705945782555, "grad_norm": 7.4929304122924805, "learning_rate": 1.3035875883023298e-07, "loss": 0.6439097404479981, "memory(GiB)": 47.44, "step": 20015, "token_acc": 0.8359495229301324, "train_speed(iter/s)": 0.095731 }, { "epoch": 0.931003112838205, "grad_norm": 8.559335708618164, "learning_rate": 1.2948795853873374e-07, "loss": 0.5946200370788575, "memory(GiB)": 47.44, "step": 20020, "token_acc": 0.8393613554903877, "train_speed(iter/s)": 0.095744 }, { "epoch": 0.9312356310981547, "grad_norm": 7.127233982086182, "learning_rate": 1.2862003833137848e-07, "loss": 0.7438712596893311, "memory(GiB)": 47.44, "step": 20025, "token_acc": 0.8122285332442366, "train_speed(iter/s)": 0.095757 }, { "epoch": 0.9314681493581043, "grad_norm": 9.78085708618164, "learning_rate": 1.2775499872139553e-07, "loss": 0.7858536720275879, "memory(GiB)": 47.44, "step": 20030, "token_acc": 0.7917398945518453, "train_speed(iter/s)": 0.095771 }, { "epoch": 0.9317006676180539, "grad_norm": 10.972649574279785, "learning_rate": 1.2689284022030956e-07, "loss": 0.581892728805542, "memory(GiB)": 47.44, "step": 20035, "token_acc": 0.8570875290472502, "train_speed(iter/s)": 0.095785 }, { "epoch": 0.9319331858780034, "grad_norm": 7.946092128753662, "learning_rate": 1.260335633379417e-07, "loss": 0.6852340698242188, "memory(GiB)": 47.44, "step": 20040, "token_acc": 0.8281821203057494, "train_speed(iter/s)": 0.095797 }, { "epoch": 0.9321657041379531, "grad_norm": 10.450540542602539, "learning_rate": 1.2517716858240824e-07, "loss": 0.6736576080322265, "memory(GiB)": 47.44, "step": 20045, "token_acc": 0.8368560105680317, "train_speed(iter/s)": 0.095811 }, { "epoch": 0.9323982223979027, "grad_norm": 8.191557884216309, "learning_rate": 1.2432365646012245e-07, "loss": 0.6913845062255859, "memory(GiB)": 47.44, "step": 20050, "token_acc": 0.8282527881040892, "train_speed(iter/s)": 0.095824 }, { "epoch": 0.9323982223979027, "eval_loss": 0.5560406446456909, "eval_runtime": 293.3635, "eval_samples_per_second": 11.845, "eval_steps_per_second": 11.845, "step": 20050 }, { "epoch": 0.9326307406578523, "grad_norm": 6.211060047149658, "learning_rate": 1.2347302747579225e-07, "loss": 0.7071500778198242, "memory(GiB)": 47.44, "step": 20055, "token_acc": 0.8342621275508575, "train_speed(iter/s)": 0.095703 }, { "epoch": 0.9328632589178019, "grad_norm": 8.84935474395752, "learning_rate": 1.2262528213242142e-07, "loss": 0.6677957057952881, "memory(GiB)": 47.44, "step": 20060, "token_acc": 0.8243108601793424, "train_speed(iter/s)": 0.095717 }, { "epoch": 0.9330957771777515, "grad_norm": 9.605239868164062, "learning_rate": 1.217804209313067e-07, "loss": 0.569607925415039, "memory(GiB)": 47.44, "step": 20065, "token_acc": 0.8461019237259534, "train_speed(iter/s)": 0.09573 }, { "epoch": 0.9333282954377011, "grad_norm": 8.329422950744629, "learning_rate": 1.2093844437204182e-07, "loss": 0.6272897720336914, "memory(GiB)": 47.44, "step": 20070, "token_acc": 0.8355795148247979, "train_speed(iter/s)": 0.095743 }, { "epoch": 0.9335608136976506, "grad_norm": 11.321281433105469, "learning_rate": 1.200993529525124e-07, "loss": 0.695442008972168, "memory(GiB)": 47.44, "step": 20075, "token_acc": 0.8292591199699135, "train_speed(iter/s)": 0.095757 }, { "epoch": 0.9337933319576003, "grad_norm": 9.605953216552734, "learning_rate": 1.192631471689004e-07, "loss": 0.5914440631866456, "memory(GiB)": 47.44, "step": 20080, "token_acc": 0.8651951123374064, "train_speed(iter/s)": 0.09577 }, { "epoch": 0.9340258502175499, "grad_norm": 8.124171257019043, "learning_rate": 1.1842982751567866e-07, "loss": 0.6247981548309326, "memory(GiB)": 47.44, "step": 20085, "token_acc": 0.8412457273072541, "train_speed(iter/s)": 0.095784 }, { "epoch": 0.9342583684774995, "grad_norm": 8.131589889526367, "learning_rate": 1.1759939448561575e-07, "loss": 0.5428094387054443, "memory(GiB)": 47.44, "step": 20090, "token_acc": 0.8647786198643798, "train_speed(iter/s)": 0.095797 }, { "epoch": 0.9344908867374491, "grad_norm": 10.977629661560059, "learning_rate": 1.167718485697722e-07, "loss": 0.6608882427215577, "memory(GiB)": 47.44, "step": 20095, "token_acc": 0.8396728016359918, "train_speed(iter/s)": 0.095811 }, { "epoch": 0.9347234049973987, "grad_norm": 9.835230827331543, "learning_rate": 1.1594719025750156e-07, "loss": 0.6709197521209717, "memory(GiB)": 47.44, "step": 20100, "token_acc": 0.831494184312556, "train_speed(iter/s)": 0.095824 }, { "epoch": 0.9347234049973987, "eval_loss": 0.556010901927948, "eval_runtime": 294.2096, "eval_samples_per_second": 11.811, "eval_steps_per_second": 11.811, "step": 20100 }, { "epoch": 0.9349559232573483, "grad_norm": 8.349122047424316, "learning_rate": 1.1512542003644933e-07, "loss": 0.6077350616455078, "memory(GiB)": 47.44, "step": 20105, "token_acc": 0.834884296794498, "train_speed(iter/s)": 0.095703 }, { "epoch": 0.9351884415172979, "grad_norm": 9.29889965057373, "learning_rate": 1.1430653839255402e-07, "loss": 0.675541353225708, "memory(GiB)": 47.44, "step": 20110, "token_acc": 0.8397224058162591, "train_speed(iter/s)": 0.095716 }, { "epoch": 0.9354209597772475, "grad_norm": 8.753215789794922, "learning_rate": 1.1349054581004548e-07, "loss": 0.5333482265472412, "memory(GiB)": 47.44, "step": 20115, "token_acc": 0.866890756302521, "train_speed(iter/s)": 0.09573 }, { "epoch": 0.9356534780371971, "grad_norm": 7.72360372543335, "learning_rate": 1.1267744277144554e-07, "loss": 0.5998239994049073, "memory(GiB)": 47.44, "step": 20120, "token_acc": 0.844138303619665, "train_speed(iter/s)": 0.095743 }, { "epoch": 0.9358859962971467, "grad_norm": 12.028573989868164, "learning_rate": 1.1186722975756626e-07, "loss": 0.656134843826294, "memory(GiB)": 47.44, "step": 20125, "token_acc": 0.8383878691141261, "train_speed(iter/s)": 0.095756 }, { "epoch": 0.9361185145570963, "grad_norm": 8.484295845031738, "learning_rate": 1.110599072475127e-07, "loss": 0.5953815460205079, "memory(GiB)": 47.44, "step": 20130, "token_acc": 0.8507862161257945, "train_speed(iter/s)": 0.095769 }, { "epoch": 0.9363510328170459, "grad_norm": 9.174735069274902, "learning_rate": 1.1025547571867856e-07, "loss": 0.6517370700836181, "memory(GiB)": 47.44, "step": 20135, "token_acc": 0.8399729912221472, "train_speed(iter/s)": 0.095783 }, { "epoch": 0.9365835510769955, "grad_norm": 8.110981941223145, "learning_rate": 1.0945393564675055e-07, "loss": 0.6983220100402832, "memory(GiB)": 47.44, "step": 20140, "token_acc": 0.8345406023637056, "train_speed(iter/s)": 0.095796 }, { "epoch": 0.9368160693369452, "grad_norm": 7.590916633605957, "learning_rate": 1.0865528750570286e-07, "loss": 0.5777697563171387, "memory(GiB)": 47.44, "step": 20145, "token_acc": 0.8619783108774236, "train_speed(iter/s)": 0.09581 }, { "epoch": 0.9370485875968947, "grad_norm": 8.5171480178833, "learning_rate": 1.0785953176780107e-07, "loss": 0.5863373756408692, "memory(GiB)": 47.44, "step": 20150, "token_acc": 0.8566164154103852, "train_speed(iter/s)": 0.095823 }, { "epoch": 0.9370485875968947, "eval_loss": 0.5557586550712585, "eval_runtime": 294.9179, "eval_samples_per_second": 11.783, "eval_steps_per_second": 11.783, "step": 20150 }, { "epoch": 0.9372811058568443, "grad_norm": 9.852570533752441, "learning_rate": 1.0706666890359985e-07, "loss": 0.5910586357116699, "memory(GiB)": 47.44, "step": 20155, "token_acc": 0.835281461738544, "train_speed(iter/s)": 0.095702 }, { "epoch": 0.937513624116794, "grad_norm": 7.415332794189453, "learning_rate": 1.0627669938194418e-07, "loss": 0.5437919616699218, "memory(GiB)": 47.44, "step": 20160, "token_acc": 0.8614267676767676, "train_speed(iter/s)": 0.095715 }, { "epoch": 0.9377461423767435, "grad_norm": 12.079816818237305, "learning_rate": 1.0548962366996707e-07, "loss": 0.766736364364624, "memory(GiB)": 47.44, "step": 20165, "token_acc": 0.7975959674292361, "train_speed(iter/s)": 0.095729 }, { "epoch": 0.9379786606366931, "grad_norm": 8.337579727172852, "learning_rate": 1.047054422330901e-07, "loss": 0.625864315032959, "memory(GiB)": 47.44, "step": 20170, "token_acc": 0.837708066581306, "train_speed(iter/s)": 0.095742 }, { "epoch": 0.9382111788966427, "grad_norm": 8.03079605102539, "learning_rate": 1.0392415553502455e-07, "loss": 0.5572741985321045, "memory(GiB)": 47.44, "step": 20175, "token_acc": 0.8698746187732972, "train_speed(iter/s)": 0.095755 }, { "epoch": 0.9384436971565924, "grad_norm": 7.4281182289123535, "learning_rate": 1.0314576403776977e-07, "loss": 0.5888198375701904, "memory(GiB)": 47.44, "step": 20180, "token_acc": 0.8555512869765655, "train_speed(iter/s)": 0.095768 }, { "epoch": 0.9386762154165419, "grad_norm": 8.945042610168457, "learning_rate": 1.02370268201612e-07, "loss": 0.6920148372650147, "memory(GiB)": 47.44, "step": 20185, "token_acc": 0.828472755180353, "train_speed(iter/s)": 0.095781 }, { "epoch": 0.9389087336764915, "grad_norm": 7.534639358520508, "learning_rate": 1.0159766848512608e-07, "loss": 0.6225242137908935, "memory(GiB)": 47.44, "step": 20190, "token_acc": 0.8326589595375723, "train_speed(iter/s)": 0.095795 }, { "epoch": 0.9391412519364412, "grad_norm": 8.891571044921875, "learning_rate": 1.0082796534517436e-07, "loss": 0.5637755393981934, "memory(GiB)": 47.44, "step": 20195, "token_acc": 0.8522033898305085, "train_speed(iter/s)": 0.095808 }, { "epoch": 0.9393737701963908, "grad_norm": 9.549708366394043, "learning_rate": 1.0006115923690551e-07, "loss": 0.7433938026428223, "memory(GiB)": 47.44, "step": 20200, "token_acc": 0.831905344757242, "train_speed(iter/s)": 0.095821 }, { "epoch": 0.9393737701963908, "eval_loss": 0.5558546185493469, "eval_runtime": 294.94, "eval_samples_per_second": 11.782, "eval_steps_per_second": 11.782, "step": 20200 }, { "epoch": 0.9396062884563403, "grad_norm": 7.152247905731201, "learning_rate": 9.929725061375627e-08, "loss": 0.6077425479888916, "memory(GiB)": 47.44, "step": 20205, "token_acc": 0.835157187425006, "train_speed(iter/s)": 0.0957 }, { "epoch": 0.9398388067162899, "grad_norm": 10.471779823303223, "learning_rate": 9.85362399274481e-08, "loss": 0.5760631084442138, "memory(GiB)": 47.44, "step": 20210, "token_acc": 0.868144690781797, "train_speed(iter/s)": 0.095713 }, { "epoch": 0.9400713249762396, "grad_norm": 8.634541511535645, "learning_rate": 9.777812762799211e-08, "loss": 0.6034083843231202, "memory(GiB)": 47.44, "step": 20215, "token_acc": 0.8574080950612699, "train_speed(iter/s)": 0.095727 }, { "epoch": 0.9403038432361891, "grad_norm": 7.538180828094482, "learning_rate": 9.702291416368193e-08, "loss": 0.6724761486053467, "memory(GiB)": 47.44, "step": 20220, "token_acc": 0.8304682868998222, "train_speed(iter/s)": 0.09574 }, { "epoch": 0.9405363614961387, "grad_norm": 10.402557373046875, "learning_rate": 9.627059998109978e-08, "loss": 0.7383048057556152, "memory(GiB)": 47.44, "step": 20225, "token_acc": 0.8202247191011236, "train_speed(iter/s)": 0.095753 }, { "epoch": 0.9407688797560884, "grad_norm": 8.302450180053711, "learning_rate": 9.552118552511147e-08, "loss": 0.6783876895904541, "memory(GiB)": 47.44, "step": 20230, "token_acc": 0.8281358281358281, "train_speed(iter/s)": 0.095767 }, { "epoch": 0.941001398016038, "grad_norm": 8.443572998046875, "learning_rate": 9.477467123886919e-08, "loss": 0.540089750289917, "memory(GiB)": 47.44, "step": 20235, "token_acc": 0.8699566522174058, "train_speed(iter/s)": 0.09578 }, { "epoch": 0.9412339162759875, "grad_norm": 11.566424369812012, "learning_rate": 9.403105756380926e-08, "loss": 0.7472939968109131, "memory(GiB)": 47.44, "step": 20240, "token_acc": 0.8125202724618877, "train_speed(iter/s)": 0.095793 }, { "epoch": 0.9414664345359371, "grad_norm": 10.177603721618652, "learning_rate": 9.329034493965383e-08, "loss": 0.7187521457672119, "memory(GiB)": 47.44, "step": 20245, "token_acc": 0.8234134804887663, "train_speed(iter/s)": 0.095806 }, { "epoch": 0.9416989527958868, "grad_norm": 10.402929306030273, "learning_rate": 9.255253380440921e-08, "loss": 0.6262013912200928, "memory(GiB)": 47.44, "step": 20250, "token_acc": 0.8423997513211067, "train_speed(iter/s)": 0.095819 }, { "epoch": 0.9416989527958868, "eval_loss": 0.5554139018058777, "eval_runtime": 294.8259, "eval_samples_per_second": 11.787, "eval_steps_per_second": 11.787, "step": 20250 }, { "epoch": 0.9419314710558363, "grad_norm": 8.747036933898926, "learning_rate": 9.181762459436694e-08, "loss": 0.5820370674133301, "memory(GiB)": 47.44, "step": 20255, "token_acc": 0.8354214893126188, "train_speed(iter/s)": 0.095699 }, { "epoch": 0.9421639893157859, "grad_norm": 9.884243965148926, "learning_rate": 9.108561774409941e-08, "loss": 0.6614628314971924, "memory(GiB)": 47.44, "step": 20260, "token_acc": 0.832986832986833, "train_speed(iter/s)": 0.095712 }, { "epoch": 0.9423965075757356, "grad_norm": 7.948294162750244, "learning_rate": 9.035651368646647e-08, "loss": 0.6848249912261963, "memory(GiB)": 47.44, "step": 20265, "token_acc": 0.8344437041972018, "train_speed(iter/s)": 0.095725 }, { "epoch": 0.9426290258356852, "grad_norm": 6.9303693771362305, "learning_rate": 8.963031285260937e-08, "loss": 0.541897201538086, "memory(GiB)": 47.44, "step": 20270, "token_acc": 0.8590909090909091, "train_speed(iter/s)": 0.095738 }, { "epoch": 0.9428615440956347, "grad_norm": 6.2741241455078125, "learning_rate": 8.890701567195292e-08, "loss": 0.7256217002868652, "memory(GiB)": 47.44, "step": 20275, "token_acc": 0.8132061260356516, "train_speed(iter/s)": 0.095751 }, { "epoch": 0.9430940623555844, "grad_norm": 9.887829780578613, "learning_rate": 8.818662257220556e-08, "loss": 0.6676210880279541, "memory(GiB)": 47.44, "step": 20280, "token_acc": 0.8412804268089363, "train_speed(iter/s)": 0.095764 }, { "epoch": 0.943326580615534, "grad_norm": 11.911397933959961, "learning_rate": 8.746913397935708e-08, "loss": 0.5624986171722413, "memory(GiB)": 47.44, "step": 20285, "token_acc": 0.8604972375690608, "train_speed(iter/s)": 0.095777 }, { "epoch": 0.9435590988754836, "grad_norm": 13.02669906616211, "learning_rate": 8.675455031768143e-08, "loss": 0.6928697109222413, "memory(GiB)": 47.44, "step": 20290, "token_acc": 0.8278443113772455, "train_speed(iter/s)": 0.095791 }, { "epoch": 0.9437916171354331, "grad_norm": 9.055578231811523, "learning_rate": 8.604287200973394e-08, "loss": 0.58877592086792, "memory(GiB)": 47.44, "step": 20295, "token_acc": 0.8360365673842525, "train_speed(iter/s)": 0.095804 }, { "epoch": 0.9440241353953828, "grad_norm": 9.44039535522461, "learning_rate": 8.533409947635185e-08, "loss": 0.6449069023132324, "memory(GiB)": 47.44, "step": 20300, "token_acc": 0.8507737656595431, "train_speed(iter/s)": 0.095817 }, { "epoch": 0.9440241353953828, "eval_loss": 0.5554980635643005, "eval_runtime": 295.2399, "eval_samples_per_second": 11.77, "eval_steps_per_second": 11.77, "step": 20300 }, { "epoch": 0.9442566536553324, "grad_norm": 10.0753755569458, "learning_rate": 8.46282331366538e-08, "loss": 0.6532928943634033, "memory(GiB)": 47.44, "step": 20305, "token_acc": 0.8349886082854667, "train_speed(iter/s)": 0.095697 }, { "epoch": 0.9444891719152819, "grad_norm": 8.442564010620117, "learning_rate": 8.392527340804146e-08, "loss": 0.5748683452606201, "memory(GiB)": 47.44, "step": 20310, "token_acc": 0.8664459161147903, "train_speed(iter/s)": 0.09571 }, { "epoch": 0.9447216901752316, "grad_norm": 7.529089450836182, "learning_rate": 8.32252207061951e-08, "loss": 0.6530350685119629, "memory(GiB)": 47.44, "step": 20315, "token_acc": 0.8451369216241738, "train_speed(iter/s)": 0.095723 }, { "epoch": 0.9449542084351812, "grad_norm": 8.566681861877441, "learning_rate": 8.252807544507913e-08, "loss": 0.6495450496673584, "memory(GiB)": 47.44, "step": 20320, "token_acc": 0.8388909704008992, "train_speed(iter/s)": 0.095736 }, { "epoch": 0.9451867266951308, "grad_norm": 6.657361030578613, "learning_rate": 8.183383803693545e-08, "loss": 0.586691427230835, "memory(GiB)": 47.44, "step": 20325, "token_acc": 0.8495850622406639, "train_speed(iter/s)": 0.09575 }, { "epoch": 0.9454192449550803, "grad_norm": 7.880705833435059, "learning_rate": 8.114250889228848e-08, "loss": 0.5361662864685058, "memory(GiB)": 47.44, "step": 20330, "token_acc": 0.8539553752535497, "train_speed(iter/s)": 0.095763 }, { "epoch": 0.94565176321503, "grad_norm": 7.656473636627197, "learning_rate": 8.04540884199434e-08, "loss": 0.6294970512390137, "memory(GiB)": 47.44, "step": 20335, "token_acc": 0.8405292479108635, "train_speed(iter/s)": 0.095776 }, { "epoch": 0.9458842814749796, "grad_norm": 6.87350606918335, "learning_rate": 7.97685770269846e-08, "loss": 0.6477347373962402, "memory(GiB)": 47.44, "step": 20340, "token_acc": 0.8395100502512562, "train_speed(iter/s)": 0.09579 }, { "epoch": 0.9461167997349292, "grad_norm": 10.930252075195312, "learning_rate": 7.908597511877447e-08, "loss": 0.5632180213928223, "memory(GiB)": 47.44, "step": 20345, "token_acc": 0.8588528678304239, "train_speed(iter/s)": 0.095803 }, { "epoch": 0.9463493179948788, "grad_norm": 7.769409656524658, "learning_rate": 7.840628309895848e-08, "loss": 0.6253656864166259, "memory(GiB)": 47.44, "step": 20350, "token_acc": 0.8249168430601754, "train_speed(iter/s)": 0.095816 }, { "epoch": 0.9463493179948788, "eval_loss": 0.5555700659751892, "eval_runtime": 296.4564, "eval_samples_per_second": 11.722, "eval_steps_per_second": 11.722, "step": 20350 }, { "epoch": 0.9465818362548284, "grad_norm": 10.457148551940918, "learning_rate": 7.772950136945789e-08, "loss": 0.74940505027771, "memory(GiB)": 47.44, "step": 20355, "token_acc": 0.8343391286300429, "train_speed(iter/s)": 0.095695 }, { "epoch": 0.946814354514778, "grad_norm": 8.147481918334961, "learning_rate": 7.705563033047592e-08, "loss": 0.6334987163543702, "memory(GiB)": 47.44, "step": 20360, "token_acc": 0.8368891947694426, "train_speed(iter/s)": 0.095708 }, { "epoch": 0.9470468727747275, "grad_norm": 10.947805404663086, "learning_rate": 7.638467038049214e-08, "loss": 0.5573566913604736, "memory(GiB)": 47.44, "step": 20365, "token_acc": 0.8695842450765864, "train_speed(iter/s)": 0.095721 }, { "epoch": 0.9472793910346772, "grad_norm": 8.894417762756348, "learning_rate": 7.571662191626694e-08, "loss": 0.60349440574646, "memory(GiB)": 47.44, "step": 20370, "token_acc": 0.8537735849056604, "train_speed(iter/s)": 0.095734 }, { "epoch": 0.9475119092946268, "grad_norm": 9.178512573242188, "learning_rate": 7.505148533283712e-08, "loss": 0.5638972282409668, "memory(GiB)": 47.44, "step": 20375, "token_acc": 0.853031465848043, "train_speed(iter/s)": 0.095748 }, { "epoch": 0.9477444275545764, "grad_norm": 7.611563682556152, "learning_rate": 7.438926102351973e-08, "loss": 0.6906840324401855, "memory(GiB)": 47.44, "step": 20380, "token_acc": 0.8227810650887574, "train_speed(iter/s)": 0.095761 }, { "epoch": 0.947976945814526, "grad_norm": 9.940587997436523, "learning_rate": 7.372994937990707e-08, "loss": 0.6728063106536866, "memory(GiB)": 47.44, "step": 20385, "token_acc": 0.8305246422893482, "train_speed(iter/s)": 0.095774 }, { "epoch": 0.9482094640744756, "grad_norm": 8.792899131774902, "learning_rate": 7.307355079187118e-08, "loss": 0.7505878448486328, "memory(GiB)": 47.44, "step": 20390, "token_acc": 0.82015065913371, "train_speed(iter/s)": 0.095787 }, { "epoch": 0.9484419823344252, "grad_norm": 7.857633113861084, "learning_rate": 7.242006564756043e-08, "loss": 0.7554164409637452, "memory(GiB)": 47.44, "step": 20395, "token_acc": 0.8082595870206489, "train_speed(iter/s)": 0.0958 }, { "epoch": 0.9486745005943747, "grad_norm": 10.25812816619873, "learning_rate": 7.17694943334013e-08, "loss": 0.678457498550415, "memory(GiB)": 47.44, "step": 20400, "token_acc": 0.8347305389221557, "train_speed(iter/s)": 0.095814 }, { "epoch": 0.9486745005943747, "eval_loss": 0.5555104613304138, "eval_runtime": 292.2526, "eval_samples_per_second": 11.89, "eval_steps_per_second": 11.89, "step": 20400 }, { "epoch": 0.9489070188543244, "grad_norm": 10.560473442077637, "learning_rate": 7.11218372340966e-08, "loss": 0.7394631385803223, "memory(GiB)": 47.44, "step": 20405, "token_acc": 0.8341056755207338, "train_speed(iter/s)": 0.095695 }, { "epoch": 0.949139537114274, "grad_norm": 11.776728630065918, "learning_rate": 7.047709473262554e-08, "loss": 0.5568135261535645, "memory(GiB)": 47.44, "step": 20410, "token_acc": 0.8602316602316602, "train_speed(iter/s)": 0.095709 }, { "epoch": 0.9493720553742236, "grad_norm": 10.002482414245605, "learning_rate": 6.983526721024425e-08, "loss": 0.7164381504058838, "memory(GiB)": 47.44, "step": 20415, "token_acc": 0.8240400667779633, "train_speed(iter/s)": 0.095722 }, { "epoch": 0.9496045736341732, "grad_norm": 11.498485565185547, "learning_rate": 6.919635504648581e-08, "loss": 0.6015612125396729, "memory(GiB)": 47.44, "step": 20420, "token_acc": 0.8594515181194907, "train_speed(iter/s)": 0.095736 }, { "epoch": 0.9498370918941228, "grad_norm": 7.4594902992248535, "learning_rate": 6.856035861915855e-08, "loss": 0.6615743637084961, "memory(GiB)": 47.44, "step": 20425, "token_acc": 0.8398541114058355, "train_speed(iter/s)": 0.095749 }, { "epoch": 0.9500696101540724, "grad_norm": 8.488970756530762, "learning_rate": 6.792727830434608e-08, "loss": 0.6310092449188233, "memory(GiB)": 47.44, "step": 20430, "token_acc": 0.8501026694045175, "train_speed(iter/s)": 0.095762 }, { "epoch": 0.9503021284140221, "grad_norm": 6.498387336730957, "learning_rate": 6.729711447640897e-08, "loss": 0.6058262348175049, "memory(GiB)": 47.44, "step": 20435, "token_acc": 0.8434754311747478, "train_speed(iter/s)": 0.095774 }, { "epoch": 0.9505346466739716, "grad_norm": 10.151055335998535, "learning_rate": 6.666986750798244e-08, "loss": 0.7225804805755616, "memory(GiB)": 47.44, "step": 20440, "token_acc": 0.8169456066945606, "train_speed(iter/s)": 0.095788 }, { "epoch": 0.9507671649339212, "grad_norm": 7.998047351837158, "learning_rate": 6.604553776997702e-08, "loss": 0.6553145408630371, "memory(GiB)": 47.44, "step": 20445, "token_acc": 0.8338098641887062, "train_speed(iter/s)": 0.095801 }, { "epoch": 0.9509996831938708, "grad_norm": 9.505949974060059, "learning_rate": 6.542412563157796e-08, "loss": 0.5239490509033203, "memory(GiB)": 47.44, "step": 20450, "token_acc": 0.8669275929549902, "train_speed(iter/s)": 0.095814 }, { "epoch": 0.9509996831938708, "eval_loss": 0.5556869506835938, "eval_runtime": 296.071, "eval_samples_per_second": 11.737, "eval_steps_per_second": 11.737, "step": 20450 }, { "epoch": 0.9512322014538204, "grad_norm": 8.004267692565918, "learning_rate": 6.480563146024521e-08, "loss": 0.7079615116119384, "memory(GiB)": 47.44, "step": 20455, "token_acc": 0.834247492705327, "train_speed(iter/s)": 0.095695 }, { "epoch": 0.95146471971377, "grad_norm": 8.829005241394043, "learning_rate": 6.419005562171399e-08, "loss": 0.6616856575012207, "memory(GiB)": 47.44, "step": 20460, "token_acc": 0.8278097982708934, "train_speed(iter/s)": 0.095708 }, { "epoch": 0.9516972379737196, "grad_norm": 10.461058616638184, "learning_rate": 6.357739847999367e-08, "loss": 0.571187686920166, "memory(GiB)": 47.44, "step": 20465, "token_acc": 0.8574660633484162, "train_speed(iter/s)": 0.095721 }, { "epoch": 0.9519297562336693, "grad_norm": 6.821761131286621, "learning_rate": 6.296766039736613e-08, "loss": 0.5766401767730713, "memory(GiB)": 47.44, "step": 20470, "token_acc": 0.8541473943879124, "train_speed(iter/s)": 0.095734 }, { "epoch": 0.9521622744936188, "grad_norm": 5.286739349365234, "learning_rate": 6.236084173438961e-08, "loss": 0.6574409961700439, "memory(GiB)": 47.44, "step": 20475, "token_acc": 0.8148148148148148, "train_speed(iter/s)": 0.095747 }, { "epoch": 0.9523947927535684, "grad_norm": 8.371713638305664, "learning_rate": 6.175694284989375e-08, "loss": 0.5862714767456054, "memory(GiB)": 47.44, "step": 20480, "token_acc": 0.8502436863092601, "train_speed(iter/s)": 0.09576 }, { "epoch": 0.952627311013518, "grad_norm": 7.867913722991943, "learning_rate": 6.11559641009829e-08, "loss": 0.760695219039917, "memory(GiB)": 47.44, "step": 20485, "token_acc": 0.8002534854245881, "train_speed(iter/s)": 0.095773 }, { "epoch": 0.9528598292734677, "grad_norm": 8.1065034866333, "learning_rate": 6.055790584303445e-08, "loss": 0.6531691074371337, "memory(GiB)": 47.44, "step": 20490, "token_acc": 0.8368972746331237, "train_speed(iter/s)": 0.095786 }, { "epoch": 0.9530923475334172, "grad_norm": 12.392067909240723, "learning_rate": 5.996276842969828e-08, "loss": 0.6888431549072266, "memory(GiB)": 47.44, "step": 20495, "token_acc": 0.8311335403726708, "train_speed(iter/s)": 0.095799 }, { "epoch": 0.9533248657933668, "grad_norm": 7.902614593505859, "learning_rate": 5.937055221289845e-08, "loss": 0.5516024589538574, "memory(GiB)": 47.44, "step": 20500, "token_acc": 0.8628333910633876, "train_speed(iter/s)": 0.095812 }, { "epoch": 0.9533248657933668, "eval_loss": 0.5554400086402893, "eval_runtime": 293.2617, "eval_samples_per_second": 11.849, "eval_steps_per_second": 11.849, "step": 20500 }, { "epoch": 0.9535573840533165, "grad_norm": 8.178373336791992, "learning_rate": 5.878125754283037e-08, "loss": 0.5706425666809082, "memory(GiB)": 47.44, "step": 20505, "token_acc": 0.8351757426336981, "train_speed(iter/s)": 0.095693 }, { "epoch": 0.953789902313266, "grad_norm": 6.950657367706299, "learning_rate": 5.8194884767961424e-08, "loss": 0.7144189834594726, "memory(GiB)": 47.44, "step": 20510, "token_acc": 0.8302277432712215, "train_speed(iter/s)": 0.095706 }, { "epoch": 0.9540224205732156, "grad_norm": 7.927690505981445, "learning_rate": 5.761143423503257e-08, "loss": 0.663585901260376, "memory(GiB)": 47.44, "step": 20515, "token_acc": 0.8332219251336899, "train_speed(iter/s)": 0.095719 }, { "epoch": 0.9542549388331653, "grad_norm": 14.15100383758545, "learning_rate": 5.703090628905617e-08, "loss": 0.679119062423706, "memory(GiB)": 47.44, "step": 20520, "token_acc": 0.8324175824175825, "train_speed(iter/s)": 0.095732 }, { "epoch": 0.9544874570931149, "grad_norm": 8.4072847366333, "learning_rate": 5.645330127331594e-08, "loss": 0.5841912269592285, "memory(GiB)": 47.44, "step": 20525, "token_acc": 0.8558432470258922, "train_speed(iter/s)": 0.095746 }, { "epoch": 0.9547199753530644, "grad_norm": 8.089521408081055, "learning_rate": 5.587861952936813e-08, "loss": 0.6344331741333008, "memory(GiB)": 47.44, "step": 20530, "token_acc": 0.8492678725236865, "train_speed(iter/s)": 0.095758 }, { "epoch": 0.954952493613014, "grad_norm": 9.23490047454834, "learning_rate": 5.5306861397038666e-08, "loss": 0.6122418403625488, "memory(GiB)": 47.44, "step": 20535, "token_acc": 0.8401052323881906, "train_speed(iter/s)": 0.095771 }, { "epoch": 0.9551850118729637, "grad_norm": 11.101957321166992, "learning_rate": 5.4738027214427114e-08, "loss": 0.5774817943572998, "memory(GiB)": 47.44, "step": 20540, "token_acc": 0.8593969144460029, "train_speed(iter/s)": 0.095784 }, { "epoch": 0.9554175301329132, "grad_norm": 8.920694351196289, "learning_rate": 5.417211731790217e-08, "loss": 0.6202130794525147, "memory(GiB)": 47.44, "step": 20545, "token_acc": 0.8487694300518135, "train_speed(iter/s)": 0.095797 }, { "epoch": 0.9556500483928628, "grad_norm": 7.193459510803223, "learning_rate": 5.360913204210394e-08, "loss": 0.6225608825683594, "memory(GiB)": 47.44, "step": 20550, "token_acc": 0.8276497695852535, "train_speed(iter/s)": 0.09581 }, { "epoch": 0.9556500483928628, "eval_loss": 0.5555076599121094, "eval_runtime": 292.2433, "eval_samples_per_second": 11.891, "eval_steps_per_second": 11.891, "step": 20550 }, { "epoch": 0.9558825666528125, "grad_norm": 10.845667839050293, "learning_rate": 5.304907171994278e-08, "loss": 0.64171142578125, "memory(GiB)": 47.44, "step": 20555, "token_acc": 0.834728835767473, "train_speed(iter/s)": 0.095693 }, { "epoch": 0.9561150849127621, "grad_norm": 7.467350482940674, "learning_rate": 5.249193668259989e-08, "loss": 0.7335173130035401, "memory(GiB)": 47.44, "step": 20560, "token_acc": 0.8039755351681958, "train_speed(iter/s)": 0.095705 }, { "epoch": 0.9563476031727116, "grad_norm": 7.151541233062744, "learning_rate": 5.1937727259525615e-08, "loss": 0.5967055320739746, "memory(GiB)": 47.44, "step": 20565, "token_acc": 0.8547701815372731, "train_speed(iter/s)": 0.095718 }, { "epoch": 0.9565801214326612, "grad_norm": 7.254678249359131, "learning_rate": 5.1386443778442264e-08, "loss": 0.6981623649597168, "memory(GiB)": 47.44, "step": 20570, "token_acc": 0.80312415836251, "train_speed(iter/s)": 0.095731 }, { "epoch": 0.9568126396926109, "grad_norm": 7.200235366821289, "learning_rate": 5.083808656534017e-08, "loss": 0.7622573852539063, "memory(GiB)": 47.44, "step": 20575, "token_acc": 0.8102981029810298, "train_speed(iter/s)": 0.095744 }, { "epoch": 0.9570451579525605, "grad_norm": 11.583197593688965, "learning_rate": 5.0292655944479963e-08, "loss": 0.7474843978881835, "memory(GiB)": 47.44, "step": 20580, "token_acc": 0.8325881768504719, "train_speed(iter/s)": 0.095757 }, { "epoch": 0.95727767621251, "grad_norm": 10.911505699157715, "learning_rate": 4.975015223839197e-08, "loss": 0.6490874767303467, "memory(GiB)": 47.44, "step": 20585, "token_acc": 0.8389866895663375, "train_speed(iter/s)": 0.09577 }, { "epoch": 0.9575101944724597, "grad_norm": 9.79134750366211, "learning_rate": 4.921057576787458e-08, "loss": 0.6671240329742432, "memory(GiB)": 47.44, "step": 20590, "token_acc": 0.8255982596084119, "train_speed(iter/s)": 0.095783 }, { "epoch": 0.9577427127324093, "grad_norm": 7.199132442474365, "learning_rate": 4.8673926851996454e-08, "loss": 0.7733827590942383, "memory(GiB)": 47.44, "step": 20595, "token_acc": 0.8150917743031951, "train_speed(iter/s)": 0.095796 }, { "epoch": 0.9579752309923588, "grad_norm": 8.266303062438965, "learning_rate": 4.8140205808094865e-08, "loss": 0.6054050922393799, "memory(GiB)": 47.44, "step": 20600, "token_acc": 0.8480492813141683, "train_speed(iter/s)": 0.095809 }, { "epoch": 0.9579752309923588, "eval_loss": 0.5556042194366455, "eval_runtime": 294.0624, "eval_samples_per_second": 11.817, "eval_steps_per_second": 11.817, "step": 20600 }, { "epoch": 0.9582077492523084, "grad_norm": 7.836030960083008, "learning_rate": 4.760941295177568e-08, "loss": 0.6465342044830322, "memory(GiB)": 47.44, "step": 20605, "token_acc": 0.8347251800626929, "train_speed(iter/s)": 0.095691 }, { "epoch": 0.9584402675122581, "grad_norm": 7.720570087432861, "learning_rate": 4.7081548596912276e-08, "loss": 0.6976634502410889, "memory(GiB)": 47.44, "step": 20610, "token_acc": 0.8268502581755593, "train_speed(iter/s)": 0.095704 }, { "epoch": 0.9586727857722077, "grad_norm": 10.879193305969238, "learning_rate": 4.655661305564774e-08, "loss": 0.6851535320281983, "memory(GiB)": 47.44, "step": 20615, "token_acc": 0.8421271292064811, "train_speed(iter/s)": 0.095717 }, { "epoch": 0.9589053040321572, "grad_norm": 8.683979988098145, "learning_rate": 4.6034606638392654e-08, "loss": 0.697068738937378, "memory(GiB)": 47.44, "step": 20620, "token_acc": 0.8208109719737626, "train_speed(iter/s)": 0.09573 }, { "epoch": 0.9591378222921069, "grad_norm": 8.056221961975098, "learning_rate": 4.551552965382511e-08, "loss": 0.6238224029541015, "memory(GiB)": 47.44, "step": 20625, "token_acc": 0.8367801463569837, "train_speed(iter/s)": 0.095743 }, { "epoch": 0.9593703405520565, "grad_norm": 11.859223365783691, "learning_rate": 4.4999382408892345e-08, "loss": 0.5613128185272217, "memory(GiB)": 47.44, "step": 20630, "token_acc": 0.8601036269430051, "train_speed(iter/s)": 0.095756 }, { "epoch": 0.9596028588120061, "grad_norm": 7.768198490142822, "learning_rate": 4.4486165208806885e-08, "loss": 0.6417368412017822, "memory(GiB)": 47.44, "step": 20635, "token_acc": 0.8403361344537815, "train_speed(iter/s)": 0.095768 }, { "epoch": 0.9598353770719557, "grad_norm": 9.254332542419434, "learning_rate": 4.397587835705097e-08, "loss": 0.6034894466400147, "memory(GiB)": 47.44, "step": 20640, "token_acc": 0.8504885993485342, "train_speed(iter/s)": 0.095781 }, { "epoch": 0.9600678953319053, "grad_norm": 6.1051926612854, "learning_rate": 4.346852215537267e-08, "loss": 0.6086849212646485, "memory(GiB)": 47.44, "step": 20645, "token_acc": 0.8502259522272434, "train_speed(iter/s)": 0.095794 }, { "epoch": 0.9603004135918549, "grad_norm": 8.756834983825684, "learning_rate": 4.296409690378644e-08, "loss": 0.5037965297698974, "memory(GiB)": 47.44, "step": 20650, "token_acc": 0.8769622401357658, "train_speed(iter/s)": 0.095806 }, { "epoch": 0.9603004135918549, "eval_loss": 0.5555453896522522, "eval_runtime": 292.1663, "eval_samples_per_second": 11.894, "eval_steps_per_second": 11.894, "step": 20650 }, { "epoch": 0.9605329318518044, "grad_norm": 8.111796379089355, "learning_rate": 4.246260290057591e-08, "loss": 0.8531887054443359, "memory(GiB)": 47.44, "step": 20655, "token_acc": 0.8338415120845439, "train_speed(iter/s)": 0.095689 }, { "epoch": 0.9607654501117541, "grad_norm": 10.552480697631836, "learning_rate": 4.196404044228941e-08, "loss": 0.6422392368316651, "memory(GiB)": 47.44, "step": 20660, "token_acc": 0.83687374749499, "train_speed(iter/s)": 0.095702 }, { "epoch": 0.9609979683717037, "grad_norm": 8.833476066589355, "learning_rate": 4.146840982374223e-08, "loss": 0.6003233909606933, "memory(GiB)": 47.44, "step": 20665, "token_acc": 0.8502259522272434, "train_speed(iter/s)": 0.095715 }, { "epoch": 0.9612304866316533, "grad_norm": 9.322022438049316, "learning_rate": 4.097571133801548e-08, "loss": 0.541776466369629, "memory(GiB)": 47.44, "step": 20670, "token_acc": 0.8672470076169749, "train_speed(iter/s)": 0.095728 }, { "epoch": 0.9614630048916029, "grad_norm": 8.25760269165039, "learning_rate": 4.048594527645833e-08, "loss": 0.6594733238220215, "memory(GiB)": 47.44, "step": 20675, "token_acc": 0.8320663441603318, "train_speed(iter/s)": 0.095741 }, { "epoch": 0.9616955231515525, "grad_norm": 9.2846040725708, "learning_rate": 3.9999111928683554e-08, "loss": 0.5512380123138427, "memory(GiB)": 47.44, "step": 20680, "token_acc": 0.8586731167933656, "train_speed(iter/s)": 0.095754 }, { "epoch": 0.9619280414115021, "grad_norm": 8.768184661865234, "learning_rate": 3.951521158257143e-08, "loss": 0.6587899208068848, "memory(GiB)": 47.44, "step": 20685, "token_acc": 0.83515731874145, "train_speed(iter/s)": 0.095767 }, { "epoch": 0.9621605596714518, "grad_norm": 6.437694072723389, "learning_rate": 3.9034244524266406e-08, "loss": 0.6556053638458252, "memory(GiB)": 47.44, "step": 20690, "token_acc": 0.8247656915648963, "train_speed(iter/s)": 0.095779 }, { "epoch": 0.9623930779314013, "grad_norm": 9.403351783752441, "learning_rate": 3.8556211038179304e-08, "loss": 0.682905912399292, "memory(GiB)": 47.44, "step": 20695, "token_acc": 0.8239623196938475, "train_speed(iter/s)": 0.095792 }, { "epoch": 0.9626255961913509, "grad_norm": 8.33271312713623, "learning_rate": 3.808111140698623e-08, "loss": 0.5829206466674804, "memory(GiB)": 47.44, "step": 20700, "token_acc": 0.8585293019783501, "train_speed(iter/s)": 0.095805 }, { "epoch": 0.9626255961913509, "eval_loss": 0.5555470585823059, "eval_runtime": 294.213, "eval_samples_per_second": 11.811, "eval_steps_per_second": 11.811, "step": 20700 }, { "epoch": 0.9628581144513005, "grad_norm": 8.44229793548584, "learning_rate": 3.760894591162911e-08, "loss": 0.7375020503997802, "memory(GiB)": 47.44, "step": 20705, "token_acc": 0.834630381412365, "train_speed(iter/s)": 0.095687 }, { "epoch": 0.9630906327112501, "grad_norm": 8.542226791381836, "learning_rate": 3.71397148313124e-08, "loss": 0.7269775867462158, "memory(GiB)": 47.44, "step": 20710, "token_acc": 0.8133986928104575, "train_speed(iter/s)": 0.0957 }, { "epoch": 0.9633231509711997, "grad_norm": 8.211197853088379, "learning_rate": 3.667341844350803e-08, "loss": 0.5958492755889893, "memory(GiB)": 47.44, "step": 20715, "token_acc": 0.8416696653472472, "train_speed(iter/s)": 0.095713 }, { "epoch": 0.9635556692311493, "grad_norm": 9.82761287689209, "learning_rate": 3.621005702395153e-08, "loss": 0.6712447166442871, "memory(GiB)": 47.44, "step": 20720, "token_acc": 0.8185196655311242, "train_speed(iter/s)": 0.095726 }, { "epoch": 0.963788187491099, "grad_norm": 11.418320655822754, "learning_rate": 3.574963084664207e-08, "loss": 0.6232163429260253, "memory(GiB)": 47.44, "step": 20725, "token_acc": 0.8422480620155038, "train_speed(iter/s)": 0.095739 }, { "epoch": 0.9640207057510485, "grad_norm": 8.077664375305176, "learning_rate": 3.529214018384408e-08, "loss": 0.7407678127288818, "memory(GiB)": 47.44, "step": 20730, "token_acc": 0.8097859327217125, "train_speed(iter/s)": 0.095752 }, { "epoch": 0.9642532240109981, "grad_norm": 7.364772319793701, "learning_rate": 3.483758530608616e-08, "loss": 0.5969498634338379, "memory(GiB)": 47.44, "step": 20735, "token_acc": 0.84688995215311, "train_speed(iter/s)": 0.095764 }, { "epoch": 0.9644857422709477, "grad_norm": 8.380683898925781, "learning_rate": 3.4385966482160525e-08, "loss": 0.6390859603881835, "memory(GiB)": 47.44, "step": 20740, "token_acc": 0.8565385971096229, "train_speed(iter/s)": 0.095777 }, { "epoch": 0.9647182605308973, "grad_norm": 8.919743537902832, "learning_rate": 3.393728397912355e-08, "loss": 0.6278938770294189, "memory(GiB)": 47.44, "step": 20745, "token_acc": 0.8445541064504759, "train_speed(iter/s)": 0.09579 }, { "epoch": 0.9649507787908469, "grad_norm": 8.967925071716309, "learning_rate": 3.34915380622941e-08, "loss": 0.7339792728424073, "memory(GiB)": 47.44, "step": 20750, "token_acc": 0.8144448713023434, "train_speed(iter/s)": 0.095804 }, { "epoch": 0.9649507787908469, "eval_loss": 0.5556846261024475, "eval_runtime": 295.7774, "eval_samples_per_second": 11.749, "eval_steps_per_second": 11.749, "step": 20750 }, { "epoch": 0.9651832970507965, "grad_norm": 8.481380462646484, "learning_rate": 3.304872899525691e-08, "loss": 0.6525393962860108, "memory(GiB)": 47.44, "step": 20755, "token_acc": 0.8348432948106412, "train_speed(iter/s)": 0.095686 }, { "epoch": 0.9654158153107462, "grad_norm": 13.112621307373047, "learning_rate": 3.260885703985806e-08, "loss": 0.6459598541259766, "memory(GiB)": 47.44, "step": 20760, "token_acc": 0.8370715192551841, "train_speed(iter/s)": 0.095699 }, { "epoch": 0.9656483335706957, "grad_norm": 9.978148460388184, "learning_rate": 3.217192245620726e-08, "loss": 0.63280029296875, "memory(GiB)": 47.44, "step": 20765, "token_acc": 0.8340011448196909, "train_speed(iter/s)": 0.095712 }, { "epoch": 0.9658808518306453, "grad_norm": 8.677961349487305, "learning_rate": 3.173792550267729e-08, "loss": 0.749193525314331, "memory(GiB)": 47.44, "step": 20770, "token_acc": 0.790771484375, "train_speed(iter/s)": 0.095725 }, { "epoch": 0.966113370090595, "grad_norm": 8.47712230682373, "learning_rate": 3.1306866435903974e-08, "loss": 0.5897305488586426, "memory(GiB)": 47.44, "step": 20775, "token_acc": 0.843000773395205, "train_speed(iter/s)": 0.095738 }, { "epoch": 0.9663458883505446, "grad_norm": 8.103322982788086, "learning_rate": 3.087874551078673e-08, "loss": 0.6301196575164795, "memory(GiB)": 47.44, "step": 20780, "token_acc": 0.8404820985466147, "train_speed(iter/s)": 0.095751 }, { "epoch": 0.9665784066104941, "grad_norm": 10.327425956726074, "learning_rate": 3.045356298048529e-08, "loss": 0.5524001121520996, "memory(GiB)": 47.44, "step": 20785, "token_acc": 0.8594551914725622, "train_speed(iter/s)": 0.095764 }, { "epoch": 0.9668109248704437, "grad_norm": 9.891942024230957, "learning_rate": 3.003131909642409e-08, "loss": 0.6475227355957032, "memory(GiB)": 47.44, "step": 20790, "token_acc": 0.8433402346445825, "train_speed(iter/s)": 0.095777 }, { "epoch": 0.9670434431303934, "grad_norm": 7.341713905334473, "learning_rate": 2.9612014108288955e-08, "loss": 0.5846897602081299, "memory(GiB)": 47.44, "step": 20795, "token_acc": 0.8507265521796565, "train_speed(iter/s)": 0.09579 }, { "epoch": 0.9672759613903429, "grad_norm": 8.49909782409668, "learning_rate": 2.9195648264027653e-08, "loss": 0.7312897682189942, "memory(GiB)": 47.44, "step": 20800, "token_acc": 0.8227060653188181, "train_speed(iter/s)": 0.095803 }, { "epoch": 0.9672759613903429, "eval_loss": 0.5553971529006958, "eval_runtime": 295.0521, "eval_samples_per_second": 11.778, "eval_steps_per_second": 11.778, "step": 20800 }, { "epoch": 0.9675084796502925, "grad_norm": 7.44534158706665, "learning_rate": 2.8782221809850464e-08, "loss": 0.6029882907867432, "memory(GiB)": 47.44, "step": 20805, "token_acc": 0.8352805884091072, "train_speed(iter/s)": 0.095686 }, { "epoch": 0.9677409979102422, "grad_norm": 8.211577415466309, "learning_rate": 2.837173499022905e-08, "loss": 0.5521889209747315, "memory(GiB)": 47.44, "step": 20810, "token_acc": 0.8668499607227023, "train_speed(iter/s)": 0.095699 }, { "epoch": 0.9679735161701918, "grad_norm": 9.065325736999512, "learning_rate": 2.7964188047895913e-08, "loss": 0.6344101428985596, "memory(GiB)": 47.44, "step": 20815, "token_acc": 0.8490566037735849, "train_speed(iter/s)": 0.095712 }, { "epoch": 0.9682060344301413, "grad_norm": 8.109495162963867, "learning_rate": 2.755958122384772e-08, "loss": 0.7012411117553711, "memory(GiB)": 47.44, "step": 20820, "token_acc": 0.8378076062639821, "train_speed(iter/s)": 0.095725 }, { "epoch": 0.9684385526900909, "grad_norm": 7.52706241607666, "learning_rate": 2.715791475734031e-08, "loss": 0.6769590854644776, "memory(GiB)": 47.44, "step": 20825, "token_acc": 0.8325183374083129, "train_speed(iter/s)": 0.095738 }, { "epoch": 0.9686710709500406, "grad_norm": 7.486845970153809, "learning_rate": 2.6759188885891462e-08, "loss": 0.7046610832214355, "memory(GiB)": 47.44, "step": 20830, "token_acc": 0.8220979020979021, "train_speed(iter/s)": 0.09575 }, { "epoch": 0.9689035892099902, "grad_norm": 10.00309944152832, "learning_rate": 2.6363403845280355e-08, "loss": 0.686539888381958, "memory(GiB)": 47.44, "step": 20835, "token_acc": 0.809255784865541, "train_speed(iter/s)": 0.095762 }, { "epoch": 0.9691361074699397, "grad_norm": 9.596781730651855, "learning_rate": 2.597055986954644e-08, "loss": 0.7272031784057618, "memory(GiB)": 47.44, "step": 20840, "token_acc": 0.8149327671620665, "train_speed(iter/s)": 0.095775 }, { "epoch": 0.9693686257298894, "grad_norm": 5.8002142906188965, "learning_rate": 2.5580657190991122e-08, "loss": 0.650816535949707, "memory(GiB)": 47.44, "step": 20845, "token_acc": 0.840092317837125, "train_speed(iter/s)": 0.095789 }, { "epoch": 0.969601143989839, "grad_norm": 9.024754524230957, "learning_rate": 2.5193696040174964e-08, "loss": 0.6397728443145752, "memory(GiB)": 47.44, "step": 20850, "token_acc": 0.8487957181088314, "train_speed(iter/s)": 0.095802 }, { "epoch": 0.969601143989839, "eval_loss": 0.5555434823036194, "eval_runtime": 294.1615, "eval_samples_per_second": 11.813, "eval_steps_per_second": 11.813, "step": 20850 }, { "epoch": 0.9698336622497885, "grad_norm": 7.542553424835205, "learning_rate": 2.4809676645921042e-08, "loss": 0.6364368915557861, "memory(GiB)": 47.44, "step": 20855, "token_acc": 0.8351431391905232, "train_speed(iter/s)": 0.095685 }, { "epoch": 0.9700661805097381, "grad_norm": 9.540937423706055, "learning_rate": 2.4428599235311045e-08, "loss": 0.6640225410461426, "memory(GiB)": 47.44, "step": 20860, "token_acc": 0.8276329156789706, "train_speed(iter/s)": 0.095698 }, { "epoch": 0.9702986987696878, "grad_norm": 5.910898685455322, "learning_rate": 2.4050464033688048e-08, "loss": 0.6791872978210449, "memory(GiB)": 47.44, "step": 20865, "token_acc": 0.8174807197943444, "train_speed(iter/s)": 0.095711 }, { "epoch": 0.9705312170296374, "grad_norm": 5.9184370040893555, "learning_rate": 2.3675271264655407e-08, "loss": 0.6619283199310303, "memory(GiB)": 47.44, "step": 20870, "token_acc": 0.8163421153111875, "train_speed(iter/s)": 0.095724 }, { "epoch": 0.9707637352895869, "grad_norm": 9.528105735778809, "learning_rate": 2.330302115007621e-08, "loss": 0.5713191986083984, "memory(GiB)": 47.44, "step": 20875, "token_acc": 0.8521199586349535, "train_speed(iter/s)": 0.095736 }, { "epoch": 0.9709962535495366, "grad_norm": 11.298874855041504, "learning_rate": 2.2933713910073262e-08, "loss": 0.5619585990905762, "memory(GiB)": 47.44, "step": 20880, "token_acc": 0.8574626865671642, "train_speed(iter/s)": 0.095749 }, { "epoch": 0.9712287718094862, "grad_norm": 10.252700805664062, "learning_rate": 2.25673497630291e-08, "loss": 0.54613356590271, "memory(GiB)": 47.44, "step": 20885, "token_acc": 0.8730092204526404, "train_speed(iter/s)": 0.095762 }, { "epoch": 0.9714612900694357, "grad_norm": 5.918463230133057, "learning_rate": 2.2203928925585984e-08, "loss": 0.5828773498535156, "memory(GiB)": 47.44, "step": 20890, "token_acc": 0.846976401179941, "train_speed(iter/s)": 0.095775 }, { "epoch": 0.9716938083293853, "grad_norm": 10.07883358001709, "learning_rate": 2.1843451612646448e-08, "loss": 0.6059688091278076, "memory(GiB)": 47.44, "step": 20895, "token_acc": 0.8476046774801962, "train_speed(iter/s)": 0.095787 }, { "epoch": 0.971926326589335, "grad_norm": 9.270331382751465, "learning_rate": 2.148591803737221e-08, "loss": 0.541387939453125, "memory(GiB)": 47.44, "step": 20900, "token_acc": 0.8590014064697609, "train_speed(iter/s)": 0.0958 }, { "epoch": 0.971926326589335, "eval_loss": 0.5555252432823181, "eval_runtime": 295.9192, "eval_samples_per_second": 11.743, "eval_steps_per_second": 11.743, "step": 20900 }, { "epoch": 0.9721588448492846, "grad_norm": 8.093807220458984, "learning_rate": 2.1131328411182484e-08, "loss": 0.6522313594818115, "memory(GiB)": 47.44, "step": 20905, "token_acc": 0.8349668238868015, "train_speed(iter/s)": 0.095683 }, { "epoch": 0.9723913631092341, "grad_norm": 9.61971664428711, "learning_rate": 2.0779682943758428e-08, "loss": 0.5763451099395752, "memory(GiB)": 47.44, "step": 20910, "token_acc": 0.8618947368421053, "train_speed(iter/s)": 0.095696 }, { "epoch": 0.9726238813691838, "grad_norm": 9.078413963317871, "learning_rate": 2.043098184303871e-08, "loss": 0.5425615787506104, "memory(GiB)": 47.44, "step": 20915, "token_acc": 0.8592896174863388, "train_speed(iter/s)": 0.095709 }, { "epoch": 0.9728563996291334, "grad_norm": 12.81851863861084, "learning_rate": 2.008522531522006e-08, "loss": 0.6862669467926026, "memory(GiB)": 47.44, "step": 20920, "token_acc": 0.832632464255677, "train_speed(iter/s)": 0.095722 }, { "epoch": 0.973088917889083, "grad_norm": 10.916457176208496, "learning_rate": 1.9742413564760033e-08, "loss": 0.6592337608337402, "memory(GiB)": 47.44, "step": 20925, "token_acc": 0.8250401284109149, "train_speed(iter/s)": 0.095734 }, { "epoch": 0.9733214361490325, "grad_norm": 8.57726764678955, "learning_rate": 1.9402546794373146e-08, "loss": 0.7035943508148194, "memory(GiB)": 47.44, "step": 20930, "token_acc": 0.825030012004802, "train_speed(iter/s)": 0.095747 }, { "epoch": 0.9735539544089822, "grad_norm": 7.987157344818115, "learning_rate": 1.9065625205033632e-08, "loss": 0.6475473403930664, "memory(GiB)": 47.44, "step": 20935, "token_acc": 0.828650711064863, "train_speed(iter/s)": 0.095759 }, { "epoch": 0.9737864726689318, "grad_norm": 8.405952453613281, "learning_rate": 1.8731648995972685e-08, "loss": 0.5332788467407227, "memory(GiB)": 47.44, "step": 20940, "token_acc": 0.8770635756937126, "train_speed(iter/s)": 0.095772 }, { "epoch": 0.9740189909288813, "grad_norm": 8.0729398727417, "learning_rate": 1.840061836468232e-08, "loss": 0.6800761699676514, "memory(GiB)": 47.44, "step": 20945, "token_acc": 0.826472675656494, "train_speed(iter/s)": 0.095785 }, { "epoch": 0.974251509188831, "grad_norm": 9.692399978637695, "learning_rate": 1.807253350690985e-08, "loss": 0.6131781101226806, "memory(GiB)": 47.44, "step": 20950, "token_acc": 0.8350717079530638, "train_speed(iter/s)": 0.095798 }, { "epoch": 0.974251509188831, "eval_loss": 0.555717408657074, "eval_runtime": 294.7045, "eval_samples_per_second": 11.791, "eval_steps_per_second": 11.791, "step": 20950 }, { "epoch": 0.9744840274487806, "grad_norm": 8.899210929870605, "learning_rate": 1.7747394616662862e-08, "loss": 0.7808675765991211, "memory(GiB)": 47.44, "step": 20955, "token_acc": 0.834443311452425, "train_speed(iter/s)": 0.095681 }, { "epoch": 0.9747165457087302, "grad_norm": 6.883942604064941, "learning_rate": 1.7425201886205333e-08, "loss": 0.6193971633911133, "memory(GiB)": 47.44, "step": 20960, "token_acc": 0.8425504229017566, "train_speed(iter/s)": 0.095693 }, { "epoch": 0.9749490639686798, "grad_norm": 7.886797904968262, "learning_rate": 1.7105955506059867e-08, "loss": 0.6988693714141846, "memory(GiB)": 47.44, "step": 20965, "token_acc": 0.8223776223776224, "train_speed(iter/s)": 0.095706 }, { "epoch": 0.9751815822286294, "grad_norm": 11.55400562286377, "learning_rate": 1.6789655665006565e-08, "loss": 0.6574934005737305, "memory(GiB)": 47.44, "step": 20970, "token_acc": 0.8405327573794097, "train_speed(iter/s)": 0.095719 }, { "epoch": 0.975414100488579, "grad_norm": 6.187772750854492, "learning_rate": 1.6476302550084145e-08, "loss": 0.7456907272338867, "memory(GiB)": 47.44, "step": 20975, "token_acc": 0.8103341584158416, "train_speed(iter/s)": 0.095732 }, { "epoch": 0.9756466187485286, "grad_norm": 8.612384796142578, "learning_rate": 1.6165896346587162e-08, "loss": 0.6755566596984863, "memory(GiB)": 47.44, "step": 20980, "token_acc": 0.8424725822532403, "train_speed(iter/s)": 0.095744 }, { "epoch": 0.9758791370084782, "grad_norm": 6.694667339324951, "learning_rate": 1.585843723806879e-08, "loss": 0.7141738414764405, "memory(GiB)": 47.44, "step": 20985, "token_acc": 0.8280895231916964, "train_speed(iter/s)": 0.095757 }, { "epoch": 0.9761116552684278, "grad_norm": 8.811197280883789, "learning_rate": 1.55539254063386e-08, "loss": 0.6681375503540039, "memory(GiB)": 47.44, "step": 20990, "token_acc": 0.8291426840351409, "train_speed(iter/s)": 0.09577 }, { "epoch": 0.9763441735283774, "grad_norm": 8.207998275756836, "learning_rate": 1.525236103146477e-08, "loss": 0.661448621749878, "memory(GiB)": 47.44, "step": 20995, "token_acc": 0.8250607427976397, "train_speed(iter/s)": 0.095783 }, { "epoch": 0.976576691788327, "grad_norm": 7.307313919067383, "learning_rate": 1.4953744291770766e-08, "loss": 0.663054084777832, "memory(GiB)": 47.44, "step": 21000, "token_acc": 0.839825263924281, "train_speed(iter/s)": 0.095796 }, { "epoch": 0.976576691788327, "eval_loss": 0.5556771159172058, "eval_runtime": 295.8602, "eval_samples_per_second": 11.745, "eval_steps_per_second": 11.745, "step": 21000 }, { "epoch": 0.9768092100482766, "grad_norm": 8.349247932434082, "learning_rate": 1.4658075363838121e-08, "loss": 0.5215034484863281, "memory(GiB)": 47.44, "step": 21005, "token_acc": 0.8354958257626087, "train_speed(iter/s)": 0.095679 }, { "epoch": 0.9770417283082262, "grad_norm": 8.144487380981445, "learning_rate": 1.4365354422504751e-08, "loss": 0.707400131225586, "memory(GiB)": 47.44, "step": 21010, "token_acc": 0.8157653528872594, "train_speed(iter/s)": 0.095691 }, { "epoch": 0.9772742465681759, "grad_norm": 10.015386581420898, "learning_rate": 1.4075581640866088e-08, "loss": 0.6027235507965087, "memory(GiB)": 47.44, "step": 21015, "token_acc": 0.8569099062372605, "train_speed(iter/s)": 0.095704 }, { "epoch": 0.9775067648281254, "grad_norm": 6.5005412101745605, "learning_rate": 1.3788757190273394e-08, "loss": 0.7321415901184082, "memory(GiB)": 47.44, "step": 21020, "token_acc": 0.808440366972477, "train_speed(iter/s)": 0.095717 }, { "epoch": 0.977739283088075, "grad_norm": 6.8954548835754395, "learning_rate": 1.3504881240334888e-08, "loss": 0.5201037406921387, "memory(GiB)": 47.44, "step": 21025, "token_acc": 0.858516909711449, "train_speed(iter/s)": 0.095729 }, { "epoch": 0.9779718013480246, "grad_norm": 7.919386863708496, "learning_rate": 1.3223953958915736e-08, "loss": 0.7142354011535644, "memory(GiB)": 47.44, "step": 21030, "token_acc": 0.8170170827858082, "train_speed(iter/s)": 0.095742 }, { "epoch": 0.9782043196079742, "grad_norm": 8.009725570678711, "learning_rate": 1.2945975512135833e-08, "loss": 0.665521764755249, "memory(GiB)": 47.44, "step": 21035, "token_acc": 0.8361138370951914, "train_speed(iter/s)": 0.095755 }, { "epoch": 0.9784368378679238, "grad_norm": 10.154017448425293, "learning_rate": 1.2670946064373135e-08, "loss": 0.5645842552185059, "memory(GiB)": 47.44, "step": 21040, "token_acc": 0.8581791802684077, "train_speed(iter/s)": 0.095767 }, { "epoch": 0.9786693561278734, "grad_norm": 8.321249961853027, "learning_rate": 1.2398865778261438e-08, "loss": 0.6524802207946777, "memory(GiB)": 47.44, "step": 21045, "token_acc": 0.843369300382875, "train_speed(iter/s)": 0.09578 }, { "epoch": 0.9789018743878231, "grad_norm": 8.507335662841797, "learning_rate": 1.2129734814689265e-08, "loss": 0.5944690227508544, "memory(GiB)": 47.44, "step": 21050, "token_acc": 0.8567393058918482, "train_speed(iter/s)": 0.095793 }, { "epoch": 0.9789018743878231, "eval_loss": 0.5553185343742371, "eval_runtime": 296.9087, "eval_samples_per_second": 11.704, "eval_steps_per_second": 11.704, "step": 21050 } ], "logging_steps": 5, "max_steps": 21503, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.1497138693949542e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }