diff --git "a/output/checkpoint-21503/trainer_state.json" "b/output/checkpoint-21503/trainer_state.json" new file mode 100644--- /dev/null +++ "b/output/checkpoint-21503/trainer_state.json" @@ -0,0 +1,46491 @@ +{ + "best_metric": 0.55531853, + "best_model_checkpoint": "/root/workspace/myPharmHGT/KV_PLM/output/checkpoint-21050", + "epoch": 0.9999680287392569, + "eval_steps": 50, + "global_step": 21503, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 4.650365198992033e-05, + "grad_norm": 11.30196762084961, + "learning_rate": 9.29368029739777e-09, + "loss": 5.595128536224365, + "memory(GiB)": 16.79, + "step": 1, + "token_acc": 0.35786802030456855, + "train_speed(iter/s)": 0.181377 + }, + { + "epoch": 0.00023251825994960167, + "grad_norm": 14.642141342163086, + "learning_rate": 4.6468401486988856e-08, + "loss": 5.128164291381836, + "memory(GiB)": 16.8, + "step": 5, + "token_acc": 0.4304812834224599, + "train_speed(iter/s)": 0.21625 + }, + { + "epoch": 0.00046503651989920333, + "grad_norm": 7.316074848175049, + "learning_rate": 9.293680297397771e-08, + "loss": 4.300411987304687, + "memory(GiB)": 17.84, + "step": 10, + "token_acc": 0.4681737061273052, + "train_speed(iter/s)": 0.219536 + }, + { + "epoch": 0.000697554779848805, + "grad_norm": 11.100884437561035, + "learning_rate": 1.3940520446096655e-07, + "loss": 4.639457321166992, + "memory(GiB)": 19.03, + "step": 15, + "token_acc": 0.43668770887486075, + "train_speed(iter/s)": 0.220308 + }, + { + "epoch": 0.0009300730397984067, + "grad_norm": 6.971445560455322, + "learning_rate": 1.8587360594795542e-07, + "loss": 5.257072448730469, + "memory(GiB)": 19.04, + "step": 20, + "token_acc": 0.4142085583714167, + "train_speed(iter/s)": 0.222155 + }, + { + "epoch": 0.0011625912997480082, + "grad_norm": 7.333343505859375, + "learning_rate": 2.3234200743494425e-07, + "loss": 4.158517074584961, + "memory(GiB)": 19.04, + "step": 25, + "token_acc": 0.4433541480820696, + "train_speed(iter/s)": 0.221422 + }, + { + "epoch": 0.00139510955969761, + "grad_norm": 7.327789783477783, + "learning_rate": 2.788104089219331e-07, + "loss": 3.946809768676758, + "memory(GiB)": 20.29, + "step": 30, + "token_acc": 0.4627841793833697, + "train_speed(iter/s)": 0.220406 + }, + { + "epoch": 0.0016276278196472116, + "grad_norm": 9.05079174041748, + "learning_rate": 3.2527881040892197e-07, + "loss": 4.110320281982422, + "memory(GiB)": 22.09, + "step": 35, + "token_acc": 0.4564489112227806, + "train_speed(iter/s)": 0.21968 + }, + { + "epoch": 0.0018601460795968133, + "grad_norm": 10.026455879211426, + "learning_rate": 3.7174721189591085e-07, + "loss": 4.707549285888672, + "memory(GiB)": 22.09, + "step": 40, + "token_acc": 0.41740674955595025, + "train_speed(iter/s)": 0.218784 + }, + { + "epoch": 0.002092664339546415, + "grad_norm": 7.746947288513184, + "learning_rate": 4.1821561338289967e-07, + "loss": 4.500521087646485, + "memory(GiB)": 22.09, + "step": 45, + "token_acc": 0.4180354267310789, + "train_speed(iter/s)": 0.219268 + }, + { + "epoch": 0.0023251825994960165, + "grad_norm": 11.605240821838379, + "learning_rate": 4.646840148698885e-07, + "loss": 5.464200210571289, + "memory(GiB)": 22.09, + "step": 50, + "token_acc": 0.37989080982711554, + "train_speed(iter/s)": 0.219704 + }, + { + "epoch": 0.0023251825994960165, + "eval_loss": 6.238847732543945, + "eval_runtime": 290.2527, + "eval_samples_per_second": 11.972, + "eval_steps_per_second": 11.972, + "step": 50 + }, + { + "epoch": 0.0025577008594456184, + "grad_norm": 10.126920700073242, + "learning_rate": 5.111524163568774e-07, + "loss": 4.355241012573242, + "memory(GiB)": 22.09, + "step": 55, + "token_acc": 0.4303478219228375, + "train_speed(iter/s)": 0.101729 + }, + { + "epoch": 0.00279021911939522, + "grad_norm": 10.698599815368652, + "learning_rate": 5.576208178438662e-07, + "loss": 4.348709487915039, + "memory(GiB)": 22.09, + "step": 60, + "token_acc": 0.44151949350216596, + "train_speed(iter/s)": 0.106489 + }, + { + "epoch": 0.0030227373793448218, + "grad_norm": 10.698352813720703, + "learning_rate": 6.04089219330855e-07, + "loss": 4.434131240844726, + "memory(GiB)": 22.09, + "step": 65, + "token_acc": 0.4308273257809818, + "train_speed(iter/s)": 0.110843 + }, + { + "epoch": 0.0032552556392944233, + "grad_norm": 11.780677795410156, + "learning_rate": 6.505576208178439e-07, + "loss": 4.880805969238281, + "memory(GiB)": 22.09, + "step": 70, + "token_acc": 0.420174741858618, + "train_speed(iter/s)": 0.114884 + }, + { + "epoch": 0.003487773899244025, + "grad_norm": 10.340365409851074, + "learning_rate": 6.970260223048328e-07, + "loss": 4.451810836791992, + "memory(GiB)": 22.09, + "step": 75, + "token_acc": 0.4266284896206156, + "train_speed(iter/s)": 0.118675 + }, + { + "epoch": 0.0037202921591936266, + "grad_norm": 8.917567253112793, + "learning_rate": 7.434944237918217e-07, + "loss": 4.337361145019531, + "memory(GiB)": 22.09, + "step": 80, + "token_acc": 0.4407567208762031, + "train_speed(iter/s)": 0.122245 + }, + { + "epoch": 0.003952810419143228, + "grad_norm": 7.564225673675537, + "learning_rate": 7.899628252788105e-07, + "loss": 4.301780319213867, + "memory(GiB)": 22.09, + "step": 85, + "token_acc": 0.43113176236309325, + "train_speed(iter/s)": 0.125462 + }, + { + "epoch": 0.00418532867909283, + "grad_norm": 9.511303901672363, + "learning_rate": 8.364312267657993e-07, + "loss": 4.201332473754883, + "memory(GiB)": 22.09, + "step": 90, + "token_acc": 0.43982630272952855, + "train_speed(iter/s)": 0.128436 + }, + { + "epoch": 0.004417846939042432, + "grad_norm": 11.19315242767334, + "learning_rate": 8.828996282527883e-07, + "loss": 4.037698745727539, + "memory(GiB)": 22.09, + "step": 95, + "token_acc": 0.4582210242587601, + "train_speed(iter/s)": 0.131168 + }, + { + "epoch": 0.004650365198992033, + "grad_norm": 10.009793281555176, + "learning_rate": 9.29368029739777e-07, + "loss": 4.3285572052001955, + "memory(GiB)": 22.09, + "step": 100, + "token_acc": 0.44606819763395966, + "train_speed(iter/s)": 0.133872 + }, + { + "epoch": 0.004650365198992033, + "eval_loss": 6.00515079498291, + "eval_runtime": 293.5348, + "eval_samples_per_second": 11.838, + "eval_steps_per_second": 11.838, + "step": 100 + }, + { + "epoch": 0.004882883458941635, + "grad_norm": 10.538952827453613, + "learning_rate": 9.75836431226766e-07, + "loss": 4.578464508056641, + "memory(GiB)": 22.09, + "step": 105, + "token_acc": 0.4315021837560604, + "train_speed(iter/s)": 0.098632 + }, + { + "epoch": 0.005115401718891237, + "grad_norm": 8.359787940979004, + "learning_rate": 1.0223048327137547e-06, + "loss": 3.992523193359375, + "memory(GiB)": 22.09, + "step": 110, + "token_acc": 0.45879120879120877, + "train_speed(iter/s)": 0.10116 + }, + { + "epoch": 0.005347919978840839, + "grad_norm": 9.403013229370117, + "learning_rate": 1.0687732342007436e-06, + "loss": 4.678741836547852, + "memory(GiB)": 22.09, + "step": 115, + "token_acc": 0.4170796797560046, + "train_speed(iter/s)": 0.103556 + }, + { + "epoch": 0.00558043823879044, + "grad_norm": 12.418402671813965, + "learning_rate": 1.1152416356877324e-06, + "loss": 4.810413360595703, + "memory(GiB)": 22.09, + "step": 120, + "token_acc": 0.42337876910367617, + "train_speed(iter/s)": 0.105865 + }, + { + "epoch": 0.005812956498740042, + "grad_norm": 10.668410301208496, + "learning_rate": 1.1617100371747212e-06, + "loss": 4.338812637329101, + "memory(GiB)": 22.09, + "step": 125, + "token_acc": 0.444243301955105, + "train_speed(iter/s)": 0.108127 + }, + { + "epoch": 0.0060454747586896436, + "grad_norm": 9.335319519042969, + "learning_rate": 1.20817843866171e-06, + "loss": 4.58795051574707, + "memory(GiB)": 22.09, + "step": 130, + "token_acc": 0.42386831275720166, + "train_speed(iter/s)": 0.110288 + }, + { + "epoch": 0.006277993018639245, + "grad_norm": 7.6158881187438965, + "learning_rate": 1.2546468401486989e-06, + "loss": 4.130472183227539, + "memory(GiB)": 22.09, + "step": 135, + "token_acc": 0.4362486828240253, + "train_speed(iter/s)": 0.112356 + }, + { + "epoch": 0.0065105112785888465, + "grad_norm": 9.206437110900879, + "learning_rate": 1.3011152416356879e-06, + "loss": 4.286816787719727, + "memory(GiB)": 22.09, + "step": 140, + "token_acc": 0.42203258339798294, + "train_speed(iter/s)": 0.11431 + }, + { + "epoch": 0.006743029538538448, + "grad_norm": 8.033815383911133, + "learning_rate": 1.3475836431226765e-06, + "loss": 3.698823165893555, + "memory(GiB)": 22.09, + "step": 145, + "token_acc": 0.4792787092692186, + "train_speed(iter/s)": 0.116227 + }, + { + "epoch": 0.00697554779848805, + "grad_norm": 9.180941581726074, + "learning_rate": 1.3940520446096655e-06, + "loss": 3.7941364288330077, + "memory(GiB)": 22.09, + "step": 150, + "token_acc": 0.4449166394246486, + "train_speed(iter/s)": 0.118057 + }, + { + "epoch": 0.00697554779848805, + "eval_loss": 4.911423206329346, + "eval_runtime": 292.8063, + "eval_samples_per_second": 11.868, + "eval_steps_per_second": 11.868, + "step": 150 + }, + { + "epoch": 0.007208066058437651, + "grad_norm": 8.536662101745605, + "learning_rate": 1.4405204460966544e-06, + "loss": 3.2620067596435547, + "memory(GiB)": 22.09, + "step": 155, + "token_acc": 0.45824087550603265, + "train_speed(iter/s)": 0.097604 + }, + { + "epoch": 0.007440584318387253, + "grad_norm": 12.196755409240723, + "learning_rate": 1.4869888475836434e-06, + "loss": 3.9180809020996095, + "memory(GiB)": 22.09, + "step": 160, + "token_acc": 0.4449484536082474, + "train_speed(iter/s)": 0.09936 + }, + { + "epoch": 0.007673102578336855, + "grad_norm": 4.950088977813721, + "learning_rate": 1.533457249070632e-06, + "loss": 3.5914573669433594, + "memory(GiB)": 22.09, + "step": 165, + "token_acc": 0.4844632768361582, + "train_speed(iter/s)": 0.101032 + }, + { + "epoch": 0.007905620838286456, + "grad_norm": 6.352217197418213, + "learning_rate": 1.579925650557621e-06, + "loss": 3.5214935302734376, + "memory(GiB)": 22.09, + "step": 170, + "token_acc": 0.49245647969052225, + "train_speed(iter/s)": 0.102611 + }, + { + "epoch": 0.008138139098236059, + "grad_norm": 7.197726249694824, + "learning_rate": 1.6263940520446099e-06, + "loss": 3.174709701538086, + "memory(GiB)": 22.09, + "step": 175, + "token_acc": 0.5147719974309569, + "train_speed(iter/s)": 0.1042 + }, + { + "epoch": 0.00837065735818566, + "grad_norm": 6.746879577636719, + "learning_rate": 1.6728624535315987e-06, + "loss": 3.4479164123535155, + "memory(GiB)": 22.09, + "step": 180, + "token_acc": 0.47926447574334896, + "train_speed(iter/s)": 0.105701 + }, + { + "epoch": 0.008603175618135261, + "grad_norm": 5.178398132324219, + "learning_rate": 1.7193308550185875e-06, + "loss": 3.3821487426757812, + "memory(GiB)": 22.09, + "step": 185, + "token_acc": 0.489778534923339, + "train_speed(iter/s)": 0.107214 + }, + { + "epoch": 0.008835693878084864, + "grad_norm": 4.9685540199279785, + "learning_rate": 1.7657992565055765e-06, + "loss": 2.7297746658325197, + "memory(GiB)": 22.09, + "step": 190, + "token_acc": 0.5216201423097975, + "train_speed(iter/s)": 0.108622 + }, + { + "epoch": 0.009068212138034465, + "grad_norm": 3.6990842819213867, + "learning_rate": 1.8122676579925652e-06, + "loss": 2.962936782836914, + "memory(GiB)": 22.09, + "step": 195, + "token_acc": 0.5222845823704193, + "train_speed(iter/s)": 0.110023 + }, + { + "epoch": 0.009300730397984066, + "grad_norm": 4.623185634613037, + "learning_rate": 1.858736059479554e-06, + "loss": 2.6473554611206054, + "memory(GiB)": 22.09, + "step": 200, + "token_acc": 0.5438311688311688, + "train_speed(iter/s)": 0.111444 + }, + { + "epoch": 0.009300730397984066, + "eval_loss": 3.5655155181884766, + "eval_runtime": 293.694, + "eval_samples_per_second": 11.832, + "eval_steps_per_second": 11.832, + "step": 200 + }, + { + "epoch": 0.009533248657933669, + "grad_norm": 6.069931507110596, + "learning_rate": 1.905204460966543e-06, + "loss": 2.718964767456055, + "memory(GiB)": 22.09, + "step": 205, + "token_acc": 0.5408020470992764, + "train_speed(iter/s)": 0.097091 + }, + { + "epoch": 0.00976576691788327, + "grad_norm": 18.15644645690918, + "learning_rate": 1.951672862453532e-06, + "loss": 2.583760070800781, + "memory(GiB)": 22.09, + "step": 210, + "token_acc": 0.5504587155963303, + "train_speed(iter/s)": 0.098378 + }, + { + "epoch": 0.009998285177832873, + "grad_norm": 6.019149303436279, + "learning_rate": 1.9981412639405207e-06, + "loss": 2.5243818283081056, + "memory(GiB)": 22.09, + "step": 215, + "token_acc": 0.549553208773355, + "train_speed(iter/s)": 0.099675 + }, + { + "epoch": 0.010230803437782474, + "grad_norm": 8.37928581237793, + "learning_rate": 2.0446096654275095e-06, + "loss": 2.481988525390625, + "memory(GiB)": 22.09, + "step": 220, + "token_acc": 0.5509568313306631, + "train_speed(iter/s)": 0.100916 + }, + { + "epoch": 0.010463321697732075, + "grad_norm": 12.24028205871582, + "learning_rate": 2.0910780669144983e-06, + "loss": 2.3563426971435546, + "memory(GiB)": 22.09, + "step": 225, + "token_acc": 0.5385551948051948, + "train_speed(iter/s)": 0.102149 + }, + { + "epoch": 0.010695839957681677, + "grad_norm": 6.658745288848877, + "learning_rate": 2.137546468401487e-06, + "loss": 2.2650516510009764, + "memory(GiB)": 22.09, + "step": 230, + "token_acc": 0.538664323374341, + "train_speed(iter/s)": 0.10335 + }, + { + "epoch": 0.010928358217631278, + "grad_norm": 4.5154008865356445, + "learning_rate": 2.184014869888476e-06, + "loss": 2.082209587097168, + "memory(GiB)": 22.09, + "step": 235, + "token_acc": 0.5826681870011402, + "train_speed(iter/s)": 0.1045 + }, + { + "epoch": 0.01116087647758088, + "grad_norm": 3.9229602813720703, + "learning_rate": 2.2304832713754648e-06, + "loss": 2.032619857788086, + "memory(GiB)": 22.09, + "step": 240, + "token_acc": 0.5845122859270291, + "train_speed(iter/s)": 0.105659 + }, + { + "epoch": 0.011393394737530482, + "grad_norm": 4.006194591522217, + "learning_rate": 2.276951672862454e-06, + "loss": 2.0263885498046874, + "memory(GiB)": 22.09, + "step": 245, + "token_acc": 0.6218020022246941, + "train_speed(iter/s)": 0.106808 + }, + { + "epoch": 0.011625912997480083, + "grad_norm": 3.4309797286987305, + "learning_rate": 2.3234200743494424e-06, + "loss": 1.9709346771240235, + "memory(GiB)": 22.09, + "step": 250, + "token_acc": 0.5988117573483427, + "train_speed(iter/s)": 0.107935 + }, + { + "epoch": 0.011625912997480083, + "eval_loss": 1.9419108629226685, + "eval_runtime": 292.9372, + "eval_samples_per_second": 11.863, + "eval_steps_per_second": 11.863, + "step": 250 + }, + { + "epoch": 0.011858431257429684, + "grad_norm": 4.214391231536865, + "learning_rate": 2.3698884758364317e-06, + "loss": 1.9560165405273438, + "memory(GiB)": 22.09, + "step": 255, + "token_acc": 0.6185187256789907, + "train_speed(iter/s)": 0.096879 + }, + { + "epoch": 0.012090949517379287, + "grad_norm": 2.496126651763916, + "learning_rate": 2.41635687732342e-06, + "loss": 1.8916751861572265, + "memory(GiB)": 22.09, + "step": 260, + "token_acc": 0.6213592233009708, + "train_speed(iter/s)": 0.097938 + }, + { + "epoch": 0.012323467777328888, + "grad_norm": 2.3947010040283203, + "learning_rate": 2.462825278810409e-06, + "loss": 1.8349943161010742, + "memory(GiB)": 22.09, + "step": 265, + "token_acc": 0.6115827774408733, + "train_speed(iter/s)": 0.098953 + }, + { + "epoch": 0.01255598603727849, + "grad_norm": 2.534743070602417, + "learning_rate": 2.5092936802973977e-06, + "loss": 1.7406538009643555, + "memory(GiB)": 22.09, + "step": 270, + "token_acc": 0.6380839739798936, + "train_speed(iter/s)": 0.099965 + }, + { + "epoch": 0.012788504297228092, + "grad_norm": 3.6293578147888184, + "learning_rate": 2.555762081784387e-06, + "loss": 1.8015171051025392, + "memory(GiB)": 22.09, + "step": 275, + "token_acc": 0.6511627906976745, + "train_speed(iter/s)": 0.100974 + }, + { + "epoch": 0.013021022557177693, + "grad_norm": 4.162702560424805, + "learning_rate": 2.6022304832713758e-06, + "loss": 1.8374807357788085, + "memory(GiB)": 22.09, + "step": 280, + "token_acc": 0.6273263731275533, + "train_speed(iter/s)": 0.101975 + }, + { + "epoch": 0.013253540817127296, + "grad_norm": 4.197765827178955, + "learning_rate": 2.6486988847583646e-06, + "loss": 1.6645938873291015, + "memory(GiB)": 22.09, + "step": 285, + "token_acc": 0.6414484319430973, + "train_speed(iter/s)": 0.102915 + }, + { + "epoch": 0.013486059077076897, + "grad_norm": 2.419771671295166, + "learning_rate": 2.695167286245353e-06, + "loss": 1.7501996994018554, + "memory(GiB)": 22.09, + "step": 290, + "token_acc": 0.6264450867052023, + "train_speed(iter/s)": 0.103873 + }, + { + "epoch": 0.013718577337026498, + "grad_norm": 2.622260808944702, + "learning_rate": 2.7416356877323423e-06, + "loss": 1.5766103744506836, + "memory(GiB)": 22.09, + "step": 295, + "token_acc": 0.6576110392410521, + "train_speed(iter/s)": 0.104818 + }, + { + "epoch": 0.0139510955969761, + "grad_norm": 3.4051737785339355, + "learning_rate": 2.788104089219331e-06, + "loss": 1.6507225036621094, + "memory(GiB)": 22.09, + "step": 300, + "token_acc": 0.6394230769230769, + "train_speed(iter/s)": 0.105736 + }, + { + "epoch": 0.0139510955969761, + "eval_loss": 1.518836498260498, + "eval_runtime": 288.759, + "eval_samples_per_second": 12.034, + "eval_steps_per_second": 12.034, + "step": 300 + }, + { + "epoch": 0.014183613856925702, + "grad_norm": 2.1325795650482178, + "learning_rate": 2.83457249070632e-06, + "loss": 1.4976882934570312, + "memory(GiB)": 22.09, + "step": 305, + "token_acc": 0.6439182878445585, + "train_speed(iter/s)": 0.096836 + }, + { + "epoch": 0.014416132116875303, + "grad_norm": 2.5515503883361816, + "learning_rate": 2.8810408921933087e-06, + "loss": 1.582894992828369, + "memory(GiB)": 22.09, + "step": 310, + "token_acc": 0.6342662632375189, + "train_speed(iter/s)": 0.097711 + }, + { + "epoch": 0.014648650376824906, + "grad_norm": 2.6792547702789307, + "learning_rate": 2.927509293680298e-06, + "loss": 1.6397327423095702, + "memory(GiB)": 22.09, + "step": 315, + "token_acc": 0.6391833529642716, + "train_speed(iter/s)": 0.098569 + }, + { + "epoch": 0.014881168636774507, + "grad_norm": 3.859010934829712, + "learning_rate": 2.973977695167287e-06, + "loss": 1.3939047813415528, + "memory(GiB)": 22.09, + "step": 320, + "token_acc": 0.6882701962574167, + "train_speed(iter/s)": 0.099432 + }, + { + "epoch": 0.015113686896724108, + "grad_norm": 2.2595622539520264, + "learning_rate": 3.020446096654275e-06, + "loss": 1.4366521835327148, + "memory(GiB)": 22.09, + "step": 325, + "token_acc": 0.6654362416107382, + "train_speed(iter/s)": 0.100268 + }, + { + "epoch": 0.01534620515667371, + "grad_norm": 3.0457279682159424, + "learning_rate": 3.066914498141264e-06, + "loss": 1.526663589477539, + "memory(GiB)": 24.06, + "step": 330, + "token_acc": 0.6629547141796585, + "train_speed(iter/s)": 0.101072 + }, + { + "epoch": 0.015578723416623311, + "grad_norm": 9.594057083129883, + "learning_rate": 3.113382899628253e-06, + "loss": 1.3833000183105468, + "memory(GiB)": 24.06, + "step": 335, + "token_acc": 0.6841689696012633, + "train_speed(iter/s)": 0.101881 + }, + { + "epoch": 0.015811241676572912, + "grad_norm": 3.075023889541626, + "learning_rate": 3.159851301115242e-06, + "loss": 1.5897629737854004, + "memory(GiB)": 24.06, + "step": 340, + "token_acc": 0.66429418742586, + "train_speed(iter/s)": 0.102694 + }, + { + "epoch": 0.016043759936522514, + "grad_norm": 5.14961051940918, + "learning_rate": 3.206319702602231e-06, + "loss": 1.5434186935424805, + "memory(GiB)": 24.06, + "step": 345, + "token_acc": 0.6542893725992317, + "train_speed(iter/s)": 0.10349 + }, + { + "epoch": 0.016276278196472118, + "grad_norm": 2.473466157913208, + "learning_rate": 3.2527881040892197e-06, + "loss": 1.457532024383545, + "memory(GiB)": 24.06, + "step": 350, + "token_acc": 0.668398533007335, + "train_speed(iter/s)": 0.104276 + }, + { + "epoch": 0.016276278196472118, + "eval_loss": 1.3259131908416748, + "eval_runtime": 288.2494, + "eval_samples_per_second": 12.056, + "eval_steps_per_second": 12.056, + "step": 350 + }, + { + "epoch": 0.01650879645642172, + "grad_norm": 2.0314037799835205, + "learning_rate": 3.299256505576208e-06, + "loss": 1.3253539085388184, + "memory(GiB)": 24.06, + "step": 355, + "token_acc": 0.6744063535139881, + "train_speed(iter/s)": 0.096782 + }, + { + "epoch": 0.01674131471637132, + "grad_norm": 2.7340409755706787, + "learning_rate": 3.3457249070631974e-06, + "loss": 1.3673904418945313, + "memory(GiB)": 24.06, + "step": 360, + "token_acc": 0.7089552238805971, + "train_speed(iter/s)": 0.097525 + }, + { + "epoch": 0.01697383297632092, + "grad_norm": 1.6114614009857178, + "learning_rate": 3.392193308550186e-06, + "loss": 1.3874520301818847, + "memory(GiB)": 24.06, + "step": 365, + "token_acc": 0.6815522020326455, + "train_speed(iter/s)": 0.098261 + }, + { + "epoch": 0.017206351236270522, + "grad_norm": 3.4921016693115234, + "learning_rate": 3.438661710037175e-06, + "loss": 1.558394718170166, + "memory(GiB)": 24.06, + "step": 370, + "token_acc": 0.6628183361629881, + "train_speed(iter/s)": 0.098969 + }, + { + "epoch": 0.017438869496220127, + "grad_norm": 3.3175108432769775, + "learning_rate": 3.485130111524164e-06, + "loss": 1.4499250411987306, + "memory(GiB)": 24.06, + "step": 375, + "token_acc": 0.6738898756660746, + "train_speed(iter/s)": 0.09971 + }, + { + "epoch": 0.017671387756169728, + "grad_norm": 1.9265453815460205, + "learning_rate": 3.531598513011153e-06, + "loss": 1.455325222015381, + "memory(GiB)": 24.06, + "step": 380, + "token_acc": 0.6769176387416048, + "train_speed(iter/s)": 0.100433 + }, + { + "epoch": 0.01790390601611933, + "grad_norm": 2.1353354454040527, + "learning_rate": 3.5780669144981415e-06, + "loss": 1.4541678428649902, + "memory(GiB)": 26.73, + "step": 385, + "token_acc": 0.6730624529721595, + "train_speed(iter/s)": 0.101103 + }, + { + "epoch": 0.01813642427606893, + "grad_norm": 3.1384456157684326, + "learning_rate": 3.6245353159851303e-06, + "loss": 1.4149900436401368, + "memory(GiB)": 26.73, + "step": 390, + "token_acc": 0.6772802653399669, + "train_speed(iter/s)": 0.101775 + }, + { + "epoch": 0.01836894253601853, + "grad_norm": 2.3894238471984863, + "learning_rate": 3.671003717472119e-06, + "loss": 1.4391159057617187, + "memory(GiB)": 26.73, + "step": 395, + "token_acc": 0.6768361581920904, + "train_speed(iter/s)": 0.102484 + }, + { + "epoch": 0.018601460795968132, + "grad_norm": 2.095750093460083, + "learning_rate": 3.717472118959108e-06, + "loss": 1.2132759094238281, + "memory(GiB)": 26.73, + "step": 400, + "token_acc": 0.7314036725801432, + "train_speed(iter/s)": 0.103155 + }, + { + "epoch": 0.018601460795968132, + "eval_loss": 1.243235468864441, + "eval_runtime": 290.9778, + "eval_samples_per_second": 11.942, + "eval_steps_per_second": 11.942, + "step": 400 + }, + { + "epoch": 0.018833979055917736, + "grad_norm": 2.0978684425354004, + "learning_rate": 3.763940520446097e-06, + "loss": 1.314230728149414, + "memory(GiB)": 26.73, + "step": 405, + "token_acc": 0.6890068657193038, + "train_speed(iter/s)": 0.096617 + }, + { + "epoch": 0.019066497315867337, + "grad_norm": 2.425307273864746, + "learning_rate": 3.810408921933086e-06, + "loss": 1.3469379425048829, + "memory(GiB)": 26.73, + "step": 410, + "token_acc": 0.6947027901334412, + "train_speed(iter/s)": 0.09728 + }, + { + "epoch": 0.01929901557581694, + "grad_norm": 2.1733996868133545, + "learning_rate": 3.8568773234200744e-06, + "loss": 1.4006044387817382, + "memory(GiB)": 26.73, + "step": 415, + "token_acc": 0.6759921925829538, + "train_speed(iter/s)": 0.097936 + }, + { + "epoch": 0.01953153383576654, + "grad_norm": 2.709667444229126, + "learning_rate": 3.903345724907064e-06, + "loss": 1.3892633438110351, + "memory(GiB)": 26.73, + "step": 420, + "token_acc": 0.6919795221843004, + "train_speed(iter/s)": 0.098593 + }, + { + "epoch": 0.01976405209571614, + "grad_norm": 2.0785586833953857, + "learning_rate": 3.949814126394053e-06, + "loss": 1.3460134506225585, + "memory(GiB)": 26.73, + "step": 425, + "token_acc": 0.695500387897595, + "train_speed(iter/s)": 0.099222 + }, + { + "epoch": 0.019996570355665745, + "grad_norm": 2.9196395874023438, + "learning_rate": 3.996282527881041e-06, + "loss": 1.4188276290893556, + "memory(GiB)": 26.73, + "step": 430, + "token_acc": 0.6826769132244637, + "train_speed(iter/s)": 0.099863 + }, + { + "epoch": 0.020229088615615346, + "grad_norm": 2.977738618850708, + "learning_rate": 4.04275092936803e-06, + "loss": 1.3545875549316406, + "memory(GiB)": 26.73, + "step": 435, + "token_acc": 0.6918604651162791, + "train_speed(iter/s)": 0.100495 + }, + { + "epoch": 0.020461606875564947, + "grad_norm": 2.716374635696411, + "learning_rate": 4.089219330855019e-06, + "loss": 1.4137415885925293, + "memory(GiB)": 26.73, + "step": 440, + "token_acc": 0.6859414528370076, + "train_speed(iter/s)": 0.101102 + }, + { + "epoch": 0.020694125135514548, + "grad_norm": 2.3097028732299805, + "learning_rate": 4.135687732342008e-06, + "loss": 1.4122305870056153, + "memory(GiB)": 26.73, + "step": 445, + "token_acc": 0.6715374841168996, + "train_speed(iter/s)": 0.101712 + }, + { + "epoch": 0.02092664339546415, + "grad_norm": 2.968329429626465, + "learning_rate": 4.182156133828997e-06, + "loss": 1.3420489311218262, + "memory(GiB)": 26.73, + "step": 450, + "token_acc": 0.7025730484081989, + "train_speed(iter/s)": 0.102313 + }, + { + "epoch": 0.02092664339546415, + "eval_loss": 1.1884208917617798, + "eval_runtime": 290.2048, + "eval_samples_per_second": 11.974, + "eval_steps_per_second": 11.974, + "step": 450 + }, + { + "epoch": 0.02115916165541375, + "grad_norm": 2.2978663444519043, + "learning_rate": 4.228624535315986e-06, + "loss": 1.2818711280822754, + "memory(GiB)": 26.73, + "step": 455, + "token_acc": 0.6983738667434163, + "train_speed(iter/s)": 0.096555 + }, + { + "epoch": 0.021391679915363355, + "grad_norm": 3.826552629470825, + "learning_rate": 4.275092936802974e-06, + "loss": 1.373377227783203, + "memory(GiB)": 26.73, + "step": 460, + "token_acc": 0.6942446043165468, + "train_speed(iter/s)": 0.097138 + }, + { + "epoch": 0.021624198175312956, + "grad_norm": 2.246628999710083, + "learning_rate": 4.321561338289963e-06, + "loss": 1.3337231636047364, + "memory(GiB)": 26.73, + "step": 465, + "token_acc": 0.7166531932093775, + "train_speed(iter/s)": 0.097731 + }, + { + "epoch": 0.021856716435262557, + "grad_norm": 3.411140203475952, + "learning_rate": 4.368029739776952e-06, + "loss": 1.408590030670166, + "memory(GiB)": 26.73, + "step": 470, + "token_acc": 0.6903353057199211, + "train_speed(iter/s)": 0.098298 + }, + { + "epoch": 0.022089234695212158, + "grad_norm": 3.054403781890869, + "learning_rate": 4.414498141263941e-06, + "loss": 1.229485321044922, + "memory(GiB)": 26.73, + "step": 475, + "token_acc": 0.7275797373358349, + "train_speed(iter/s)": 0.09886 + }, + { + "epoch": 0.02232175295516176, + "grad_norm": 2.294562578201294, + "learning_rate": 4.4609665427509296e-06, + "loss": 1.3114431381225586, + "memory(GiB)": 26.73, + "step": 480, + "token_acc": 0.713089802130898, + "train_speed(iter/s)": 0.099429 + }, + { + "epoch": 0.02255427121511136, + "grad_norm": 2.637377977371216, + "learning_rate": 4.507434944237919e-06, + "loss": 1.3108051300048829, + "memory(GiB)": 26.73, + "step": 485, + "token_acc": 0.6991845363938387, + "train_speed(iter/s)": 0.099973 + }, + { + "epoch": 0.022786789475060965, + "grad_norm": 2.643446922302246, + "learning_rate": 4.553903345724908e-06, + "loss": 1.2051572799682617, + "memory(GiB)": 26.73, + "step": 490, + "token_acc": 0.7252704791344667, + "train_speed(iter/s)": 0.100528 + }, + { + "epoch": 0.023019307735010566, + "grad_norm": 2.522397994995117, + "learning_rate": 4.6003717472118964e-06, + "loss": 1.3973845481872558, + "memory(GiB)": 26.73, + "step": 495, + "token_acc": 0.6806872037914692, + "train_speed(iter/s)": 0.101073 + }, + { + "epoch": 0.023251825994960167, + "grad_norm": 2.2974302768707275, + "learning_rate": 4.646840148698885e-06, + "loss": 1.3299295425415039, + "memory(GiB)": 26.73, + "step": 500, + "token_acc": 0.7066365007541479, + "train_speed(iter/s)": 0.101625 + }, + { + "epoch": 0.023251825994960167, + "eval_loss": 1.1440743207931519, + "eval_runtime": 291.5453, + "eval_samples_per_second": 11.919, + "eval_steps_per_second": 11.919, + "step": 500 + }, + { + "epoch": 0.023484344254909768, + "grad_norm": 2.4379918575286865, + "learning_rate": 4.693308550185874e-06, + "loss": 1.2742385864257812, + "memory(GiB)": 26.73, + "step": 505, + "token_acc": 0.706128955128716, + "train_speed(iter/s)": 0.096465 + }, + { + "epoch": 0.02371686251485937, + "grad_norm": 3.101616621017456, + "learning_rate": 4.739776951672863e-06, + "loss": 1.1919514656066894, + "memory(GiB)": 26.73, + "step": 510, + "token_acc": 0.7258243793997777, + "train_speed(iter/s)": 0.096999 + }, + { + "epoch": 0.023949380774808973, + "grad_norm": 2.3769919872283936, + "learning_rate": 4.786245353159852e-06, + "loss": 1.3223759651184082, + "memory(GiB)": 26.73, + "step": 515, + "token_acc": 0.7008708822415751, + "train_speed(iter/s)": 0.097522 + }, + { + "epoch": 0.024181899034758574, + "grad_norm": 2.893718719482422, + "learning_rate": 4.83271375464684e-06, + "loss": 1.3019302368164063, + "memory(GiB)": 26.73, + "step": 520, + "token_acc": 0.6975822433610781, + "train_speed(iter/s)": 0.098045 + }, + { + "epoch": 0.024414417294708175, + "grad_norm": 3.2003087997436523, + "learning_rate": 4.879182156133829e-06, + "loss": 1.221653938293457, + "memory(GiB)": 26.73, + "step": 525, + "token_acc": 0.723404255319149, + "train_speed(iter/s)": 0.098566 + }, + { + "epoch": 0.024646935554657776, + "grad_norm": 3.078279495239258, + "learning_rate": 4.925650557620818e-06, + "loss": 1.1143360137939453, + "memory(GiB)": 26.73, + "step": 530, + "token_acc": 0.7328652624540287, + "train_speed(iter/s)": 0.099081 + }, + { + "epoch": 0.024879453814607377, + "grad_norm": 2.866652488708496, + "learning_rate": 4.972118959107807e-06, + "loss": 1.2642970085144043, + "memory(GiB)": 26.73, + "step": 535, + "token_acc": 0.7042682926829268, + "train_speed(iter/s)": 0.09959 + }, + { + "epoch": 0.02511197207455698, + "grad_norm": 2.440160036087036, + "learning_rate": 5.0185873605947954e-06, + "loss": 1.2780232429504395, + "memory(GiB)": 26.73, + "step": 540, + "token_acc": 0.6966232385003989, + "train_speed(iter/s)": 0.100089 + }, + { + "epoch": 0.025344490334506583, + "grad_norm": 2.839524030685425, + "learning_rate": 5.0650557620817855e-06, + "loss": 1.1742261886596679, + "memory(GiB)": 26.73, + "step": 545, + "token_acc": 0.7220434432823813, + "train_speed(iter/s)": 0.100589 + }, + { + "epoch": 0.025577008594456184, + "grad_norm": 4.023037910461426, + "learning_rate": 5.111524163568774e-06, + "loss": 1.1907401084899902, + "memory(GiB)": 26.73, + "step": 550, + "token_acc": 0.7297198981447799, + "train_speed(iter/s)": 0.101092 + }, + { + "epoch": 0.025577008594456184, + "eval_loss": 1.100506067276001, + "eval_runtime": 289.3509, + "eval_samples_per_second": 12.01, + "eval_steps_per_second": 12.01, + "step": 550 + }, + { + "epoch": 0.025809526854405785, + "grad_norm": 2.9395487308502197, + "learning_rate": 5.157992565055762e-06, + "loss": 1.2625597953796386, + "memory(GiB)": 26.73, + "step": 555, + "token_acc": 0.7145255684543798, + "train_speed(iter/s)": 0.096472 + }, + { + "epoch": 0.026042045114355386, + "grad_norm": 2.969301462173462, + "learning_rate": 5.2044609665427516e-06, + "loss": 1.2307548522949219, + "memory(GiB)": 26.73, + "step": 560, + "token_acc": 0.7089829250185598, + "train_speed(iter/s)": 0.09695 + }, + { + "epoch": 0.026274563374304987, + "grad_norm": 2.336460828781128, + "learning_rate": 5.25092936802974e-06, + "loss": 1.2285655975341796, + "memory(GiB)": 26.73, + "step": 565, + "token_acc": 0.7056101792943898, + "train_speed(iter/s)": 0.097424 + }, + { + "epoch": 0.02650708163425459, + "grad_norm": 2.536938428878784, + "learning_rate": 5.297397769516729e-06, + "loss": 1.2232088088989257, + "memory(GiB)": 26.73, + "step": 570, + "token_acc": 0.7129063405214033, + "train_speed(iter/s)": 0.09789 + }, + { + "epoch": 0.026739599894204193, + "grad_norm": 2.696225643157959, + "learning_rate": 5.343866171003718e-06, + "loss": 1.1694017410278321, + "memory(GiB)": 26.73, + "step": 575, + "token_acc": 0.7356643356643356, + "train_speed(iter/s)": 0.098353 + }, + { + "epoch": 0.026972118154153794, + "grad_norm": 2.478032350540161, + "learning_rate": 5.390334572490706e-06, + "loss": 1.1354408264160156, + "memory(GiB)": 26.73, + "step": 580, + "token_acc": 0.7360194511983328, + "train_speed(iter/s)": 0.098816 + }, + { + "epoch": 0.027204636414103395, + "grad_norm": 3.790090799331665, + "learning_rate": 5.436802973977695e-06, + "loss": 1.2879012107849122, + "memory(GiB)": 26.73, + "step": 585, + "token_acc": 0.7038508557457213, + "train_speed(iter/s)": 0.099279 + }, + { + "epoch": 0.027437154674052996, + "grad_norm": 3.797004461288452, + "learning_rate": 5.4832713754646845e-06, + "loss": 1.3572250366210938, + "memory(GiB)": 26.73, + "step": 590, + "token_acc": 0.6939364773820982, + "train_speed(iter/s)": 0.099741 + }, + { + "epoch": 0.027669672934002597, + "grad_norm": 3.488421678543091, + "learning_rate": 5.529739776951674e-06, + "loss": 1.2515945434570312, + "memory(GiB)": 26.73, + "step": 595, + "token_acc": 0.7290346626910175, + "train_speed(iter/s)": 0.1002 + }, + { + "epoch": 0.0279021911939522, + "grad_norm": 2.518224000930786, + "learning_rate": 5.576208178438662e-06, + "loss": 1.1993464469909667, + "memory(GiB)": 26.73, + "step": 600, + "token_acc": 0.7091660923501034, + "train_speed(iter/s)": 0.100654 + }, + { + "epoch": 0.0279021911939522, + "eval_loss": 1.0732550621032715, + "eval_runtime": 290.72, + "eval_samples_per_second": 11.953, + "eval_steps_per_second": 11.953, + "step": 600 + }, + { + "epoch": 0.028134709453901802, + "grad_norm": 2.8344390392303467, + "learning_rate": 5.622676579925651e-06, + "loss": 1.3822043418884278, + "memory(GiB)": 26.73, + "step": 605, + "token_acc": 0.7207555612375351, + "train_speed(iter/s)": 0.096411 + }, + { + "epoch": 0.028367227713851403, + "grad_norm": 2.5761542320251465, + "learning_rate": 5.66914498141264e-06, + "loss": 1.1615178108215332, + "memory(GiB)": 26.73, + "step": 610, + "token_acc": 0.7227501397428732, + "train_speed(iter/s)": 0.096857 + }, + { + "epoch": 0.028599745973801004, + "grad_norm": 3.259291172027588, + "learning_rate": 5.715613382899628e-06, + "loss": 1.300617504119873, + "memory(GiB)": 26.73, + "step": 615, + "token_acc": 0.6939717334871647, + "train_speed(iter/s)": 0.097287 + }, + { + "epoch": 0.028832264233750605, + "grad_norm": 2.453037738800049, + "learning_rate": 5.7620817843866174e-06, + "loss": 1.2526305198669434, + "memory(GiB)": 26.73, + "step": 620, + "token_acc": 0.7125262841694202, + "train_speed(iter/s)": 0.097716 + }, + { + "epoch": 0.02906478249370021, + "grad_norm": 4.138836860656738, + "learning_rate": 5.808550185873606e-06, + "loss": 1.1886103630065918, + "memory(GiB)": 26.73, + "step": 625, + "token_acc": 0.7297491039426524, + "train_speed(iter/s)": 0.098156 + }, + { + "epoch": 0.02929730075364981, + "grad_norm": 2.7090673446655273, + "learning_rate": 5.855018587360596e-06, + "loss": 1.2077580451965333, + "memory(GiB)": 26.73, + "step": 630, + "token_acc": 0.733251633986928, + "train_speed(iter/s)": 0.09859 + }, + { + "epoch": 0.029529819013599412, + "grad_norm": 3.340815544128418, + "learning_rate": 5.901486988847584e-06, + "loss": 1.2236101150512695, + "memory(GiB)": 26.73, + "step": 635, + "token_acc": 0.7258354293927416, + "train_speed(iter/s)": 0.099024 + }, + { + "epoch": 0.029762337273549013, + "grad_norm": 2.5718116760253906, + "learning_rate": 5.947955390334574e-06, + "loss": 1.2151712417602538, + "memory(GiB)": 26.73, + "step": 640, + "token_acc": 0.7069377990430622, + "train_speed(iter/s)": 0.099447 + }, + { + "epoch": 0.029994855533498614, + "grad_norm": 3.5712640285491943, + "learning_rate": 5.994423791821562e-06, + "loss": 1.1882932662963868, + "memory(GiB)": 26.73, + "step": 645, + "token_acc": 0.7349031522977592, + "train_speed(iter/s)": 0.099867 + }, + { + "epoch": 0.030227373793448215, + "grad_norm": 3.336963176727295, + "learning_rate": 6.04089219330855e-06, + "loss": 1.115128993988037, + "memory(GiB)": 26.73, + "step": 650, + "token_acc": 0.7469618055555556, + "train_speed(iter/s)": 0.100287 + }, + { + "epoch": 0.030227373793448215, + "eval_loss": 1.038861870765686, + "eval_runtime": 294.2553, + "eval_samples_per_second": 11.809, + "eval_steps_per_second": 11.809, + "step": 650 + }, + { + "epoch": 0.03045989205339782, + "grad_norm": 2.6979942321777344, + "learning_rate": 6.08736059479554e-06, + "loss": 1.1597721099853515, + "memory(GiB)": 26.73, + "step": 655, + "token_acc": 0.7270051037550198, + "train_speed(iter/s)": 0.09634 + }, + { + "epoch": 0.03069241031334742, + "grad_norm": 2.986121654510498, + "learning_rate": 6.133828996282528e-06, + "loss": 1.0470152854919434, + "memory(GiB)": 26.73, + "step": 660, + "token_acc": 0.7729346173340091, + "train_speed(iter/s)": 0.096749 + }, + { + "epoch": 0.030924928573297022, + "grad_norm": 2.93112850189209, + "learning_rate": 6.180297397769517e-06, + "loss": 1.149097728729248, + "memory(GiB)": 26.73, + "step": 665, + "token_acc": 0.7351664254703328, + "train_speed(iter/s)": 0.097163 + }, + { + "epoch": 0.031157446833246623, + "grad_norm": 3.3635780811309814, + "learning_rate": 6.226765799256506e-06, + "loss": 1.2041844367980956, + "memory(GiB)": 26.73, + "step": 670, + "token_acc": 0.7335233345208407, + "train_speed(iter/s)": 0.097577 + }, + { + "epoch": 0.03138996509319623, + "grad_norm": 3.214078664779663, + "learning_rate": 6.273234200743496e-06, + "loss": 1.1929821968078613, + "memory(GiB)": 26.73, + "step": 675, + "token_acc": 0.7315741583257507, + "train_speed(iter/s)": 0.097977 + }, + { + "epoch": 0.031622483353145825, + "grad_norm": 2.8574559688568115, + "learning_rate": 6.319702602230484e-06, + "loss": 1.0180715560913085, + "memory(GiB)": 26.73, + "step": 680, + "token_acc": 0.7521399330107927, + "train_speed(iter/s)": 0.098385 + }, + { + "epoch": 0.03185500161309543, + "grad_norm": 3.0687994956970215, + "learning_rate": 6.3661710037174726e-06, + "loss": 1.1894699096679688, + "memory(GiB)": 26.73, + "step": 685, + "token_acc": 0.7381535947712419, + "train_speed(iter/s)": 0.098779 + }, + { + "epoch": 0.03208751987304503, + "grad_norm": 2.5668647289276123, + "learning_rate": 6.412639405204462e-06, + "loss": 1.1381871223449707, + "memory(GiB)": 26.73, + "step": 690, + "token_acc": 0.7289256198347107, + "train_speed(iter/s)": 0.099181 + }, + { + "epoch": 0.03232003813299463, + "grad_norm": 3.334214687347412, + "learning_rate": 6.45910780669145e-06, + "loss": 1.1664478302001953, + "memory(GiB)": 26.73, + "step": 695, + "token_acc": 0.7353790613718412, + "train_speed(iter/s)": 0.099563 + }, + { + "epoch": 0.032552556392944236, + "grad_norm": 2.6217947006225586, + "learning_rate": 6.5055762081784395e-06, + "loss": 1.1068604469299317, + "memory(GiB)": 26.73, + "step": 700, + "token_acc": 0.7076585231951689, + "train_speed(iter/s)": 0.099951 + }, + { + "epoch": 0.032552556392944236, + "eval_loss": 1.0255147218704224, + "eval_runtime": 293.8133, + "eval_samples_per_second": 11.827, + "eval_steps_per_second": 11.827, + "step": 700 + }, + { + "epoch": 0.032785074652893834, + "grad_norm": 4.364160060882568, + "learning_rate": 6.552044609665428e-06, + "loss": 1.0353403091430664, + "memory(GiB)": 26.73, + "step": 705, + "token_acc": 0.7327724745283322, + "train_speed(iter/s)": 0.096298 + }, + { + "epoch": 0.03301759291284344, + "grad_norm": 3.8299734592437744, + "learning_rate": 6.598513011152416e-06, + "loss": 1.2472078323364257, + "memory(GiB)": 26.73, + "step": 710, + "token_acc": 0.7108608790410461, + "train_speed(iter/s)": 0.096683 + }, + { + "epoch": 0.033250111172793036, + "grad_norm": 2.9473071098327637, + "learning_rate": 6.6449814126394055e-06, + "loss": 1.0487863540649414, + "memory(GiB)": 26.73, + "step": 715, + "token_acc": 0.7527955271565495, + "train_speed(iter/s)": 0.097065 + }, + { + "epoch": 0.03348262943274264, + "grad_norm": 3.801494598388672, + "learning_rate": 6.691449814126395e-06, + "loss": 1.176010513305664, + "memory(GiB)": 26.73, + "step": 720, + "token_acc": 0.7328456983629398, + "train_speed(iter/s)": 0.097437 + }, + { + "epoch": 0.033715147692692245, + "grad_norm": 3.633549213409424, + "learning_rate": 6.737918215613384e-06, + "loss": 1.1107640266418457, + "memory(GiB)": 26.73, + "step": 725, + "token_acc": 0.7462328767123287, + "train_speed(iter/s)": 0.097817 + }, + { + "epoch": 0.03394766595264184, + "grad_norm": 2.499803304672241, + "learning_rate": 6.784386617100372e-06, + "loss": 1.1150272369384766, + "memory(GiB)": 26.73, + "step": 730, + "token_acc": 0.738086576937068, + "train_speed(iter/s)": 0.098192 + }, + { + "epoch": 0.03418018421259145, + "grad_norm": 4.188467979431152, + "learning_rate": 6.830855018587361e-06, + "loss": 1.223805046081543, + "memory(GiB)": 26.73, + "step": 735, + "token_acc": 0.7213947190250508, + "train_speed(iter/s)": 0.098545 + }, + { + "epoch": 0.034412702472541044, + "grad_norm": 3.4925193786621094, + "learning_rate": 6.87732342007435e-06, + "loss": 1.0611821174621583, + "memory(GiB)": 26.73, + "step": 740, + "token_acc": 0.7538409013315125, + "train_speed(iter/s)": 0.098917 + }, + { + "epoch": 0.03464522073249065, + "grad_norm": 2.734851121902466, + "learning_rate": 6.9237918215613384e-06, + "loss": 1.2451375007629395, + "memory(GiB)": 26.73, + "step": 745, + "token_acc": 0.7303664921465969, + "train_speed(iter/s)": 0.099281 + }, + { + "epoch": 0.03487773899244025, + "grad_norm": 3.9239959716796875, + "learning_rate": 6.970260223048328e-06, + "loss": 1.1344339370727539, + "memory(GiB)": 26.73, + "step": 750, + "token_acc": 0.726453488372093, + "train_speed(iter/s)": 0.099641 + }, + { + "epoch": 0.03487773899244025, + "eval_loss": 1.0062031745910645, + "eval_runtime": 293.4158, + "eval_samples_per_second": 11.843, + "eval_steps_per_second": 11.843, + "step": 750 + }, + { + "epoch": 0.03511025725238985, + "grad_norm": 2.1670944690704346, + "learning_rate": 7.016728624535316e-06, + "loss": 1.0993772506713868, + "memory(GiB)": 26.73, + "step": 755, + "token_acc": 0.7372170460730637, + "train_speed(iter/s)": 0.09626 + }, + { + "epoch": 0.035342775512339455, + "grad_norm": 3.950388193130493, + "learning_rate": 7.063197026022306e-06, + "loss": 1.2084269523620605, + "memory(GiB)": 26.73, + "step": 760, + "token_acc": 0.731457800511509, + "train_speed(iter/s)": 0.096612 + }, + { + "epoch": 0.03557529377228905, + "grad_norm": 3.085629940032959, + "learning_rate": 7.109665427509295e-06, + "loss": 1.115158462524414, + "memory(GiB)": 26.73, + "step": 765, + "token_acc": 0.7291159772911597, + "train_speed(iter/s)": 0.096965 + }, + { + "epoch": 0.03580781203223866, + "grad_norm": 3.8739664554595947, + "learning_rate": 7.156133828996283e-06, + "loss": 1.1317030906677246, + "memory(GiB)": 26.73, + "step": 770, + "token_acc": 0.7414814814814815, + "train_speed(iter/s)": 0.097318 + }, + { + "epoch": 0.036040330292188255, + "grad_norm": 3.862696409225464, + "learning_rate": 7.202602230483272e-06, + "loss": 1.1016357421875, + "memory(GiB)": 26.73, + "step": 775, + "token_acc": 0.7436731742588576, + "train_speed(iter/s)": 0.097674 + }, + { + "epoch": 0.03627284855213786, + "grad_norm": 3.5100576877593994, + "learning_rate": 7.249070631970261e-06, + "loss": 1.0728687286376952, + "memory(GiB)": 26.73, + "step": 780, + "token_acc": 0.7553956834532374, + "train_speed(iter/s)": 0.09803 + }, + { + "epoch": 0.036505366812087464, + "grad_norm": 3.369295358657837, + "learning_rate": 7.29553903345725e-06, + "loss": 1.1247928619384766, + "memory(GiB)": 26.73, + "step": 785, + "token_acc": 0.7476307476307477, + "train_speed(iter/s)": 0.098369 + }, + { + "epoch": 0.03673788507203706, + "grad_norm": 3.494028329849243, + "learning_rate": 7.342007434944238e-06, + "loss": 1.1848819732666016, + "memory(GiB)": 26.73, + "step": 790, + "token_acc": 0.7192463982268194, + "train_speed(iter/s)": 0.098712 + }, + { + "epoch": 0.036970403331986666, + "grad_norm": 4.516552925109863, + "learning_rate": 7.388475836431227e-06, + "loss": 1.0329771995544434, + "memory(GiB)": 26.73, + "step": 795, + "token_acc": 0.7662447257383966, + "train_speed(iter/s)": 0.099061 + }, + { + "epoch": 0.037202921591936264, + "grad_norm": 3.831563711166382, + "learning_rate": 7.434944237918216e-06, + "loss": 1.1165730476379394, + "memory(GiB)": 26.73, + "step": 800, + "token_acc": 0.7447425670775925, + "train_speed(iter/s)": 0.099387 + }, + { + "epoch": 0.037202921591936264, + "eval_loss": 0.9778481721878052, + "eval_runtime": 294.7837, + "eval_samples_per_second": 11.788, + "eval_steps_per_second": 11.788, + "step": 800 + }, + { + "epoch": 0.03743543985188587, + "grad_norm": 5.585468769073486, + "learning_rate": 7.481412639405205e-06, + "loss": 1.1400897979736329, + "memory(GiB)": 26.73, + "step": 805, + "token_acc": 0.7408928271258075, + "train_speed(iter/s)": 0.096205 + }, + { + "epoch": 0.03766795811183547, + "grad_norm": 3.9298033714294434, + "learning_rate": 7.527881040892194e-06, + "loss": 1.0923521995544434, + "memory(GiB)": 26.73, + "step": 810, + "token_acc": 0.7488021902806297, + "train_speed(iter/s)": 0.096544 + }, + { + "epoch": 0.03790047637178507, + "grad_norm": 3.8628060817718506, + "learning_rate": 7.574349442379183e-06, + "loss": 1.083217716217041, + "memory(GiB)": 26.73, + "step": 815, + "token_acc": 0.7465940054495913, + "train_speed(iter/s)": 0.096873 + }, + { + "epoch": 0.038132994631734675, + "grad_norm": 3.3381187915802, + "learning_rate": 7.620817843866172e-06, + "loss": 1.0980140686035156, + "memory(GiB)": 26.73, + "step": 820, + "token_acc": 0.7514580529385375, + "train_speed(iter/s)": 0.097186 + }, + { + "epoch": 0.03836551289168427, + "grad_norm": 2.6115517616271973, + "learning_rate": 7.667286245353161e-06, + "loss": 1.0684693336486817, + "memory(GiB)": 26.73, + "step": 825, + "token_acc": 0.75678391959799, + "train_speed(iter/s)": 0.097519 + }, + { + "epoch": 0.03859803115163388, + "grad_norm": 3.8252604007720947, + "learning_rate": 7.713754646840149e-06, + "loss": 1.1756773948669434, + "memory(GiB)": 26.73, + "step": 830, + "token_acc": 0.727756114852889, + "train_speed(iter/s)": 0.097846 + }, + { + "epoch": 0.03883054941158348, + "grad_norm": 3.1177918910980225, + "learning_rate": 7.760223048327138e-06, + "loss": 1.1260833740234375, + "memory(GiB)": 26.73, + "step": 835, + "token_acc": 0.7389162561576355, + "train_speed(iter/s)": 0.098165 + }, + { + "epoch": 0.03906306767153308, + "grad_norm": 3.329439878463745, + "learning_rate": 7.806691449814127e-06, + "loss": 1.0523783683776855, + "memory(GiB)": 26.73, + "step": 840, + "token_acc": 0.7564054957296695, + "train_speed(iter/s)": 0.098484 + }, + { + "epoch": 0.039295585931482684, + "grad_norm": 2.8450145721435547, + "learning_rate": 7.853159851301115e-06, + "loss": 1.1531224250793457, + "memory(GiB)": 26.73, + "step": 845, + "token_acc": 0.7297872340425532, + "train_speed(iter/s)": 0.098796 + }, + { + "epoch": 0.03952810419143228, + "grad_norm": 5.251894950866699, + "learning_rate": 7.899628252788106e-06, + "loss": 1.1336482048034668, + "memory(GiB)": 26.73, + "step": 850, + "token_acc": 0.7409217877094972, + "train_speed(iter/s)": 0.099117 + }, + { + "epoch": 0.03952810419143228, + "eval_loss": 0.9617792963981628, + "eval_runtime": 292.5541, + "eval_samples_per_second": 11.878, + "eval_steps_per_second": 11.878, + "step": 850 + }, + { + "epoch": 0.039760622451381886, + "grad_norm": 1.9178143739700317, + "learning_rate": 7.946096654275093e-06, + "loss": 1.1401193618774415, + "memory(GiB)": 26.73, + "step": 855, + "token_acc": 0.7444372143219908, + "train_speed(iter/s)": 0.09615 + }, + { + "epoch": 0.03999314071133149, + "grad_norm": 4.079286575317383, + "learning_rate": 7.992565055762083e-06, + "loss": 1.1457449913024902, + "memory(GiB)": 26.73, + "step": 860, + "token_acc": 0.7299968122409946, + "train_speed(iter/s)": 0.096459 + }, + { + "epoch": 0.04022565897128109, + "grad_norm": 3.180300712585449, + "learning_rate": 8.039033457249072e-06, + "loss": 1.1377723693847657, + "memory(GiB)": 26.73, + "step": 865, + "token_acc": 0.7370562130177515, + "train_speed(iter/s)": 0.096769 + }, + { + "epoch": 0.04045817723123069, + "grad_norm": 2.781759738922119, + "learning_rate": 8.08550185873606e-06, + "loss": 1.0756050109863282, + "memory(GiB)": 26.73, + "step": 870, + "token_acc": 0.7453805198872534, + "train_speed(iter/s)": 0.097086 + }, + { + "epoch": 0.04069069549118029, + "grad_norm": 3.273380994796753, + "learning_rate": 8.131970260223049e-06, + "loss": 1.1176755905151368, + "memory(GiB)": 26.73, + "step": 875, + "token_acc": 0.7340090877315624, + "train_speed(iter/s)": 0.097397 + }, + { + "epoch": 0.040923213751129894, + "grad_norm": 3.9136433601379395, + "learning_rate": 8.178438661710038e-06, + "loss": 1.1917973518371583, + "memory(GiB)": 26.73, + "step": 880, + "token_acc": 0.7312151137827394, + "train_speed(iter/s)": 0.097701 + }, + { + "epoch": 0.04115573201107949, + "grad_norm": 3.471822500228882, + "learning_rate": 8.224907063197025e-06, + "loss": 1.0876335144042968, + "memory(GiB)": 26.73, + "step": 885, + "token_acc": 0.7307692307692307, + "train_speed(iter/s)": 0.098007 + }, + { + "epoch": 0.041388250271029096, + "grad_norm": 3.936947822570801, + "learning_rate": 8.271375464684016e-06, + "loss": 1.0483864784240722, + "memory(GiB)": 26.73, + "step": 890, + "token_acc": 0.7621787495205217, + "train_speed(iter/s)": 0.098317 + }, + { + "epoch": 0.0416207685309787, + "grad_norm": 3.633143424987793, + "learning_rate": 8.317843866171004e-06, + "loss": 1.127341079711914, + "memory(GiB)": 26.73, + "step": 895, + "token_acc": 0.7553226334752702, + "train_speed(iter/s)": 0.098615 + }, + { + "epoch": 0.0418532867909283, + "grad_norm": 4.088837623596191, + "learning_rate": 8.364312267657993e-06, + "loss": 1.015018844604492, + "memory(GiB)": 26.73, + "step": 900, + "token_acc": 0.7831031681559708, + "train_speed(iter/s)": 0.098915 + }, + { + "epoch": 0.0418532867909283, + "eval_loss": 0.9545445442199707, + "eval_runtime": 294.6021, + "eval_samples_per_second": 11.796, + "eval_steps_per_second": 11.796, + "step": 900 + }, + { + "epoch": 0.0420858050508779, + "grad_norm": 3.2518317699432373, + "learning_rate": 8.410780669144982e-06, + "loss": 1.0016798019409179, + "memory(GiB)": 26.73, + "step": 905, + "token_acc": 0.7486964923067095, + "train_speed(iter/s)": 0.096115 + }, + { + "epoch": 0.0423183233108275, + "grad_norm": 3.467190980911255, + "learning_rate": 8.457249070631972e-06, + "loss": 1.0845005989074707, + "memory(GiB)": 26.73, + "step": 910, + "token_acc": 0.7519889311656867, + "train_speed(iter/s)": 0.096399 + }, + { + "epoch": 0.042550841570777105, + "grad_norm": 3.6865177154541016, + "learning_rate": 8.50371747211896e-06, + "loss": 1.2374433517456054, + "memory(GiB)": 26.73, + "step": 915, + "token_acc": 0.7189285714285715, + "train_speed(iter/s)": 0.096698 + }, + { + "epoch": 0.04278335983072671, + "grad_norm": 4.201014518737793, + "learning_rate": 8.550185873605949e-06, + "loss": 1.1373135566711425, + "memory(GiB)": 26.73, + "step": 920, + "token_acc": 0.7400126823081801, + "train_speed(iter/s)": 0.096992 + }, + { + "epoch": 0.04301587809067631, + "grad_norm": 2.818901300430298, + "learning_rate": 8.596654275092938e-06, + "loss": 1.017166519165039, + "memory(GiB)": 26.73, + "step": 925, + "token_acc": 0.7844311377245509, + "train_speed(iter/s)": 0.09729 + }, + { + "epoch": 0.04324839635062591, + "grad_norm": 4.298871040344238, + "learning_rate": 8.643122676579925e-06, + "loss": 0.9277063369750976, + "memory(GiB)": 26.73, + "step": 930, + "token_acc": 0.7862513426423201, + "train_speed(iter/s)": 0.097581 + }, + { + "epoch": 0.04348091461057551, + "grad_norm": 4.076528549194336, + "learning_rate": 8.689591078066916e-06, + "loss": 0.998668098449707, + "memory(GiB)": 26.73, + "step": 935, + "token_acc": 0.7610944277610945, + "train_speed(iter/s)": 0.097859 + }, + { + "epoch": 0.043713432870525114, + "grad_norm": 3.1914594173431396, + "learning_rate": 8.736059479553904e-06, + "loss": 1.0335095405578614, + "memory(GiB)": 26.73, + "step": 940, + "token_acc": 0.7718513420509291, + "train_speed(iter/s)": 0.098151 + }, + { + "epoch": 0.04394595113047472, + "grad_norm": 2.9357616901397705, + "learning_rate": 8.782527881040893e-06, + "loss": 1.0621576309204102, + "memory(GiB)": 26.73, + "step": 945, + "token_acc": 0.7534029756251979, + "train_speed(iter/s)": 0.098419 + }, + { + "epoch": 0.044178469390424316, + "grad_norm": 3.612863779067993, + "learning_rate": 8.828996282527882e-06, + "loss": 1.024215030670166, + "memory(GiB)": 26.73, + "step": 950, + "token_acc": 0.7534153005464481, + "train_speed(iter/s)": 0.098704 + }, + { + "epoch": 0.044178469390424316, + "eval_loss": 0.9331585764884949, + "eval_runtime": 296.5324, + "eval_samples_per_second": 11.719, + "eval_steps_per_second": 11.719, + "step": 950 + }, + { + "epoch": 0.04441098765037392, + "grad_norm": 3.4591856002807617, + "learning_rate": 8.87546468401487e-06, + "loss": 1.0047401428222655, + "memory(GiB)": 26.73, + "step": 955, + "token_acc": 0.751922091235264, + "train_speed(iter/s)": 0.096033 + }, + { + "epoch": 0.04464350591032352, + "grad_norm": 4.42740535736084, + "learning_rate": 8.921933085501859e-06, + "loss": 0.9573320388793946, + "memory(GiB)": 26.73, + "step": 960, + "token_acc": 0.7904052165812762, + "train_speed(iter/s)": 0.096312 + }, + { + "epoch": 0.04487602417027312, + "grad_norm": 4.089885234832764, + "learning_rate": 8.968401486988848e-06, + "loss": 1.1120384216308594, + "memory(GiB)": 26.73, + "step": 965, + "token_acc": 0.7395833333333334, + "train_speed(iter/s)": 0.096591 + }, + { + "epoch": 0.04510854243022272, + "grad_norm": 3.35774827003479, + "learning_rate": 9.014869888475838e-06, + "loss": 1.0453130722045898, + "memory(GiB)": 26.73, + "step": 970, + "token_acc": 0.7602854743912678, + "train_speed(iter/s)": 0.096867 + }, + { + "epoch": 0.045341060690172325, + "grad_norm": 3.1541173458099365, + "learning_rate": 9.061338289962825e-06, + "loss": 1.1856597900390624, + "memory(GiB)": 26.73, + "step": 975, + "token_acc": 0.7269852424455376, + "train_speed(iter/s)": 0.097145 + }, + { + "epoch": 0.04557357895012193, + "grad_norm": 3.100411891937256, + "learning_rate": 9.107806691449816e-06, + "loss": 0.913086986541748, + "memory(GiB)": 26.73, + "step": 980, + "token_acc": 0.7782732063234697, + "train_speed(iter/s)": 0.097422 + }, + { + "epoch": 0.04580609721007153, + "grad_norm": 3.1742074489593506, + "learning_rate": 9.154275092936804e-06, + "loss": 0.8818007469177246, + "memory(GiB)": 26.73, + "step": 985, + "token_acc": 0.8005411673753382, + "train_speed(iter/s)": 0.097697 + }, + { + "epoch": 0.04603861547002113, + "grad_norm": 4.51315450668335, + "learning_rate": 9.200743494423793e-06, + "loss": 0.9337103843688965, + "memory(GiB)": 26.73, + "step": 990, + "token_acc": 0.7855678556785568, + "train_speed(iter/s)": 0.097973 + }, + { + "epoch": 0.04627113372997073, + "grad_norm": 3.2544333934783936, + "learning_rate": 9.247211895910782e-06, + "loss": 1.1088324546813966, + "memory(GiB)": 26.73, + "step": 995, + "token_acc": 0.7601842012043925, + "train_speed(iter/s)": 0.098244 + }, + { + "epoch": 0.04650365198992033, + "grad_norm": 3.8300986289978027, + "learning_rate": 9.29368029739777e-06, + "loss": 1.0316854476928712, + "memory(GiB)": 26.73, + "step": 1000, + "token_acc": 0.7476149176062445, + "train_speed(iter/s)": 0.098513 + }, + { + "epoch": 0.04650365198992033, + "eval_loss": 0.9164021611213684, + "eval_runtime": 290.9576, + "eval_samples_per_second": 11.943, + "eval_steps_per_second": 11.943, + "step": 1000 + }, + { + "epoch": 0.04673617024986994, + "grad_norm": 4.002346038818359, + "learning_rate": 9.340148698884759e-06, + "loss": 1.068749237060547, + "memory(GiB)": 26.73, + "step": 1005, + "token_acc": 0.7531827778133197, + "train_speed(iter/s)": 0.096035 + }, + { + "epoch": 0.046968688509819535, + "grad_norm": 3.9563968181610107, + "learning_rate": 9.386617100371748e-06, + "loss": 0.9444809913635254, + "memory(GiB)": 26.73, + "step": 1010, + "token_acc": 0.7721238938053098, + "train_speed(iter/s)": 0.0963 + }, + { + "epoch": 0.04720120676976914, + "grad_norm": 4.915591716766357, + "learning_rate": 9.433085501858736e-06, + "loss": 0.9487569808959961, + "memory(GiB)": 26.73, + "step": 1015, + "token_acc": 0.7838190517616355, + "train_speed(iter/s)": 0.096566 + }, + { + "epoch": 0.04743372502971874, + "grad_norm": 5.050897598266602, + "learning_rate": 9.479553903345727e-06, + "loss": 1.0228119850158692, + "memory(GiB)": 26.73, + "step": 1020, + "token_acc": 0.7647294589178357, + "train_speed(iter/s)": 0.096826 + }, + { + "epoch": 0.04766624328966834, + "grad_norm": 2.706132650375366, + "learning_rate": 9.526022304832714e-06, + "loss": 1.0279125213623046, + "memory(GiB)": 26.73, + "step": 1025, + "token_acc": 0.7501481920569057, + "train_speed(iter/s)": 0.097091 + }, + { + "epoch": 0.047898761549617946, + "grad_norm": 3.322195529937744, + "learning_rate": 9.572490706319703e-06, + "loss": 1.0890856742858888, + "memory(GiB)": 26.73, + "step": 1030, + "token_acc": 0.737261698440208, + "train_speed(iter/s)": 0.097354 + }, + { + "epoch": 0.048131279809567544, + "grad_norm": 2.9123032093048096, + "learning_rate": 9.618959107806693e-06, + "loss": 1.077380657196045, + "memory(GiB)": 26.73, + "step": 1035, + "token_acc": 0.7402678293366552, + "train_speed(iter/s)": 0.097615 + }, + { + "epoch": 0.04836379806951715, + "grad_norm": 3.951082706451416, + "learning_rate": 9.66542750929368e-06, + "loss": 1.0775503158569335, + "memory(GiB)": 26.73, + "step": 1040, + "token_acc": 0.7607033639143731, + "train_speed(iter/s)": 0.09787 + }, + { + "epoch": 0.048596316329466746, + "grad_norm": 4.118011951446533, + "learning_rate": 9.71189591078067e-06, + "loss": 1.1944708824157715, + "memory(GiB)": 26.73, + "step": 1045, + "token_acc": 0.7104085893229943, + "train_speed(iter/s)": 0.098126 + }, + { + "epoch": 0.04882883458941635, + "grad_norm": 4.057410717010498, + "learning_rate": 9.758364312267659e-06, + "loss": 0.9629217147827148, + "memory(GiB)": 26.73, + "step": 1050, + "token_acc": 0.762325448845825, + "train_speed(iter/s)": 0.09838 + }, + { + "epoch": 0.04882883458941635, + "eval_loss": 0.9048006534576416, + "eval_runtime": 290.1861, + "eval_samples_per_second": 11.975, + "eval_steps_per_second": 11.975, + "step": 1050 + }, + { + "epoch": 0.049061352849365955, + "grad_norm": 3.130415439605713, + "learning_rate": 9.804832713754648e-06, + "loss": 0.9988020896911621, + "memory(GiB)": 26.73, + "step": 1055, + "token_acc": 0.7562960773820459, + "train_speed(iter/s)": 0.096023 + }, + { + "epoch": 0.04929387110931555, + "grad_norm": 4.304872989654541, + "learning_rate": 9.851301115241636e-06, + "loss": 1.0478652000427247, + "memory(GiB)": 26.73, + "step": 1060, + "token_acc": 0.7515170871925902, + "train_speed(iter/s)": 0.096272 + }, + { + "epoch": 0.04952638936926516, + "grad_norm": 3.4750263690948486, + "learning_rate": 9.897769516728627e-06, + "loss": 1.090310001373291, + "memory(GiB)": 26.73, + "step": 1065, + "token_acc": 0.7461376404494382, + "train_speed(iter/s)": 0.096527 + }, + { + "epoch": 0.049758907629214755, + "grad_norm": 3.2683334350585938, + "learning_rate": 9.944237918215614e-06, + "loss": 1.0517633438110352, + "memory(GiB)": 26.73, + "step": 1070, + "token_acc": 0.7599436818021823, + "train_speed(iter/s)": 0.09678 + }, + { + "epoch": 0.04999142588916436, + "grad_norm": 3.1977105140686035, + "learning_rate": 9.990706319702603e-06, + "loss": 0.9936330795288086, + "memory(GiB)": 26.73, + "step": 1075, + "token_acc": 0.7530349013657056, + "train_speed(iter/s)": 0.097031 + }, + { + "epoch": 0.05022394414911396, + "grad_norm": 3.543762445449829, + "learning_rate": 9.999999053870585e-06, + "loss": 0.9840545654296875, + "memory(GiB)": 26.73, + "step": 1080, + "token_acc": 0.7613271494826971, + "train_speed(iter/s)": 0.097282 + }, + { + "epoch": 0.05045646240906356, + "grad_norm": 4.944690227508545, + "learning_rate": 9.999995210220447e-06, + "loss": 1.022100067138672, + "memory(GiB)": 26.73, + "step": 1085, + "token_acc": 0.7664176169878096, + "train_speed(iter/s)": 0.097534 + }, + { + "epoch": 0.050688980669013166, + "grad_norm": 3.441666841506958, + "learning_rate": 9.999988409918769e-06, + "loss": 1.0619563102722167, + "memory(GiB)": 29.49, + "step": 1090, + "token_acc": 0.7531625040544924, + "train_speed(iter/s)": 0.097773 + }, + { + "epoch": 0.05092149892896276, + "grad_norm": 5.653553485870361, + "learning_rate": 9.999978652969573e-06, + "loss": 0.9953934669494628, + "memory(GiB)": 29.49, + "step": 1095, + "token_acc": 0.764, + "train_speed(iter/s)": 0.098024 + }, + { + "epoch": 0.05115401718891237, + "grad_norm": 3.904196262359619, + "learning_rate": 9.999965939378626e-06, + "loss": 0.9837069511413574, + "memory(GiB)": 29.49, + "step": 1100, + "token_acc": 0.7674581005586593, + "train_speed(iter/s)": 0.098275 + }, + { + "epoch": 0.05115401718891237, + "eval_loss": 0.8877241015434265, + "eval_runtime": 293.9743, + "eval_samples_per_second": 11.821, + "eval_steps_per_second": 11.821, + "step": 1100 + }, + { + "epoch": 0.051386535448861966, + "grad_norm": 3.1758522987365723, + "learning_rate": 9.999950269153451e-06, + "loss": 1.0526921272277832, + "memory(GiB)": 29.49, + "step": 1105, + "token_acc": 0.7591879537940586, + "train_speed(iter/s)": 0.096001 + }, + { + "epoch": 0.05161905370881157, + "grad_norm": 3.341015577316284, + "learning_rate": 9.999931642303309e-06, + "loss": 1.084926223754883, + "memory(GiB)": 29.49, + "step": 1110, + "token_acc": 0.7447089947089947, + "train_speed(iter/s)": 0.09624 + }, + { + "epoch": 0.051851571968761175, + "grad_norm": 4.055420875549316, + "learning_rate": 9.999910058839218e-06, + "loss": 1.0594484329223632, + "memory(GiB)": 29.49, + "step": 1115, + "token_acc": 0.7498171177761521, + "train_speed(iter/s)": 0.096482 + }, + { + "epoch": 0.05208409022871077, + "grad_norm": 3.7540385723114014, + "learning_rate": 9.999885518773939e-06, + "loss": 1.0508157730102539, + "memory(GiB)": 29.49, + "step": 1120, + "token_acc": 0.7722502914885347, + "train_speed(iter/s)": 0.096719 + }, + { + "epoch": 0.05231660848866038, + "grad_norm": 3.7452423572540283, + "learning_rate": 9.999858022121987e-06, + "loss": 0.9673895835876465, + "memory(GiB)": 29.49, + "step": 1125, + "token_acc": 0.7685028725920919, + "train_speed(iter/s)": 0.096952 + }, + { + "epoch": 0.052549126748609974, + "grad_norm": 2.8961267471313477, + "learning_rate": 9.999827568899615e-06, + "loss": 1.046191120147705, + "memory(GiB)": 29.49, + "step": 1130, + "token_acc": 0.7544949329846355, + "train_speed(iter/s)": 0.097192 + }, + { + "epoch": 0.05278164500855958, + "grad_norm": 4.284615516662598, + "learning_rate": 9.999794159124838e-06, + "loss": 1.1306605339050293, + "memory(GiB)": 29.49, + "step": 1135, + "token_acc": 0.7502649240551043, + "train_speed(iter/s)": 0.097431 + }, + { + "epoch": 0.05301416326850918, + "grad_norm": 3.700716257095337, + "learning_rate": 9.999757792817407e-06, + "loss": 1.0605037689208985, + "memory(GiB)": 29.49, + "step": 1140, + "token_acc": 0.7426001160766106, + "train_speed(iter/s)": 0.097663 + }, + { + "epoch": 0.05324668152845878, + "grad_norm": 3.2940375804901123, + "learning_rate": 9.999718469998829e-06, + "loss": 1.070816421508789, + "memory(GiB)": 29.49, + "step": 1145, + "token_acc": 0.7607361963190185, + "train_speed(iter/s)": 0.097904 + }, + { + "epoch": 0.053479199788408385, + "grad_norm": 3.6671321392059326, + "learning_rate": 9.999676190692356e-06, + "loss": 1.008139419555664, + "memory(GiB)": 29.49, + "step": 1150, + "token_acc": 0.7638326585695007, + "train_speed(iter/s)": 0.09814 + }, + { + "epoch": 0.053479199788408385, + "eval_loss": 0.8825203776359558, + "eval_runtime": 293.4828, + "eval_samples_per_second": 11.841, + "eval_steps_per_second": 11.841, + "step": 1150 + }, + { + "epoch": 0.05371171804835798, + "grad_norm": 3.003434181213379, + "learning_rate": 9.99963095492299e-06, + "loss": 0.7977881908416748, + "memory(GiB)": 29.49, + "step": 1155, + "token_acc": 0.7611431316042268, + "train_speed(iter/s)": 0.09597 + }, + { + "epoch": 0.05394423630830759, + "grad_norm": 3.953749418258667, + "learning_rate": 9.999582762717479e-06, + "loss": 1.0369236946105957, + "memory(GiB)": 29.49, + "step": 1160, + "token_acc": 0.7574827321565618, + "train_speed(iter/s)": 0.096205 + }, + { + "epoch": 0.05417675456825719, + "grad_norm": 3.2749271392822266, + "learning_rate": 9.999531614104323e-06, + "loss": 1.0453920364379883, + "memory(GiB)": 29.49, + "step": 1165, + "token_acc": 0.7414411477013368, + "train_speed(iter/s)": 0.096432 + }, + { + "epoch": 0.05440927282820679, + "grad_norm": 3.203056812286377, + "learning_rate": 9.999477509113764e-06, + "loss": 1.0291184425354003, + "memory(GiB)": 29.49, + "step": 1170, + "token_acc": 0.7581111468037263, + "train_speed(iter/s)": 0.096664 + }, + { + "epoch": 0.054641791088156394, + "grad_norm": 3.7450356483459473, + "learning_rate": 9.999420447777799e-06, + "loss": 0.9435734748840332, + "memory(GiB)": 29.49, + "step": 1175, + "token_acc": 0.7707606420097697, + "train_speed(iter/s)": 0.096896 + }, + { + "epoch": 0.05487430934810599, + "grad_norm": 3.322979211807251, + "learning_rate": 9.999360430130168e-06, + "loss": 0.9902138710021973, + "memory(GiB)": 29.49, + "step": 1180, + "token_acc": 0.7555851950650216, + "train_speed(iter/s)": 0.097124 + }, + { + "epoch": 0.055106827608055596, + "grad_norm": 3.5927646160125732, + "learning_rate": 9.999297456206363e-06, + "loss": 0.9759355545043945, + "memory(GiB)": 29.49, + "step": 1185, + "token_acc": 0.7508080155138979, + "train_speed(iter/s)": 0.097349 + }, + { + "epoch": 0.055339345868005194, + "grad_norm": 3.642490863800049, + "learning_rate": 9.99923152604362e-06, + "loss": 1.0184328079223632, + "memory(GiB)": 29.49, + "step": 1190, + "token_acc": 0.7636363636363637, + "train_speed(iter/s)": 0.097575 + }, + { + "epoch": 0.0555718641279548, + "grad_norm": 3.2297916412353516, + "learning_rate": 9.99916263968093e-06, + "loss": 0.9956600189208984, + "memory(GiB)": 29.49, + "step": 1195, + "token_acc": 0.7647887323943662, + "train_speed(iter/s)": 0.097806 + }, + { + "epoch": 0.0558043823879044, + "grad_norm": 3.0777218341827393, + "learning_rate": 9.999090797159024e-06, + "loss": 1.0422906875610352, + "memory(GiB)": 29.49, + "step": 1200, + "token_acc": 0.7535483870967742, + "train_speed(iter/s)": 0.098027 + }, + { + "epoch": 0.0558043823879044, + "eval_loss": 0.8728676438331604, + "eval_runtime": 295.1145, + "eval_samples_per_second": 11.775, + "eval_steps_per_second": 11.775, + "step": 1200 + }, + { + "epoch": 0.056036900647854, + "grad_norm": 5.250125408172607, + "learning_rate": 9.999015998520385e-06, + "loss": 1.0027048110961914, + "memory(GiB)": 29.49, + "step": 1205, + "token_acc": 0.7636625926668003, + "train_speed(iter/s)": 0.095943 + }, + { + "epoch": 0.056269418907803605, + "grad_norm": 4.09804630279541, + "learning_rate": 9.998938243809244e-06, + "loss": 0.8814007759094238, + "memory(GiB)": 29.49, + "step": 1210, + "token_acc": 0.7876106194690266, + "train_speed(iter/s)": 0.096168 + }, + { + "epoch": 0.0565019371677532, + "grad_norm": 3.9543261528015137, + "learning_rate": 9.998857533071582e-06, + "loss": 0.879784107208252, + "memory(GiB)": 29.49, + "step": 1215, + "token_acc": 0.7943156320119671, + "train_speed(iter/s)": 0.096389 + }, + { + "epoch": 0.05673445542770281, + "grad_norm": 4.678823947906494, + "learning_rate": 9.998773866355123e-06, + "loss": 0.9489768981933594, + "memory(GiB)": 29.49, + "step": 1220, + "token_acc": 0.7849514563106796, + "train_speed(iter/s)": 0.096614 + }, + { + "epoch": 0.05696697368765241, + "grad_norm": 3.336897611618042, + "learning_rate": 9.998687243709342e-06, + "loss": 0.9821072578430176, + "memory(GiB)": 29.49, + "step": 1225, + "token_acc": 0.7741603838245373, + "train_speed(iter/s)": 0.096829 + }, + { + "epoch": 0.05719949194760201, + "grad_norm": 4.5111002922058105, + "learning_rate": 9.998597665185463e-06, + "loss": 0.9603194236755371, + "memory(GiB)": 29.49, + "step": 1230, + "token_acc": 0.7746188710341986, + "train_speed(iter/s)": 0.097046 + }, + { + "epoch": 0.05743201020755161, + "grad_norm": 3.5741658210754395, + "learning_rate": 9.998505130836456e-06, + "loss": 0.976735782623291, + "memory(GiB)": 29.49, + "step": 1235, + "token_acc": 0.7599756320438623, + "train_speed(iter/s)": 0.097271 + }, + { + "epoch": 0.05766452846750121, + "grad_norm": 3.4121596813201904, + "learning_rate": 9.998409640717038e-06, + "loss": 0.8603731155395508, + "memory(GiB)": 29.49, + "step": 1240, + "token_acc": 0.7960591133004926, + "train_speed(iter/s)": 0.097495 + }, + { + "epoch": 0.057897046727450815, + "grad_norm": 3.4050662517547607, + "learning_rate": 9.998311194883676e-06, + "loss": 0.9685305595397949, + "memory(GiB)": 29.49, + "step": 1245, + "token_acc": 0.7773972602739726, + "train_speed(iter/s)": 0.097716 + }, + { + "epoch": 0.05812956498740042, + "grad_norm": 3.6689274311065674, + "learning_rate": 9.998209793394586e-06, + "loss": 0.973471736907959, + "memory(GiB)": 29.49, + "step": 1250, + "token_acc": 0.7650891632373114, + "train_speed(iter/s)": 0.097939 + }, + { + "epoch": 0.05812956498740042, + "eval_loss": 0.8615283966064453, + "eval_runtime": 293.4868, + "eval_samples_per_second": 11.84, + "eval_steps_per_second": 11.84, + "step": 1250 + }, + { + "epoch": 0.05836208324735002, + "grad_norm": 2.7972047328948975, + "learning_rate": 9.99810543630973e-06, + "loss": 0.9186818122863769, + "memory(GiB)": 29.49, + "step": 1255, + "token_acc": 0.765333910006888, + "train_speed(iter/s)": 0.095949 + }, + { + "epoch": 0.05859460150729962, + "grad_norm": 3.5054550170898438, + "learning_rate": 9.997998123690813e-06, + "loss": 1.0190353393554688, + "memory(GiB)": 29.49, + "step": 1260, + "token_acc": 0.7730008598452278, + "train_speed(iter/s)": 0.096164 + }, + { + "epoch": 0.05882711976724922, + "grad_norm": 4.9383087158203125, + "learning_rate": 9.997887855601296e-06, + "loss": 0.8528729438781738, + "memory(GiB)": 29.49, + "step": 1265, + "token_acc": 0.8101571946795647, + "train_speed(iter/s)": 0.096374 + }, + { + "epoch": 0.059059638027198824, + "grad_norm": 3.853332996368408, + "learning_rate": 9.997774632106384e-06, + "loss": 0.9558304786682129, + "memory(GiB)": 29.49, + "step": 1270, + "token_acc": 0.7764960346070656, + "train_speed(iter/s)": 0.096588 + }, + { + "epoch": 0.05929215628714842, + "grad_norm": 3.9047439098358154, + "learning_rate": 9.997658453273027e-06, + "loss": 0.9520219802856446, + "memory(GiB)": 29.49, + "step": 1275, + "token_acc": 0.7671997024916326, + "train_speed(iter/s)": 0.096802 + }, + { + "epoch": 0.059524674547098026, + "grad_norm": 3.4936561584472656, + "learning_rate": 9.997539319169928e-06, + "loss": 0.982548999786377, + "memory(GiB)": 29.49, + "step": 1280, + "token_acc": 0.7688804554079697, + "train_speed(iter/s)": 0.097014 + }, + { + "epoch": 0.05975719280704763, + "grad_norm": 5.598224639892578, + "learning_rate": 9.997417229867534e-06, + "loss": 1.0808399200439454, + "memory(GiB)": 29.49, + "step": 1285, + "token_acc": 0.7508747375787264, + "train_speed(iter/s)": 0.097217 + }, + { + "epoch": 0.05998971106699723, + "grad_norm": 3.887967109680176, + "learning_rate": 9.997292185438038e-06, + "loss": 1.0092259407043458, + "memory(GiB)": 29.49, + "step": 1290, + "token_acc": 0.7590950779578111, + "train_speed(iter/s)": 0.097424 + }, + { + "epoch": 0.06022222932694683, + "grad_norm": 3.637932538986206, + "learning_rate": 9.997164185955385e-06, + "loss": 1.0872056007385253, + "memory(GiB)": 29.49, + "step": 1295, + "token_acc": 0.7493606138107417, + "train_speed(iter/s)": 0.097631 + }, + { + "epoch": 0.06045474758689643, + "grad_norm": 3.762218475341797, + "learning_rate": 9.997033231495263e-06, + "loss": 0.9506484985351562, + "memory(GiB)": 29.49, + "step": 1300, + "token_acc": 0.7712395543175488, + "train_speed(iter/s)": 0.097841 + }, + { + "epoch": 0.06045474758689643, + "eval_loss": 0.8519848585128784, + "eval_runtime": 292.5996, + "eval_samples_per_second": 11.876, + "eval_steps_per_second": 11.876, + "step": 1300 + }, + { + "epoch": 0.060687265846846035, + "grad_norm": 3.022722005844116, + "learning_rate": 9.996899322135113e-06, + "loss": 1.1480344772338866, + "memory(GiB)": 29.49, + "step": 1305, + "token_acc": 0.7660458955700242, + "train_speed(iter/s)": 0.095938 + }, + { + "epoch": 0.06091978410679564, + "grad_norm": 3.4392518997192383, + "learning_rate": 9.996762457954116e-06, + "loss": 1.0385401725769043, + "memory(GiB)": 29.49, + "step": 1310, + "token_acc": 0.7515671395579017, + "train_speed(iter/s)": 0.096145 + }, + { + "epoch": 0.06115230236674524, + "grad_norm": 3.6800425052642822, + "learning_rate": 9.996622639033206e-06, + "loss": 0.9112386703491211, + "memory(GiB)": 29.49, + "step": 1315, + "token_acc": 0.7730819245773732, + "train_speed(iter/s)": 0.096354 + }, + { + "epoch": 0.06138482062669484, + "grad_norm": 4.807951927185059, + "learning_rate": 9.996479865455063e-06, + "loss": 0.9230103492736816, + "memory(GiB)": 29.49, + "step": 1320, + "token_acc": 0.7816488444291135, + "train_speed(iter/s)": 0.096556 + }, + { + "epoch": 0.06161733888664444, + "grad_norm": 3.865591287612915, + "learning_rate": 9.996334137304111e-06, + "loss": 1.008216381072998, + "memory(GiB)": 29.49, + "step": 1325, + "token_acc": 0.7491337491337491, + "train_speed(iter/s)": 0.096759 + }, + { + "epoch": 0.061849857146594044, + "grad_norm": 3.1500022411346436, + "learning_rate": 9.996185454666525e-06, + "loss": 0.9567780494689941, + "memory(GiB)": 29.49, + "step": 1330, + "token_acc": 0.7723855092276145, + "train_speed(iter/s)": 0.096965 + }, + { + "epoch": 0.06208237540654365, + "grad_norm": 4.783307075500488, + "learning_rate": 9.996033817630224e-06, + "loss": 0.8996448516845703, + "memory(GiB)": 29.49, + "step": 1335, + "token_acc": 0.7988252569750367, + "train_speed(iter/s)": 0.097167 + }, + { + "epoch": 0.062314893666493246, + "grad_norm": 3.412679433822632, + "learning_rate": 9.995879226284878e-06, + "loss": 0.874872875213623, + "memory(GiB)": 29.49, + "step": 1340, + "token_acc": 0.7957110609480813, + "train_speed(iter/s)": 0.097368 + }, + { + "epoch": 0.06254741192644285, + "grad_norm": 3.106863498687744, + "learning_rate": 9.995721680721901e-06, + "loss": 1.107049560546875, + "memory(GiB)": 29.49, + "step": 1345, + "token_acc": 0.7413730475844533, + "train_speed(iter/s)": 0.097566 + }, + { + "epoch": 0.06277993018639245, + "grad_norm": 3.066474437713623, + "learning_rate": 9.995561181034454e-06, + "loss": 0.8950592041015625, + "memory(GiB)": 29.49, + "step": 1350, + "token_acc": 0.7750075097626915, + "train_speed(iter/s)": 0.097766 + }, + { + "epoch": 0.06277993018639245, + "eval_loss": 0.8605217337608337, + "eval_runtime": 294.6974, + "eval_samples_per_second": 11.792, + "eval_steps_per_second": 11.792, + "step": 1350 + }, + { + "epoch": 0.06301244844634205, + "grad_norm": 4.106268405914307, + "learning_rate": 9.995397727317447e-06, + "loss": 1.0116336822509766, + "memory(GiB)": 29.49, + "step": 1355, + "token_acc": 0.7674015773172834, + "train_speed(iter/s)": 0.095918 + }, + { + "epoch": 0.06324496670629165, + "grad_norm": 3.730259418487549, + "learning_rate": 9.99523131966753e-06, + "loss": 1.0562795639038085, + "memory(GiB)": 29.49, + "step": 1360, + "token_acc": 0.7494160827494161, + "train_speed(iter/s)": 0.096117 + }, + { + "epoch": 0.06347748496624125, + "grad_norm": 4.691348552703857, + "learning_rate": 9.995061958183111e-06, + "loss": 1.1530324935913085, + "memory(GiB)": 29.49, + "step": 1365, + "token_acc": 0.7197295636140135, + "train_speed(iter/s)": 0.096302 + }, + { + "epoch": 0.06371000322619086, + "grad_norm": 4.493622303009033, + "learning_rate": 9.994889642964338e-06, + "loss": 0.9282937049865723, + "memory(GiB)": 29.49, + "step": 1370, + "token_acc": 0.753072625698324, + "train_speed(iter/s)": 0.096499 + }, + { + "epoch": 0.06394252148614046, + "grad_norm": 3.885254144668579, + "learning_rate": 9.994714374113104e-06, + "loss": 0.9173580169677734, + "memory(GiB)": 29.49, + "step": 1375, + "token_acc": 0.775804289544236, + "train_speed(iter/s)": 0.096698 + }, + { + "epoch": 0.06417503974609005, + "grad_norm": 4.416103839874268, + "learning_rate": 9.994536151733051e-06, + "loss": 0.96749849319458, + "memory(GiB)": 29.49, + "step": 1380, + "token_acc": 0.7750706214689266, + "train_speed(iter/s)": 0.096894 + }, + { + "epoch": 0.06440755800603966, + "grad_norm": 4.307039260864258, + "learning_rate": 9.994354975929567e-06, + "loss": 0.9266422271728516, + "memory(GiB)": 29.49, + "step": 1385, + "token_acc": 0.7925219941348973, + "train_speed(iter/s)": 0.097093 + }, + { + "epoch": 0.06464007626598926, + "grad_norm": 3.3025949001312256, + "learning_rate": 9.99417084680979e-06, + "loss": 1.0316217422485352, + "memory(GiB)": 29.49, + "step": 1390, + "token_acc": 0.7507598784194529, + "train_speed(iter/s)": 0.097289 + }, + { + "epoch": 0.06487259452593887, + "grad_norm": 3.5625064373016357, + "learning_rate": 9.993983764482598e-06, + "loss": 0.9325406074523925, + "memory(GiB)": 29.49, + "step": 1395, + "token_acc": 0.7747720364741641, + "train_speed(iter/s)": 0.097481 + }, + { + "epoch": 0.06510511278588847, + "grad_norm": 2.98435640335083, + "learning_rate": 9.99379372905862e-06, + "loss": 0.9729434967041015, + "memory(GiB)": 29.49, + "step": 1400, + "token_acc": 0.7669683257918553, + "train_speed(iter/s)": 0.097676 + }, + { + "epoch": 0.06510511278588847, + "eval_loss": 0.8391401171684265, + "eval_runtime": 294.1127, + "eval_samples_per_second": 11.815, + "eval_steps_per_second": 11.815, + "step": 1400 + }, + { + "epoch": 0.06533763104583806, + "grad_norm": 3.670905828475952, + "learning_rate": 9.99360074065023e-06, + "loss": 0.9499552726745606, + "memory(GiB)": 29.49, + "step": 1405, + "token_acc": 0.7702438440355989, + "train_speed(iter/s)": 0.095898 + }, + { + "epoch": 0.06557014930578767, + "grad_norm": 3.8291659355163574, + "learning_rate": 9.993404799371548e-06, + "loss": 0.8762431144714355, + "memory(GiB)": 29.49, + "step": 1410, + "token_acc": 0.7836134453781513, + "train_speed(iter/s)": 0.096089 + }, + { + "epoch": 0.06580266756573727, + "grad_norm": 5.117160797119141, + "learning_rate": 9.99320590533844e-06, + "loss": 0.9930843353271485, + "memory(GiB)": 29.49, + "step": 1415, + "token_acc": 0.7609187965059852, + "train_speed(iter/s)": 0.096282 + }, + { + "epoch": 0.06603518582568688, + "grad_norm": 4.551426410675049, + "learning_rate": 9.993004058668516e-06, + "loss": 1.0043935775756836, + "memory(GiB)": 29.49, + "step": 1420, + "token_acc": 0.7556226921785835, + "train_speed(iter/s)": 0.096472 + }, + { + "epoch": 0.06626770408563648, + "grad_norm": 3.7161145210266113, + "learning_rate": 9.992799259481136e-06, + "loss": 1.058093547821045, + "memory(GiB)": 29.49, + "step": 1425, + "token_acc": 0.745412078745412, + "train_speed(iter/s)": 0.096662 + }, + { + "epoch": 0.06650022234558607, + "grad_norm": 3.9534449577331543, + "learning_rate": 9.992591507897405e-06, + "loss": 0.894005012512207, + "memory(GiB)": 29.49, + "step": 1430, + "token_acc": 0.7950200088928413, + "train_speed(iter/s)": 0.096848 + }, + { + "epoch": 0.06673274060553568, + "grad_norm": 4.306530475616455, + "learning_rate": 9.992380804040172e-06, + "loss": 0.9588717460632324, + "memory(GiB)": 29.49, + "step": 1435, + "token_acc": 0.7775534921275736, + "train_speed(iter/s)": 0.097039 + }, + { + "epoch": 0.06696525886548528, + "grad_norm": 4.580782890319824, + "learning_rate": 9.992167148034033e-06, + "loss": 0.8765263557434082, + "memory(GiB)": 29.49, + "step": 1440, + "token_acc": 0.7939895470383276, + "train_speed(iter/s)": 0.097225 + }, + { + "epoch": 0.06719777712543488, + "grad_norm": 3.7674219608306885, + "learning_rate": 9.991950540005329e-06, + "loss": 0.9246517181396484, + "memory(GiB)": 29.49, + "step": 1445, + "token_acc": 0.7846655791190864, + "train_speed(iter/s)": 0.097413 + }, + { + "epoch": 0.06743029538538449, + "grad_norm": 4.060750961303711, + "learning_rate": 9.991730980082147e-06, + "loss": 0.9379199028015137, + "memory(GiB)": 29.49, + "step": 1450, + "token_acc": 0.7786173026067246, + "train_speed(iter/s)": 0.097597 + }, + { + "epoch": 0.06743029538538449, + "eval_loss": 0.8332929015159607, + "eval_runtime": 294.3264, + "eval_samples_per_second": 11.807, + "eval_steps_per_second": 11.807, + "step": 1450 + }, + { + "epoch": 0.06766281364533408, + "grad_norm": 4.1459197998046875, + "learning_rate": 9.99150846839432e-06, + "loss": 0.8683028221130371, + "memory(GiB)": 29.49, + "step": 1455, + "token_acc": 0.7721197916248606, + "train_speed(iter/s)": 0.095882 + }, + { + "epoch": 0.06789533190528368, + "grad_norm": 4.427006721496582, + "learning_rate": 9.991283005073425e-06, + "loss": 0.8849419593811035, + "memory(GiB)": 29.49, + "step": 1460, + "token_acc": 0.7902208201892744, + "train_speed(iter/s)": 0.096063 + }, + { + "epoch": 0.06812785016523329, + "grad_norm": 4.184934139251709, + "learning_rate": 9.991054590252786e-06, + "loss": 0.9550199508666992, + "memory(GiB)": 29.49, + "step": 1465, + "token_acc": 0.7796402289452167, + "train_speed(iter/s)": 0.096248 + }, + { + "epoch": 0.0683603684251829, + "grad_norm": 3.8598761558532715, + "learning_rate": 9.990823224067473e-06, + "loss": 0.9285133361816407, + "memory(GiB)": 29.49, + "step": 1470, + "token_acc": 0.7871080139372822, + "train_speed(iter/s)": 0.096435 + }, + { + "epoch": 0.0685928866851325, + "grad_norm": 4.542653560638428, + "learning_rate": 9.990588906654296e-06, + "loss": 0.8635785102844238, + "memory(GiB)": 29.49, + "step": 1475, + "token_acc": 0.7989153108051731, + "train_speed(iter/s)": 0.096622 + }, + { + "epoch": 0.06882540494508209, + "grad_norm": 4.145790100097656, + "learning_rate": 9.99035163815182e-06, + "loss": 0.8900579452514649, + "memory(GiB)": 29.49, + "step": 1480, + "token_acc": 0.7928062944923192, + "train_speed(iter/s)": 0.096806 + }, + { + "epoch": 0.0690579232050317, + "grad_norm": 4.27974271774292, + "learning_rate": 9.990111418700345e-06, + "loss": 0.9037236213684082, + "memory(GiB)": 29.49, + "step": 1485, + "token_acc": 0.7942857142857143, + "train_speed(iter/s)": 0.096986 + }, + { + "epoch": 0.0692904414649813, + "grad_norm": 4.4757399559021, + "learning_rate": 9.989868248441922e-06, + "loss": 0.9862874984741211, + "memory(GiB)": 29.49, + "step": 1490, + "token_acc": 0.7700774561761109, + "train_speed(iter/s)": 0.097165 + }, + { + "epoch": 0.0695229597249309, + "grad_norm": 5.238999366760254, + "learning_rate": 9.989622127520345e-06, + "loss": 1.1203701972961426, + "memory(GiB)": 29.49, + "step": 1495, + "token_acc": 0.7371184771906794, + "train_speed(iter/s)": 0.097343 + }, + { + "epoch": 0.0697554779848805, + "grad_norm": 4.224093437194824, + "learning_rate": 9.989373056081151e-06, + "loss": 0.9576206207275391, + "memory(GiB)": 29.49, + "step": 1500, + "token_acc": 0.7671414375621066, + "train_speed(iter/s)": 0.097524 + }, + { + "epoch": 0.0697554779848805, + "eval_loss": 0.829556941986084, + "eval_runtime": 292.0969, + "eval_samples_per_second": 11.897, + "eval_steps_per_second": 11.897, + "step": 1500 + }, + { + "epoch": 0.0699879962448301, + "grad_norm": 4.1649675369262695, + "learning_rate": 9.989121034271625e-06, + "loss": 0.9163863182067871, + "memory(GiB)": 29.49, + "step": 1505, + "token_acc": 0.772836721143616, + "train_speed(iter/s)": 0.095881 + }, + { + "epoch": 0.0702205145047797, + "grad_norm": 5.037374019622803, + "learning_rate": 9.988866062240796e-06, + "loss": 0.9264779090881348, + "memory(GiB)": 29.49, + "step": 1510, + "token_acc": 0.7805779569892473, + "train_speed(iter/s)": 0.096059 + }, + { + "epoch": 0.0704530327647293, + "grad_norm": 4.991156578063965, + "learning_rate": 9.988608140139436e-06, + "loss": 0.9976637840270997, + "memory(GiB)": 29.49, + "step": 1515, + "token_acc": 0.7648648648648648, + "train_speed(iter/s)": 0.09624 + }, + { + "epoch": 0.07068555102467891, + "grad_norm": 3.67498517036438, + "learning_rate": 9.988347268120062e-06, + "loss": 0.9292027473449707, + "memory(GiB)": 29.49, + "step": 1520, + "token_acc": 0.7783676177836761, + "train_speed(iter/s)": 0.096419 + }, + { + "epoch": 0.0709180692846285, + "grad_norm": 4.408437728881836, + "learning_rate": 9.988083446336936e-06, + "loss": 0.9982312202453614, + "memory(GiB)": 29.49, + "step": 1525, + "token_acc": 0.7497497497497497, + "train_speed(iter/s)": 0.096596 + }, + { + "epoch": 0.0711505875445781, + "grad_norm": 4.5508832931518555, + "learning_rate": 9.987816674946064e-06, + "loss": 0.8968223571777344, + "memory(GiB)": 29.49, + "step": 1530, + "token_acc": 0.7771679473106476, + "train_speed(iter/s)": 0.096768 + }, + { + "epoch": 0.07138310580452771, + "grad_norm": 3.7403552532196045, + "learning_rate": 9.987546954105198e-06, + "loss": 0.8528156280517578, + "memory(GiB)": 29.49, + "step": 1535, + "token_acc": 0.7941391941391941, + "train_speed(iter/s)": 0.096944 + }, + { + "epoch": 0.07161562406447732, + "grad_norm": 3.6034836769104004, + "learning_rate": 9.987274283973829e-06, + "loss": 0.9096664428710938, + "memory(GiB)": 29.49, + "step": 1540, + "token_acc": 0.7896147403685092, + "train_speed(iter/s)": 0.097124 + }, + { + "epoch": 0.07184814232442692, + "grad_norm": 5.132664680480957, + "learning_rate": 9.986998664713195e-06, + "loss": 0.9881318092346192, + "memory(GiB)": 29.49, + "step": 1545, + "token_acc": 0.7650876054510058, + "train_speed(iter/s)": 0.097299 + }, + { + "epoch": 0.07208066058437651, + "grad_norm": 4.338294506072998, + "learning_rate": 9.986720096486282e-06, + "loss": 1.01829195022583, + "memory(GiB)": 29.49, + "step": 1550, + "token_acc": 0.7651966626936829, + "train_speed(iter/s)": 0.09748 + }, + { + "epoch": 0.07208066058437651, + "eval_loss": 0.8277249336242676, + "eval_runtime": 290.3989, + "eval_samples_per_second": 11.966, + "eval_steps_per_second": 11.966, + "step": 1550 + }, + { + "epoch": 0.07231317884432611, + "grad_norm": 13.258360862731934, + "learning_rate": 9.986438579457813e-06, + "loss": 0.8957409858703613, + "memory(GiB)": 29.49, + "step": 1555, + "token_acc": 0.7736871986374213, + "train_speed(iter/s)": 0.0959 + }, + { + "epoch": 0.07254569710427572, + "grad_norm": 4.9179840087890625, + "learning_rate": 9.98615411379426e-06, + "loss": 0.9114449501037598, + "memory(GiB)": 29.49, + "step": 1560, + "token_acc": 0.7850467289719626, + "train_speed(iter/s)": 0.096075 + }, + { + "epoch": 0.07277821536422532, + "grad_norm": 3.7983877658843994, + "learning_rate": 9.985866699663833e-06, + "loss": 0.9112727165222168, + "memory(GiB)": 29.49, + "step": 1565, + "token_acc": 0.7770820288040076, + "train_speed(iter/s)": 0.096234 + }, + { + "epoch": 0.07301073362417493, + "grad_norm": 3.6494104862213135, + "learning_rate": 9.98557633723649e-06, + "loss": 0.99598388671875, + "memory(GiB)": 29.49, + "step": 1570, + "token_acc": 0.7602203537257176, + "train_speed(iter/s)": 0.096406 + }, + { + "epoch": 0.07324325188412452, + "grad_norm": 5.046865940093994, + "learning_rate": 9.985283026683934e-06, + "loss": 0.9579297065734863, + "memory(GiB)": 29.49, + "step": 1575, + "token_acc": 0.7820121951219512, + "train_speed(iter/s)": 0.096577 + }, + { + "epoch": 0.07347577014407412, + "grad_norm": 4.406998157501221, + "learning_rate": 9.984986768179608e-06, + "loss": 0.8625598907470703, + "memory(GiB)": 29.49, + "step": 1580, + "token_acc": 0.8079896907216495, + "train_speed(iter/s)": 0.096749 + }, + { + "epoch": 0.07370828840402373, + "grad_norm": 4.63735818862915, + "learning_rate": 9.984687561898693e-06, + "loss": 0.9187863349914551, + "memory(GiB)": 29.49, + "step": 1585, + "token_acc": 0.7812375249500998, + "train_speed(iter/s)": 0.096924 + }, + { + "epoch": 0.07394080666397333, + "grad_norm": 5.131118297576904, + "learning_rate": 9.984385408018127e-06, + "loss": 0.8392014503479004, + "memory(GiB)": 29.49, + "step": 1590, + "token_acc": 0.7880321524263174, + "train_speed(iter/s)": 0.097094 + }, + { + "epoch": 0.07417332492392294, + "grad_norm": 4.358776569366455, + "learning_rate": 9.98408030671658e-06, + "loss": 0.8473024368286133, + "memory(GiB)": 29.49, + "step": 1595, + "token_acc": 0.8058499364137346, + "train_speed(iter/s)": 0.09727 + }, + { + "epoch": 0.07440584318387253, + "grad_norm": 5.489317417144775, + "learning_rate": 9.983772258174464e-06, + "loss": 0.9494953155517578, + "memory(GiB)": 29.49, + "step": 1600, + "token_acc": 0.7736131934032984, + "train_speed(iter/s)": 0.097441 + }, + { + "epoch": 0.07440584318387253, + "eval_loss": 0.8192211985588074, + "eval_runtime": 293.9639, + "eval_samples_per_second": 11.821, + "eval_steps_per_second": 11.821, + "step": 1600 + }, + { + "epoch": 0.07463836144382213, + "grad_norm": 5.0594024658203125, + "learning_rate": 9.98346126257394e-06, + "loss": 1.0007355690002442, + "memory(GiB)": 29.49, + "step": 1605, + "token_acc": 0.773707034500416, + "train_speed(iter/s)": 0.095896 + }, + { + "epoch": 0.07487087970377174, + "grad_norm": 10.87908935546875, + "learning_rate": 9.983147320098914e-06, + "loss": 0.9566534996032715, + "memory(GiB)": 29.49, + "step": 1610, + "token_acc": 0.7540680473372781, + "train_speed(iter/s)": 0.096063 + }, + { + "epoch": 0.07510339796372134, + "grad_norm": 3.9066755771636963, + "learning_rate": 9.982830430935024e-06, + "loss": 0.8844928741455078, + "memory(GiB)": 29.49, + "step": 1615, + "token_acc": 0.7842778793418648, + "train_speed(iter/s)": 0.096232 + }, + { + "epoch": 0.07533591622367095, + "grad_norm": 4.4570207595825195, + "learning_rate": 9.982510595269658e-06, + "loss": 0.9009759902954102, + "memory(GiB)": 29.49, + "step": 1620, + "token_acc": 0.7826250470101542, + "train_speed(iter/s)": 0.096398 + }, + { + "epoch": 0.07556843448362054, + "grad_norm": 4.537950038909912, + "learning_rate": 9.982187813291944e-06, + "loss": 0.9638691902160644, + "memory(GiB)": 29.49, + "step": 1625, + "token_acc": 0.7793783169067475, + "train_speed(iter/s)": 0.096565 + }, + { + "epoch": 0.07580095274357014, + "grad_norm": 3.762213945388794, + "learning_rate": 9.981862085192756e-06, + "loss": 0.8557533264160156, + "memory(GiB)": 29.49, + "step": 1630, + "token_acc": 0.7996086105675146, + "train_speed(iter/s)": 0.096724 + }, + { + "epoch": 0.07603347100351975, + "grad_norm": 3.7838034629821777, + "learning_rate": 9.981533411164703e-06, + "loss": 0.9432172775268555, + "memory(GiB)": 29.49, + "step": 1635, + "token_acc": 0.7803448275862069, + "train_speed(iter/s)": 0.096889 + }, + { + "epoch": 0.07626598926346935, + "grad_norm": 4.075239181518555, + "learning_rate": 9.981201791402142e-06, + "loss": 0.9049320220947266, + "memory(GiB)": 29.49, + "step": 1640, + "token_acc": 0.7912266450040617, + "train_speed(iter/s)": 0.097056 + }, + { + "epoch": 0.07649850752341895, + "grad_norm": 3.987454414367676, + "learning_rate": 9.980867226101172e-06, + "loss": 0.9458544731140137, + "memory(GiB)": 29.49, + "step": 1645, + "token_acc": 0.7716707683893902, + "train_speed(iter/s)": 0.097219 + }, + { + "epoch": 0.07673102578336854, + "grad_norm": 3.9305269718170166, + "learning_rate": 9.980529715459628e-06, + "loss": 0.9325850486755372, + "memory(GiB)": 29.49, + "step": 1650, + "token_acc": 0.7838125665601704, + "train_speed(iter/s)": 0.097384 + }, + { + "epoch": 0.07673102578336854, + "eval_loss": 0.8100156188011169, + "eval_runtime": 294.3166, + "eval_samples_per_second": 11.807, + "eval_steps_per_second": 11.807, + "step": 1650 + }, + { + "epoch": 0.07696354404331815, + "grad_norm": 4.639121055603027, + "learning_rate": 9.980189259677093e-06, + "loss": 0.9464486122131348, + "memory(GiB)": 29.49, + "step": 1655, + "token_acc": 0.7757631985155918, + "train_speed(iter/s)": 0.095872 + }, + { + "epoch": 0.07719606230326775, + "grad_norm": 4.0201945304870605, + "learning_rate": 9.979845858954889e-06, + "loss": 0.9635875701904297, + "memory(GiB)": 29.49, + "step": 1660, + "token_acc": 0.7598730606488011, + "train_speed(iter/s)": 0.096034 + }, + { + "epoch": 0.07742858056321736, + "grad_norm": 3.4885239601135254, + "learning_rate": 9.979499513496078e-06, + "loss": 0.889770793914795, + "memory(GiB)": 29.49, + "step": 1665, + "token_acc": 0.785060417429513, + "train_speed(iter/s)": 0.096196 + }, + { + "epoch": 0.07766109882316696, + "grad_norm": 3.780503749847412, + "learning_rate": 9.979150223505465e-06, + "loss": 0.8421327590942382, + "memory(GiB)": 29.49, + "step": 1670, + "token_acc": 0.7923784494086727, + "train_speed(iter/s)": 0.096354 + }, + { + "epoch": 0.07789361708311655, + "grad_norm": 3.9452600479125977, + "learning_rate": 9.978797989189598e-06, + "loss": 0.9661630630493164, + "memory(GiB)": 29.49, + "step": 1675, + "token_acc": 0.7715868361029652, + "train_speed(iter/s)": 0.096511 + }, + { + "epoch": 0.07812613534306616, + "grad_norm": 4.459422588348389, + "learning_rate": 9.978442810756761e-06, + "loss": 0.8917196273803711, + "memory(GiB)": 29.49, + "step": 1680, + "token_acc": 0.792498980839788, + "train_speed(iter/s)": 0.096672 + }, + { + "epoch": 0.07835865360301576, + "grad_norm": 6.250596046447754, + "learning_rate": 9.978084688416983e-06, + "loss": 1.031515884399414, + "memory(GiB)": 29.49, + "step": 1685, + "token_acc": 0.7625, + "train_speed(iter/s)": 0.096831 + }, + { + "epoch": 0.07859117186296537, + "grad_norm": 4.471418857574463, + "learning_rate": 9.977723622382034e-06, + "loss": 0.8646291732788086, + "memory(GiB)": 29.49, + "step": 1690, + "token_acc": 0.7904109589041096, + "train_speed(iter/s)": 0.096988 + }, + { + "epoch": 0.07882369012291497, + "grad_norm": 4.630346775054932, + "learning_rate": 9.977359612865424e-06, + "loss": 0.9914566993713378, + "memory(GiB)": 29.49, + "step": 1695, + "token_acc": 0.7661343978709249, + "train_speed(iter/s)": 0.097147 + }, + { + "epoch": 0.07905620838286456, + "grad_norm": 3.786259651184082, + "learning_rate": 9.9769926600824e-06, + "loss": 0.8837844848632812, + "memory(GiB)": 29.49, + "step": 1700, + "token_acc": 0.7803897309000928, + "train_speed(iter/s)": 0.097304 + }, + { + "epoch": 0.07905620838286456, + "eval_loss": 0.8090683221817017, + "eval_runtime": 294.3788, + "eval_samples_per_second": 11.805, + "eval_steps_per_second": 11.805, + "step": 1700 + }, + { + "epoch": 0.07928872664281417, + "grad_norm": 4.507750511169434, + "learning_rate": 9.976622764249956e-06, + "loss": 0.9460622787475585, + "memory(GiB)": 29.49, + "step": 1705, + "token_acc": 0.7773199512601808, + "train_speed(iter/s)": 0.09585 + }, + { + "epoch": 0.07952124490276377, + "grad_norm": 3.4761149883270264, + "learning_rate": 9.97624992558682e-06, + "loss": 0.8971858024597168, + "memory(GiB)": 29.49, + "step": 1710, + "token_acc": 0.7684088940225238, + "train_speed(iter/s)": 0.096007 + }, + { + "epoch": 0.07975376316271338, + "grad_norm": 3.712315559387207, + "learning_rate": 9.975874144313465e-06, + "loss": 1.006467914581299, + "memory(GiB)": 29.49, + "step": 1715, + "token_acc": 0.7429280397022332, + "train_speed(iter/s)": 0.096157 + }, + { + "epoch": 0.07998628142266298, + "grad_norm": 3.706094741821289, + "learning_rate": 9.975495420652102e-06, + "loss": 0.7353443622589111, + "memory(GiB)": 29.49, + "step": 1720, + "token_acc": 0.8296415626258559, + "train_speed(iter/s)": 0.096317 + }, + { + "epoch": 0.08021879968261257, + "grad_norm": 5.486309051513672, + "learning_rate": 9.97511375482668e-06, + "loss": 0.9426840782165528, + "memory(GiB)": 29.49, + "step": 1725, + "token_acc": 0.7679036458333334, + "train_speed(iter/s)": 0.096471 + }, + { + "epoch": 0.08045131794256218, + "grad_norm": 5.596253871917725, + "learning_rate": 9.974729147062891e-06, + "loss": 0.939633560180664, + "memory(GiB)": 29.49, + "step": 1730, + "token_acc": 0.7654434250764526, + "train_speed(iter/s)": 0.096628 + }, + { + "epoch": 0.08068383620251178, + "grad_norm": 3.46860671043396, + "learning_rate": 9.974341597588166e-06, + "loss": 0.8142014503479004, + "memory(GiB)": 29.49, + "step": 1735, + "token_acc": 0.7989148864021702, + "train_speed(iter/s)": 0.096779 + }, + { + "epoch": 0.08091635446246138, + "grad_norm": 4.415549278259277, + "learning_rate": 9.973951106631672e-06, + "loss": 0.8912237167358399, + "memory(GiB)": 29.49, + "step": 1740, + "token_acc": 0.7891541885876163, + "train_speed(iter/s)": 0.096933 + }, + { + "epoch": 0.08114887272241098, + "grad_norm": 3.439880847930908, + "learning_rate": 9.973557674424324e-06, + "loss": 0.9315021514892579, + "memory(GiB)": 29.49, + "step": 1745, + "token_acc": 0.7705095771593784, + "train_speed(iter/s)": 0.097084 + }, + { + "epoch": 0.08138139098236058, + "grad_norm": 4.43915319442749, + "learning_rate": 9.973161301198766e-06, + "loss": 0.945796012878418, + "memory(GiB)": 29.49, + "step": 1750, + "token_acc": 0.7663170163170163, + "train_speed(iter/s)": 0.097239 + }, + { + "epoch": 0.08138139098236058, + "eval_loss": 0.8076890110969543, + "eval_runtime": 290.1412, + "eval_samples_per_second": 11.977, + "eval_steps_per_second": 11.977, + "step": 1750 + }, + { + "epoch": 0.08161390924231018, + "grad_norm": 3.757079839706421, + "learning_rate": 9.972761987189387e-06, + "loss": 0.939511489868164, + "memory(GiB)": 29.49, + "step": 1755, + "token_acc": 0.7777243512686125, + "train_speed(iter/s)": 0.09585 + }, + { + "epoch": 0.08184642750225979, + "grad_norm": 4.13332986831665, + "learning_rate": 9.972359732632316e-06, + "loss": 0.8005983352661132, + "memory(GiB)": 29.49, + "step": 1760, + "token_acc": 0.8006837606837607, + "train_speed(iter/s)": 0.096004 + }, + { + "epoch": 0.0820789457622094, + "grad_norm": 5.232628345489502, + "learning_rate": 9.971954537765414e-06, + "loss": 0.8781660079956055, + "memory(GiB)": 29.49, + "step": 1765, + "token_acc": 0.7958847736625514, + "train_speed(iter/s)": 0.096157 + }, + { + "epoch": 0.08231146402215898, + "grad_norm": 4.8083696365356445, + "learning_rate": 9.971546402828289e-06, + "loss": 0.7852079391479492, + "memory(GiB)": 29.49, + "step": 1770, + "token_acc": 0.8230055658627087, + "train_speed(iter/s)": 0.096309 + }, + { + "epoch": 0.08254398228210859, + "grad_norm": 5.42658805847168, + "learning_rate": 9.971135328062282e-06, + "loss": 1.06929292678833, + "memory(GiB)": 29.49, + "step": 1775, + "token_acc": 0.746515397082658, + "train_speed(iter/s)": 0.096455 + }, + { + "epoch": 0.08277650054205819, + "grad_norm": 4.100043773651123, + "learning_rate": 9.970721313710475e-06, + "loss": 0.8212204933166504, + "memory(GiB)": 29.49, + "step": 1780, + "token_acc": 0.8087121212121212, + "train_speed(iter/s)": 0.096608 + }, + { + "epoch": 0.0830090188020078, + "grad_norm": 5.11602258682251, + "learning_rate": 9.970304360017686e-06, + "loss": 0.8928379058837891, + "memory(GiB)": 29.49, + "step": 1785, + "token_acc": 0.7898375388869685, + "train_speed(iter/s)": 0.096758 + }, + { + "epoch": 0.0832415370619574, + "grad_norm": 4.066575050354004, + "learning_rate": 9.969884467230472e-06, + "loss": 0.8791553497314453, + "memory(GiB)": 29.49, + "step": 1790, + "token_acc": 0.7894538606403013, + "train_speed(iter/s)": 0.096909 + }, + { + "epoch": 0.08347405532190699, + "grad_norm": 3.9396798610687256, + "learning_rate": 9.969461635597134e-06, + "loss": 0.8068005561828613, + "memory(GiB)": 29.49, + "step": 1795, + "token_acc": 0.8042886317222601, + "train_speed(iter/s)": 0.097058 + }, + { + "epoch": 0.0837065735818566, + "grad_norm": 4.261980056762695, + "learning_rate": 9.9690358653677e-06, + "loss": 0.9571780204772949, + "memory(GiB)": 29.49, + "step": 1800, + "token_acc": 0.7771060056428859, + "train_speed(iter/s)": 0.097209 + }, + { + "epoch": 0.0837065735818566, + "eval_loss": 0.8003239035606384, + "eval_runtime": 291.1492, + "eval_samples_per_second": 11.935, + "eval_steps_per_second": 11.935, + "step": 1800 + }, + { + "epoch": 0.0839390918418062, + "grad_norm": 4.323093414306641, + "learning_rate": 9.968607156793944e-06, + "loss": 0.8221240997314453, + "memory(GiB)": 29.49, + "step": 1805, + "token_acc": 0.7793450881612091, + "train_speed(iter/s)": 0.095854 + }, + { + "epoch": 0.0841716101017558, + "grad_norm": 3.989581346511841, + "learning_rate": 9.96817551012937e-06, + "loss": 0.9691762924194336, + "memory(GiB)": 29.49, + "step": 1810, + "token_acc": 0.7553009992688277, + "train_speed(iter/s)": 0.095997 + }, + { + "epoch": 0.08440412836170541, + "grad_norm": 3.824002265930176, + "learning_rate": 9.96774092562923e-06, + "loss": 0.8493466377258301, + "memory(GiB)": 29.49, + "step": 1815, + "token_acc": 0.7916417910447762, + "train_speed(iter/s)": 0.096142 + }, + { + "epoch": 0.084636646621655, + "grad_norm": 5.909646511077881, + "learning_rate": 9.967303403550505e-06, + "loss": 0.9302974700927734, + "memory(GiB)": 29.49, + "step": 1820, + "token_acc": 0.7567913071268776, + "train_speed(iter/s)": 0.096292 + }, + { + "epoch": 0.0848691648816046, + "grad_norm": 5.779644966125488, + "learning_rate": 9.966862944151914e-06, + "loss": 1.206606101989746, + "memory(GiB)": 29.49, + "step": 1825, + "token_acc": 0.7205930807248765, + "train_speed(iter/s)": 0.09644 + }, + { + "epoch": 0.08510168314155421, + "grad_norm": 5.22266960144043, + "learning_rate": 9.966419547693915e-06, + "loss": 0.8638523101806641, + "memory(GiB)": 29.49, + "step": 1830, + "token_acc": 0.8013672616786935, + "train_speed(iter/s)": 0.096588 + }, + { + "epoch": 0.08533420140150381, + "grad_norm": 6.209298610687256, + "learning_rate": 9.965973214438702e-06, + "loss": 0.8310011863708496, + "memory(GiB)": 29.49, + "step": 1835, + "token_acc": 0.7985272459499264, + "train_speed(iter/s)": 0.096731 + }, + { + "epoch": 0.08556671966145342, + "grad_norm": 4.538772106170654, + "learning_rate": 9.965523944650206e-06, + "loss": 0.8197231292724609, + "memory(GiB)": 29.49, + "step": 1840, + "token_acc": 0.8025325119780972, + "train_speed(iter/s)": 0.096881 + }, + { + "epoch": 0.08579923792140301, + "grad_norm": 4.979494571685791, + "learning_rate": 9.965071738594095e-06, + "loss": 0.8975962638854981, + "memory(GiB)": 29.49, + "step": 1845, + "token_acc": 0.7914086687306502, + "train_speed(iter/s)": 0.09703 + }, + { + "epoch": 0.08603175618135261, + "grad_norm": 4.453029155731201, + "learning_rate": 9.964616596537768e-06, + "loss": 0.8544903755187988, + "memory(GiB)": 29.49, + "step": 1850, + "token_acc": 0.8063121487246001, + "train_speed(iter/s)": 0.09718 + }, + { + "epoch": 0.08603175618135261, + "eval_loss": 0.8005815744400024, + "eval_runtime": 292.6623, + "eval_samples_per_second": 11.874, + "eval_steps_per_second": 11.874, + "step": 1850 + }, + { + "epoch": 0.08626427444130222, + "grad_norm": 4.928404331207275, + "learning_rate": 9.96415851875037e-06, + "loss": 0.8729582786560058, + "memory(GiB)": 29.49, + "step": 1855, + "token_acc": 0.7795837812685525, + "train_speed(iter/s)": 0.095855 + }, + { + "epoch": 0.08649679270125182, + "grad_norm": 4.308002948760986, + "learning_rate": 9.963697505502776e-06, + "loss": 0.9543866157531739, + "memory(GiB)": 29.49, + "step": 1860, + "token_acc": 0.7780277465316835, + "train_speed(iter/s)": 0.096 + }, + { + "epoch": 0.08672931096120143, + "grad_norm": 4.0189595222473145, + "learning_rate": 9.963233557067593e-06, + "loss": 0.9054231643676758, + "memory(GiB)": 29.49, + "step": 1865, + "token_acc": 0.8008409785932722, + "train_speed(iter/s)": 0.096149 + }, + { + "epoch": 0.08696182922115102, + "grad_norm": 4.4174299240112305, + "learning_rate": 9.962766673719172e-06, + "loss": 0.8594171524047851, + "memory(GiB)": 29.49, + "step": 1870, + "token_acc": 0.7968021319120586, + "train_speed(iter/s)": 0.096292 + }, + { + "epoch": 0.08719434748110062, + "grad_norm": 4.788074016571045, + "learning_rate": 9.962296855733594e-06, + "loss": 1.0707283973693849, + "memory(GiB)": 29.49, + "step": 1875, + "token_acc": 0.7581755593803786, + "train_speed(iter/s)": 0.096436 + }, + { + "epoch": 0.08742686574105023, + "grad_norm": 4.352080345153809, + "learning_rate": 9.961824103388678e-06, + "loss": 0.9819015502929688, + "memory(GiB)": 29.49, + "step": 1880, + "token_acc": 0.7637842723711962, + "train_speed(iter/s)": 0.096579 + }, + { + "epoch": 0.08765938400099983, + "grad_norm": 3.994741201400757, + "learning_rate": 9.961348416963974e-06, + "loss": 0.8944009780883789, + "memory(GiB)": 29.49, + "step": 1885, + "token_acc": 0.7813012575177692, + "train_speed(iter/s)": 0.09672 + }, + { + "epoch": 0.08789190226094944, + "grad_norm": 4.152418613433838, + "learning_rate": 9.960869796740776e-06, + "loss": 0.9587595939636231, + "memory(GiB)": 29.49, + "step": 1890, + "token_acc": 0.7640990371389271, + "train_speed(iter/s)": 0.096864 + }, + { + "epoch": 0.08812442052089903, + "grad_norm": 5.073063850402832, + "learning_rate": 9.9603882430021e-06, + "loss": 0.9772052764892578, + "memory(GiB)": 29.49, + "step": 1895, + "token_acc": 0.7661883094154708, + "train_speed(iter/s)": 0.097008 + }, + { + "epoch": 0.08835693878084863, + "grad_norm": 4.3130059242248535, + "learning_rate": 9.959903756032707e-06, + "loss": 0.9528634071350097, + "memory(GiB)": 29.49, + "step": 1900, + "token_acc": 0.7809652379345258, + "train_speed(iter/s)": 0.097149 + }, + { + "epoch": 0.08835693878084863, + "eval_loss": 0.7958551049232483, + "eval_runtime": 291.8739, + "eval_samples_per_second": 11.906, + "eval_steps_per_second": 11.906, + "step": 1900 + }, + { + "epoch": 0.08858945704079824, + "grad_norm": 4.308380603790283, + "learning_rate": 9.959416336119091e-06, + "loss": 0.9988663673400879, + "memory(GiB)": 29.49, + "step": 1905, + "token_acc": 0.7786771526197706, + "train_speed(iter/s)": 0.095854 + }, + { + "epoch": 0.08882197530074784, + "grad_norm": 4.396267414093018, + "learning_rate": 9.958925983549475e-06, + "loss": 0.7884073734283448, + "memory(GiB)": 29.49, + "step": 1910, + "token_acc": 0.8175895765472313, + "train_speed(iter/s)": 0.095997 + }, + { + "epoch": 0.08905449356069745, + "grad_norm": 4.666249752044678, + "learning_rate": 9.958432698613822e-06, + "loss": 0.9932140350341797, + "memory(GiB)": 29.49, + "step": 1915, + "token_acc": 0.7660390516039052, + "train_speed(iter/s)": 0.096135 + }, + { + "epoch": 0.08928701182064704, + "grad_norm": 3.9180893898010254, + "learning_rate": 9.957936481603825e-06, + "loss": 0.7315858364105224, + "memory(GiB)": 29.49, + "step": 1920, + "token_acc": 0.8133279807306303, + "train_speed(iter/s)": 0.096274 + }, + { + "epoch": 0.08951953008059664, + "grad_norm": 4.10879373550415, + "learning_rate": 9.957437332812911e-06, + "loss": 0.9591626167297364, + "memory(GiB)": 29.49, + "step": 1925, + "token_acc": 0.7715167888846005, + "train_speed(iter/s)": 0.096411 + }, + { + "epoch": 0.08975204834054624, + "grad_norm": 5.104907035827637, + "learning_rate": 9.956935252536246e-06, + "loss": 0.9636960029602051, + "memory(GiB)": 29.49, + "step": 1930, + "token_acc": 0.7698966769058923, + "train_speed(iter/s)": 0.096546 + }, + { + "epoch": 0.08998456660049585, + "grad_norm": 4.108262062072754, + "learning_rate": 9.956430241070726e-06, + "loss": 0.8493362426757812, + "memory(GiB)": 29.49, + "step": 1935, + "token_acc": 0.7907935417382342, + "train_speed(iter/s)": 0.096687 + }, + { + "epoch": 0.09021708486044544, + "grad_norm": 3.6644999980926514, + "learning_rate": 9.955922298714974e-06, + "loss": 1.0462658882141114, + "memory(GiB)": 29.49, + "step": 1940, + "token_acc": 0.7325724861196792, + "train_speed(iter/s)": 0.096823 + }, + { + "epoch": 0.09044960312039504, + "grad_norm": 5.004022598266602, + "learning_rate": 9.955411425769357e-06, + "loss": 0.9329165458679199, + "memory(GiB)": 29.49, + "step": 1945, + "token_acc": 0.768247656915649, + "train_speed(iter/s)": 0.09696 + }, + { + "epoch": 0.09068212138034465, + "grad_norm": 3.4844934940338135, + "learning_rate": 9.954897622535969e-06, + "loss": 0.9116199493408204, + "memory(GiB)": 29.49, + "step": 1950, + "token_acc": 0.7900699766744419, + "train_speed(iter/s)": 0.097095 + }, + { + "epoch": 0.09068212138034465, + "eval_loss": 0.7911024689674377, + "eval_runtime": 290.3316, + "eval_samples_per_second": 11.969, + "eval_steps_per_second": 11.969, + "step": 1950 + }, + { + "epoch": 0.09091463964029425, + "grad_norm": 5.0355753898620605, + "learning_rate": 9.954380889318636e-06, + "loss": 0.9376407623291015, + "memory(GiB)": 29.49, + "step": 1955, + "token_acc": 0.7810113058315128, + "train_speed(iter/s)": 0.095848 + }, + { + "epoch": 0.09114715790024386, + "grad_norm": 5.319980144500732, + "learning_rate": 9.95386122642292e-06, + "loss": 0.9036301612854004, + "memory(GiB)": 29.49, + "step": 1960, + "token_acc": 0.7804032766225583, + "train_speed(iter/s)": 0.095983 + }, + { + "epoch": 0.09137967616019345, + "grad_norm": 4.639838695526123, + "learning_rate": 9.953338634156113e-06, + "loss": 0.7657929420471191, + "memory(GiB)": 29.49, + "step": 1965, + "token_acc": 0.810907786149982, + "train_speed(iter/s)": 0.096118 + }, + { + "epoch": 0.09161219442014305, + "grad_norm": 4.133523941040039, + "learning_rate": 9.95281311282724e-06, + "loss": 0.9569430351257324, + "memory(GiB)": 29.49, + "step": 1970, + "token_acc": 0.7655044739278001, + "train_speed(iter/s)": 0.096255 + }, + { + "epoch": 0.09184471268009266, + "grad_norm": 6.160674571990967, + "learning_rate": 9.95228466274706e-06, + "loss": 0.9236304283142089, + "memory(GiB)": 29.49, + "step": 1975, + "token_acc": 0.7819602272727273, + "train_speed(iter/s)": 0.096393 + }, + { + "epoch": 0.09207723094004226, + "grad_norm": 4.117314338684082, + "learning_rate": 9.951753284228058e-06, + "loss": 0.9396333694458008, + "memory(GiB)": 29.49, + "step": 1980, + "token_acc": 0.7647058823529411, + "train_speed(iter/s)": 0.096529 + }, + { + "epoch": 0.09230974919999187, + "grad_norm": 3.8997716903686523, + "learning_rate": 9.951218977584456e-06, + "loss": 0.9236691474914551, + "memory(GiB)": 29.49, + "step": 1985, + "token_acc": 0.7829246139872843, + "train_speed(iter/s)": 0.096665 + }, + { + "epoch": 0.09254226745994146, + "grad_norm": 5.6404852867126465, + "learning_rate": 9.950681743132209e-06, + "loss": 0.8209335327148437, + "memory(GiB)": 29.49, + "step": 1990, + "token_acc": 0.7989203778677463, + "train_speed(iter/s)": 0.096799 + }, + { + "epoch": 0.09277478571989106, + "grad_norm": 5.0860209465026855, + "learning_rate": 9.950141581188997e-06, + "loss": 1.0448697090148926, + "memory(GiB)": 29.49, + "step": 1995, + "token_acc": 0.7433050293925539, + "train_speed(iter/s)": 0.096933 + }, + { + "epoch": 0.09300730397984067, + "grad_norm": 4.093280792236328, + "learning_rate": 9.949598492074234e-06, + "loss": 0.8636885643005371, + "memory(GiB)": 29.49, + "step": 2000, + "token_acc": 0.7966202193892677, + "train_speed(iter/s)": 0.097068 + }, + { + "epoch": 0.09300730397984067, + "eval_loss": 0.7961020469665527, + "eval_runtime": 289.8869, + "eval_samples_per_second": 11.987, + "eval_steps_per_second": 11.987, + "step": 2000 + }, + { + "epoch": 0.09323982223979027, + "grad_norm": 3.603029251098633, + "learning_rate": 9.949052476109067e-06, + "loss": 0.9437604904174804, + "memory(GiB)": 29.49, + "step": 2005, + "token_acc": 0.7814692859768866, + "train_speed(iter/s)": 0.095853 + }, + { + "epoch": 0.09347234049973988, + "grad_norm": 4.166951656341553, + "learning_rate": 9.948503533616374e-06, + "loss": 0.8821152687072754, + "memory(GiB)": 29.49, + "step": 2010, + "token_acc": 0.780439121756487, + "train_speed(iter/s)": 0.095988 + }, + { + "epoch": 0.09370485875968947, + "grad_norm": 5.039173126220703, + "learning_rate": 9.947951664920758e-06, + "loss": 1.0004417419433593, + "memory(GiB)": 29.49, + "step": 2015, + "token_acc": 0.7596216568819308, + "train_speed(iter/s)": 0.096123 + }, + { + "epoch": 0.09393737701963907, + "grad_norm": 5.216615200042725, + "learning_rate": 9.947396870348555e-06, + "loss": 0.9004398345947265, + "memory(GiB)": 29.49, + "step": 2020, + "token_acc": 0.7764618800888231, + "train_speed(iter/s)": 0.096257 + }, + { + "epoch": 0.09416989527958868, + "grad_norm": 3.9363391399383545, + "learning_rate": 9.946839150227838e-06, + "loss": 0.8003036499023437, + "memory(GiB)": 29.49, + "step": 2025, + "token_acc": 0.8057581573896353, + "train_speed(iter/s)": 0.096393 + }, + { + "epoch": 0.09440241353953828, + "grad_norm": 3.8237192630767822, + "learning_rate": 9.946278504888401e-06, + "loss": 0.8571130752563476, + "memory(GiB)": 29.49, + "step": 2030, + "token_acc": 0.79004329004329, + "train_speed(iter/s)": 0.096526 + }, + { + "epoch": 0.09463493179948788, + "grad_norm": 5.707287788391113, + "learning_rate": 9.945714934661767e-06, + "loss": 0.8530313491821289, + "memory(GiB)": 29.49, + "step": 2035, + "token_acc": 0.7946362968405584, + "train_speed(iter/s)": 0.096662 + }, + { + "epoch": 0.09486745005943747, + "grad_norm": 4.582156658172607, + "learning_rate": 9.9451484398812e-06, + "loss": 0.8503716468811036, + "memory(GiB)": 29.49, + "step": 2040, + "token_acc": 0.7962447844228094, + "train_speed(iter/s)": 0.096795 + }, + { + "epoch": 0.09509996831938708, + "grad_norm": 5.141015529632568, + "learning_rate": 9.94457902088168e-06, + "loss": 0.8094500541687012, + "memory(GiB)": 29.49, + "step": 2045, + "token_acc": 0.801535974130962, + "train_speed(iter/s)": 0.09693 + }, + { + "epoch": 0.09533248657933668, + "grad_norm": 5.945651054382324, + "learning_rate": 9.944006677999923e-06, + "loss": 0.7684842109680176, + "memory(GiB)": 29.49, + "step": 2050, + "token_acc": 0.8084388185654009, + "train_speed(iter/s)": 0.097063 + }, + { + "epoch": 0.09533248657933668, + "eval_loss": 0.7812900543212891, + "eval_runtime": 290.1308, + "eval_samples_per_second": 11.977, + "eval_steps_per_second": 11.977, + "step": 2050 + }, + { + "epoch": 0.09556500483928629, + "grad_norm": 4.04123067855835, + "learning_rate": 9.943431411574377e-06, + "loss": 0.7828290462493896, + "memory(GiB)": 29.49, + "step": 2055, + "token_acc": 0.7829557178160413, + "train_speed(iter/s)": 0.095874 + }, + { + "epoch": 0.09579752309923589, + "grad_norm": 5.821208477020264, + "learning_rate": 9.942853221945208e-06, + "loss": 0.9211565971374511, + "memory(GiB)": 29.49, + "step": 2060, + "token_acc": 0.7775191220485533, + "train_speed(iter/s)": 0.096006 + }, + { + "epoch": 0.09603004135918548, + "grad_norm": 4.410974502563477, + "learning_rate": 9.942272109454322e-06, + "loss": 0.78861083984375, + "memory(GiB)": 29.49, + "step": 2065, + "token_acc": 0.7906564163217031, + "train_speed(iter/s)": 0.096136 + }, + { + "epoch": 0.09626255961913509, + "grad_norm": 3.8728504180908203, + "learning_rate": 9.94168807444535e-06, + "loss": 0.8335811614990234, + "memory(GiB)": 29.49, + "step": 2070, + "token_acc": 0.8082344213649851, + "train_speed(iter/s)": 0.096268 + }, + { + "epoch": 0.09649507787908469, + "grad_norm": 4.204519271850586, + "learning_rate": 9.941101117263648e-06, + "loss": 0.8729138374328613, + "memory(GiB)": 29.49, + "step": 2075, + "token_acc": 0.7818115144847818, + "train_speed(iter/s)": 0.096399 + }, + { + "epoch": 0.0967275961390343, + "grad_norm": 5.9173383712768555, + "learning_rate": 9.9405112382563e-06, + "loss": 0.8117104530334472, + "memory(GiB)": 29.49, + "step": 2080, + "token_acc": 0.8, + "train_speed(iter/s)": 0.096531 + }, + { + "epoch": 0.0969601143989839, + "grad_norm": 4.552997589111328, + "learning_rate": 9.939918437772122e-06, + "loss": 0.9070040702819824, + "memory(GiB)": 29.49, + "step": 2085, + "token_acc": 0.7808084127505751, + "train_speed(iter/s)": 0.096658 + }, + { + "epoch": 0.09719263265893349, + "grad_norm": 3.836216449737549, + "learning_rate": 9.939322716161654e-06, + "loss": 0.8257838249206543, + "memory(GiB)": 29.49, + "step": 2090, + "token_acc": 0.8128583128583129, + "train_speed(iter/s)": 0.096791 + }, + { + "epoch": 0.0974251509188831, + "grad_norm": 4.201406002044678, + "learning_rate": 9.938724073777167e-06, + "loss": 0.9113821029663086, + "memory(GiB)": 29.49, + "step": 2095, + "token_acc": 0.7799520766773163, + "train_speed(iter/s)": 0.09692 + }, + { + "epoch": 0.0976576691788327, + "grad_norm": 4.389884948730469, + "learning_rate": 9.938122510972652e-06, + "loss": 0.9145685195922851, + "memory(GiB)": 29.49, + "step": 2100, + "token_acc": 0.786472148541114, + "train_speed(iter/s)": 0.097049 + }, + { + "epoch": 0.0976576691788327, + "eval_loss": 0.7848458290100098, + "eval_runtime": 294.2657, + "eval_samples_per_second": 11.809, + "eval_steps_per_second": 11.809, + "step": 2100 + }, + { + "epoch": 0.0978901874387823, + "grad_norm": 4.336113929748535, + "learning_rate": 9.937518028103837e-06, + "loss": 0.985959815979004, + "memory(GiB)": 29.49, + "step": 2105, + "token_acc": 0.7828502454076004, + "train_speed(iter/s)": 0.095873 + }, + { + "epoch": 0.09812270569873191, + "grad_norm": 5.265202522277832, + "learning_rate": 9.936910625528169e-06, + "loss": 0.9030218124389648, + "memory(GiB)": 29.49, + "step": 2110, + "token_acc": 0.7816537467700259, + "train_speed(iter/s)": 0.096001 + }, + { + "epoch": 0.0983552239586815, + "grad_norm": 5.062147617340088, + "learning_rate": 9.936300303604823e-06, + "loss": 0.8893616676330567, + "memory(GiB)": 29.49, + "step": 2115, + "token_acc": 0.7948929159802306, + "train_speed(iter/s)": 0.096129 + }, + { + "epoch": 0.0985877422186311, + "grad_norm": 5.708126544952393, + "learning_rate": 9.935687062694702e-06, + "loss": 0.8678428649902343, + "memory(GiB)": 29.49, + "step": 2120, + "token_acc": 0.7893436838390967, + "train_speed(iter/s)": 0.096256 + }, + { + "epoch": 0.09882026047858071, + "grad_norm": 5.069558620452881, + "learning_rate": 9.935070903160436e-06, + "loss": 0.9081151962280274, + "memory(GiB)": 29.49, + "step": 2125, + "token_acc": 0.7773660205245154, + "train_speed(iter/s)": 0.096378 + }, + { + "epoch": 0.09905277873853031, + "grad_norm": 3.8125367164611816, + "learning_rate": 9.934451825366375e-06, + "loss": 0.846406364440918, + "memory(GiB)": 29.49, + "step": 2130, + "token_acc": 0.7957292506043513, + "train_speed(iter/s)": 0.096505 + }, + { + "epoch": 0.0992852969984799, + "grad_norm": 5.074999809265137, + "learning_rate": 9.933829829678603e-06, + "loss": 0.9698441505432129, + "memory(GiB)": 29.49, + "step": 2135, + "token_acc": 0.7694581280788177, + "train_speed(iter/s)": 0.096632 + }, + { + "epoch": 0.09951781525842951, + "grad_norm": 6.1027021408081055, + "learning_rate": 9.933204916464922e-06, + "loss": 0.8034770965576172, + "memory(GiB)": 29.49, + "step": 2140, + "token_acc": 0.7895142636854279, + "train_speed(iter/s)": 0.096757 + }, + { + "epoch": 0.09975033351837911, + "grad_norm": 6.004554748535156, + "learning_rate": 9.932577086094866e-06, + "loss": 0.8347911834716797, + "memory(GiB)": 29.49, + "step": 2145, + "token_acc": 0.7937138728323699, + "train_speed(iter/s)": 0.096885 + }, + { + "epoch": 0.09998285177832872, + "grad_norm": 5.652978897094727, + "learning_rate": 9.931946338939688e-06, + "loss": 0.8349695205688477, + "memory(GiB)": 29.49, + "step": 2150, + "token_acc": 0.8160493827160494, + "train_speed(iter/s)": 0.097009 + }, + { + "epoch": 0.09998285177832872, + "eval_loss": 0.7777920365333557, + "eval_runtime": 291.8039, + "eval_samples_per_second": 11.909, + "eval_steps_per_second": 11.909, + "step": 2150 + }, + { + "epoch": 0.10021537003827832, + "grad_norm": 5.332450866699219, + "learning_rate": 9.931312675372368e-06, + "loss": 0.7858267784118652, + "memory(GiB)": 29.49, + "step": 2155, + "token_acc": 0.7848584531899057, + "train_speed(iter/s)": 0.095872 + }, + { + "epoch": 0.10044788829822791, + "grad_norm": 5.089801788330078, + "learning_rate": 9.930676095767612e-06, + "loss": 0.8572509765625, + "memory(GiB)": 29.49, + "step": 2160, + "token_acc": 0.7963917525773195, + "train_speed(iter/s)": 0.095999 + }, + { + "epoch": 0.10068040655817752, + "grad_norm": 4.407390594482422, + "learning_rate": 9.93003660050185e-06, + "loss": 0.8731700897216796, + "memory(GiB)": 29.49, + "step": 2165, + "token_acc": 0.7905614881157423, + "train_speed(iter/s)": 0.096124 + }, + { + "epoch": 0.10091292481812712, + "grad_norm": 4.490730285644531, + "learning_rate": 9.929394189953232e-06, + "loss": 0.8779221534729004, + "memory(GiB)": 29.49, + "step": 2170, + "token_acc": 0.7994467496542186, + "train_speed(iter/s)": 0.09625 + }, + { + "epoch": 0.10114544307807673, + "grad_norm": 4.566867828369141, + "learning_rate": 9.92874886450164e-06, + "loss": 0.8151620864868164, + "memory(GiB)": 29.49, + "step": 2175, + "token_acc": 0.809959721713658, + "train_speed(iter/s)": 0.096375 + }, + { + "epoch": 0.10137796133802633, + "grad_norm": 4.89373254776001, + "learning_rate": 9.92810062452867e-06, + "loss": 0.9058709144592285, + "memory(GiB)": 29.49, + "step": 2180, + "token_acc": 0.7809948032665182, + "train_speed(iter/s)": 0.096497 + }, + { + "epoch": 0.10161047959797592, + "grad_norm": 5.802387714385986, + "learning_rate": 9.92744947041765e-06, + "loss": 0.8258605003356934, + "memory(GiB)": 29.49, + "step": 2185, + "token_acc": 0.7867144252686421, + "train_speed(iter/s)": 0.09662 + }, + { + "epoch": 0.10184299785792553, + "grad_norm": 4.026666641235352, + "learning_rate": 9.926795402553624e-06, + "loss": 0.7584074974060059, + "memory(GiB)": 29.49, + "step": 2190, + "token_acc": 0.8099924299772899, + "train_speed(iter/s)": 0.096746 + }, + { + "epoch": 0.10207551611787513, + "grad_norm": 4.953476428985596, + "learning_rate": 9.926138421323365e-06, + "loss": 0.9181832313537598, + "memory(GiB)": 29.49, + "step": 2195, + "token_acc": 0.7740170940170941, + "train_speed(iter/s)": 0.09687 + }, + { + "epoch": 0.10230803437782474, + "grad_norm": 3.769212245941162, + "learning_rate": 9.925478527115369e-06, + "loss": 0.901462173461914, + "memory(GiB)": 29.49, + "step": 2200, + "token_acc": 0.7800376647834275, + "train_speed(iter/s)": 0.096994 + }, + { + "epoch": 0.10230803437782474, + "eval_loss": 0.7704504132270813, + "eval_runtime": 292.4552, + "eval_samples_per_second": 11.882, + "eval_steps_per_second": 11.882, + "step": 2200 + }, + { + "epoch": 0.10254055263777434, + "grad_norm": 4.067014694213867, + "learning_rate": 9.924815720319845e-06, + "loss": 0.876547908782959, + "memory(GiB)": 29.49, + "step": 2205, + "token_acc": 0.7853665544045586, + "train_speed(iter/s)": 0.095879 + }, + { + "epoch": 0.10277307089772393, + "grad_norm": 4.733416557312012, + "learning_rate": 9.924150001328736e-06, + "loss": 0.8199963569641113, + "memory(GiB)": 29.49, + "step": 2210, + "token_acc": 0.7988871224165341, + "train_speed(iter/s)": 0.096001 + }, + { + "epoch": 0.10300558915767354, + "grad_norm": 4.051647663116455, + "learning_rate": 9.923481370535702e-06, + "loss": 0.8695549011230469, + "memory(GiB)": 29.49, + "step": 2215, + "token_acc": 0.7721365971592626, + "train_speed(iter/s)": 0.096119 + }, + { + "epoch": 0.10323810741762314, + "grad_norm": 4.613498687744141, + "learning_rate": 9.922809828336122e-06, + "loss": 0.8630804061889649, + "memory(GiB)": 29.49, + "step": 2220, + "token_acc": 0.7895302975977053, + "train_speed(iter/s)": 0.09624 + }, + { + "epoch": 0.10347062567757274, + "grad_norm": 5.266293525695801, + "learning_rate": 9.922135375127103e-06, + "loss": 0.8668428421020508, + "memory(GiB)": 29.49, + "step": 2225, + "token_acc": 0.7758620689655172, + "train_speed(iter/s)": 0.096363 + }, + { + "epoch": 0.10370314393752235, + "grad_norm": 4.555314540863037, + "learning_rate": 9.921458011307468e-06, + "loss": 0.9019486427307128, + "memory(GiB)": 29.49, + "step": 2230, + "token_acc": 0.776064610866373, + "train_speed(iter/s)": 0.096484 + }, + { + "epoch": 0.10393566219747194, + "grad_norm": 4.664549350738525, + "learning_rate": 9.920777737277764e-06, + "loss": 0.8585104942321777, + "memory(GiB)": 29.49, + "step": 2235, + "token_acc": 0.7947368421052632, + "train_speed(iter/s)": 0.096605 + }, + { + "epoch": 0.10416818045742154, + "grad_norm": 3.9962921142578125, + "learning_rate": 9.920094553440257e-06, + "loss": 0.966301441192627, + "memory(GiB)": 29.49, + "step": 2240, + "token_acc": 0.7736486486486487, + "train_speed(iter/s)": 0.096721 + }, + { + "epoch": 0.10440069871737115, + "grad_norm": 5.359710216522217, + "learning_rate": 9.919408460198937e-06, + "loss": 0.8347146034240722, + "memory(GiB)": 29.49, + "step": 2245, + "token_acc": 0.7948028673835126, + "train_speed(iter/s)": 0.096841 + }, + { + "epoch": 0.10463321697732075, + "grad_norm": 4.495904445648193, + "learning_rate": 9.918719457959509e-06, + "loss": 0.9961088180541993, + "memory(GiB)": 29.49, + "step": 2250, + "token_acc": 0.7679214402618658, + "train_speed(iter/s)": 0.096961 + }, + { + "epoch": 0.10463321697732075, + "eval_loss": 0.771256685256958, + "eval_runtime": 291.6294, + "eval_samples_per_second": 11.916, + "eval_steps_per_second": 11.916, + "step": 2250 + }, + { + "epoch": 0.10486573523727036, + "grad_norm": 4.535305500030518, + "learning_rate": 9.918027547129405e-06, + "loss": 0.7924888134002686, + "memory(GiB)": 29.49, + "step": 2255, + "token_acc": 0.7862682171411193, + "train_speed(iter/s)": 0.095876 + }, + { + "epoch": 0.10509825349721995, + "grad_norm": 4.197555065155029, + "learning_rate": 9.91733272811777e-06, + "loss": 0.8547355651855468, + "memory(GiB)": 29.49, + "step": 2260, + "token_acc": 0.7923462986198243, + "train_speed(iter/s)": 0.095994 + }, + { + "epoch": 0.10533077175716955, + "grad_norm": 4.672827243804932, + "learning_rate": 9.916635001335473e-06, + "loss": 0.8752650260925293, + "memory(GiB)": 29.49, + "step": 2265, + "token_acc": 0.7856060606060606, + "train_speed(iter/s)": 0.096114 + }, + { + "epoch": 0.10556329001711916, + "grad_norm": 5.364903926849365, + "learning_rate": 9.9159343671951e-06, + "loss": 0.8720294952392578, + "memory(GiB)": 29.49, + "step": 2270, + "token_acc": 0.7919115105426893, + "train_speed(iter/s)": 0.096231 + }, + { + "epoch": 0.10579580827706876, + "grad_norm": 4.762035846710205, + "learning_rate": 9.915230826110962e-06, + "loss": 0.8258625984191894, + "memory(GiB)": 29.49, + "step": 2275, + "token_acc": 0.8075097108329736, + "train_speed(iter/s)": 0.096349 + }, + { + "epoch": 0.10602832653701837, + "grad_norm": 4.820277690887451, + "learning_rate": 9.91452437849908e-06, + "loss": 0.8725085258483887, + "memory(GiB)": 29.49, + "step": 2280, + "token_acc": 0.7912274736484189, + "train_speed(iter/s)": 0.096465 + }, + { + "epoch": 0.10626084479696796, + "grad_norm": 4.2000555992126465, + "learning_rate": 9.9138150247772e-06, + "loss": 0.8382984161376953, + "memory(GiB)": 29.49, + "step": 2285, + "token_acc": 0.811981234211476, + "train_speed(iter/s)": 0.096584 + }, + { + "epoch": 0.10649336305691756, + "grad_norm": 4.270804405212402, + "learning_rate": 9.913102765364786e-06, + "loss": 0.871574592590332, + "memory(GiB)": 29.49, + "step": 2290, + "token_acc": 0.7906079125120618, + "train_speed(iter/s)": 0.096702 + }, + { + "epoch": 0.10672588131686717, + "grad_norm": 4.381374359130859, + "learning_rate": 9.912387600683016e-06, + "loss": 0.8795125007629394, + "memory(GiB)": 29.49, + "step": 2295, + "token_acc": 0.7884333821376281, + "train_speed(iter/s)": 0.096816 + }, + { + "epoch": 0.10695839957681677, + "grad_norm": 5.970802307128906, + "learning_rate": 9.91166953115479e-06, + "loss": 0.9401533126831054, + "memory(GiB)": 29.49, + "step": 2300, + "token_acc": 0.7745098039215687, + "train_speed(iter/s)": 0.096934 + }, + { + "epoch": 0.10695839957681677, + "eval_loss": 0.7655821442604065, + "eval_runtime": 291.9701, + "eval_samples_per_second": 11.902, + "eval_steps_per_second": 11.902, + "step": 2300 + }, + { + "epoch": 0.10719091783676638, + "grad_norm": 6.104898929595947, + "learning_rate": 9.910948557204727e-06, + "loss": 0.8890548706054687, + "memory(GiB)": 29.49, + "step": 2305, + "token_acc": 0.7863885505481121, + "train_speed(iter/s)": 0.095866 + }, + { + "epoch": 0.10742343609671597, + "grad_norm": 3.8734943866729736, + "learning_rate": 9.910224679259159e-06, + "loss": 0.892047119140625, + "memory(GiB)": 29.49, + "step": 2310, + "token_acc": 0.787044220325834, + "train_speed(iter/s)": 0.095981 + }, + { + "epoch": 0.10765595435666557, + "grad_norm": 5.029067039489746, + "learning_rate": 9.909497897746139e-06, + "loss": 0.9151164054870605, + "memory(GiB)": 29.49, + "step": 2315, + "token_acc": 0.7836443032949583, + "train_speed(iter/s)": 0.096095 + }, + { + "epoch": 0.10788847261661517, + "grad_norm": 3.318230628967285, + "learning_rate": 9.908768213095432e-06, + "loss": 0.8855794906616211, + "memory(GiB)": 29.49, + "step": 2320, + "token_acc": 0.785736726358781, + "train_speed(iter/s)": 0.096209 + }, + { + "epoch": 0.10812099087656478, + "grad_norm": 5.197476863861084, + "learning_rate": 9.908035625738525e-06, + "loss": 0.9641876220703125, + "memory(GiB)": 29.49, + "step": 2325, + "token_acc": 0.7662420382165606, + "train_speed(iter/s)": 0.096324 + }, + { + "epoch": 0.10835350913651438, + "grad_norm": 5.319194793701172, + "learning_rate": 9.907300136108622e-06, + "loss": 0.9029970169067383, + "memory(GiB)": 29.49, + "step": 2330, + "token_acc": 0.7849344978165939, + "train_speed(iter/s)": 0.096438 + }, + { + "epoch": 0.10858602739646397, + "grad_norm": 4.619373321533203, + "learning_rate": 9.906561744640638e-06, + "loss": 0.7368559837341309, + "memory(GiB)": 29.49, + "step": 2335, + "token_acc": 0.8242205151378219, + "train_speed(iter/s)": 0.096555 + }, + { + "epoch": 0.10881854565641358, + "grad_norm": 9.945032119750977, + "learning_rate": 9.905820451771206e-06, + "loss": 0.8624940872192383, + "memory(GiB)": 29.49, + "step": 2340, + "token_acc": 0.8019303399076794, + "train_speed(iter/s)": 0.096672 + }, + { + "epoch": 0.10905106391636318, + "grad_norm": 4.2016730308532715, + "learning_rate": 9.905076257938677e-06, + "loss": 0.8212512969970703, + "memory(GiB)": 29.49, + "step": 2345, + "token_acc": 0.794137022397892, + "train_speed(iter/s)": 0.09679 + }, + { + "epoch": 0.10928358217631279, + "grad_norm": 3.608508348464966, + "learning_rate": 9.904329163583115e-06, + "loss": 0.890407943725586, + "memory(GiB)": 29.49, + "step": 2350, + "token_acc": 0.7811355311355311, + "train_speed(iter/s)": 0.096907 + }, + { + "epoch": 0.10928358217631279, + "eval_loss": 0.768470823764801, + "eval_runtime": 290.2717, + "eval_samples_per_second": 11.972, + "eval_steps_per_second": 11.972, + "step": 2350 + }, + { + "epoch": 0.10951610043626238, + "grad_norm": 5.3882646560668945, + "learning_rate": 9.903579169146302e-06, + "loss": 0.8198015213012695, + "memory(GiB)": 29.49, + "step": 2355, + "token_acc": 0.7874883825273211, + "train_speed(iter/s)": 0.095876 + }, + { + "epoch": 0.10974861869621198, + "grad_norm": 4.694791793823242, + "learning_rate": 9.90282627507173e-06, + "loss": 0.9625364303588867, + "memory(GiB)": 29.49, + "step": 2360, + "token_acc": 0.771689497716895, + "train_speed(iter/s)": 0.095987 + }, + { + "epoch": 0.10998113695616159, + "grad_norm": 4.6519694328308105, + "learning_rate": 9.902070481804608e-06, + "loss": 0.9932247161865234, + "memory(GiB)": 29.49, + "step": 2365, + "token_acc": 0.7187100076785258, + "train_speed(iter/s)": 0.0961 + }, + { + "epoch": 0.11021365521611119, + "grad_norm": 8.811151504516602, + "learning_rate": 9.901311789791863e-06, + "loss": 0.9222234725952149, + "memory(GiB)": 29.49, + "step": 2370, + "token_acc": 0.7835302293259208, + "train_speed(iter/s)": 0.09621 + }, + { + "epoch": 0.1104461734760608, + "grad_norm": 5.532461643218994, + "learning_rate": 9.900550199482132e-06, + "loss": 0.8486194610595703, + "memory(GiB)": 29.49, + "step": 2375, + "token_acc": 0.7865168539325843, + "train_speed(iter/s)": 0.096321 + }, + { + "epoch": 0.11067869173601039, + "grad_norm": 5.290581226348877, + "learning_rate": 9.899785711325767e-06, + "loss": 0.7777493000030518, + "memory(GiB)": 29.49, + "step": 2380, + "token_acc": 0.8072541966426858, + "train_speed(iter/s)": 0.096434 + }, + { + "epoch": 0.11091120999595999, + "grad_norm": 5.493096828460693, + "learning_rate": 9.89901832577483e-06, + "loss": 0.8348598480224609, + "memory(GiB)": 29.49, + "step": 2385, + "token_acc": 0.8035310095065641, + "train_speed(iter/s)": 0.096549 + }, + { + "epoch": 0.1111437282559096, + "grad_norm": 6.49524450302124, + "learning_rate": 9.898248043283105e-06, + "loss": 0.9553499221801758, + "memory(GiB)": 29.49, + "step": 2390, + "token_acc": 0.7736389684813754, + "train_speed(iter/s)": 0.096661 + }, + { + "epoch": 0.1113762465158592, + "grad_norm": 4.445330619812012, + "learning_rate": 9.897474864306082e-06, + "loss": 0.8675954818725586, + "memory(GiB)": 29.49, + "step": 2395, + "token_acc": 0.7960128159487362, + "train_speed(iter/s)": 0.096775 + }, + { + "epoch": 0.1116087647758088, + "grad_norm": 4.415380477905273, + "learning_rate": 9.896698789300963e-06, + "loss": 0.7713698387145996, + "memory(GiB)": 29.49, + "step": 2400, + "token_acc": 0.8173153296266878, + "train_speed(iter/s)": 0.096889 + }, + { + "epoch": 0.1116087647758088, + "eval_loss": 0.7590782642364502, + "eval_runtime": 293.8879, + "eval_samples_per_second": 11.824, + "eval_steps_per_second": 11.824, + "step": 2400 + }, + { + "epoch": 0.1118412830357584, + "grad_norm": 5.605898857116699, + "learning_rate": 9.89591981872667e-06, + "loss": 0.8945033073425293, + "memory(GiB)": 29.49, + "step": 2405, + "token_acc": 0.7878308273982239, + "train_speed(iter/s)": 0.095865 + }, + { + "epoch": 0.112073801295708, + "grad_norm": 4.756052017211914, + "learning_rate": 9.895137953043826e-06, + "loss": 0.8087597846984863, + "memory(GiB)": 29.49, + "step": 2410, + "token_acc": 0.7922782386726228, + "train_speed(iter/s)": 0.095976 + }, + { + "epoch": 0.1123063195556576, + "grad_norm": 5.700892925262451, + "learning_rate": 9.894353192714779e-06, + "loss": 0.8685206413269043, + "memory(GiB)": 29.49, + "step": 2415, + "token_acc": 0.786042944785276, + "train_speed(iter/s)": 0.096088 + }, + { + "epoch": 0.11253883781560721, + "grad_norm": 4.865542888641357, + "learning_rate": 9.893565538203575e-06, + "loss": 0.8942924499511719, + "memory(GiB)": 29.49, + "step": 2420, + "token_acc": 0.7881873727087576, + "train_speed(iter/s)": 0.096202 + }, + { + "epoch": 0.11277135607555681, + "grad_norm": 4.80360746383667, + "learning_rate": 9.892774989975986e-06, + "loss": 0.8122398376464843, + "memory(GiB)": 29.49, + "step": 2425, + "token_acc": 0.8057724957555178, + "train_speed(iter/s)": 0.096315 + }, + { + "epoch": 0.1130038743355064, + "grad_norm": 4.724724292755127, + "learning_rate": 9.891981548499483e-06, + "loss": 0.7987810611724854, + "memory(GiB)": 29.49, + "step": 2430, + "token_acc": 0.797164667393675, + "train_speed(iter/s)": 0.09643 + }, + { + "epoch": 0.11323639259545601, + "grad_norm": 4.790870666503906, + "learning_rate": 9.891185214243254e-06, + "loss": 0.7731681823730469, + "memory(GiB)": 29.49, + "step": 2435, + "token_acc": 0.8126709206927986, + "train_speed(iter/s)": 0.096547 + }, + { + "epoch": 0.11346891085540561, + "grad_norm": 5.879446506500244, + "learning_rate": 9.890385987678192e-06, + "loss": 0.9109359741210937, + "memory(GiB)": 29.49, + "step": 2440, + "token_acc": 0.7865546218487395, + "train_speed(iter/s)": 0.096661 + }, + { + "epoch": 0.11370142911535522, + "grad_norm": 4.92235803604126, + "learning_rate": 9.889583869276911e-06, + "loss": 0.8439332962036132, + "memory(GiB)": 29.49, + "step": 2445, + "token_acc": 0.7941460276616276, + "train_speed(iter/s)": 0.096777 + }, + { + "epoch": 0.11393394737530482, + "grad_norm": 5.367386341094971, + "learning_rate": 9.888778859513723e-06, + "loss": 0.8719941139221191, + "memory(GiB)": 29.49, + "step": 2450, + "token_acc": 0.7949034114262228, + "train_speed(iter/s)": 0.096891 + }, + { + "epoch": 0.11393394737530482, + "eval_loss": 0.7564442753791809, + "eval_runtime": 290.7043, + "eval_samples_per_second": 11.954, + "eval_steps_per_second": 11.954, + "step": 2450 + }, + { + "epoch": 0.11416646563525441, + "grad_norm": 4.416979789733887, + "learning_rate": 9.88797095886466e-06, + "loss": 0.8568400382995606, + "memory(GiB)": 29.49, + "step": 2455, + "token_acc": 0.788566903689721, + "train_speed(iter/s)": 0.095896 + }, + { + "epoch": 0.11439898389520402, + "grad_norm": 4.938545227050781, + "learning_rate": 9.887160167807452e-06, + "loss": 0.7519895553588867, + "memory(GiB)": 29.49, + "step": 2460, + "token_acc": 0.8103386809269162, + "train_speed(iter/s)": 0.096007 + }, + { + "epoch": 0.11463150215515362, + "grad_norm": 4.431781768798828, + "learning_rate": 9.88634648682155e-06, + "loss": 0.8250004768371582, + "memory(GiB)": 29.49, + "step": 2465, + "token_acc": 0.7998715065852875, + "train_speed(iter/s)": 0.096116 + }, + { + "epoch": 0.11486402041510323, + "grad_norm": 6.4470086097717285, + "learning_rate": 9.885529916388108e-06, + "loss": 0.8673094749450684, + "memory(GiB)": 29.49, + "step": 2470, + "token_acc": 0.7979130434782609, + "train_speed(iter/s)": 0.096226 + }, + { + "epoch": 0.11509653867505283, + "grad_norm": 5.069904804229736, + "learning_rate": 9.884710456989987e-06, + "loss": 0.8118146896362305, + "memory(GiB)": 29.49, + "step": 2475, + "token_acc": 0.7955861070911722, + "train_speed(iter/s)": 0.096335 + }, + { + "epoch": 0.11532905693500242, + "grad_norm": 4.7304911613464355, + "learning_rate": 9.883888109111763e-06, + "loss": 0.8498652458190918, + "memory(GiB)": 29.49, + "step": 2480, + "token_acc": 0.7961630695443646, + "train_speed(iter/s)": 0.096445 + }, + { + "epoch": 0.11556157519495203, + "grad_norm": 5.705682754516602, + "learning_rate": 9.883062873239711e-06, + "loss": 0.8524469375610352, + "memory(GiB)": 29.49, + "step": 2485, + "token_acc": 0.7909624008278717, + "train_speed(iter/s)": 0.096555 + }, + { + "epoch": 0.11579409345490163, + "grad_norm": 5.161575794219971, + "learning_rate": 9.88223474986182e-06, + "loss": 0.8959260940551758, + "memory(GiB)": 29.49, + "step": 2490, + "token_acc": 0.783974862529458, + "train_speed(iter/s)": 0.096666 + }, + { + "epoch": 0.11602661171485124, + "grad_norm": 4.681116580963135, + "learning_rate": 9.881403739467788e-06, + "loss": 0.8818140029907227, + "memory(GiB)": 29.49, + "step": 2495, + "token_acc": 0.7958260869565218, + "train_speed(iter/s)": 0.096774 + }, + { + "epoch": 0.11625912997480084, + "grad_norm": 4.936962127685547, + "learning_rate": 9.88056984254901e-06, + "loss": 0.8515759468078613, + "memory(GiB)": 29.49, + "step": 2500, + "token_acc": 0.7895465559016961, + "train_speed(iter/s)": 0.096882 + }, + { + "epoch": 0.11625912997480084, + "eval_loss": 0.7618740797042847, + "eval_runtime": 288.9198, + "eval_samples_per_second": 12.028, + "eval_steps_per_second": 12.028, + "step": 2500 + }, + { + "epoch": 0.11649164823475043, + "grad_norm": 5.392852306365967, + "learning_rate": 9.879733059598602e-06, + "loss": 0.7953616619110108, + "memory(GiB)": 29.49, + "step": 2505, + "token_acc": 0.789321072698728, + "train_speed(iter/s)": 0.095916 + }, + { + "epoch": 0.11672416649470004, + "grad_norm": 4.226475238800049, + "learning_rate": 9.878893391111377e-06, + "loss": 1.033120346069336, + "memory(GiB)": 29.49, + "step": 2510, + "token_acc": 0.7546312478154491, + "train_speed(iter/s)": 0.096024 + }, + { + "epoch": 0.11695668475464964, + "grad_norm": 4.294797897338867, + "learning_rate": 9.878050837583857e-06, + "loss": 0.8275615692138671, + "memory(GiB)": 29.49, + "step": 2515, + "token_acc": 0.7989915966386555, + "train_speed(iter/s)": 0.096131 + }, + { + "epoch": 0.11718920301459924, + "grad_norm": 5.027287006378174, + "learning_rate": 9.87720539951427e-06, + "loss": 0.8524178504943848, + "memory(GiB)": 29.49, + "step": 2520, + "token_acc": 0.809153713298791, + "train_speed(iter/s)": 0.096241 + }, + { + "epoch": 0.11742172127454885, + "grad_norm": 4.703906536102295, + "learning_rate": 9.876357077402548e-06, + "loss": 0.9237653732299804, + "memory(GiB)": 29.49, + "step": 2525, + "token_acc": 0.7695299837925446, + "train_speed(iter/s)": 0.096347 + }, + { + "epoch": 0.11765423953449844, + "grad_norm": 5.149855136871338, + "learning_rate": 9.875505871750332e-06, + "loss": 0.8868477821350098, + "memory(GiB)": 29.49, + "step": 2530, + "token_acc": 0.7888123226591002, + "train_speed(iter/s)": 0.096455 + }, + { + "epoch": 0.11788675779444804, + "grad_norm": 4.148962020874023, + "learning_rate": 9.874651783060965e-06, + "loss": 0.8953413009643555, + "memory(GiB)": 29.49, + "step": 2535, + "token_acc": 0.787868038311458, + "train_speed(iter/s)": 0.096564 + }, + { + "epoch": 0.11811927605439765, + "grad_norm": 4.259050369262695, + "learning_rate": 9.873794811839496e-06, + "loss": 0.7834689617156982, + "memory(GiB)": 29.49, + "step": 2540, + "token_acc": 0.8124745209947004, + "train_speed(iter/s)": 0.096673 + }, + { + "epoch": 0.11835179431434725, + "grad_norm": 5.244549751281738, + "learning_rate": 9.872934958592682e-06, + "loss": 0.7544142723083496, + "memory(GiB)": 29.49, + "step": 2545, + "token_acc": 0.8242142025611175, + "train_speed(iter/s)": 0.096783 + }, + { + "epoch": 0.11858431257429684, + "grad_norm": 4.926893711090088, + "learning_rate": 9.872072223828976e-06, + "loss": 0.8376446723937988, + "memory(GiB)": 29.49, + "step": 2550, + "token_acc": 0.7962895598399418, + "train_speed(iter/s)": 0.09689 + }, + { + "epoch": 0.11858431257429684, + "eval_loss": 0.7521718144416809, + "eval_runtime": 291.5566, + "eval_samples_per_second": 11.919, + "eval_steps_per_second": 11.919, + "step": 2550 + }, + { + "epoch": 0.11881683083424645, + "grad_norm": 5.548832893371582, + "learning_rate": 9.871206608058542e-06, + "loss": 0.851633358001709, + "memory(GiB)": 29.49, + "step": 2555, + "token_acc": 0.789743054942763, + "train_speed(iter/s)": 0.095932 + }, + { + "epoch": 0.11904934909419605, + "grad_norm": 4.430542469024658, + "learning_rate": 9.870338111793245e-06, + "loss": 0.8358804702758789, + "memory(GiB)": 29.49, + "step": 2560, + "token_acc": 0.7926465717124875, + "train_speed(iter/s)": 0.096038 + }, + { + "epoch": 0.11928186735414566, + "grad_norm": 5.821200370788574, + "learning_rate": 9.869466735546655e-06, + "loss": 0.7679703235626221, + "memory(GiB)": 29.49, + "step": 2565, + "token_acc": 0.8165555945282357, + "train_speed(iter/s)": 0.096144 + }, + { + "epoch": 0.11951438561409526, + "grad_norm": 4.216818809509277, + "learning_rate": 9.86859247983404e-06, + "loss": 0.7578719139099122, + "memory(GiB)": 29.49, + "step": 2570, + "token_acc": 0.8305369127516778, + "train_speed(iter/s)": 0.09625 + }, + { + "epoch": 0.11974690387404485, + "grad_norm": 4.801919937133789, + "learning_rate": 9.867715345172378e-06, + "loss": 0.8901889801025391, + "memory(GiB)": 29.49, + "step": 2575, + "token_acc": 0.7774348422496571, + "train_speed(iter/s)": 0.096356 + }, + { + "epoch": 0.11997942213399446, + "grad_norm": 6.391806125640869, + "learning_rate": 9.866835332080345e-06, + "loss": 0.8922554969787597, + "memory(GiB)": 29.49, + "step": 2580, + "token_acc": 0.7786946736684172, + "train_speed(iter/s)": 0.096461 + }, + { + "epoch": 0.12021194039394406, + "grad_norm": 4.466254234313965, + "learning_rate": 9.86595244107832e-06, + "loss": 0.901337718963623, + "memory(GiB)": 29.49, + "step": 2585, + "token_acc": 0.7783475783475784, + "train_speed(iter/s)": 0.096565 + }, + { + "epoch": 0.12044445865389367, + "grad_norm": 5.171731472015381, + "learning_rate": 9.865066672688381e-06, + "loss": 0.8802291870117187, + "memory(GiB)": 29.49, + "step": 2590, + "token_acc": 0.8067769261799113, + "train_speed(iter/s)": 0.096671 + }, + { + "epoch": 0.12067697691384327, + "grad_norm": 4.936208248138428, + "learning_rate": 9.864178027434312e-06, + "loss": 0.7710800647735596, + "memory(GiB)": 29.49, + "step": 2595, + "token_acc": 0.8116624411445128, + "train_speed(iter/s)": 0.096779 + }, + { + "epoch": 0.12090949517379286, + "grad_norm": 4.140257835388184, + "learning_rate": 9.863286505841599e-06, + "loss": 0.902885627746582, + "memory(GiB)": 29.49, + "step": 2600, + "token_acc": 0.7825848849945235, + "train_speed(iter/s)": 0.096884 + }, + { + "epoch": 0.12090949517379286, + "eval_loss": 0.750512421131134, + "eval_runtime": 290.6496, + "eval_samples_per_second": 11.956, + "eval_steps_per_second": 11.956, + "step": 2600 + }, + { + "epoch": 0.12114201343374247, + "grad_norm": 5.622610092163086, + "learning_rate": 9.862392108437423e-06, + "loss": 0.8916511535644531, + "memory(GiB)": 29.49, + "step": 2605, + "token_acc": 0.7897618549904324, + "train_speed(iter/s)": 0.095947 + }, + { + "epoch": 0.12137453169369207, + "grad_norm": 4.701568126678467, + "learning_rate": 9.861494835750669e-06, + "loss": 0.9207223892211914, + "memory(GiB)": 29.49, + "step": 2610, + "token_acc": 0.782055262340888, + "train_speed(iter/s)": 0.09605 + }, + { + "epoch": 0.12160704995364167, + "grad_norm": 4.3934431076049805, + "learning_rate": 9.860594688311924e-06, + "loss": 0.820650863647461, + "memory(GiB)": 29.49, + "step": 2615, + "token_acc": 0.801851217003771, + "train_speed(iter/s)": 0.096152 + }, + { + "epoch": 0.12183956821359128, + "grad_norm": 5.942111968994141, + "learning_rate": 9.859691666653471e-06, + "loss": 0.975086784362793, + "memory(GiB)": 29.49, + "step": 2620, + "token_acc": 0.7700453857791225, + "train_speed(iter/s)": 0.096256 + }, + { + "epoch": 0.12207208647354087, + "grad_norm": 5.694136142730713, + "learning_rate": 9.858785771309296e-06, + "loss": 0.8277214050292969, + "memory(GiB)": 29.49, + "step": 2625, + "token_acc": 0.7854190154077415, + "train_speed(iter/s)": 0.09636 + }, + { + "epoch": 0.12230460473349047, + "grad_norm": 4.6577630043029785, + "learning_rate": 9.857877002815081e-06, + "loss": 0.8744843482971192, + "memory(GiB)": 29.49, + "step": 2630, + "token_acc": 0.7825768667642753, + "train_speed(iter/s)": 0.096464 + }, + { + "epoch": 0.12253712299344008, + "grad_norm": 3.98728084564209, + "learning_rate": 9.856965361708213e-06, + "loss": 0.8487506866455078, + "memory(GiB)": 29.49, + "step": 2635, + "token_acc": 0.7898970398970399, + "train_speed(iter/s)": 0.096568 + }, + { + "epoch": 0.12276964125338968, + "grad_norm": 4.995297908782959, + "learning_rate": 9.856050848527768e-06, + "loss": 0.7177443027496337, + "memory(GiB)": 29.49, + "step": 2640, + "token_acc": 0.8281311734492296, + "train_speed(iter/s)": 0.096671 + }, + { + "epoch": 0.12300215951333929, + "grad_norm": 6.051422595977783, + "learning_rate": 9.855133463814529e-06, + "loss": 0.9563394546508789, + "memory(GiB)": 29.49, + "step": 2645, + "token_acc": 0.7816429170159263, + "train_speed(iter/s)": 0.096774 + }, + { + "epoch": 0.12323467777328888, + "grad_norm": 5.628708362579346, + "learning_rate": 9.854213208110974e-06, + "loss": 0.8455151557922364, + "memory(GiB)": 29.49, + "step": 2650, + "token_acc": 0.7891130567186905, + "train_speed(iter/s)": 0.096878 + }, + { + "epoch": 0.12323467777328888, + "eval_loss": 0.7496427297592163, + "eval_runtime": 289.7636, + "eval_samples_per_second": 11.993, + "eval_steps_per_second": 11.993, + "step": 2650 + }, + { + "epoch": 0.12346719603323848, + "grad_norm": 3.999803066253662, + "learning_rate": 9.853290081961278e-06, + "loss": 0.8988096237182617, + "memory(GiB)": 29.49, + "step": 2655, + "token_acc": 0.7904755841344624, + "train_speed(iter/s)": 0.095965 + }, + { + "epoch": 0.12369971429318809, + "grad_norm": 4.903356552124023, + "learning_rate": 9.852364085911313e-06, + "loss": 0.8998661041259766, + "memory(GiB)": 29.49, + "step": 2660, + "token_acc": 0.781635581061693, + "train_speed(iter/s)": 0.096066 + }, + { + "epoch": 0.12393223255313769, + "grad_norm": 5.046064853668213, + "learning_rate": 9.851435220508652e-06, + "loss": 0.8154894828796386, + "memory(GiB)": 29.49, + "step": 2665, + "token_acc": 0.814495254529767, + "train_speed(iter/s)": 0.096168 + }, + { + "epoch": 0.1241647508130873, + "grad_norm": 4.473296165466309, + "learning_rate": 9.850503486302559e-06, + "loss": 0.8603778839111328, + "memory(GiB)": 29.49, + "step": 2670, + "token_acc": 0.7875688434303698, + "train_speed(iter/s)": 0.096271 + }, + { + "epoch": 0.12439726907303689, + "grad_norm": 5.46212911605835, + "learning_rate": 9.849568883843997e-06, + "loss": 0.7482133865356445, + "memory(GiB)": 29.49, + "step": 2675, + "token_acc": 0.8121958202118523, + "train_speed(iter/s)": 0.096369 + }, + { + "epoch": 0.12462978733298649, + "grad_norm": 5.4718170166015625, + "learning_rate": 9.848631413685627e-06, + "loss": 0.7595831871032714, + "memory(GiB)": 29.49, + "step": 2680, + "token_acc": 0.8065456902138691, + "train_speed(iter/s)": 0.09647 + }, + { + "epoch": 0.1248623055929361, + "grad_norm": 4.871596336364746, + "learning_rate": 9.847691076381803e-06, + "loss": 0.8095902442932129, + "memory(GiB)": 29.49, + "step": 2685, + "token_acc": 0.8086928934010152, + "train_speed(iter/s)": 0.096571 + }, + { + "epoch": 0.1250948238528857, + "grad_norm": 4.615569591522217, + "learning_rate": 9.846747872488578e-06, + "loss": 0.7411964416503907, + "memory(GiB)": 29.49, + "step": 2690, + "token_acc": 0.8041069100391134, + "train_speed(iter/s)": 0.09667 + }, + { + "epoch": 0.1253273421128353, + "grad_norm": 4.434457302093506, + "learning_rate": 9.845801802563693e-06, + "loss": 0.9427834510803222, + "memory(GiB)": 29.49, + "step": 2695, + "token_acc": 0.7910832719233604, + "train_speed(iter/s)": 0.096768 + }, + { + "epoch": 0.1255598603727849, + "grad_norm": 5.101200103759766, + "learning_rate": 9.844852867166592e-06, + "loss": 0.7691882133483887, + "memory(GiB)": 29.49, + "step": 2700, + "token_acc": 0.8168316831683168, + "train_speed(iter/s)": 0.096871 + }, + { + "epoch": 0.1255598603727849, + "eval_loss": 0.7434446215629578, + "eval_runtime": 290.5786, + "eval_samples_per_second": 11.959, + "eval_steps_per_second": 11.959, + "step": 2700 + }, + { + "epoch": 0.12579237863273451, + "grad_norm": 5.41193151473999, + "learning_rate": 9.843901066858408e-06, + "loss": 0.75772123336792, + "memory(GiB)": 29.49, + "step": 2705, + "token_acc": 0.792011822502696, + "train_speed(iter/s)": 0.095969 + }, + { + "epoch": 0.1260248968926841, + "grad_norm": 5.032259464263916, + "learning_rate": 9.842946402201971e-06, + "loss": 0.7770239353179932, + "memory(GiB)": 29.49, + "step": 2710, + "token_acc": 0.8157417482771128, + "train_speed(iter/s)": 0.09607 + }, + { + "epoch": 0.1262574151526337, + "grad_norm": 5.005000591278076, + "learning_rate": 9.841988873761804e-06, + "loss": 0.9538334846496582, + "memory(GiB)": 29.49, + "step": 2715, + "token_acc": 0.7771830043492807, + "train_speed(iter/s)": 0.096169 + }, + { + "epoch": 0.1264899334125833, + "grad_norm": 4.789558410644531, + "learning_rate": 9.84102848210412e-06, + "loss": 0.8359936714172364, + "memory(GiB)": 29.49, + "step": 2720, + "token_acc": 0.7862092862092862, + "train_speed(iter/s)": 0.096268 + }, + { + "epoch": 0.1267224516725329, + "grad_norm": 3.9694697856903076, + "learning_rate": 9.840065227796833e-06, + "loss": 0.866541576385498, + "memory(GiB)": 29.49, + "step": 2725, + "token_acc": 0.8002589834898025, + "train_speed(iter/s)": 0.096367 + }, + { + "epoch": 0.1269549699324825, + "grad_norm": 4.770729064941406, + "learning_rate": 9.839099111409543e-06, + "loss": 0.7865410804748535, + "memory(GiB)": 29.49, + "step": 2730, + "token_acc": 0.8031796502384738, + "train_speed(iter/s)": 0.096462 + }, + { + "epoch": 0.1271874881924321, + "grad_norm": 5.078505516052246, + "learning_rate": 9.838130133513543e-06, + "loss": 0.8374693870544434, + "memory(GiB)": 29.49, + "step": 2735, + "token_acc": 0.7991054789414834, + "train_speed(iter/s)": 0.09656 + }, + { + "epoch": 0.12742000645238172, + "grad_norm": 5.44411039352417, + "learning_rate": 9.83715829468182e-06, + "loss": 0.7986952304840088, + "memory(GiB)": 29.49, + "step": 2740, + "token_acc": 0.801953125, + "train_speed(iter/s)": 0.096659 + }, + { + "epoch": 0.12765252471233132, + "grad_norm": 4.840211391448975, + "learning_rate": 9.836183595489054e-06, + "loss": 0.8615546226501465, + "memory(GiB)": 29.49, + "step": 2745, + "token_acc": 0.7914564413050791, + "train_speed(iter/s)": 0.09676 + }, + { + "epoch": 0.12788504297228093, + "grad_norm": 4.512420177459717, + "learning_rate": 9.835206036511613e-06, + "loss": 0.8429337501525879, + "memory(GiB)": 29.49, + "step": 2750, + "token_acc": 0.7959511568123393, + "train_speed(iter/s)": 0.09686 + }, + { + "epoch": 0.12788504297228093, + "eval_loss": 0.754675567150116, + "eval_runtime": 289.7348, + "eval_samples_per_second": 11.994, + "eval_steps_per_second": 11.994, + "step": 2750 + }, + { + "epoch": 0.12811756123223053, + "grad_norm": 5.223292350769043, + "learning_rate": 9.834225618327558e-06, + "loss": 0.8188864707946777, + "memory(GiB)": 29.49, + "step": 2755, + "token_acc": 0.7912945004465994, + "train_speed(iter/s)": 0.095978 + }, + { + "epoch": 0.1283500794921801, + "grad_norm": 4.913177967071533, + "learning_rate": 9.833242341516643e-06, + "loss": 0.8221258163452149, + "memory(GiB)": 29.49, + "step": 2760, + "token_acc": 0.803325079589671, + "train_speed(iter/s)": 0.096077 + }, + { + "epoch": 0.1285825977521297, + "grad_norm": 6.017866134643555, + "learning_rate": 9.832256206660305e-06, + "loss": 1.0436551094055175, + "memory(GiB)": 29.49, + "step": 2765, + "token_acc": 0.7476943346508564, + "train_speed(iter/s)": 0.096174 + }, + { + "epoch": 0.12881511601207932, + "grad_norm": 4.889126777648926, + "learning_rate": 9.83126721434168e-06, + "loss": 0.8034382820129394, + "memory(GiB)": 29.49, + "step": 2770, + "token_acc": 0.8095238095238095, + "train_speed(iter/s)": 0.096272 + }, + { + "epoch": 0.12904763427202892, + "grad_norm": 4.911211967468262, + "learning_rate": 9.83027536514559e-06, + "loss": 0.7517318725585938, + "memory(GiB)": 29.49, + "step": 2775, + "token_acc": 0.8236705317872851, + "train_speed(iter/s)": 0.09637 + }, + { + "epoch": 0.12928015253197853, + "grad_norm": 5.036534786224365, + "learning_rate": 9.829280659658544e-06, + "loss": 0.8068610191345215, + "memory(GiB)": 29.49, + "step": 2780, + "token_acc": 0.7897165458141068, + "train_speed(iter/s)": 0.096469 + }, + { + "epoch": 0.12951267079192813, + "grad_norm": 5.624305725097656, + "learning_rate": 9.828283098468741e-06, + "loss": 0.8407914161682128, + "memory(GiB)": 29.49, + "step": 2785, + "token_acc": 0.7917938284164123, + "train_speed(iter/s)": 0.096567 + }, + { + "epoch": 0.12974518905187774, + "grad_norm": 5.084197521209717, + "learning_rate": 9.827282682166074e-06, + "loss": 0.8023724555969238, + "memory(GiB)": 29.49, + "step": 2790, + "token_acc": 0.7984674329501916, + "train_speed(iter/s)": 0.096663 + }, + { + "epoch": 0.12997770731182734, + "grad_norm": 5.027306079864502, + "learning_rate": 9.826279411342117e-06, + "loss": 0.9611904144287109, + "memory(GiB)": 29.49, + "step": 2795, + "token_acc": 0.7777015437392796, + "train_speed(iter/s)": 0.096757 + }, + { + "epoch": 0.13021022557177694, + "grad_norm": 4.208934307098389, + "learning_rate": 9.825273286590133e-06, + "loss": 0.8066798210144043, + "memory(GiB)": 29.49, + "step": 2800, + "token_acc": 0.8011676938047356, + "train_speed(iter/s)": 0.096853 + }, + { + "epoch": 0.13021022557177694, + "eval_loss": 0.7422595620155334, + "eval_runtime": 289.8504, + "eval_samples_per_second": 11.989, + "eval_steps_per_second": 11.989, + "step": 2800 + }, + { + "epoch": 0.13044274383172655, + "grad_norm": 6.204680919647217, + "learning_rate": 9.82426430850508e-06, + "loss": 0.8447407722473145, + "memory(GiB)": 29.49, + "step": 2805, + "token_acc": 0.7921122457184209, + "train_speed(iter/s)": 0.095988 + }, + { + "epoch": 0.13067526209167613, + "grad_norm": 6.124194622039795, + "learning_rate": 9.823252477683594e-06, + "loss": 0.8883560180664063, + "memory(GiB)": 29.49, + "step": 2810, + "token_acc": 0.7881856540084389, + "train_speed(iter/s)": 0.096085 + }, + { + "epoch": 0.13090778035162573, + "grad_norm": 5.090604782104492, + "learning_rate": 9.822237794724003e-06, + "loss": 0.9355738639831543, + "memory(GiB)": 29.49, + "step": 2815, + "token_acc": 0.7807560137457045, + "train_speed(iter/s)": 0.09618 + }, + { + "epoch": 0.13114029861157533, + "grad_norm": 4.79983377456665, + "learning_rate": 9.821220260226319e-06, + "loss": 0.9572502136230469, + "memory(GiB)": 29.49, + "step": 2820, + "token_acc": 0.785137861466039, + "train_speed(iter/s)": 0.096277 + }, + { + "epoch": 0.13137281687152494, + "grad_norm": 4.211757659912109, + "learning_rate": 9.820199874792245e-06, + "loss": 0.8392532348632813, + "memory(GiB)": 29.49, + "step": 2825, + "token_acc": 0.7980360065466449, + "train_speed(iter/s)": 0.096372 + }, + { + "epoch": 0.13160533513147454, + "grad_norm": 6.155999660491943, + "learning_rate": 9.819176639025162e-06, + "loss": 0.8681906700134278, + "memory(GiB)": 29.49, + "step": 2830, + "token_acc": 0.7908745247148289, + "train_speed(iter/s)": 0.096468 + }, + { + "epoch": 0.13183785339142415, + "grad_norm": 5.698853492736816, + "learning_rate": 9.818150553530144e-06, + "loss": 0.803080940246582, + "memory(GiB)": 29.49, + "step": 2835, + "token_acc": 0.8136070853462157, + "train_speed(iter/s)": 0.096563 + }, + { + "epoch": 0.13207037165137375, + "grad_norm": 4.489797115325928, + "learning_rate": 9.817121618913948e-06, + "loss": 0.8084547042846679, + "memory(GiB)": 29.49, + "step": 2840, + "token_acc": 0.804380664652568, + "train_speed(iter/s)": 0.096661 + }, + { + "epoch": 0.13230288991132336, + "grad_norm": 5.172839164733887, + "learning_rate": 9.81608983578501e-06, + "loss": 0.6797237873077393, + "memory(GiB)": 29.49, + "step": 2845, + "token_acc": 0.8252279635258358, + "train_speed(iter/s)": 0.096756 + }, + { + "epoch": 0.13253540817127296, + "grad_norm": 4.8296942710876465, + "learning_rate": 9.815055204753458e-06, + "loss": 0.7829336166381836, + "memory(GiB)": 29.49, + "step": 2850, + "token_acc": 0.801798800799467, + "train_speed(iter/s)": 0.096851 + }, + { + "epoch": 0.13253540817127296, + "eval_loss": 0.7429930567741394, + "eval_runtime": 288.7556, + "eval_samples_per_second": 12.034, + "eval_steps_per_second": 12.034, + "step": 2850 + }, + { + "epoch": 0.13276792643122254, + "grad_norm": 5.827232360839844, + "learning_rate": 9.814017726431105e-06, + "loss": 0.8832127571105957, + "memory(GiB)": 29.49, + "step": 2855, + "token_acc": 0.7922617522736006, + "train_speed(iter/s)": 0.096001 + }, + { + "epoch": 0.13300044469117214, + "grad_norm": 3.6636886596679688, + "learning_rate": 9.812977401431437e-06, + "loss": 0.8548738479614257, + "memory(GiB)": 29.49, + "step": 2860, + "token_acc": 0.7891721569750788, + "train_speed(iter/s)": 0.096096 + }, + { + "epoch": 0.13323296295112175, + "grad_norm": 4.969047546386719, + "learning_rate": 9.811934230369636e-06, + "loss": 0.76055908203125, + "memory(GiB)": 29.49, + "step": 2865, + "token_acc": 0.8173374613003096, + "train_speed(iter/s)": 0.096191 + }, + { + "epoch": 0.13346548121107135, + "grad_norm": 2.693516492843628, + "learning_rate": 9.810888213862556e-06, + "loss": 0.8962690353393554, + "memory(GiB)": 33.07, + "step": 2870, + "token_acc": 0.751769587503051, + "train_speed(iter/s)": 0.096278 + }, + { + "epoch": 0.13369799947102096, + "grad_norm": 5.445571422576904, + "learning_rate": 9.809839352528743e-06, + "loss": 0.8073612213134765, + "memory(GiB)": 33.07, + "step": 2875, + "token_acc": 0.8058076225045372, + "train_speed(iter/s)": 0.096372 + }, + { + "epoch": 0.13393051773097056, + "grad_norm": 4.459766387939453, + "learning_rate": 9.808787646988422e-06, + "loss": 0.8136503219604492, + "memory(GiB)": 33.07, + "step": 2880, + "token_acc": 0.8049738219895288, + "train_speed(iter/s)": 0.096466 + }, + { + "epoch": 0.13416303599092017, + "grad_norm": 5.122330665588379, + "learning_rate": 9.807733097863494e-06, + "loss": 0.8207123756408692, + "memory(GiB)": 33.07, + "step": 2885, + "token_acc": 0.8027571580063627, + "train_speed(iter/s)": 0.09656 + }, + { + "epoch": 0.13439555425086977, + "grad_norm": 2.956819772720337, + "learning_rate": 9.80667570577755e-06, + "loss": 0.9861713409423828, + "memory(GiB)": 33.07, + "step": 2890, + "token_acc": 0.7597343797162692, + "train_speed(iter/s)": 0.09665 + }, + { + "epoch": 0.13462807251081937, + "grad_norm": 4.836545467376709, + "learning_rate": 9.805615471355859e-06, + "loss": 0.7751515865325928, + "memory(GiB)": 33.07, + "step": 2895, + "token_acc": 0.8123145400593472, + "train_speed(iter/s)": 0.096744 + }, + { + "epoch": 0.13486059077076898, + "grad_norm": 4.383913993835449, + "learning_rate": 9.804552395225368e-06, + "loss": 0.8893208503723145, + "memory(GiB)": 33.07, + "step": 2900, + "token_acc": 0.7776507573592455, + "train_speed(iter/s)": 0.096838 + }, + { + "epoch": 0.13486059077076898, + "eval_loss": 0.7408275008201599, + "eval_runtime": 290.8079, + "eval_samples_per_second": 11.949, + "eval_steps_per_second": 11.949, + "step": 2900 + }, + { + "epoch": 0.13509310903071856, + "grad_norm": 5.118293762207031, + "learning_rate": 9.80348647801471e-06, + "loss": 0.8451316833496094, + "memory(GiB)": 33.07, + "step": 2905, + "token_acc": 0.7927192454344772, + "train_speed(iter/s)": 0.095998 + }, + { + "epoch": 0.13532562729066816, + "grad_norm": 4.629569053649902, + "learning_rate": 9.802417720354189e-06, + "loss": 0.8460866928100585, + "memory(GiB)": 33.07, + "step": 2910, + "token_acc": 0.8077192982456141, + "train_speed(iter/s)": 0.096086 + }, + { + "epoch": 0.13555814555061776, + "grad_norm": 5.260866641998291, + "learning_rate": 9.801346122875801e-06, + "loss": 0.75780029296875, + "memory(GiB)": 33.07, + "step": 2915, + "token_acc": 0.8127731092436975, + "train_speed(iter/s)": 0.096177 + }, + { + "epoch": 0.13579066381056737, + "grad_norm": 4.477482318878174, + "learning_rate": 9.800271686213213e-06, + "loss": 0.9403352737426758, + "memory(GiB)": 33.07, + "step": 2920, + "token_acc": 0.773038605230386, + "train_speed(iter/s)": 0.096264 + }, + { + "epoch": 0.13602318207051697, + "grad_norm": 4.261521816253662, + "learning_rate": 9.799194411001768e-06, + "loss": 0.8434426307678222, + "memory(GiB)": 33.07, + "step": 2925, + "token_acc": 0.7681672025723473, + "train_speed(iter/s)": 0.096355 + }, + { + "epoch": 0.13625570033046658, + "grad_norm": 3.634920358657837, + "learning_rate": 9.798114297878496e-06, + "loss": 0.8066039085388184, + "memory(GiB)": 33.07, + "step": 2930, + "token_acc": 0.797752808988764, + "train_speed(iter/s)": 0.096447 + }, + { + "epoch": 0.13648821859041618, + "grad_norm": 5.318452835083008, + "learning_rate": 9.797031347482101e-06, + "loss": 0.8009425163269043, + "memory(GiB)": 33.07, + "step": 2935, + "token_acc": 0.8004246284501062, + "train_speed(iter/s)": 0.096537 + }, + { + "epoch": 0.1367207368503658, + "grad_norm": 6.3095316886901855, + "learning_rate": 9.795945560452967e-06, + "loss": 0.8986371040344239, + "memory(GiB)": 33.07, + "step": 2940, + "token_acc": 0.7920758550626481, + "train_speed(iter/s)": 0.096628 + }, + { + "epoch": 0.1369532551103154, + "grad_norm": 3.810253381729126, + "learning_rate": 9.794856937433148e-06, + "loss": 0.8121430397033691, + "memory(GiB)": 33.07, + "step": 2945, + "token_acc": 0.8095394736842105, + "train_speed(iter/s)": 0.096717 + }, + { + "epoch": 0.137185773370265, + "grad_norm": 4.707873344421387, + "learning_rate": 9.793765479066385e-06, + "loss": 0.8329290390014649, + "memory(GiB)": 33.07, + "step": 2950, + "token_acc": 0.7846309403437816, + "train_speed(iter/s)": 0.096809 + }, + { + "epoch": 0.137185773370265, + "eval_loss": 0.7358382940292358, + "eval_runtime": 292.6301, + "eval_samples_per_second": 11.875, + "eval_steps_per_second": 11.875, + "step": 2950 + }, + { + "epoch": 0.13741829163021457, + "grad_norm": 5.086911201477051, + "learning_rate": 9.79267118599809e-06, + "loss": 0.8006107330322265, + "memory(GiB)": 33.07, + "step": 2955, + "token_acc": 0.7935293080510988, + "train_speed(iter/s)": 0.09598 + }, + { + "epoch": 0.13765080989016418, + "grad_norm": 4.827671527862549, + "learning_rate": 9.791574058875351e-06, + "loss": 0.777289867401123, + "memory(GiB)": 33.07, + "step": 2960, + "token_acc": 0.8047665687234737, + "train_speed(iter/s)": 0.09607 + }, + { + "epoch": 0.13788332815011378, + "grad_norm": 5.354944705963135, + "learning_rate": 9.790474098346933e-06, + "loss": 0.7382638931274415, + "memory(GiB)": 33.07, + "step": 2965, + "token_acc": 0.820254110612855, + "train_speed(iter/s)": 0.096161 + }, + { + "epoch": 0.1381158464100634, + "grad_norm": 5.039259910583496, + "learning_rate": 9.78937130506328e-06, + "loss": 0.8652180671691895, + "memory(GiB)": 33.07, + "step": 2970, + "token_acc": 0.7841680129240711, + "train_speed(iter/s)": 0.096251 + }, + { + "epoch": 0.138348364670013, + "grad_norm": 5.581478118896484, + "learning_rate": 9.788265679676503e-06, + "loss": 0.7484108448028565, + "memory(GiB)": 33.07, + "step": 2975, + "token_acc": 0.8159857904085257, + "train_speed(iter/s)": 0.096344 + }, + { + "epoch": 0.1385808829299626, + "grad_norm": 4.969134330749512, + "learning_rate": 9.787157222840395e-06, + "loss": 0.8082466125488281, + "memory(GiB)": 33.07, + "step": 2980, + "token_acc": 0.8080651415277239, + "train_speed(iter/s)": 0.096435 + }, + { + "epoch": 0.1388134011899122, + "grad_norm": 4.701774597167969, + "learning_rate": 9.786045935210423e-06, + "loss": 0.894073486328125, + "memory(GiB)": 33.07, + "step": 2985, + "token_acc": 0.7753647777400746, + "train_speed(iter/s)": 0.096526 + }, + { + "epoch": 0.1390459194498618, + "grad_norm": 5.563474655151367, + "learning_rate": 9.78493181744372e-06, + "loss": 0.7509230136871338, + "memory(GiB)": 33.07, + "step": 2990, + "token_acc": 0.819632881085395, + "train_speed(iter/s)": 0.096619 + }, + { + "epoch": 0.1392784377098114, + "grad_norm": 6.414600849151611, + "learning_rate": 9.783814870199101e-06, + "loss": 0.8086988449096679, + "memory(GiB)": 33.07, + "step": 2995, + "token_acc": 0.8048202291584354, + "train_speed(iter/s)": 0.09671 + }, + { + "epoch": 0.139510955969761, + "grad_norm": 4.5219597816467285, + "learning_rate": 9.782695094137056e-06, + "loss": 0.7585030555725097, + "memory(GiB)": 33.07, + "step": 3000, + "token_acc": 0.8189102564102564, + "train_speed(iter/s)": 0.096802 + }, + { + "epoch": 0.139510955969761, + "eval_loss": 0.7317752838134766, + "eval_runtime": 290.2766, + "eval_samples_per_second": 11.971, + "eval_steps_per_second": 11.971, + "step": 3000 + }, + { + "epoch": 0.1397434742297106, + "grad_norm": 4.116754055023193, + "learning_rate": 9.781572489919735e-06, + "loss": 0.8825703620910644, + "memory(GiB)": 33.07, + "step": 3005, + "token_acc": 0.7937784522003035, + "train_speed(iter/s)": 0.095992 + }, + { + "epoch": 0.1399759924896602, + "grad_norm": 5.839756488800049, + "learning_rate": 9.780447058210973e-06, + "loss": 0.8361115455627441, + "memory(GiB)": 33.07, + "step": 3010, + "token_acc": 0.7739352020385876, + "train_speed(iter/s)": 0.096081 + }, + { + "epoch": 0.1402085107496098, + "grad_norm": 4.884430885314941, + "learning_rate": 9.779318799676274e-06, + "loss": 0.7522700786590576, + "memory(GiB)": 33.07, + "step": 3015, + "token_acc": 0.8246367239101717, + "train_speed(iter/s)": 0.096168 + }, + { + "epoch": 0.1404410290095594, + "grad_norm": 4.92686128616333, + "learning_rate": 9.778187714982808e-06, + "loss": 0.7651845455169678, + "memory(GiB)": 33.07, + "step": 3020, + "token_acc": 0.7955761683910096, + "train_speed(iter/s)": 0.096256 + }, + { + "epoch": 0.140673547269509, + "grad_norm": 4.128361225128174, + "learning_rate": 9.777053804799423e-06, + "loss": 0.8679119110107422, + "memory(GiB)": 33.07, + "step": 3025, + "token_acc": 0.78975487115022, + "train_speed(iter/s)": 0.096345 + }, + { + "epoch": 0.1409060655294586, + "grad_norm": 5.358954429626465, + "learning_rate": 9.775917069796635e-06, + "loss": 0.8796347618103028, + "memory(GiB)": 33.07, + "step": 3030, + "token_acc": 0.7975945017182131, + "train_speed(iter/s)": 0.096434 + }, + { + "epoch": 0.14113858378940822, + "grad_norm": 4.029304027557373, + "learning_rate": 9.77477751064663e-06, + "loss": 0.7879987239837647, + "memory(GiB)": 33.07, + "step": 3035, + "token_acc": 0.8097361575622445, + "train_speed(iter/s)": 0.096523 + }, + { + "epoch": 0.14137110204935782, + "grad_norm": 5.047082424163818, + "learning_rate": 9.773635128023263e-06, + "loss": 0.7463294506072998, + "memory(GiB)": 33.07, + "step": 3040, + "token_acc": 0.815464587394412, + "train_speed(iter/s)": 0.09661 + }, + { + "epoch": 0.14160362030930743, + "grad_norm": 4.665285587310791, + "learning_rate": 9.772489922602064e-06, + "loss": 0.7370441913604736, + "memory(GiB)": 33.07, + "step": 3045, + "token_acc": 0.8179551122194514, + "train_speed(iter/s)": 0.096699 + }, + { + "epoch": 0.141836138569257, + "grad_norm": 3.9613771438598633, + "learning_rate": 9.771341895060223e-06, + "loss": 0.8843655586242676, + "memory(GiB)": 33.07, + "step": 3050, + "token_acc": 0.77269800386349, + "train_speed(iter/s)": 0.096789 + }, + { + "epoch": 0.141836138569257, + "eval_loss": 0.7323087453842163, + "eval_runtime": 291.6112, + "eval_samples_per_second": 11.917, + "eval_steps_per_second": 11.917, + "step": 3050 + }, + { + "epoch": 0.1420686568292066, + "grad_norm": 4.098989963531494, + "learning_rate": 9.770191046076609e-06, + "loss": 0.8098397254943848, + "memory(GiB)": 33.07, + "step": 3055, + "token_acc": 0.7947033102312605, + "train_speed(iter/s)": 0.095987 + }, + { + "epoch": 0.1423011750891562, + "grad_norm": 5.723684787750244, + "learning_rate": 9.769037376331752e-06, + "loss": 0.7382633686065674, + "memory(GiB)": 33.07, + "step": 3060, + "token_acc": 0.8312332951508209, + "train_speed(iter/s)": 0.096076 + }, + { + "epoch": 0.14253369334910582, + "grad_norm": 4.521707057952881, + "learning_rate": 9.767880886507853e-06, + "loss": 0.9055806159973144, + "memory(GiB)": 33.07, + "step": 3065, + "token_acc": 0.7789291882556131, + "train_speed(iter/s)": 0.096164 + }, + { + "epoch": 0.14276621160905542, + "grad_norm": 5.3605732917785645, + "learning_rate": 9.766721577288781e-06, + "loss": 0.822453784942627, + "memory(GiB)": 33.07, + "step": 3070, + "token_acc": 0.7985028072364317, + "train_speed(iter/s)": 0.096251 + }, + { + "epoch": 0.14299872986900503, + "grad_norm": 5.691252708435059, + "learning_rate": 9.76555944936007e-06, + "loss": 0.9621199607849121, + "memory(GiB)": 33.07, + "step": 3075, + "token_acc": 0.7724851143842055, + "train_speed(iter/s)": 0.096336 + }, + { + "epoch": 0.14323124812895463, + "grad_norm": 6.443828105926514, + "learning_rate": 9.764394503408922e-06, + "loss": 0.8782764434814453, + "memory(GiB)": 33.07, + "step": 3080, + "token_acc": 0.8062077198567449, + "train_speed(iter/s)": 0.096424 + }, + { + "epoch": 0.14346376638890423, + "grad_norm": 5.440097808837891, + "learning_rate": 9.763226740124209e-06, + "loss": 0.7793534278869629, + "memory(GiB)": 33.07, + "step": 3085, + "token_acc": 0.8125247720967103, + "train_speed(iter/s)": 0.096513 + }, + { + "epoch": 0.14369628464885384, + "grad_norm": 6.047162055969238, + "learning_rate": 9.76205616019646e-06, + "loss": 0.7774827480316162, + "memory(GiB)": 33.07, + "step": 3090, + "token_acc": 0.8274209012464045, + "train_speed(iter/s)": 0.096601 + }, + { + "epoch": 0.14392880290880344, + "grad_norm": 4.667301654815674, + "learning_rate": 9.760882764317879e-06, + "loss": 0.8556358337402343, + "memory(GiB)": 33.07, + "step": 3095, + "token_acc": 0.7904670505438259, + "train_speed(iter/s)": 0.096685 + }, + { + "epoch": 0.14416132116875302, + "grad_norm": 4.598082065582275, + "learning_rate": 9.75970655318233e-06, + "loss": 0.7539079189300537, + "memory(GiB)": 33.07, + "step": 3100, + "token_acc": 0.8225, + "train_speed(iter/s)": 0.096774 + }, + { + "epoch": 0.14416132116875302, + "eval_loss": 0.7284711003303528, + "eval_runtime": 293.8493, + "eval_samples_per_second": 11.826, + "eval_steps_per_second": 11.826, + "step": 3100 + }, + { + "epoch": 0.14439383942870263, + "grad_norm": 5.339951992034912, + "learning_rate": 9.758527527485342e-06, + "loss": 0.7430771350860595, + "memory(GiB)": 33.07, + "step": 3105, + "token_acc": 0.7953026327403449, + "train_speed(iter/s)": 0.095981 + }, + { + "epoch": 0.14462635768865223, + "grad_norm": 5.519880294799805, + "learning_rate": 9.757345687924112e-06, + "loss": 0.8814563751220703, + "memory(GiB)": 33.07, + "step": 3110, + "token_acc": 0.801297371116422, + "train_speed(iter/s)": 0.096069 + }, + { + "epoch": 0.14485887594860183, + "grad_norm": 4.690358638763428, + "learning_rate": 9.756161035197495e-06, + "loss": 0.8213804244995118, + "memory(GiB)": 33.07, + "step": 3115, + "token_acc": 0.7810784630287375, + "train_speed(iter/s)": 0.096154 + }, + { + "epoch": 0.14509139420855144, + "grad_norm": 5.900088787078857, + "learning_rate": 9.754973570006014e-06, + "loss": 0.7102549076080322, + "memory(GiB)": 33.07, + "step": 3120, + "token_acc": 0.8243793997776955, + "train_speed(iter/s)": 0.09624 + }, + { + "epoch": 0.14532391246850104, + "grad_norm": 5.579226970672607, + "learning_rate": 9.753783293051854e-06, + "loss": 0.6997756481170654, + "memory(GiB)": 33.07, + "step": 3125, + "token_acc": 0.818785578747628, + "train_speed(iter/s)": 0.096327 + }, + { + "epoch": 0.14555643072845065, + "grad_norm": 4.501831531524658, + "learning_rate": 9.752590205038863e-06, + "loss": 0.9300002098083496, + "memory(GiB)": 33.07, + "step": 3130, + "token_acc": 0.7713385826771654, + "train_speed(iter/s)": 0.096412 + }, + { + "epoch": 0.14578894898840025, + "grad_norm": 5.4228034019470215, + "learning_rate": 9.75139430667255e-06, + "loss": 0.7476221561431885, + "memory(GiB)": 33.07, + "step": 3135, + "token_acc": 0.8206948076204706, + "train_speed(iter/s)": 0.096499 + }, + { + "epoch": 0.14602146724834986, + "grad_norm": 5.298704624176025, + "learning_rate": 9.750195598660088e-06, + "loss": 0.7928246021270752, + "memory(GiB)": 33.07, + "step": 3140, + "token_acc": 0.8, + "train_speed(iter/s)": 0.096581 + }, + { + "epoch": 0.14625398550829946, + "grad_norm": 5.123478889465332, + "learning_rate": 9.748994081710308e-06, + "loss": 0.9278718948364257, + "memory(GiB)": 33.07, + "step": 3145, + "token_acc": 0.7674144037780402, + "train_speed(iter/s)": 0.096666 + }, + { + "epoch": 0.14648650376824904, + "grad_norm": 6.589613437652588, + "learning_rate": 9.747789756533706e-06, + "loss": 0.8111718177795411, + "memory(GiB)": 33.07, + "step": 3150, + "token_acc": 0.7971469329529244, + "train_speed(iter/s)": 0.096752 + }, + { + "epoch": 0.14648650376824904, + "eval_loss": 0.7266234159469604, + "eval_runtime": 294.8453, + "eval_samples_per_second": 11.786, + "eval_steps_per_second": 11.786, + "step": 3150 + }, + { + "epoch": 0.14671902202819864, + "grad_norm": 5.1463942527771, + "learning_rate": 9.746582623842434e-06, + "loss": 0.7378671169281006, + "memory(GiB)": 33.07, + "step": 3155, + "token_acc": 0.7954281596541094, + "train_speed(iter/s)": 0.095969 + }, + { + "epoch": 0.14695154028814825, + "grad_norm": 4.170019626617432, + "learning_rate": 9.745372684350309e-06, + "loss": 0.8750859260559082, + "memory(GiB)": 33.07, + "step": 3160, + "token_acc": 0.7874066861408634, + "train_speed(iter/s)": 0.096054 + }, + { + "epoch": 0.14718405854809785, + "grad_norm": 5.746112823486328, + "learning_rate": 9.744159938772807e-06, + "loss": 0.7857075691223144, + "memory(GiB)": 33.07, + "step": 3165, + "token_acc": 0.8218029350104822, + "train_speed(iter/s)": 0.096138 + }, + { + "epoch": 0.14741657680804746, + "grad_norm": 5.502606391906738, + "learning_rate": 9.742944387827059e-06, + "loss": 0.8238849639892578, + "memory(GiB)": 33.07, + "step": 3170, + "token_acc": 0.7904761904761904, + "train_speed(iter/s)": 0.096224 + }, + { + "epoch": 0.14764909506799706, + "grad_norm": 5.353701114654541, + "learning_rate": 9.741726032231858e-06, + "loss": 0.832034683227539, + "memory(GiB)": 33.07, + "step": 3175, + "token_acc": 0.8003101977510663, + "train_speed(iter/s)": 0.096309 + }, + { + "epoch": 0.14788161332794666, + "grad_norm": 4.378777980804443, + "learning_rate": 9.740504872707656e-06, + "loss": 0.9383623123168945, + "memory(GiB)": 33.07, + "step": 3180, + "token_acc": 0.7763684913217623, + "train_speed(iter/s)": 0.096394 + }, + { + "epoch": 0.14811413158789627, + "grad_norm": 6.112171173095703, + "learning_rate": 9.739280909976566e-06, + "loss": 0.8390913963317871, + "memory(GiB)": 33.07, + "step": 3185, + "token_acc": 0.8102409638554217, + "train_speed(iter/s)": 0.09648 + }, + { + "epoch": 0.14834664984784587, + "grad_norm": 7.882232666015625, + "learning_rate": 9.738054144762347e-06, + "loss": 0.7900448322296143, + "memory(GiB)": 33.07, + "step": 3190, + "token_acc": 0.807822489657766, + "train_speed(iter/s)": 0.096562 + }, + { + "epoch": 0.14857916810779548, + "grad_norm": 5.705371856689453, + "learning_rate": 9.736824577790428e-06, + "loss": 0.8507783889770508, + "memory(GiB)": 33.07, + "step": 3195, + "token_acc": 0.7648448043184886, + "train_speed(iter/s)": 0.096644 + }, + { + "epoch": 0.14881168636774506, + "grad_norm": 4.271676063537598, + "learning_rate": 9.735592209787893e-06, + "loss": 0.77920503616333, + "memory(GiB)": 33.07, + "step": 3200, + "token_acc": 0.8135593220338984, + "train_speed(iter/s)": 0.096727 + }, + { + "epoch": 0.14881168636774506, + "eval_loss": 0.7255586981773376, + "eval_runtime": 291.2062, + "eval_samples_per_second": 11.933, + "eval_steps_per_second": 11.933, + "step": 3200 + }, + { + "epoch": 0.14904420462769466, + "grad_norm": 5.626006603240967, + "learning_rate": 9.734357041483473e-06, + "loss": 0.8095316886901855, + "memory(GiB)": 33.07, + "step": 3205, + "token_acc": 0.7956126741414885, + "train_speed(iter/s)": 0.095964 + }, + { + "epoch": 0.14927672288764426, + "grad_norm": 5.581325531005859, + "learning_rate": 9.733119073607563e-06, + "loss": 0.9211545944213867, + "memory(GiB)": 33.07, + "step": 3210, + "token_acc": 0.7900207900207901, + "train_speed(iter/s)": 0.096046 + }, + { + "epoch": 0.14950924114759387, + "grad_norm": 4.774792671203613, + "learning_rate": 9.731878306892213e-06, + "loss": 0.7863685607910156, + "memory(GiB)": 33.07, + "step": 3215, + "token_acc": 0.8199121522693997, + "train_speed(iter/s)": 0.09613 + }, + { + "epoch": 0.14974175940754347, + "grad_norm": 6.042647838592529, + "learning_rate": 9.730634742071128e-06, + "loss": 0.7859435081481934, + "memory(GiB)": 33.07, + "step": 3220, + "token_acc": 0.8172798677139314, + "train_speed(iter/s)": 0.096213 + }, + { + "epoch": 0.14997427766749308, + "grad_norm": 6.267547607421875, + "learning_rate": 9.729388379879663e-06, + "loss": 0.8451736450195313, + "memory(GiB)": 33.07, + "step": 3225, + "token_acc": 0.7858958068614994, + "train_speed(iter/s)": 0.096294 + }, + { + "epoch": 0.15020679592744268, + "grad_norm": 5.5586347579956055, + "learning_rate": 9.728139221054833e-06, + "loss": 0.8673666000366211, + "memory(GiB)": 33.07, + "step": 3230, + "token_acc": 0.8081487341772152, + "train_speed(iter/s)": 0.096378 + }, + { + "epoch": 0.1504393141873923, + "grad_norm": 4.9623823165893555, + "learning_rate": 9.726887266335302e-06, + "loss": 0.7912930011749267, + "memory(GiB)": 33.07, + "step": 3235, + "token_acc": 0.812361049355269, + "train_speed(iter/s)": 0.096461 + }, + { + "epoch": 0.1506718324473419, + "grad_norm": 3.856882095336914, + "learning_rate": 9.72563251646139e-06, + "loss": 0.7687274932861328, + "memory(GiB)": 33.07, + "step": 3240, + "token_acc": 0.8265987909640471, + "train_speed(iter/s)": 0.096543 + }, + { + "epoch": 0.1509043507072915, + "grad_norm": 6.322175025939941, + "learning_rate": 9.72437497217507e-06, + "loss": 0.8785791397094727, + "memory(GiB)": 33.07, + "step": 3245, + "token_acc": 0.7953172205438066, + "train_speed(iter/s)": 0.096625 + }, + { + "epoch": 0.15113686896724107, + "grad_norm": 5.029903411865234, + "learning_rate": 9.723114634219968e-06, + "loss": 0.7664390563964844, + "memory(GiB)": 33.07, + "step": 3250, + "token_acc": 0.8135489777235276, + "train_speed(iter/s)": 0.096708 + }, + { + "epoch": 0.15113686896724107, + "eval_loss": 0.7299229502677917, + "eval_runtime": 294.3976, + "eval_samples_per_second": 11.804, + "eval_steps_per_second": 11.804, + "step": 3250 + }, + { + "epoch": 0.15136938722719068, + "grad_norm": 4.417317867279053, + "learning_rate": 9.721851503341357e-06, + "loss": 0.8709222793579101, + "memory(GiB)": 33.07, + "step": 3255, + "token_acc": 0.7953129374125175, + "train_speed(iter/s)": 0.095951 + }, + { + "epoch": 0.15160190548714028, + "grad_norm": 5.620341777801514, + "learning_rate": 9.72058558028617e-06, + "loss": 0.7596760272979737, + "memory(GiB)": 33.07, + "step": 3260, + "token_acc": 0.8017127799736495, + "train_speed(iter/s)": 0.096033 + }, + { + "epoch": 0.15183442374708989, + "grad_norm": 4.787825107574463, + "learning_rate": 9.719316865802983e-06, + "loss": 0.7746444702148437, + "memory(GiB)": 33.07, + "step": 3265, + "token_acc": 0.8201466615206484, + "train_speed(iter/s)": 0.096116 + }, + { + "epoch": 0.1520669420070395, + "grad_norm": 4.002923965454102, + "learning_rate": 9.718045360642028e-06, + "loss": 0.8768243789672852, + "memory(GiB)": 33.07, + "step": 3270, + "token_acc": 0.778902677988243, + "train_speed(iter/s)": 0.096199 + }, + { + "epoch": 0.1522994602669891, + "grad_norm": 6.7627272605896, + "learning_rate": 9.716771065555184e-06, + "loss": 0.7489274978637696, + "memory(GiB)": 33.07, + "step": 3275, + "token_acc": 0.8207650273224044, + "train_speed(iter/s)": 0.096282 + }, + { + "epoch": 0.1525319785269387, + "grad_norm": 4.953536033630371, + "learning_rate": 9.71549398129598e-06, + "loss": 0.8561309814453125, + "memory(GiB)": 33.07, + "step": 3280, + "token_acc": 0.7854435178165277, + "train_speed(iter/s)": 0.096362 + }, + { + "epoch": 0.1527644967868883, + "grad_norm": 5.244431018829346, + "learning_rate": 9.7142141086196e-06, + "loss": 0.8115564346313476, + "memory(GiB)": 33.07, + "step": 3285, + "token_acc": 0.7993816557883888, + "train_speed(iter/s)": 0.096444 + }, + { + "epoch": 0.1529970150468379, + "grad_norm": 5.603392601013184, + "learning_rate": 9.712931448282864e-06, + "loss": 0.9105894088745117, + "memory(GiB)": 33.07, + "step": 3290, + "token_acc": 0.76864, + "train_speed(iter/s)": 0.096527 + }, + { + "epoch": 0.15322953330678749, + "grad_norm": 5.889342784881592, + "learning_rate": 9.711646001044259e-06, + "loss": 0.7476118087768555, + "memory(GiB)": 33.07, + "step": 3295, + "token_acc": 0.8253218884120171, + "train_speed(iter/s)": 0.09661 + }, + { + "epoch": 0.1534620515667371, + "grad_norm": 5.446425437927246, + "learning_rate": 9.7103577676639e-06, + "loss": 0.8373539924621582, + "memory(GiB)": 33.07, + "step": 3300, + "token_acc": 0.8043965903992822, + "train_speed(iter/s)": 0.096693 + }, + { + "epoch": 0.1534620515667371, + "eval_loss": 0.7242446541786194, + "eval_runtime": 290.5903, + "eval_samples_per_second": 11.958, + "eval_steps_per_second": 11.958, + "step": 3300 + }, + { + "epoch": 0.1536945698266867, + "grad_norm": 6.903223991394043, + "learning_rate": 9.709066748903566e-06, + "loss": 0.784368896484375, + "memory(GiB)": 33.07, + "step": 3305, + "token_acc": 0.7966411332970097, + "train_speed(iter/s)": 0.095957 + }, + { + "epoch": 0.1539270880866363, + "grad_norm": 4.271733283996582, + "learning_rate": 9.707772945526672e-06, + "loss": 0.8733412742614746, + "memory(GiB)": 33.07, + "step": 3310, + "token_acc": 0.7872619829284307, + "train_speed(iter/s)": 0.096037 + }, + { + "epoch": 0.1541596063465859, + "grad_norm": 5.4446940422058105, + "learning_rate": 9.706476358298286e-06, + "loss": 0.8001057624816894, + "memory(GiB)": 33.07, + "step": 3315, + "token_acc": 0.8016831320892792, + "train_speed(iter/s)": 0.09612 + }, + { + "epoch": 0.1543921246065355, + "grad_norm": 5.717871189117432, + "learning_rate": 9.70517698798512e-06, + "loss": 0.7779604911804199, + "memory(GiB)": 33.07, + "step": 3320, + "token_acc": 0.8100538599640934, + "train_speed(iter/s)": 0.096202 + }, + { + "epoch": 0.1546246428664851, + "grad_norm": 4.912775039672852, + "learning_rate": 9.703874835355533e-06, + "loss": 0.77957763671875, + "memory(GiB)": 33.07, + "step": 3325, + "token_acc": 0.800769442154438, + "train_speed(iter/s)": 0.096284 + }, + { + "epoch": 0.15485716112643472, + "grad_norm": 5.523812294006348, + "learning_rate": 9.702569901179524e-06, + "loss": 0.8424034118652344, + "memory(GiB)": 33.07, + "step": 3330, + "token_acc": 0.7964601769911505, + "train_speed(iter/s)": 0.096364 + }, + { + "epoch": 0.15508967938638432, + "grad_norm": 5.215954303741455, + "learning_rate": 9.701262186228744e-06, + "loss": 0.7897037029266357, + "memory(GiB)": 33.07, + "step": 3335, + "token_acc": 0.8032228075612023, + "train_speed(iter/s)": 0.096438 + }, + { + "epoch": 0.15532219764633393, + "grad_norm": 4.996734142303467, + "learning_rate": 9.699951691276486e-06, + "loss": 0.7758293151855469, + "memory(GiB)": 33.07, + "step": 3340, + "token_acc": 0.8208223311957752, + "train_speed(iter/s)": 0.09652 + }, + { + "epoch": 0.1555547159062835, + "grad_norm": 5.724847793579102, + "learning_rate": 9.698638417097683e-06, + "loss": 0.7793337821960449, + "memory(GiB)": 33.07, + "step": 3345, + "token_acc": 0.8164094232331438, + "train_speed(iter/s)": 0.096602 + }, + { + "epoch": 0.1557872341662331, + "grad_norm": 6.377450466156006, + "learning_rate": 9.697322364468917e-06, + "loss": 0.8371264457702636, + "memory(GiB)": 33.07, + "step": 3350, + "token_acc": 0.7927991528415108, + "train_speed(iter/s)": 0.096683 + }, + { + "epoch": 0.1557872341662331, + "eval_loss": 0.7258186340332031, + "eval_runtime": 292.1213, + "eval_samples_per_second": 11.896, + "eval_steps_per_second": 11.896, + "step": 3350 + }, + { + "epoch": 0.1560197524261827, + "grad_norm": 6.011385440826416, + "learning_rate": 9.69600353416841e-06, + "loss": 0.798521089553833, + "memory(GiB)": 33.07, + "step": 3355, + "token_acc": 0.7971320658088471, + "train_speed(iter/s)": 0.095956 + }, + { + "epoch": 0.15625227068613232, + "grad_norm": 6.298033237457275, + "learning_rate": 9.694681926976025e-06, + "loss": 0.7486701488494873, + "memory(GiB)": 33.07, + "step": 3360, + "token_acc": 0.8195459792227779, + "train_speed(iter/s)": 0.096035 + }, + { + "epoch": 0.15648478894608192, + "grad_norm": 6.298468112945557, + "learning_rate": 9.693357543673274e-06, + "loss": 0.8666628837585449, + "memory(GiB)": 33.07, + "step": 3365, + "token_acc": 0.7910060536177573, + "train_speed(iter/s)": 0.096112 + }, + { + "epoch": 0.15671730720603153, + "grad_norm": 5.438319206237793, + "learning_rate": 9.6920303850433e-06, + "loss": 0.6734135627746582, + "memory(GiB)": 33.07, + "step": 3370, + "token_acc": 0.8503155996393147, + "train_speed(iter/s)": 0.096192 + }, + { + "epoch": 0.15694982546598113, + "grad_norm": 5.32026481628418, + "learning_rate": 9.690700451870898e-06, + "loss": 0.8130708694458008, + "memory(GiB)": 33.07, + "step": 3375, + "token_acc": 0.8077761627906976, + "train_speed(iter/s)": 0.096272 + }, + { + "epoch": 0.15718234372593073, + "grad_norm": 5.873499870300293, + "learning_rate": 9.689367744942494e-06, + "loss": 0.7929094314575196, + "memory(GiB)": 33.07, + "step": 3380, + "token_acc": 0.8097868981846882, + "train_speed(iter/s)": 0.096354 + }, + { + "epoch": 0.15741486198588034, + "grad_norm": 6.561089992523193, + "learning_rate": 9.688032265046162e-06, + "loss": 0.6968857765197753, + "memory(GiB)": 33.07, + "step": 3385, + "token_acc": 0.851013672795851, + "train_speed(iter/s)": 0.096434 + }, + { + "epoch": 0.15764738024582994, + "grad_norm": 6.442429065704346, + "learning_rate": 9.686694012971612e-06, + "loss": 0.8454565048217774, + "memory(GiB)": 33.07, + "step": 3390, + "token_acc": 0.7944564434845213, + "train_speed(iter/s)": 0.096514 + }, + { + "epoch": 0.15787989850577952, + "grad_norm": 5.067663669586182, + "learning_rate": 9.685352989510193e-06, + "loss": 0.7534542083740234, + "memory(GiB)": 33.07, + "step": 3395, + "token_acc": 0.8213627992633518, + "train_speed(iter/s)": 0.096595 + }, + { + "epoch": 0.15811241676572912, + "grad_norm": 3.8264877796173096, + "learning_rate": 9.684009195454893e-06, + "loss": 0.9291213989257813, + "memory(GiB)": 33.07, + "step": 3400, + "token_acc": 0.7741023466214306, + "train_speed(iter/s)": 0.096675 + }, + { + "epoch": 0.15811241676572912, + "eval_loss": 0.7275504469871521, + "eval_runtime": 294.5423, + "eval_samples_per_second": 11.798, + "eval_steps_per_second": 11.798, + "step": 3400 + }, + { + "epoch": 0.15834493502567873, + "grad_norm": 6.075207710266113, + "learning_rate": 9.68266263160034e-06, + "loss": 0.7632218360900879, + "memory(GiB)": 33.07, + "step": 3405, + "token_acc": 0.7977882080627532, + "train_speed(iter/s)": 0.095951 + }, + { + "epoch": 0.15857745328562833, + "grad_norm": 5.3615336418151855, + "learning_rate": 9.681313298742798e-06, + "loss": 0.8492207527160645, + "memory(GiB)": 33.07, + "step": 3410, + "token_acc": 0.7734967892586107, + "train_speed(iter/s)": 0.09603 + }, + { + "epoch": 0.15880997154557794, + "grad_norm": 4.396504878997803, + "learning_rate": 9.67996119768017e-06, + "loss": 0.7675450325012207, + "memory(GiB)": 33.07, + "step": 3415, + "token_acc": 0.8188755020080322, + "train_speed(iter/s)": 0.096108 + }, + { + "epoch": 0.15904248980552754, + "grad_norm": 5.461509704589844, + "learning_rate": 9.678606329211995e-06, + "loss": 0.8365023612976075, + "memory(GiB)": 33.07, + "step": 3420, + "token_acc": 0.7900235928547354, + "train_speed(iter/s)": 0.096186 + }, + { + "epoch": 0.15927500806547715, + "grad_norm": 6.789258003234863, + "learning_rate": 9.677248694139447e-06, + "loss": 0.86660737991333, + "memory(GiB)": 33.07, + "step": 3425, + "token_acc": 0.7959479015918958, + "train_speed(iter/s)": 0.096262 + }, + { + "epoch": 0.15950752632542675, + "grad_norm": 5.055928707122803, + "learning_rate": 9.675888293265341e-06, + "loss": 0.729840087890625, + "memory(GiB)": 33.07, + "step": 3430, + "token_acc": 0.8189922480620155, + "train_speed(iter/s)": 0.096339 + }, + { + "epoch": 0.15974004458537636, + "grad_norm": 5.088936805725098, + "learning_rate": 9.674525127394122e-06, + "loss": 0.9462801933288574, + "memory(GiB)": 33.07, + "step": 3435, + "token_acc": 0.7601904195180006, + "train_speed(iter/s)": 0.096415 + }, + { + "epoch": 0.15997256284532596, + "grad_norm": 4.846744060516357, + "learning_rate": 9.67315919733187e-06, + "loss": 0.8493914604187012, + "memory(GiB)": 33.07, + "step": 3440, + "token_acc": 0.8003169572107766, + "train_speed(iter/s)": 0.096494 + }, + { + "epoch": 0.16020508110527554, + "grad_norm": 8.210926055908203, + "learning_rate": 9.671790503886304e-06, + "loss": 0.7502418518066406, + "memory(GiB)": 33.07, + "step": 3445, + "token_acc": 0.8145896656534954, + "train_speed(iter/s)": 0.096573 + }, + { + "epoch": 0.16043759936522514, + "grad_norm": 5.043862342834473, + "learning_rate": 9.670419047866776e-06, + "loss": 0.7558164119720459, + "memory(GiB)": 33.07, + "step": 3450, + "token_acc": 0.8108108108108109, + "train_speed(iter/s)": 0.09665 + }, + { + "epoch": 0.16043759936522514, + "eval_loss": 0.7215369939804077, + "eval_runtime": 292.7346, + "eval_samples_per_second": 11.871, + "eval_steps_per_second": 11.871, + "step": 3450 + }, + { + "epoch": 0.16067011762517475, + "grad_norm": 5.069505214691162, + "learning_rate": 9.669044830084266e-06, + "loss": 0.9493141174316406, + "memory(GiB)": 33.07, + "step": 3455, + "token_acc": 0.7972598793194513, + "train_speed(iter/s)": 0.095941 + }, + { + "epoch": 0.16090263588512435, + "grad_norm": 5.921719074249268, + "learning_rate": 9.667667851351394e-06, + "loss": 0.8479342460632324, + "memory(GiB)": 33.07, + "step": 3460, + "token_acc": 0.7883467883467884, + "train_speed(iter/s)": 0.096019 + }, + { + "epoch": 0.16113515414507396, + "grad_norm": 5.095674991607666, + "learning_rate": 9.666288112482411e-06, + "loss": 0.7614772319793701, + "memory(GiB)": 33.07, + "step": 3465, + "token_acc": 0.8230712166172107, + "train_speed(iter/s)": 0.096097 + }, + { + "epoch": 0.16136767240502356, + "grad_norm": 5.0426530838012695, + "learning_rate": 9.664905614293198e-06, + "loss": 0.7405894279479981, + "memory(GiB)": 33.07, + "step": 3470, + "token_acc": 0.8121739130434783, + "train_speed(iter/s)": 0.096174 + }, + { + "epoch": 0.16160019066497316, + "grad_norm": 4.290180206298828, + "learning_rate": 9.66352035760127e-06, + "loss": 0.8251392364501953, + "memory(GiB)": 33.07, + "step": 3475, + "token_acc": 0.7998174627319744, + "train_speed(iter/s)": 0.09625 + }, + { + "epoch": 0.16183270892492277, + "grad_norm": 6.105207443237305, + "learning_rate": 9.66213234322577e-06, + "loss": 0.9235754013061523, + "memory(GiB)": 33.07, + "step": 3480, + "token_acc": 0.771029555050341, + "train_speed(iter/s)": 0.096327 + }, + { + "epoch": 0.16206522718487237, + "grad_norm": 5.007493019104004, + "learning_rate": 9.660741571987476e-06, + "loss": 0.7650205135345459, + "memory(GiB)": 33.07, + "step": 3485, + "token_acc": 0.8162409454822722, + "train_speed(iter/s)": 0.096405 + }, + { + "epoch": 0.16229774544482195, + "grad_norm": 3.939481496810913, + "learning_rate": 9.659348044708791e-06, + "loss": 0.8588067054748535, + "memory(GiB)": 33.07, + "step": 3490, + "token_acc": 0.7917938284164123, + "train_speed(iter/s)": 0.096482 + }, + { + "epoch": 0.16253026370477155, + "grad_norm": 5.938999176025391, + "learning_rate": 9.657951762213754e-06, + "loss": 0.6494212627410889, + "memory(GiB)": 33.07, + "step": 3495, + "token_acc": 0.836876691148048, + "train_speed(iter/s)": 0.096559 + }, + { + "epoch": 0.16276278196472116, + "grad_norm": 5.342368125915527, + "learning_rate": 9.656552725328028e-06, + "loss": 0.8250043869018555, + "memory(GiB)": 33.07, + "step": 3500, + "token_acc": 0.7960215778826703, + "train_speed(iter/s)": 0.096637 + }, + { + "epoch": 0.16276278196472116, + "eval_loss": 0.7176188826560974, + "eval_runtime": 296.5613, + "eval_samples_per_second": 11.718, + "eval_steps_per_second": 11.718, + "step": 3500 + }, + { + "epoch": 0.16299530022467076, + "grad_norm": 5.9379353523254395, + "learning_rate": 9.655150934878907e-06, + "loss": 0.9059648513793945, + "memory(GiB)": 33.07, + "step": 3505, + "token_acc": 0.7976547804858526, + "train_speed(iter/s)": 0.095929 + }, + { + "epoch": 0.16322781848462037, + "grad_norm": 4.100049018859863, + "learning_rate": 9.653746391695314e-06, + "loss": 0.8847969055175782, + "memory(GiB)": 33.07, + "step": 3510, + "token_acc": 0.7722834645669291, + "train_speed(iter/s)": 0.096005 + }, + { + "epoch": 0.16346033674456997, + "grad_norm": 3.792137861251831, + "learning_rate": 9.652339096607796e-06, + "loss": 0.796082878112793, + "memory(GiB)": 33.07, + "step": 3515, + "token_acc": 0.804725959960617, + "train_speed(iter/s)": 0.096082 + }, + { + "epoch": 0.16369285500451958, + "grad_norm": 6.276231288909912, + "learning_rate": 9.650929050448534e-06, + "loss": 0.7875662803649902, + "memory(GiB)": 33.07, + "step": 3520, + "token_acc": 0.7962891379976808, + "train_speed(iter/s)": 0.096161 + }, + { + "epoch": 0.16392537326446918, + "grad_norm": 5.953547477722168, + "learning_rate": 9.649516254051327e-06, + "loss": 0.7139126300811768, + "memory(GiB)": 33.07, + "step": 3525, + "token_acc": 0.8333924140375754, + "train_speed(iter/s)": 0.096242 + }, + { + "epoch": 0.1641578915244188, + "grad_norm": 6.996596336364746, + "learning_rate": 9.648100708251612e-06, + "loss": 0.8537234306335449, + "memory(GiB)": 33.07, + "step": 3530, + "token_acc": 0.7948618139353835, + "train_speed(iter/s)": 0.09632 + }, + { + "epoch": 0.1643904097843684, + "grad_norm": 5.0464396476745605, + "learning_rate": 9.646682413886437e-06, + "loss": 0.7961117267608643, + "memory(GiB)": 33.07, + "step": 3535, + "token_acc": 0.8100498930862438, + "train_speed(iter/s)": 0.096397 + }, + { + "epoch": 0.16462292804431797, + "grad_norm": 6.0903167724609375, + "learning_rate": 9.64526137179449e-06, + "loss": 0.906730842590332, + "memory(GiB)": 33.07, + "step": 3540, + "token_acc": 0.7886123423116264, + "train_speed(iter/s)": 0.096476 + }, + { + "epoch": 0.16485544630426757, + "grad_norm": 7.942210674285889, + "learning_rate": 9.643837582816071e-06, + "loss": 0.8420848846435547, + "memory(GiB)": 33.07, + "step": 3545, + "token_acc": 0.7900228236061297, + "train_speed(iter/s)": 0.096552 + }, + { + "epoch": 0.16508796456421718, + "grad_norm": 5.138957500457764, + "learning_rate": 9.642411047793115e-06, + "loss": 0.7483475685119629, + "memory(GiB)": 33.07, + "step": 3550, + "token_acc": 0.8143631436314364, + "train_speed(iter/s)": 0.096631 + }, + { + "epoch": 0.16508796456421718, + "eval_loss": 0.7186556458473206, + "eval_runtime": 294.7293, + "eval_samples_per_second": 11.79, + "eval_steps_per_second": 11.79, + "step": 3550 + }, + { + "epoch": 0.16532048282416678, + "grad_norm": 6.104626655578613, + "learning_rate": 9.640981767569176e-06, + "loss": 0.8763669967651367, + "memory(GiB)": 33.07, + "step": 3555, + "token_acc": 0.7971994052115554, + "train_speed(iter/s)": 0.095937 + }, + { + "epoch": 0.16555300108411639, + "grad_norm": 6.483398914337158, + "learning_rate": 9.63954974298943e-06, + "loss": 0.9383742332458496, + "memory(GiB)": 33.07, + "step": 3560, + "token_acc": 0.7736309731426578, + "train_speed(iter/s)": 0.096013 + }, + { + "epoch": 0.165785519344066, + "grad_norm": 5.247052192687988, + "learning_rate": 9.638114974900675e-06, + "loss": 0.801731014251709, + "memory(GiB)": 33.07, + "step": 3565, + "token_acc": 0.802016129032258, + "train_speed(iter/s)": 0.096088 + }, + { + "epoch": 0.1660180376040156, + "grad_norm": 4.652188301086426, + "learning_rate": 9.636677464151339e-06, + "loss": 0.7853640079498291, + "memory(GiB)": 33.07, + "step": 3570, + "token_acc": 0.813795702977761, + "train_speed(iter/s)": 0.096165 + }, + { + "epoch": 0.1662505558639652, + "grad_norm": 4.874006748199463, + "learning_rate": 9.635237211591461e-06, + "loss": 0.7610373497009277, + "memory(GiB)": 33.07, + "step": 3575, + "token_acc": 0.8124118476727785, + "train_speed(iter/s)": 0.09624 + }, + { + "epoch": 0.1664830741239148, + "grad_norm": 6.398994445800781, + "learning_rate": 9.633794218072711e-06, + "loss": 0.7691407203674316, + "memory(GiB)": 33.07, + "step": 3580, + "token_acc": 0.803083391730904, + "train_speed(iter/s)": 0.096314 + }, + { + "epoch": 0.1667155923838644, + "grad_norm": 5.58176851272583, + "learning_rate": 9.632348484448375e-06, + "loss": 0.7621356964111328, + "memory(GiB)": 33.07, + "step": 3585, + "token_acc": 0.7978406552494416, + "train_speed(iter/s)": 0.096391 + }, + { + "epoch": 0.16694811064381399, + "grad_norm": 4.73793888092041, + "learning_rate": 9.630900011573358e-06, + "loss": 0.8498669624328613, + "memory(GiB)": 33.07, + "step": 3590, + "token_acc": 0.7809806835066865, + "train_speed(iter/s)": 0.096467 + }, + { + "epoch": 0.1671806289037636, + "grad_norm": 5.0031352043151855, + "learning_rate": 9.629448800304189e-06, + "loss": 0.8297422409057618, + "memory(GiB)": 33.07, + "step": 3595, + "token_acc": 0.8004722550177096, + "train_speed(iter/s)": 0.096543 + }, + { + "epoch": 0.1674131471637132, + "grad_norm": 4.638136386871338, + "learning_rate": 9.627994851499012e-06, + "loss": 0.8547920227050781, + "memory(GiB)": 33.07, + "step": 3600, + "token_acc": 0.8039288361749444, + "train_speed(iter/s)": 0.096619 + }, + { + "epoch": 0.1674131471637132, + "eval_loss": 0.7175089716911316, + "eval_runtime": 293.2233, + "eval_samples_per_second": 11.851, + "eval_steps_per_second": 11.851, + "step": 3600 + }, + { + "epoch": 0.1676456654236628, + "grad_norm": 5.295803070068359, + "learning_rate": 9.626538166017594e-06, + "loss": 0.6819862842559814, + "memory(GiB)": 33.07, + "step": 3605, + "token_acc": 0.7997306548454071, + "train_speed(iter/s)": 0.095937 + }, + { + "epoch": 0.1678781836836124, + "grad_norm": 6.202095031738281, + "learning_rate": 9.625078744721315e-06, + "loss": 0.9101020812988281, + "memory(GiB)": 33.07, + "step": 3610, + "token_acc": 0.7704337562702862, + "train_speed(iter/s)": 0.096011 + }, + { + "epoch": 0.168110701943562, + "grad_norm": 6.182511329650879, + "learning_rate": 9.623616588473178e-06, + "loss": 0.7675065517425537, + "memory(GiB)": 33.07, + "step": 3615, + "token_acc": 0.8091299019607843, + "train_speed(iter/s)": 0.096086 + }, + { + "epoch": 0.1683432202035116, + "grad_norm": 5.590625762939453, + "learning_rate": 9.622151698137801e-06, + "loss": 0.8312381744384766, + "memory(GiB)": 33.07, + "step": 3620, + "token_acc": 0.8003894839337877, + "train_speed(iter/s)": 0.09616 + }, + { + "epoch": 0.16857573846346122, + "grad_norm": 7.963487148284912, + "learning_rate": 9.62068407458142e-06, + "loss": 0.6981609344482422, + "memory(GiB)": 33.07, + "step": 3625, + "token_acc": 0.8294966835739368, + "train_speed(iter/s)": 0.096236 + }, + { + "epoch": 0.16880825672341082, + "grad_norm": 7.476184368133545, + "learning_rate": 9.619213718671882e-06, + "loss": 0.7124053478240967, + "memory(GiB)": 33.07, + "step": 3630, + "token_acc": 0.8472818311874106, + "train_speed(iter/s)": 0.096309 + }, + { + "epoch": 0.16904077498336043, + "grad_norm": 5.105494499206543, + "learning_rate": 9.617740631278659e-06, + "loss": 0.7903679847717285, + "memory(GiB)": 33.07, + "step": 3635, + "token_acc": 0.8045439358503174, + "train_speed(iter/s)": 0.096384 + }, + { + "epoch": 0.16927329324331, + "grad_norm": 5.298689365386963, + "learning_rate": 9.61626481327283e-06, + "loss": 0.8809099197387695, + "memory(GiB)": 33.07, + "step": 3640, + "token_acc": 0.8095663835493965, + "train_speed(iter/s)": 0.096459 + }, + { + "epoch": 0.1695058115032596, + "grad_norm": 5.424408912658691, + "learning_rate": 9.614786265527092e-06, + "loss": 0.8258963584899902, + "memory(GiB)": 33.07, + "step": 3645, + "token_acc": 0.7723616865708018, + "train_speed(iter/s)": 0.096531 + }, + { + "epoch": 0.1697383297632092, + "grad_norm": 6.607914924621582, + "learning_rate": 9.613304988915754e-06, + "loss": 0.8794160842895508, + "memory(GiB)": 33.07, + "step": 3650, + "token_acc": 0.7938718662952646, + "train_speed(iter/s)": 0.096605 + }, + { + "epoch": 0.1697383297632092, + "eval_loss": 0.7148188948631287, + "eval_runtime": 293.6307, + "eval_samples_per_second": 11.835, + "eval_steps_per_second": 11.835, + "step": 3650 + }, + { + "epoch": 0.16997084802315882, + "grad_norm": 5.146786212921143, + "learning_rate": 9.611820984314746e-06, + "loss": 0.7711294651031494, + "memory(GiB)": 33.07, + "step": 3655, + "token_acc": 0.7995883755992884, + "train_speed(iter/s)": 0.095931 + }, + { + "epoch": 0.17020336628310842, + "grad_norm": 7.579797744750977, + "learning_rate": 9.610334252601603e-06, + "loss": 0.6962503910064697, + "memory(GiB)": 33.07, + "step": 3660, + "token_acc": 0.8216216216216217, + "train_speed(iter/s)": 0.096005 + }, + { + "epoch": 0.17043588454305802, + "grad_norm": 3.413147211074829, + "learning_rate": 9.608844794655475e-06, + "loss": 0.8060155868530273, + "memory(GiB)": 33.07, + "step": 3665, + "token_acc": 0.7950065703022339, + "train_speed(iter/s)": 0.096078 + }, + { + "epoch": 0.17066840280300763, + "grad_norm": 7.161190986633301, + "learning_rate": 9.607352611357123e-06, + "loss": 0.6839473247528076, + "memory(GiB)": 33.07, + "step": 3670, + "token_acc": 0.8454003407155025, + "train_speed(iter/s)": 0.096151 + }, + { + "epoch": 0.17090092106295723, + "grad_norm": 5.271682262420654, + "learning_rate": 9.605857703588924e-06, + "loss": 0.8989489555358887, + "memory(GiB)": 33.07, + "step": 3675, + "token_acc": 0.798501872659176, + "train_speed(iter/s)": 0.096226 + }, + { + "epoch": 0.17113343932290684, + "grad_norm": 5.448130130767822, + "learning_rate": 9.604360072234861e-06, + "loss": 0.7826570987701416, + "memory(GiB)": 33.07, + "step": 3680, + "token_acc": 0.8154887854422345, + "train_speed(iter/s)": 0.0963 + }, + { + "epoch": 0.17136595758285642, + "grad_norm": 4.372930526733398, + "learning_rate": 9.60285971818053e-06, + "loss": 0.7660940170288086, + "memory(GiB)": 33.07, + "step": 3685, + "token_acc": 0.8209019947961839, + "train_speed(iter/s)": 0.096375 + }, + { + "epoch": 0.17159847584280602, + "grad_norm": 4.681722164154053, + "learning_rate": 9.601356642313138e-06, + "loss": 0.8155523300170898, + "memory(GiB)": 33.07, + "step": 3690, + "token_acc": 0.7983606557377049, + "train_speed(iter/s)": 0.096448 + }, + { + "epoch": 0.17183099410275562, + "grad_norm": 5.451058864593506, + "learning_rate": 9.5998508455215e-06, + "loss": 0.737110185623169, + "memory(GiB)": 33.07, + "step": 3695, + "token_acc": 0.8147727272727273, + "train_speed(iter/s)": 0.096521 + }, + { + "epoch": 0.17206351236270523, + "grad_norm": 6.284342288970947, + "learning_rate": 9.598342328696035e-06, + "loss": 0.8151021003723145, + "memory(GiB)": 33.07, + "step": 3700, + "token_acc": 0.8129205921938089, + "train_speed(iter/s)": 0.096595 + }, + { + "epoch": 0.17206351236270523, + "eval_loss": 0.7119737863540649, + "eval_runtime": 291.2382, + "eval_samples_per_second": 11.932, + "eval_steps_per_second": 11.932, + "step": 3700 + }, + { + "epoch": 0.17229603062265483, + "grad_norm": 5.116153240203857, + "learning_rate": 9.596831092728784e-06, + "loss": 0.7929253101348877, + "memory(GiB)": 33.07, + "step": 3705, + "token_acc": 0.7995435069875465, + "train_speed(iter/s)": 0.095936 + }, + { + "epoch": 0.17252854888260444, + "grad_norm": 7.7103590965271, + "learning_rate": 9.595317138513383e-06, + "loss": 0.7780908584594727, + "memory(GiB)": 33.07, + "step": 3710, + "token_acc": 0.8088341781317886, + "train_speed(iter/s)": 0.096008 + }, + { + "epoch": 0.17276106714255404, + "grad_norm": 6.119678497314453, + "learning_rate": 9.593800466945077e-06, + "loss": 0.8345657348632812, + "memory(GiB)": 33.07, + "step": 3715, + "token_acc": 0.803952321204517, + "train_speed(iter/s)": 0.096082 + }, + { + "epoch": 0.17299358540250365, + "grad_norm": 5.879092693328857, + "learning_rate": 9.592281078920729e-06, + "loss": 0.8818518638610839, + "memory(GiB)": 33.07, + "step": 3720, + "token_acc": 0.7890543817111189, + "train_speed(iter/s)": 0.096154 + }, + { + "epoch": 0.17322610366245325, + "grad_norm": 6.872639179229736, + "learning_rate": 9.590758975338793e-06, + "loss": 0.8579018592834473, + "memory(GiB)": 33.07, + "step": 3725, + "token_acc": 0.7804423128164135, + "train_speed(iter/s)": 0.096225 + }, + { + "epoch": 0.17345862192240286, + "grad_norm": 6.415402889251709, + "learning_rate": 9.589234157099336e-06, + "loss": 0.8382321357727051, + "memory(GiB)": 33.07, + "step": 3730, + "token_acc": 0.8034565916398714, + "train_speed(iter/s)": 0.096299 + }, + { + "epoch": 0.17369114018235243, + "grad_norm": 6.094632625579834, + "learning_rate": 9.587706625104035e-06, + "loss": 0.7249147415161132, + "memory(GiB)": 33.07, + "step": 3735, + "token_acc": 0.8148740503798481, + "train_speed(iter/s)": 0.096372 + }, + { + "epoch": 0.17392365844230204, + "grad_norm": 5.106109142303467, + "learning_rate": 9.58617638025616e-06, + "loss": 0.858515739440918, + "memory(GiB)": 33.07, + "step": 3740, + "token_acc": 0.7806090739589807, + "train_speed(iter/s)": 0.096444 + }, + { + "epoch": 0.17415617670225164, + "grad_norm": 7.242406845092773, + "learning_rate": 9.584643423460599e-06, + "loss": 0.7636741638183594, + "memory(GiB)": 33.07, + "step": 3745, + "token_acc": 0.8018543956043956, + "train_speed(iter/s)": 0.096516 + }, + { + "epoch": 0.17438869496220125, + "grad_norm": 7.00205135345459, + "learning_rate": 9.583107755623832e-06, + "loss": 0.7448306560516358, + "memory(GiB)": 33.07, + "step": 3750, + "token_acc": 0.8144083969465649, + "train_speed(iter/s)": 0.096589 + }, + { + "epoch": 0.17438869496220125, + "eval_loss": 0.7071701884269714, + "eval_runtime": 290.4999, + "eval_samples_per_second": 11.962, + "eval_steps_per_second": 11.962, + "step": 3750 + }, + { + "epoch": 0.17462121322215085, + "grad_norm": 5.753495216369629, + "learning_rate": 9.58156937765395e-06, + "loss": 0.6926516532897949, + "memory(GiB)": 33.07, + "step": 3755, + "token_acc": 0.8004340981729714, + "train_speed(iter/s)": 0.095939 + }, + { + "epoch": 0.17485373148210046, + "grad_norm": 4.843967437744141, + "learning_rate": 9.58002829046064e-06, + "loss": 0.7503583431243896, + "memory(GiB)": 33.07, + "step": 3760, + "token_acc": 0.8114439324116743, + "train_speed(iter/s)": 0.09601 + }, + { + "epoch": 0.17508624974205006, + "grad_norm": 6.034976959228516, + "learning_rate": 9.578484494955196e-06, + "loss": 0.7742047786712647, + "memory(GiB)": 33.07, + "step": 3765, + "token_acc": 0.8205445544554455, + "train_speed(iter/s)": 0.096081 + }, + { + "epoch": 0.17531876800199966, + "grad_norm": 5.2408061027526855, + "learning_rate": 9.576937992050515e-06, + "loss": 0.7521644115447998, + "memory(GiB)": 33.07, + "step": 3770, + "token_acc": 0.8303755674783326, + "train_speed(iter/s)": 0.096153 + }, + { + "epoch": 0.17555128626194927, + "grad_norm": 5.320487976074219, + "learning_rate": 9.575388782661086e-06, + "loss": 0.7956186294555664, + "memory(GiB)": 33.07, + "step": 3775, + "token_acc": 0.8172245204964272, + "train_speed(iter/s)": 0.096223 + }, + { + "epoch": 0.17578380452189887, + "grad_norm": 5.391150951385498, + "learning_rate": 9.573836867703007e-06, + "loss": 0.8982381820678711, + "memory(GiB)": 33.07, + "step": 3780, + "token_acc": 0.7747972551466001, + "train_speed(iter/s)": 0.096291 + }, + { + "epoch": 0.17601632278184845, + "grad_norm": 4.874892234802246, + "learning_rate": 9.572282248093976e-06, + "loss": 0.9775179862976074, + "memory(GiB)": 33.07, + "step": 3785, + "token_acc": 0.7860677578987438, + "train_speed(iter/s)": 0.096362 + }, + { + "epoch": 0.17624884104179805, + "grad_norm": 5.059951305389404, + "learning_rate": 9.570724924753284e-06, + "loss": 0.7578266143798829, + "memory(GiB)": 33.07, + "step": 3790, + "token_acc": 0.805045871559633, + "train_speed(iter/s)": 0.096433 + }, + { + "epoch": 0.17648135930174766, + "grad_norm": 4.954115867614746, + "learning_rate": 9.569164898601826e-06, + "loss": 0.8646341323852539, + "memory(GiB)": 33.07, + "step": 3795, + "token_acc": 0.7873303167420814, + "train_speed(iter/s)": 0.096505 + }, + { + "epoch": 0.17671387756169726, + "grad_norm": 5.7353057861328125, + "learning_rate": 9.567602170562092e-06, + "loss": 0.8507672309875488, + "memory(GiB)": 33.07, + "step": 3800, + "token_acc": 0.7837078651685393, + "train_speed(iter/s)": 0.096577 + }, + { + "epoch": 0.17671387756169726, + "eval_loss": 0.7128520011901855, + "eval_runtime": 290.7916, + "eval_samples_per_second": 11.95, + "eval_steps_per_second": 11.95, + "step": 3800 + }, + { + "epoch": 0.17694639582164687, + "grad_norm": 5.397838592529297, + "learning_rate": 9.566036741558173e-06, + "loss": 0.8797189712524414, + "memory(GiB)": 33.07, + "step": 3805, + "token_acc": 0.7995074755331697, + "train_speed(iter/s)": 0.095938 + }, + { + "epoch": 0.17717891408159647, + "grad_norm": 4.222261905670166, + "learning_rate": 9.564468612515756e-06, + "loss": 0.8522190093994141, + "memory(GiB)": 33.07, + "step": 3810, + "token_acc": 0.796123226029768, + "train_speed(iter/s)": 0.09601 + }, + { + "epoch": 0.17741143234154608, + "grad_norm": 4.521677017211914, + "learning_rate": 9.562897784362121e-06, + "loss": 0.7106293678283692, + "memory(GiB)": 33.07, + "step": 3815, + "token_acc": 0.8250936329588014, + "train_speed(iter/s)": 0.096079 + }, + { + "epoch": 0.17764395060149568, + "grad_norm": 5.949578285217285, + "learning_rate": 9.561324258026151e-06, + "loss": 0.8217846870422363, + "memory(GiB)": 33.07, + "step": 3820, + "token_acc": 0.8033730974907446, + "train_speed(iter/s)": 0.09615 + }, + { + "epoch": 0.17787646886144529, + "grad_norm": 5.23591947555542, + "learning_rate": 9.559748034438319e-06, + "loss": 0.7070714950561523, + "memory(GiB)": 33.07, + "step": 3825, + "token_acc": 0.8170782754290498, + "train_speed(iter/s)": 0.09622 + }, + { + "epoch": 0.1781089871213949, + "grad_norm": 7.460520267486572, + "learning_rate": 9.558169114530694e-06, + "loss": 0.6729435920715332, + "memory(GiB)": 33.07, + "step": 3830, + "token_acc": 0.8424681144432954, + "train_speed(iter/s)": 0.096292 + }, + { + "epoch": 0.17834150538134447, + "grad_norm": 5.60178279876709, + "learning_rate": 9.556587499236942e-06, + "loss": 0.7683042049407959, + "memory(GiB)": 33.07, + "step": 3835, + "token_acc": 0.8206605222734255, + "train_speed(iter/s)": 0.096362 + }, + { + "epoch": 0.17857402364129407, + "grad_norm": 6.029583930969238, + "learning_rate": 9.555003189492318e-06, + "loss": 0.7823711395263672, + "memory(GiB)": 33.07, + "step": 3840, + "token_acc": 0.8051906440243511, + "train_speed(iter/s)": 0.096428 + }, + { + "epoch": 0.17880654190124368, + "grad_norm": 4.636234283447266, + "learning_rate": 9.553416186233674e-06, + "loss": 0.7817121982574463, + "memory(GiB)": 33.07, + "step": 3845, + "token_acc": 0.8130484547882487, + "train_speed(iter/s)": 0.096497 + }, + { + "epoch": 0.17903906016119328, + "grad_norm": 5.539736747741699, + "learning_rate": 9.551826490399459e-06, + "loss": 0.8406240463256835, + "memory(GiB)": 33.07, + "step": 3850, + "token_acc": 0.8003020007550019, + "train_speed(iter/s)": 0.096566 + }, + { + "epoch": 0.17903906016119328, + "eval_loss": 0.7051939368247986, + "eval_runtime": 289.0798, + "eval_samples_per_second": 12.021, + "eval_steps_per_second": 12.021, + "step": 3850 + }, + { + "epoch": 0.17927157842114289, + "grad_norm": 4.831907749176025, + "learning_rate": 9.550234102929702e-06, + "loss": 0.7266098022460937, + "memory(GiB)": 33.07, + "step": 3855, + "token_acc": 0.8010063495866778, + "train_speed(iter/s)": 0.09594 + }, + { + "epoch": 0.1795040966810925, + "grad_norm": 3.985592842102051, + "learning_rate": 9.548639024766036e-06, + "loss": 0.8843966484069824, + "memory(GiB)": 33.07, + "step": 3860, + "token_acc": 0.8019549511262218, + "train_speed(iter/s)": 0.096009 + }, + { + "epoch": 0.1797366149410421, + "grad_norm": 5.885317802429199, + "learning_rate": 9.547041256851676e-06, + "loss": 0.9130638122558594, + "memory(GiB)": 33.07, + "step": 3865, + "token_acc": 0.7749216300940439, + "train_speed(iter/s)": 0.096077 + }, + { + "epoch": 0.1799691332009917, + "grad_norm": 5.011005401611328, + "learning_rate": 9.545440800131437e-06, + "loss": 0.7087615013122559, + "memory(GiB)": 33.07, + "step": 3870, + "token_acc": 0.8194444444444444, + "train_speed(iter/s)": 0.096146 + }, + { + "epoch": 0.1802016514609413, + "grad_norm": 5.256019592285156, + "learning_rate": 9.543837655551711e-06, + "loss": 0.8234603881835938, + "memory(GiB)": 33.07, + "step": 3875, + "token_acc": 0.7919126328217237, + "train_speed(iter/s)": 0.096216 + }, + { + "epoch": 0.18043416972089088, + "grad_norm": 5.069699287414551, + "learning_rate": 9.542231824060494e-06, + "loss": 0.6185711860656739, + "memory(GiB)": 33.07, + "step": 3880, + "token_acc": 0.8525206922498119, + "train_speed(iter/s)": 0.096286 + }, + { + "epoch": 0.18066668798084048, + "grad_norm": 4.84377384185791, + "learning_rate": 9.54062330660736e-06, + "loss": 0.8174562454223633, + "memory(GiB)": 33.07, + "step": 3885, + "token_acc": 0.7946681792399319, + "train_speed(iter/s)": 0.096354 + }, + { + "epoch": 0.1808992062407901, + "grad_norm": 6.560987949371338, + "learning_rate": 9.539012104143474e-06, + "loss": 0.7553558349609375, + "memory(GiB)": 33.07, + "step": 3890, + "token_acc": 0.8088002532446977, + "train_speed(iter/s)": 0.096422 + }, + { + "epoch": 0.1811317245007397, + "grad_norm": 5.4929938316345215, + "learning_rate": 9.537398217621593e-06, + "loss": 0.7413972377777099, + "memory(GiB)": 33.07, + "step": 3895, + "token_acc": 0.8199160625715376, + "train_speed(iter/s)": 0.096493 + }, + { + "epoch": 0.1813642427606893, + "grad_norm": 5.737190246582031, + "learning_rate": 9.535781647996057e-06, + "loss": 0.7559893131256104, + "memory(GiB)": 33.07, + "step": 3900, + "token_acc": 0.8125879043600562, + "train_speed(iter/s)": 0.096561 + }, + { + "epoch": 0.1813642427606893, + "eval_loss": 0.7063232660293579, + "eval_runtime": 293.3208, + "eval_samples_per_second": 11.847, + "eval_steps_per_second": 11.847, + "step": 3900 + }, + { + "epoch": 0.1815967610206389, + "grad_norm": 5.9086761474609375, + "learning_rate": 9.53416239622279e-06, + "loss": 0.7419236660003662, + "memory(GiB)": 33.07, + "step": 3905, + "token_acc": 0.8012286055247203, + "train_speed(iter/s)": 0.095933 + }, + { + "epoch": 0.1818292792805885, + "grad_norm": 4.9416184425354, + "learning_rate": 9.53254046325931e-06, + "loss": 0.653481912612915, + "memory(GiB)": 33.07, + "step": 3910, + "token_acc": 0.8408619975134687, + "train_speed(iter/s)": 0.096001 + }, + { + "epoch": 0.1820617975405381, + "grad_norm": 4.461050033569336, + "learning_rate": 9.530915850064715e-06, + "loss": 0.702120590209961, + "memory(GiB)": 33.07, + "step": 3915, + "token_acc": 0.8253035644339992, + "train_speed(iter/s)": 0.09607 + }, + { + "epoch": 0.18229431580048772, + "grad_norm": 4.964265823364258, + "learning_rate": 9.52928855759969e-06, + "loss": 0.7623435974121093, + "memory(GiB)": 33.07, + "step": 3920, + "token_acc": 0.8236765318882868, + "train_speed(iter/s)": 0.096139 + }, + { + "epoch": 0.18252683406043732, + "grad_norm": 5.344741344451904, + "learning_rate": 9.5276585868265e-06, + "loss": 0.6239171028137207, + "memory(GiB)": 33.07, + "step": 3925, + "token_acc": 0.8420608108108109, + "train_speed(iter/s)": 0.096208 + }, + { + "epoch": 0.1827593523203869, + "grad_norm": 5.035322189331055, + "learning_rate": 9.526025938708999e-06, + "loss": 0.713650131225586, + "memory(GiB)": 33.07, + "step": 3930, + "token_acc": 0.8268041237113402, + "train_speed(iter/s)": 0.096277 + }, + { + "epoch": 0.1829918705803365, + "grad_norm": 5.159491062164307, + "learning_rate": 9.524390614212622e-06, + "loss": 0.6837416172027588, + "memory(GiB)": 33.07, + "step": 3935, + "token_acc": 0.8171238570241064, + "train_speed(iter/s)": 0.096347 + }, + { + "epoch": 0.1832243888402861, + "grad_norm": 6.687197208404541, + "learning_rate": 9.522752614304387e-06, + "loss": 0.724323844909668, + "memory(GiB)": 33.07, + "step": 3940, + "token_acc": 0.8087412587412588, + "train_speed(iter/s)": 0.096415 + }, + { + "epoch": 0.1834569071002357, + "grad_norm": 4.916758060455322, + "learning_rate": 9.521111939952895e-06, + "loss": 0.6912760734558105, + "memory(GiB)": 33.07, + "step": 3945, + "token_acc": 0.830335934848999, + "train_speed(iter/s)": 0.096482 + }, + { + "epoch": 0.18368942536018532, + "grad_norm": 5.0996527671813965, + "learning_rate": 9.519468592128324e-06, + "loss": 0.7524893283843994, + "memory(GiB)": 33.07, + "step": 3950, + "token_acc": 0.806738715829625, + "train_speed(iter/s)": 0.09655 + }, + { + "epoch": 0.18368942536018532, + "eval_loss": 0.7087674736976624, + "eval_runtime": 291.2471, + "eval_samples_per_second": 11.931, + "eval_steps_per_second": 11.931, + "step": 3950 + }, + { + "epoch": 0.18392194362013492, + "grad_norm": 6.056982517242432, + "learning_rate": 9.51782257180244e-06, + "loss": 0.8986503601074218, + "memory(GiB)": 33.07, + "step": 3955, + "token_acc": 0.8012853224468968, + "train_speed(iter/s)": 0.095933 + }, + { + "epoch": 0.18415446188008452, + "grad_norm": 5.4020280838012695, + "learning_rate": 9.516173879948583e-06, + "loss": 0.7656956672668457, + "memory(GiB)": 33.07, + "step": 3960, + "token_acc": 0.8064238600516203, + "train_speed(iter/s)": 0.096001 + }, + { + "epoch": 0.18438698014003413, + "grad_norm": 7.4822869300842285, + "learning_rate": 9.514522517541678e-06, + "loss": 0.7231870651245117, + "memory(GiB)": 33.07, + "step": 3965, + "token_acc": 0.8162162162162162, + "train_speed(iter/s)": 0.09607 + }, + { + "epoch": 0.18461949839998373, + "grad_norm": 4.300986289978027, + "learning_rate": 9.512868485558223e-06, + "loss": 0.846955394744873, + "memory(GiB)": 33.07, + "step": 3970, + "token_acc": 0.7907311456534254, + "train_speed(iter/s)": 0.096136 + }, + { + "epoch": 0.18485201665993334, + "grad_norm": 3.9788742065429688, + "learning_rate": 9.511211784976301e-06, + "loss": 0.8556766510009766, + "memory(GiB)": 33.07, + "step": 3975, + "token_acc": 0.7815327301756253, + "train_speed(iter/s)": 0.096204 + }, + { + "epoch": 0.18508453491988291, + "grad_norm": 6.10697078704834, + "learning_rate": 9.509552416775572e-06, + "loss": 0.73006272315979, + "memory(GiB)": 33.07, + "step": 3980, + "token_acc": 0.8176037483266398, + "train_speed(iter/s)": 0.096273 + }, + { + "epoch": 0.18531705317983252, + "grad_norm": 6.847419738769531, + "learning_rate": 9.507890381937266e-06, + "loss": 0.7203133583068848, + "memory(GiB)": 33.07, + "step": 3985, + "token_acc": 0.8189987163029525, + "train_speed(iter/s)": 0.09634 + }, + { + "epoch": 0.18554957143978212, + "grad_norm": 7.144951343536377, + "learning_rate": 9.506225681444202e-06, + "loss": 0.7901617527008057, + "memory(GiB)": 33.07, + "step": 3990, + "token_acc": 0.8129742033383915, + "train_speed(iter/s)": 0.096409 + }, + { + "epoch": 0.18578208969973173, + "grad_norm": 4.346937656402588, + "learning_rate": 9.504558316280761e-06, + "loss": 0.7825525760650635, + "memory(GiB)": 33.07, + "step": 3995, + "token_acc": 0.8113915416098226, + "train_speed(iter/s)": 0.096475 + }, + { + "epoch": 0.18601460795968133, + "grad_norm": 4.778166770935059, + "learning_rate": 9.502888287432915e-06, + "loss": 0.7969249248504638, + "memory(GiB)": 33.07, + "step": 4000, + "token_acc": 0.822502030869212, + "train_speed(iter/s)": 0.096543 + }, + { + "epoch": 0.18601460795968133, + "eval_loss": 0.699872612953186, + "eval_runtime": 292.1449, + "eval_samples_per_second": 11.895, + "eval_steps_per_second": 11.895, + "step": 4000 + }, + { + "epoch": 0.18624712621963094, + "grad_norm": 6.564324378967285, + "learning_rate": 9.501215595888201e-06, + "loss": 0.739466381072998, + "memory(GiB)": 33.07, + "step": 4005, + "token_acc": 0.8025343189017952, + "train_speed(iter/s)": 0.095933 + }, + { + "epoch": 0.18647964447958054, + "grad_norm": 4.453283786773682, + "learning_rate": 9.499540242635732e-06, + "loss": 1.0081160545349122, + "memory(GiB)": 33.07, + "step": 4010, + "token_acc": 0.7659355723098012, + "train_speed(iter/s)": 0.095997 + }, + { + "epoch": 0.18671216273953015, + "grad_norm": 6.239867687225342, + "learning_rate": 9.497862228666196e-06, + "loss": 0.8795578956604004, + "memory(GiB)": 33.07, + "step": 4015, + "token_acc": 0.7943651664837176, + "train_speed(iter/s)": 0.096065 + }, + { + "epoch": 0.18694468099947975, + "grad_norm": 7.037559509277344, + "learning_rate": 9.496181554971856e-06, + "loss": 0.8672590255737305, + "memory(GiB)": 33.07, + "step": 4020, + "token_acc": 0.7891297891297891, + "train_speed(iter/s)": 0.096132 + }, + { + "epoch": 0.18717719925942936, + "grad_norm": 6.250262260437012, + "learning_rate": 9.494498222546545e-06, + "loss": 0.6804422378540039, + "memory(GiB)": 33.07, + "step": 4025, + "token_acc": 0.8410415856976292, + "train_speed(iter/s)": 0.096199 + }, + { + "epoch": 0.18740971751937893, + "grad_norm": 5.407467365264893, + "learning_rate": 9.49281223238567e-06, + "loss": 0.7347963809967041, + "memory(GiB)": 33.07, + "step": 4030, + "token_acc": 0.8263201320132013, + "train_speed(iter/s)": 0.096267 + }, + { + "epoch": 0.18764223577932854, + "grad_norm": 5.787410736083984, + "learning_rate": 9.491123585486211e-06, + "loss": 0.7938172817230225, + "memory(GiB)": 33.07, + "step": 4035, + "token_acc": 0.8069164265129684, + "train_speed(iter/s)": 0.096332 + }, + { + "epoch": 0.18787475403927814, + "grad_norm": 6.597839832305908, + "learning_rate": 9.489432282846714e-06, + "loss": 0.7364625930786133, + "memory(GiB)": 33.07, + "step": 4040, + "token_acc": 0.8278404163052906, + "train_speed(iter/s)": 0.0964 + }, + { + "epoch": 0.18810727229922775, + "grad_norm": 6.383011817932129, + "learning_rate": 9.487738325467299e-06, + "loss": 0.6238168716430664, + "memory(GiB)": 33.07, + "step": 4045, + "token_acc": 0.8554265118141771, + "train_speed(iter/s)": 0.096467 + }, + { + "epoch": 0.18833979055917735, + "grad_norm": 4.523214817047119, + "learning_rate": 9.486041714349655e-06, + "loss": 0.7638760089874268, + "memory(GiB)": 33.07, + "step": 4050, + "token_acc": 0.816711590296496, + "train_speed(iter/s)": 0.096536 + }, + { + "epoch": 0.18833979055917735, + "eval_loss": 0.7085168957710266, + "eval_runtime": 293.1328, + "eval_samples_per_second": 11.855, + "eval_steps_per_second": 11.855, + "step": 4050 + }, + { + "epoch": 0.18857230881912695, + "grad_norm": 4.8166584968566895, + "learning_rate": 9.484342450497043e-06, + "loss": 0.6987978458404541, + "memory(GiB)": 33.07, + "step": 4055, + "token_acc": 0.802425636969392, + "train_speed(iter/s)": 0.095929 + }, + { + "epoch": 0.18880482707907656, + "grad_norm": 6.945908546447754, + "learning_rate": 9.482640534914289e-06, + "loss": 0.7164095401763916, + "memory(GiB)": 33.07, + "step": 4060, + "token_acc": 0.8259895444361464, + "train_speed(iter/s)": 0.095995 + }, + { + "epoch": 0.18903734533902616, + "grad_norm": 6.610608100891113, + "learning_rate": 9.480935968607784e-06, + "loss": 0.630191707611084, + "memory(GiB)": 33.07, + "step": 4065, + "token_acc": 0.844571975131516, + "train_speed(iter/s)": 0.096062 + }, + { + "epoch": 0.18926986359897577, + "grad_norm": 5.781813621520996, + "learning_rate": 9.479228752585498e-06, + "loss": 0.79894118309021, + "memory(GiB)": 33.07, + "step": 4070, + "token_acc": 0.80873330927463, + "train_speed(iter/s)": 0.096127 + }, + { + "epoch": 0.18950238185892535, + "grad_norm": 8.563652992248535, + "learning_rate": 9.477518887856958e-06, + "loss": 1.0313690185546875, + "memory(GiB)": 33.07, + "step": 4075, + "token_acc": 0.7563822027716995, + "train_speed(iter/s)": 0.096193 + }, + { + "epoch": 0.18973490011887495, + "grad_norm": 3.825904130935669, + "learning_rate": 9.475806375433256e-06, + "loss": 0.7562622547149658, + "memory(GiB)": 33.07, + "step": 4080, + "token_acc": 0.8031704095112285, + "train_speed(iter/s)": 0.09626 + }, + { + "epoch": 0.18996741837882455, + "grad_norm": 5.252847671508789, + "learning_rate": 9.474091216327058e-06, + "loss": 0.806545352935791, + "memory(GiB)": 33.07, + "step": 4085, + "token_acc": 0.8209828393135725, + "train_speed(iter/s)": 0.096327 + }, + { + "epoch": 0.19019993663877416, + "grad_norm": 4.933413982391357, + "learning_rate": 9.47237341155259e-06, + "loss": 0.8696954727172852, + "memory(GiB)": 33.07, + "step": 4090, + "token_acc": 0.7778093883357041, + "train_speed(iter/s)": 0.096392 + }, + { + "epoch": 0.19043245489872376, + "grad_norm": 5.949667930603027, + "learning_rate": 9.470652962125639e-06, + "loss": 0.7424722194671631, + "memory(GiB)": 33.07, + "step": 4095, + "token_acc": 0.8045484508899143, + "train_speed(iter/s)": 0.096456 + }, + { + "epoch": 0.19066497315867337, + "grad_norm": 4.2154998779296875, + "learning_rate": 9.468929869063564e-06, + "loss": 0.8286898612976075, + "memory(GiB)": 33.07, + "step": 4100, + "token_acc": 0.7962100031065549, + "train_speed(iter/s)": 0.096522 + }, + { + "epoch": 0.19066497315867337, + "eval_loss": 0.6997935771942139, + "eval_runtime": 295.0785, + "eval_samples_per_second": 11.777, + "eval_steps_per_second": 11.777, + "step": 4100 + }, + { + "epoch": 0.19089749141862297, + "grad_norm": 5.990864276885986, + "learning_rate": 9.46720413338528e-06, + "loss": 1.0568864822387696, + "memory(GiB)": 33.07, + "step": 4105, + "token_acc": 0.8014819783132915, + "train_speed(iter/s)": 0.09592 + }, + { + "epoch": 0.19113000967857258, + "grad_norm": 5.360263824462891, + "learning_rate": 9.465475756111271e-06, + "loss": 0.7643206596374512, + "memory(GiB)": 33.07, + "step": 4110, + "token_acc": 0.8110726643598616, + "train_speed(iter/s)": 0.095984 + }, + { + "epoch": 0.19136252793852218, + "grad_norm": 5.552672863006592, + "learning_rate": 9.46374473826358e-06, + "loss": 0.9097006797790528, + "memory(GiB)": 33.07, + "step": 4115, + "token_acc": 0.7756370416407706, + "train_speed(iter/s)": 0.096049 + }, + { + "epoch": 0.19159504619847179, + "grad_norm": 6.712371349334717, + "learning_rate": 9.462011080865809e-06, + "loss": 0.7000391960144043, + "memory(GiB)": 33.07, + "step": 4120, + "token_acc": 0.8365650969529086, + "train_speed(iter/s)": 0.096115 + }, + { + "epoch": 0.19182756445842136, + "grad_norm": 6.444034099578857, + "learning_rate": 9.460274784943122e-06, + "loss": 0.6805448055267334, + "memory(GiB)": 33.07, + "step": 4125, + "token_acc": 0.827490454703228, + "train_speed(iter/s)": 0.09618 + }, + { + "epoch": 0.19206008271837097, + "grad_norm": 7.107831954956055, + "learning_rate": 9.458535851522247e-06, + "loss": 0.7243359565734864, + "memory(GiB)": 33.07, + "step": 4130, + "token_acc": 0.8241419205553413, + "train_speed(iter/s)": 0.096244 + }, + { + "epoch": 0.19229260097832057, + "grad_norm": 7.023250102996826, + "learning_rate": 9.45679428163147e-06, + "loss": 0.786740779876709, + "memory(GiB)": 33.07, + "step": 4135, + "token_acc": 0.8085681204168275, + "train_speed(iter/s)": 0.096305 + }, + { + "epoch": 0.19252511923827018, + "grad_norm": 5.895570278167725, + "learning_rate": 9.455050076300633e-06, + "loss": 0.6871311664581299, + "memory(GiB)": 33.07, + "step": 4140, + "token_acc": 0.8327688399661304, + "train_speed(iter/s)": 0.096369 + }, + { + "epoch": 0.19275763749821978, + "grad_norm": 7.156490325927734, + "learning_rate": 9.453303236561138e-06, + "loss": 0.7330933570861816, + "memory(GiB)": 33.07, + "step": 4145, + "token_acc": 0.8441851712457659, + "train_speed(iter/s)": 0.096435 + }, + { + "epoch": 0.19299015575816938, + "grad_norm": 5.157409191131592, + "learning_rate": 9.451553763445946e-06, + "loss": 0.7420677185058594, + "memory(GiB)": 33.07, + "step": 4150, + "token_acc": 0.8215586307356154, + "train_speed(iter/s)": 0.096499 + }, + { + "epoch": 0.19299015575816938, + "eval_loss": 0.7018148899078369, + "eval_runtime": 295.0708, + "eval_samples_per_second": 11.777, + "eval_steps_per_second": 11.777, + "step": 4150 + }, + { + "epoch": 0.193222674018119, + "grad_norm": 4.892160892486572, + "learning_rate": 9.449801657989574e-06, + "loss": 0.7619297504425049, + "memory(GiB)": 33.07, + "step": 4155, + "token_acc": 0.8021788578953669, + "train_speed(iter/s)": 0.095904 + }, + { + "epoch": 0.1934551922780686, + "grad_norm": 5.372450351715088, + "learning_rate": 9.448046921228098e-06, + "loss": 0.9046992301940918, + "memory(GiB)": 33.07, + "step": 4160, + "token_acc": 0.7911676646706587, + "train_speed(iter/s)": 0.095969 + }, + { + "epoch": 0.1936877105380182, + "grad_norm": 5.386012077331543, + "learning_rate": 9.446289554199146e-06, + "loss": 0.7827619552612305, + "memory(GiB)": 33.07, + "step": 4165, + "token_acc": 0.8042328042328042, + "train_speed(iter/s)": 0.096033 + }, + { + "epoch": 0.1939202287979678, + "grad_norm": 5.557136535644531, + "learning_rate": 9.444529557941904e-06, + "loss": 0.8229413032531738, + "memory(GiB)": 33.07, + "step": 4170, + "token_acc": 0.7890600440113172, + "train_speed(iter/s)": 0.096098 + }, + { + "epoch": 0.19415274705791738, + "grad_norm": 4.749749183654785, + "learning_rate": 9.442766933497112e-06, + "loss": 0.7320784568786621, + "memory(GiB)": 33.07, + "step": 4175, + "token_acc": 0.8313518273888155, + "train_speed(iter/s)": 0.096163 + }, + { + "epoch": 0.19438526531786698, + "grad_norm": 5.676370620727539, + "learning_rate": 9.441001681907065e-06, + "loss": 0.8156270980834961, + "memory(GiB)": 33.07, + "step": 4180, + "token_acc": 0.8094858509366282, + "train_speed(iter/s)": 0.096227 + }, + { + "epoch": 0.1946177835778166, + "grad_norm": 4.829561233520508, + "learning_rate": 9.43923380421561e-06, + "loss": 0.8413228034973145, + "memory(GiB)": 33.07, + "step": 4185, + "token_acc": 0.7903622933520928, + "train_speed(iter/s)": 0.096292 + }, + { + "epoch": 0.1948503018377662, + "grad_norm": 5.902974605560303, + "learning_rate": 9.437463301468146e-06, + "loss": 0.7763947010040283, + "memory(GiB)": 33.07, + "step": 4190, + "token_acc": 0.8305927342256214, + "train_speed(iter/s)": 0.096355 + }, + { + "epoch": 0.1950828200977158, + "grad_norm": 4.831693172454834, + "learning_rate": 9.435690174711629e-06, + "loss": 0.6980354309082031, + "memory(GiB)": 33.07, + "step": 4195, + "token_acc": 0.8259067357512954, + "train_speed(iter/s)": 0.096419 + }, + { + "epoch": 0.1953153383576654, + "grad_norm": 6.849177837371826, + "learning_rate": 9.433914424994564e-06, + "loss": 0.8142841339111329, + "memory(GiB)": 33.07, + "step": 4200, + "token_acc": 0.8020400453343408, + "train_speed(iter/s)": 0.096482 + }, + { + "epoch": 0.1953153383576654, + "eval_loss": 0.6981073021888733, + "eval_runtime": 294.9067, + "eval_samples_per_second": 11.783, + "eval_steps_per_second": 11.783, + "step": 4200 + }, + { + "epoch": 0.195547856617615, + "grad_norm": 5.831681728363037, + "learning_rate": 9.432136053367003e-06, + "loss": 0.8038248062133789, + "memory(GiB)": 33.07, + "step": 4205, + "token_acc": 0.8027805933102947, + "train_speed(iter/s)": 0.095896 + }, + { + "epoch": 0.1957803748775646, + "grad_norm": 4.811705112457275, + "learning_rate": 9.430355060880555e-06, + "loss": 0.7309530258178711, + "memory(GiB)": 33.07, + "step": 4210, + "token_acc": 0.8190003104625893, + "train_speed(iter/s)": 0.095959 + }, + { + "epoch": 0.19601289313751422, + "grad_norm": 5.662635326385498, + "learning_rate": 9.428571448588373e-06, + "loss": 0.9009736061096192, + "memory(GiB)": 33.07, + "step": 4215, + "token_acc": 0.7850264057160609, + "train_speed(iter/s)": 0.096022 + }, + { + "epoch": 0.19624541139746382, + "grad_norm": 4.918367385864258, + "learning_rate": 9.426785217545166e-06, + "loss": 0.7335701942443847, + "memory(GiB)": 33.07, + "step": 4220, + "token_acc": 0.8282166264229045, + "train_speed(iter/s)": 0.096087 + }, + { + "epoch": 0.1964779296574134, + "grad_norm": 5.138798236846924, + "learning_rate": 9.424996368807184e-06, + "loss": 0.6669015407562255, + "memory(GiB)": 33.07, + "step": 4225, + "token_acc": 0.8363567649281936, + "train_speed(iter/s)": 0.09615 + }, + { + "epoch": 0.196710447917363, + "grad_norm": 7.676051616668701, + "learning_rate": 9.423204903432232e-06, + "loss": 0.6710890769958496, + "memory(GiB)": 33.07, + "step": 4230, + "token_acc": 0.8273730684326711, + "train_speed(iter/s)": 0.096213 + }, + { + "epoch": 0.1969429661773126, + "grad_norm": 4.74376106262207, + "learning_rate": 9.421410822479656e-06, + "loss": 0.7752367496490479, + "memory(GiB)": 33.07, + "step": 4235, + "token_acc": 0.8122767132185774, + "train_speed(iter/s)": 0.096277 + }, + { + "epoch": 0.1971754844372622, + "grad_norm": 6.3608222007751465, + "learning_rate": 9.41961412701035e-06, + "loss": 0.8766127586364746, + "memory(GiB)": 33.07, + "step": 4240, + "token_acc": 0.782608695652174, + "train_speed(iter/s)": 0.096339 + }, + { + "epoch": 0.19740800269721182, + "grad_norm": 6.932170867919922, + "learning_rate": 9.417814818086758e-06, + "loss": 0.7886018753051758, + "memory(GiB)": 33.07, + "step": 4245, + "token_acc": 0.8145620022753128, + "train_speed(iter/s)": 0.096402 + }, + { + "epoch": 0.19764052095716142, + "grad_norm": 7.067446231842041, + "learning_rate": 9.41601289677287e-06, + "loss": 0.7627155780792236, + "memory(GiB)": 33.07, + "step": 4250, + "token_acc": 0.8150163220892275, + "train_speed(iter/s)": 0.096467 + }, + { + "epoch": 0.19764052095716142, + "eval_loss": 0.6957509517669678, + "eval_runtime": 294.9761, + "eval_samples_per_second": 11.781, + "eval_steps_per_second": 11.781, + "step": 4250 + }, + { + "epoch": 0.19787303921711102, + "grad_norm": 5.023551940917969, + "learning_rate": 9.414208364134211e-06, + "loss": 0.7709908962249756, + "memory(GiB)": 33.07, + "step": 4255, + "token_acc": 0.8032265285239192, + "train_speed(iter/s)": 0.095886 + }, + { + "epoch": 0.19810555747706063, + "grad_norm": 6.044614791870117, + "learning_rate": 9.412401221237863e-06, + "loss": 0.8650611877441406, + "memory(GiB)": 33.07, + "step": 4260, + "token_acc": 0.7993119266055045, + "train_speed(iter/s)": 0.09595 + }, + { + "epoch": 0.19833807573701023, + "grad_norm": 6.721257209777832, + "learning_rate": 9.410591469152442e-06, + "loss": 0.8071677207946777, + "memory(GiB)": 33.07, + "step": 4265, + "token_acc": 0.8033012379642366, + "train_speed(iter/s)": 0.096014 + }, + { + "epoch": 0.1985705939969598, + "grad_norm": 6.265713214874268, + "learning_rate": 9.408779108948108e-06, + "loss": 0.7521049499511718, + "memory(GiB)": 33.07, + "step": 4270, + "token_acc": 0.8141247833622184, + "train_speed(iter/s)": 0.096078 + }, + { + "epoch": 0.19880311225690941, + "grad_norm": 5.308954238891602, + "learning_rate": 9.40696414169657e-06, + "loss": 0.7319036006927491, + "memory(GiB)": 33.07, + "step": 4275, + "token_acc": 0.8273524720893142, + "train_speed(iter/s)": 0.096142 + }, + { + "epoch": 0.19903563051685902, + "grad_norm": 7.264330863952637, + "learning_rate": 9.405146568471073e-06, + "loss": 0.807645320892334, + "memory(GiB)": 33.07, + "step": 4280, + "token_acc": 0.7988970588235295, + "train_speed(iter/s)": 0.096204 + }, + { + "epoch": 0.19926814877680862, + "grad_norm": 6.443247318267822, + "learning_rate": 9.403326390346404e-06, + "loss": 0.8453804016113281, + "memory(GiB)": 33.07, + "step": 4285, + "token_acc": 0.8029490616621984, + "train_speed(iter/s)": 0.096268 + }, + { + "epoch": 0.19950066703675823, + "grad_norm": 4.0575056076049805, + "learning_rate": 9.40150360839889e-06, + "loss": 0.9449618339538575, + "memory(GiB)": 33.07, + "step": 4290, + "token_acc": 0.773838630806846, + "train_speed(iter/s)": 0.096329 + }, + { + "epoch": 0.19973318529670783, + "grad_norm": 4.946542739868164, + "learning_rate": 9.3996782237064e-06, + "loss": 0.7974777221679688, + "memory(GiB)": 33.07, + "step": 4295, + "token_acc": 0.8088344469190795, + "train_speed(iter/s)": 0.096388 + }, + { + "epoch": 0.19996570355665744, + "grad_norm": 5.26956844329834, + "learning_rate": 9.397850237348336e-06, + "loss": 0.8243688583374024, + "memory(GiB)": 33.07, + "step": 4300, + "token_acc": 0.7927695287282117, + "train_speed(iter/s)": 0.09645 + }, + { + "epoch": 0.19996570355665744, + "eval_loss": 0.6982521414756775, + "eval_runtime": 294.9008, + "eval_samples_per_second": 11.784, + "eval_steps_per_second": 11.784, + "step": 4300 + }, + { + "epoch": 0.20019822181660704, + "grad_norm": 6.252737522125244, + "learning_rate": 9.396019650405646e-06, + "loss": 0.7586172580718994, + "memory(GiB)": 33.07, + "step": 4305, + "token_acc": 0.803494358900446, + "train_speed(iter/s)": 0.095879 + }, + { + "epoch": 0.20043074007655665, + "grad_norm": 5.019197940826416, + "learning_rate": 9.394186463960814e-06, + "loss": 0.8452945709228515, + "memory(GiB)": 33.07, + "step": 4310, + "token_acc": 0.7955331865366467, + "train_speed(iter/s)": 0.09594 + }, + { + "epoch": 0.20066325833650625, + "grad_norm": 6.704598903656006, + "learning_rate": 9.392350679097857e-06, + "loss": 0.9175315856933594, + "memory(GiB)": 33.07, + "step": 4315, + "token_acc": 0.7831031681559708, + "train_speed(iter/s)": 0.096002 + }, + { + "epoch": 0.20089577659645583, + "grad_norm": 6.991080284118652, + "learning_rate": 9.390512296902331e-06, + "loss": 0.7498832225799561, + "memory(GiB)": 33.07, + "step": 4320, + "token_acc": 0.8220264317180617, + "train_speed(iter/s)": 0.096066 + }, + { + "epoch": 0.20112829485640543, + "grad_norm": 4.867962837219238, + "learning_rate": 9.388671318461331e-06, + "loss": 0.739860725402832, + "memory(GiB)": 33.07, + "step": 4325, + "token_acc": 0.8180700676090965, + "train_speed(iter/s)": 0.096128 + }, + { + "epoch": 0.20136081311635504, + "grad_norm": 5.916605472564697, + "learning_rate": 9.386827744863483e-06, + "loss": 1.0181119918823243, + "memory(GiB)": 33.07, + "step": 4330, + "token_acc": 0.754506128334535, + "train_speed(iter/s)": 0.096191 + }, + { + "epoch": 0.20159333137630464, + "grad_norm": 7.321384906768799, + "learning_rate": 9.384981577198946e-06, + "loss": 0.7450331211090088, + "memory(GiB)": 33.07, + "step": 4335, + "token_acc": 0.8184615384615385, + "train_speed(iter/s)": 0.096254 + }, + { + "epoch": 0.20182584963625425, + "grad_norm": 4.210700035095215, + "learning_rate": 9.383132816559422e-06, + "loss": 0.7693531513214111, + "memory(GiB)": 33.07, + "step": 4340, + "token_acc": 0.8010770784247728, + "train_speed(iter/s)": 0.096315 + }, + { + "epoch": 0.20205836789620385, + "grad_norm": 6.521642684936523, + "learning_rate": 9.381281464038134e-06, + "loss": 0.7599985122680664, + "memory(GiB)": 33.07, + "step": 4345, + "token_acc": 0.8084516799445791, + "train_speed(iter/s)": 0.096376 + }, + { + "epoch": 0.20229088615615345, + "grad_norm": 5.679013252258301, + "learning_rate": 9.37942752072985e-06, + "loss": 0.7193542003631592, + "memory(GiB)": 33.07, + "step": 4350, + "token_acc": 0.81234499862221, + "train_speed(iter/s)": 0.096437 + }, + { + "epoch": 0.20229088615615345, + "eval_loss": 0.6899635791778564, + "eval_runtime": 296.4949, + "eval_samples_per_second": 11.72, + "eval_steps_per_second": 11.72, + "step": 4350 + }, + { + "epoch": 0.20252340441610306, + "grad_norm": 5.6605939865112305, + "learning_rate": 9.377570987730857e-06, + "loss": 0.7166150093078614, + "memory(GiB)": 33.07, + "step": 4355, + "token_acc": 0.8045957678568568, + "train_speed(iter/s)": 0.095868 + }, + { + "epoch": 0.20275592267605266, + "grad_norm": 6.247797012329102, + "learning_rate": 9.375711866138986e-06, + "loss": 0.8742569923400879, + "memory(GiB)": 33.07, + "step": 4360, + "token_acc": 0.7985636114911081, + "train_speed(iter/s)": 0.09593 + }, + { + "epoch": 0.20298844093600227, + "grad_norm": 4.9714813232421875, + "learning_rate": 9.373850157053591e-06, + "loss": 0.7221141338348389, + "memory(GiB)": 33.07, + "step": 4365, + "token_acc": 0.8208573256557902, + "train_speed(iter/s)": 0.095991 + }, + { + "epoch": 0.20322095919595184, + "grad_norm": 6.083737850189209, + "learning_rate": 9.37198586157556e-06, + "loss": 0.7091527938842773, + "memory(GiB)": 33.07, + "step": 4370, + "token_acc": 0.8148148148148148, + "train_speed(iter/s)": 0.096052 + }, + { + "epoch": 0.20345347745590145, + "grad_norm": 5.963134765625, + "learning_rate": 9.370118980807303e-06, + "loss": 0.7980108261108398, + "memory(GiB)": 33.07, + "step": 4375, + "token_acc": 0.8181049069373942, + "train_speed(iter/s)": 0.096113 + }, + { + "epoch": 0.20368599571585105, + "grad_norm": 6.6309285163879395, + "learning_rate": 9.36824951585277e-06, + "loss": 0.7637380123138428, + "memory(GiB)": 33.07, + "step": 4380, + "token_acc": 0.803343949044586, + "train_speed(iter/s)": 0.096175 + }, + { + "epoch": 0.20391851397580066, + "grad_norm": 7.040733814239502, + "learning_rate": 9.36637746781743e-06, + "loss": 0.7601099967956543, + "memory(GiB)": 33.07, + "step": 4385, + "token_acc": 0.825043630017452, + "train_speed(iter/s)": 0.096237 + }, + { + "epoch": 0.20415103223575026, + "grad_norm": 7.548309803009033, + "learning_rate": 9.364502837808284e-06, + "loss": 0.7506435871124267, + "memory(GiB)": 33.07, + "step": 4390, + "token_acc": 0.8206075533661741, + "train_speed(iter/s)": 0.096298 + }, + { + "epoch": 0.20438355049569987, + "grad_norm": 6.83359956741333, + "learning_rate": 9.36262562693386e-06, + "loss": 0.8372472763061524, + "memory(GiB)": 33.07, + "step": 4395, + "token_acc": 0.7879834254143646, + "train_speed(iter/s)": 0.096359 + }, + { + "epoch": 0.20461606875564947, + "grad_norm": 4.644917964935303, + "learning_rate": 9.360745836304207e-06, + "loss": 0.8313392639160156, + "memory(GiB)": 33.07, + "step": 4400, + "token_acc": 0.8097439544807966, + "train_speed(iter/s)": 0.09642 + }, + { + "epoch": 0.20461606875564947, + "eval_loss": 0.692659318447113, + "eval_runtime": 294.9376, + "eval_samples_per_second": 11.782, + "eval_steps_per_second": 11.782, + "step": 4400 + }, + { + "epoch": 0.20484858701559908, + "grad_norm": 6.488748550415039, + "learning_rate": 9.358863467030907e-06, + "loss": 0.7236376762390136, + "memory(GiB)": 33.07, + "step": 4405, + "token_acc": 0.8043207610413241, + "train_speed(iter/s)": 0.095862 + }, + { + "epoch": 0.20508110527554868, + "grad_norm": 4.683987140655518, + "learning_rate": 9.356978520227062e-06, + "loss": 0.7017735958099365, + "memory(GiB)": 33.07, + "step": 4410, + "token_acc": 0.8319974350753446, + "train_speed(iter/s)": 0.095923 + }, + { + "epoch": 0.20531362353549829, + "grad_norm": 4.9379658699035645, + "learning_rate": 9.355090997007297e-06, + "loss": 0.7702983856201172, + "memory(GiB)": 33.07, + "step": 4415, + "token_acc": 0.8051425030978935, + "train_speed(iter/s)": 0.095983 + }, + { + "epoch": 0.20554614179544786, + "grad_norm": 5.977777481079102, + "learning_rate": 9.353200898487767e-06, + "loss": 0.6408030986785889, + "memory(GiB)": 33.07, + "step": 4420, + "token_acc": 0.8345717637856863, + "train_speed(iter/s)": 0.096044 + }, + { + "epoch": 0.20577866005539747, + "grad_norm": 6.222667694091797, + "learning_rate": 9.35130822578614e-06, + "loss": 0.77198486328125, + "memory(GiB)": 33.07, + "step": 4425, + "token_acc": 0.8058188950637464, + "train_speed(iter/s)": 0.096106 + }, + { + "epoch": 0.20601117831534707, + "grad_norm": 4.507582187652588, + "learning_rate": 9.349412980021618e-06, + "loss": 0.7829318523406983, + "memory(GiB)": 33.07, + "step": 4430, + "token_acc": 0.8062455642299503, + "train_speed(iter/s)": 0.096167 + }, + { + "epoch": 0.20624369657529668, + "grad_norm": 8.059425354003906, + "learning_rate": 9.347515162314914e-06, + "loss": 0.7704340934753418, + "memory(GiB)": 33.07, + "step": 4435, + "token_acc": 0.7938834023574387, + "train_speed(iter/s)": 0.096226 + }, + { + "epoch": 0.20647621483524628, + "grad_norm": 6.580410957336426, + "learning_rate": 9.345614773788268e-06, + "loss": 0.6889129638671875, + "memory(GiB)": 33.07, + "step": 4440, + "token_acc": 0.8137973137973138, + "train_speed(iter/s)": 0.096287 + }, + { + "epoch": 0.20670873309519588, + "grad_norm": 3.9273412227630615, + "learning_rate": 9.343711815565438e-06, + "loss": 0.7775119304656982, + "memory(GiB)": 33.07, + "step": 4445, + "token_acc": 0.8012727798669367, + "train_speed(iter/s)": 0.096348 + }, + { + "epoch": 0.2069412513551455, + "grad_norm": 7.893680572509766, + "learning_rate": 9.3418062887717e-06, + "loss": 0.7683738708496094, + "memory(GiB)": 33.07, + "step": 4450, + "token_acc": 0.8174030658250676, + "train_speed(iter/s)": 0.096407 + }, + { + "epoch": 0.2069412513551455, + "eval_loss": 0.6881858706474304, + "eval_runtime": 292.9637, + "eval_samples_per_second": 11.862, + "eval_steps_per_second": 11.862, + "step": 4450 + }, + { + "epoch": 0.2071737696150951, + "grad_norm": 5.971506595611572, + "learning_rate": 9.339898194533854e-06, + "loss": 0.780084228515625, + "memory(GiB)": 33.07, + "step": 4455, + "token_acc": 0.803921568627451, + "train_speed(iter/s)": 0.09586 + }, + { + "epoch": 0.2074062878750447, + "grad_norm": 5.8725128173828125, + "learning_rate": 9.337987533980214e-06, + "loss": 0.9864715576171875, + "memory(GiB)": 33.07, + "step": 4460, + "token_acc": 0.7490438364224772, + "train_speed(iter/s)": 0.095917 + }, + { + "epoch": 0.2076388061349943, + "grad_norm": 6.447871208190918, + "learning_rate": 9.336074308240613e-06, + "loss": 0.7951199531555175, + "memory(GiB)": 33.07, + "step": 4465, + "token_acc": 0.791958495460441, + "train_speed(iter/s)": 0.095976 + }, + { + "epoch": 0.20787132439494388, + "grad_norm": 6.752201080322266, + "learning_rate": 9.334158518446398e-06, + "loss": 0.7589597225189209, + "memory(GiB)": 33.07, + "step": 4470, + "token_acc": 0.8137150936726758, + "train_speed(iter/s)": 0.096037 + }, + { + "epoch": 0.20810384265489348, + "grad_norm": 5.577692985534668, + "learning_rate": 9.332240165730439e-06, + "loss": 0.8426610946655273, + "memory(GiB)": 33.07, + "step": 4475, + "token_acc": 0.8084077380952381, + "train_speed(iter/s)": 0.096097 + }, + { + "epoch": 0.2083363609148431, + "grad_norm": 5.145914077758789, + "learning_rate": 9.330319251227114e-06, + "loss": 0.7314593315124511, + "memory(GiB)": 33.07, + "step": 4480, + "token_acc": 0.8349742147210502, + "train_speed(iter/s)": 0.096157 + }, + { + "epoch": 0.2085688791747927, + "grad_norm": 4.913346767425537, + "learning_rate": 9.328395776072318e-06, + "loss": 0.6858441829681396, + "memory(GiB)": 33.07, + "step": 4485, + "token_acc": 0.819038642789821, + "train_speed(iter/s)": 0.096217 + }, + { + "epoch": 0.2088013974347423, + "grad_norm": 6.451091289520264, + "learning_rate": 9.326469741403463e-06, + "loss": 0.680500841140747, + "memory(GiB)": 33.07, + "step": 4490, + "token_acc": 0.8409187579753297, + "train_speed(iter/s)": 0.096277 + }, + { + "epoch": 0.2090339156946919, + "grad_norm": 6.490147590637207, + "learning_rate": 9.324541148359473e-06, + "loss": 0.7878528594970703, + "memory(GiB)": 33.07, + "step": 4495, + "token_acc": 0.8057866184448463, + "train_speed(iter/s)": 0.096338 + }, + { + "epoch": 0.2092664339546415, + "grad_norm": 4.894039630889893, + "learning_rate": 9.322609998080784e-06, + "loss": 0.6724793434143066, + "memory(GiB)": 33.07, + "step": 4500, + "token_acc": 0.8291692692067458, + "train_speed(iter/s)": 0.096397 + }, + { + "epoch": 0.2092664339546415, + "eval_loss": 0.686882734298706, + "eval_runtime": 291.9685, + "eval_samples_per_second": 11.902, + "eval_steps_per_second": 11.902, + "step": 4500 + }, + { + "epoch": 0.2094989522145911, + "grad_norm": 6.31425142288208, + "learning_rate": 9.320676291709348e-06, + "loss": 0.6531912326812744, + "memory(GiB)": 33.07, + "step": 4505, + "token_acc": 0.8051886489265973, + "train_speed(iter/s)": 0.095858 + }, + { + "epoch": 0.20973147047454072, + "grad_norm": 7.33244514465332, + "learning_rate": 9.31874003038862e-06, + "loss": 0.7912643909454345, + "memory(GiB)": 33.07, + "step": 4510, + "token_acc": 0.8016850291639663, + "train_speed(iter/s)": 0.095917 + }, + { + "epoch": 0.2099639887344903, + "grad_norm": 5.7589335441589355, + "learning_rate": 9.316801215263574e-06, + "loss": 0.7930665016174316, + "memory(GiB)": 33.07, + "step": 4515, + "token_acc": 0.8037995414346545, + "train_speed(iter/s)": 0.095976 + }, + { + "epoch": 0.2101965069944399, + "grad_norm": 5.287975788116455, + "learning_rate": 9.31485984748069e-06, + "loss": 0.8597976684570312, + "memory(GiB)": 33.07, + "step": 4520, + "token_acc": 0.8099725166862976, + "train_speed(iter/s)": 0.096036 + }, + { + "epoch": 0.2104290252543895, + "grad_norm": 5.207564353942871, + "learning_rate": 9.31291592818796e-06, + "loss": 0.7041160583496093, + "memory(GiB)": 33.07, + "step": 4525, + "token_acc": 0.8289205702647657, + "train_speed(iter/s)": 0.096095 + }, + { + "epoch": 0.2106615435143391, + "grad_norm": 5.495445251464844, + "learning_rate": 9.310969458534882e-06, + "loss": 0.7163251876831055, + "memory(GiB)": 33.07, + "step": 4530, + "token_acc": 0.8275154004106776, + "train_speed(iter/s)": 0.096154 + }, + { + "epoch": 0.2108940617742887, + "grad_norm": 5.854438781738281, + "learning_rate": 9.309020439672465e-06, + "loss": 0.7081255435943603, + "memory(GiB)": 33.07, + "step": 4535, + "token_acc": 0.8392466053438458, + "train_speed(iter/s)": 0.096215 + }, + { + "epoch": 0.21112658003423831, + "grad_norm": 5.526129722595215, + "learning_rate": 9.307068872753223e-06, + "loss": 0.7211766719818116, + "memory(GiB)": 33.07, + "step": 4540, + "token_acc": 0.8206380208333334, + "train_speed(iter/s)": 0.096275 + }, + { + "epoch": 0.21135909829418792, + "grad_norm": 7.228954315185547, + "learning_rate": 9.30511475893118e-06, + "loss": 0.7566389083862305, + "memory(GiB)": 33.07, + "step": 4545, + "token_acc": 0.8173076923076923, + "train_speed(iter/s)": 0.096336 + }, + { + "epoch": 0.21159161655413752, + "grad_norm": 6.528788089752197, + "learning_rate": 9.30315809936186e-06, + "loss": 0.8317952156066895, + "memory(GiB)": 33.07, + "step": 4550, + "token_acc": 0.7946549391069012, + "train_speed(iter/s)": 0.096396 + }, + { + "epoch": 0.21159161655413752, + "eval_loss": 0.6860916018486023, + "eval_runtime": 292.6928, + "eval_samples_per_second": 11.873, + "eval_steps_per_second": 11.873, + "step": 4550 + }, + { + "epoch": 0.21182413481408713, + "grad_norm": 5.846338272094727, + "learning_rate": 9.3011988952023e-06, + "loss": 0.8814240455627441, + "memory(GiB)": 33.07, + "step": 4555, + "token_acc": 0.8044239350750274, + "train_speed(iter/s)": 0.095861 + }, + { + "epoch": 0.21205665307403673, + "grad_norm": 5.912703514099121, + "learning_rate": 9.299237147611036e-06, + "loss": 0.8261652946472168, + "memory(GiB)": 33.07, + "step": 4560, + "token_acc": 0.7952633219071362, + "train_speed(iter/s)": 0.095919 + }, + { + "epoch": 0.2122891713339863, + "grad_norm": 6.0356526374816895, + "learning_rate": 9.29727285774811e-06, + "loss": 0.8617947578430176, + "memory(GiB)": 33.07, + "step": 4565, + "token_acc": 0.7860512129380054, + "train_speed(iter/s)": 0.095979 + }, + { + "epoch": 0.21252168959393591, + "grad_norm": 5.160603046417236, + "learning_rate": 9.295306026775066e-06, + "loss": 0.7188240051269531, + "memory(GiB)": 33.07, + "step": 4570, + "token_acc": 0.8221818181818182, + "train_speed(iter/s)": 0.096038 + }, + { + "epoch": 0.21275420785388552, + "grad_norm": 6.349583625793457, + "learning_rate": 9.293336655854955e-06, + "loss": 0.6778467178344727, + "memory(GiB)": 33.07, + "step": 4575, + "token_acc": 0.839390386869871, + "train_speed(iter/s)": 0.096098 + }, + { + "epoch": 0.21298672611383512, + "grad_norm": 5.2883100509643555, + "learning_rate": 9.291364746152325e-06, + "loss": 0.9108637809753418, + "memory(GiB)": 33.07, + "step": 4580, + "token_acc": 0.7718334297281666, + "train_speed(iter/s)": 0.096157 + }, + { + "epoch": 0.21321924437378473, + "grad_norm": 6.238943099975586, + "learning_rate": 9.289390298833226e-06, + "loss": 0.7632899284362793, + "memory(GiB)": 33.07, + "step": 4585, + "token_acc": 0.8032896945283652, + "train_speed(iter/s)": 0.096216 + }, + { + "epoch": 0.21345176263373433, + "grad_norm": 5.001307487487793, + "learning_rate": 9.287413315065212e-06, + "loss": 0.8181890487670899, + "memory(GiB)": 33.07, + "step": 4590, + "token_acc": 0.8045580110497238, + "train_speed(iter/s)": 0.096275 + }, + { + "epoch": 0.21368428089368394, + "grad_norm": 6.3453755378723145, + "learning_rate": 9.285433796017333e-06, + "loss": 0.7735485076904297, + "memory(GiB)": 33.07, + "step": 4595, + "token_acc": 0.7959317585301837, + "train_speed(iter/s)": 0.096334 + }, + { + "epoch": 0.21391679915363354, + "grad_norm": 5.127955436706543, + "learning_rate": 9.28345174286014e-06, + "loss": 0.6808011531829834, + "memory(GiB)": 33.07, + "step": 4600, + "token_acc": 0.8435283687943262, + "train_speed(iter/s)": 0.096394 + }, + { + "epoch": 0.21391679915363354, + "eval_loss": 0.6844122409820557, + "eval_runtime": 290.6503, + "eval_samples_per_second": 11.956, + "eval_steps_per_second": 11.956, + "step": 4600 + }, + { + "epoch": 0.21414931741358315, + "grad_norm": 6.679483413696289, + "learning_rate": 9.281467156765684e-06, + "loss": 0.7091818809509277, + "memory(GiB)": 33.07, + "step": 4605, + "token_acc": 0.8054765092063391, + "train_speed(iter/s)": 0.095868 + }, + { + "epoch": 0.21438183567353275, + "grad_norm": 7.544936656951904, + "learning_rate": 9.279480038907508e-06, + "loss": 0.7468667984008789, + "memory(GiB)": 33.07, + "step": 4610, + "token_acc": 0.8214285714285714, + "train_speed(iter/s)": 0.095926 + }, + { + "epoch": 0.21461435393348233, + "grad_norm": 4.884216785430908, + "learning_rate": 9.27749039046066e-06, + "loss": 0.7384253025054932, + "memory(GiB)": 33.07, + "step": 4615, + "token_acc": 0.8045409674234946, + "train_speed(iter/s)": 0.095984 + }, + { + "epoch": 0.21484687219343193, + "grad_norm": 6.339810848236084, + "learning_rate": 9.275498212601679e-06, + "loss": 0.8337714195251464, + "memory(GiB)": 33.07, + "step": 4620, + "token_acc": 0.8185096153846154, + "train_speed(iter/s)": 0.096043 + }, + { + "epoch": 0.21507939045338154, + "grad_norm": 5.720736980438232, + "learning_rate": 9.273503506508601e-06, + "loss": 0.6793076038360596, + "memory(GiB)": 33.07, + "step": 4625, + "token_acc": 0.8336227856894756, + "train_speed(iter/s)": 0.096102 + }, + { + "epoch": 0.21531190871333114, + "grad_norm": 5.042727470397949, + "learning_rate": 9.27150627336096e-06, + "loss": 0.727689790725708, + "memory(GiB)": 33.07, + "step": 4630, + "token_acc": 0.8223185265438786, + "train_speed(iter/s)": 0.096159 + }, + { + "epoch": 0.21554442697328074, + "grad_norm": 6.311602592468262, + "learning_rate": 9.269506514339776e-06, + "loss": 0.834522819519043, + "memory(GiB)": 33.07, + "step": 4635, + "token_acc": 0.7994505494505495, + "train_speed(iter/s)": 0.096218 + }, + { + "epoch": 0.21577694523323035, + "grad_norm": 6.29012393951416, + "learning_rate": 9.267504230627573e-06, + "loss": 0.6527014255523682, + "memory(GiB)": 33.07, + "step": 4640, + "token_acc": 0.8425959125134457, + "train_speed(iter/s)": 0.096274 + }, + { + "epoch": 0.21600946349317995, + "grad_norm": 4.891956806182861, + "learning_rate": 9.26549942340836e-06, + "loss": 0.7832115173339844, + "memory(GiB)": 33.07, + "step": 4645, + "token_acc": 0.8165349143610013, + "train_speed(iter/s)": 0.096332 + }, + { + "epoch": 0.21624198175312956, + "grad_norm": 5.229694843292236, + "learning_rate": 9.263492093867646e-06, + "loss": 0.7881430149078369, + "memory(GiB)": 33.07, + "step": 4650, + "token_acc": 0.8165953243332236, + "train_speed(iter/s)": 0.09639 + }, + { + "epoch": 0.21624198175312956, + "eval_loss": 0.6870741844177246, + "eval_runtime": 288.3894, + "eval_samples_per_second": 12.05, + "eval_steps_per_second": 12.05, + "step": 4650 + }, + { + "epoch": 0.21647450001307916, + "grad_norm": 6.321075439453125, + "learning_rate": 9.261482243192422e-06, + "loss": 0.7271718978881836, + "memory(GiB)": 33.07, + "step": 4655, + "token_acc": 0.80499289693371, + "train_speed(iter/s)": 0.095874 + }, + { + "epoch": 0.21670701827302877, + "grad_norm": 6.790338039398193, + "learning_rate": 9.259469872571179e-06, + "loss": 0.8134382247924805, + "memory(GiB)": 33.07, + "step": 4660, + "token_acc": 0.7886208138278719, + "train_speed(iter/s)": 0.095932 + }, + { + "epoch": 0.21693953653297834, + "grad_norm": 6.273148536682129, + "learning_rate": 9.257454983193888e-06, + "loss": 0.8073574066162109, + "memory(GiB)": 33.07, + "step": 4665, + "token_acc": 0.8144235186035829, + "train_speed(iter/s)": 0.095991 + }, + { + "epoch": 0.21717205479292795, + "grad_norm": 5.982663631439209, + "learning_rate": 9.255437576252022e-06, + "loss": 0.785725736618042, + "memory(GiB)": 33.07, + "step": 4670, + "token_acc": 0.7970684039087947, + "train_speed(iter/s)": 0.096049 + }, + { + "epoch": 0.21740457305287755, + "grad_norm": 6.031878471374512, + "learning_rate": 9.253417652938532e-06, + "loss": 0.8083737373352051, + "memory(GiB)": 33.07, + "step": 4675, + "token_acc": 0.7987055016181229, + "train_speed(iter/s)": 0.096107 + }, + { + "epoch": 0.21763709131282716, + "grad_norm": 7.036018371582031, + "learning_rate": 9.251395214447862e-06, + "loss": 0.787720775604248, + "memory(GiB)": 33.07, + "step": 4680, + "token_acc": 0.8159931212381771, + "train_speed(iter/s)": 0.096166 + }, + { + "epoch": 0.21786960957277676, + "grad_norm": 6.4246931076049805, + "learning_rate": 9.249370261975943e-06, + "loss": 0.6878273010253906, + "memory(GiB)": 33.07, + "step": 4685, + "token_acc": 0.8285606631499624, + "train_speed(iter/s)": 0.096224 + }, + { + "epoch": 0.21810212783272637, + "grad_norm": 3.8756697177886963, + "learning_rate": 9.247342796720192e-06, + "loss": 0.7691972732543946, + "memory(GiB)": 33.07, + "step": 4690, + "token_acc": 0.8043310131477185, + "train_speed(iter/s)": 0.09628 + }, + { + "epoch": 0.21833464609267597, + "grad_norm": 6.382996082305908, + "learning_rate": 9.245312819879508e-06, + "loss": 0.7722404956817627, + "memory(GiB)": 33.07, + "step": 4695, + "token_acc": 0.8183962264150944, + "train_speed(iter/s)": 0.096337 + }, + { + "epoch": 0.21856716435262558, + "grad_norm": 5.762166976928711, + "learning_rate": 9.243280332654286e-06, + "loss": 0.6639322757720947, + "memory(GiB)": 33.07, + "step": 4700, + "token_acc": 0.8204067562909342, + "train_speed(iter/s)": 0.096394 + }, + { + "epoch": 0.21856716435262558, + "eval_loss": 0.6851846575737, + "eval_runtime": 292.3554, + "eval_samples_per_second": 11.886, + "eval_steps_per_second": 11.886, + "step": 4700 + }, + { + "epoch": 0.21879968261257518, + "grad_norm": 5.3950324058532715, + "learning_rate": 9.241245336246392e-06, + "loss": 0.7640002250671387, + "memory(GiB)": 33.07, + "step": 4705, + "token_acc": 0.8051870060613825, + "train_speed(iter/s)": 0.095875 + }, + { + "epoch": 0.21903220087252476, + "grad_norm": 6.712535381317139, + "learning_rate": 9.239207831859184e-06, + "loss": 0.7891267776489258, + "memory(GiB)": 33.07, + "step": 4710, + "token_acc": 0.8250728862973761, + "train_speed(iter/s)": 0.095932 + }, + { + "epoch": 0.21926471913247436, + "grad_norm": 6.155306816101074, + "learning_rate": 9.237167820697504e-06, + "loss": 0.813847827911377, + "memory(GiB)": 33.07, + "step": 4715, + "token_acc": 0.7971908187735526, + "train_speed(iter/s)": 0.095989 + }, + { + "epoch": 0.21949723739242397, + "grad_norm": 4.275435447692871, + "learning_rate": 9.23512530396767e-06, + "loss": 0.6743597030639649, + "memory(GiB)": 33.07, + "step": 4720, + "token_acc": 0.8179708222811671, + "train_speed(iter/s)": 0.096045 + }, + { + "epoch": 0.21972975565237357, + "grad_norm": 6.898558616638184, + "learning_rate": 9.233080282877486e-06, + "loss": 0.7777416229248046, + "memory(GiB)": 33.07, + "step": 4725, + "token_acc": 0.8053097345132744, + "train_speed(iter/s)": 0.096102 + }, + { + "epoch": 0.21996227391232318, + "grad_norm": 4.393642902374268, + "learning_rate": 9.231032758636241e-06, + "loss": 0.8509530067443848, + "memory(GiB)": 33.07, + "step": 4730, + "token_acc": 0.8003134796238245, + "train_speed(iter/s)": 0.096158 + }, + { + "epoch": 0.22019479217227278, + "grad_norm": 5.334609508514404, + "learning_rate": 9.22898273245469e-06, + "loss": 0.7725913047790527, + "memory(GiB)": 33.07, + "step": 4735, + "token_acc": 0.8223370429252782, + "train_speed(iter/s)": 0.096216 + }, + { + "epoch": 0.22042731043222238, + "grad_norm": 6.812933444976807, + "learning_rate": 9.226930205545086e-06, + "loss": 0.6820529460906982, + "memory(GiB)": 33.07, + "step": 4740, + "token_acc": 0.8412623645784267, + "train_speed(iter/s)": 0.096271 + }, + { + "epoch": 0.220659828692172, + "grad_norm": 5.598426342010498, + "learning_rate": 9.224875179121145e-06, + "loss": 0.7706835746765137, + "memory(GiB)": 33.07, + "step": 4745, + "token_acc": 0.806554756195044, + "train_speed(iter/s)": 0.096328 + }, + { + "epoch": 0.2208923469521216, + "grad_norm": 6.014193058013916, + "learning_rate": 9.22281765439807e-06, + "loss": 0.8362375259399414, + "memory(GiB)": 33.07, + "step": 4750, + "token_acc": 0.7828804347826087, + "train_speed(iter/s)": 0.096385 + }, + { + "epoch": 0.2208923469521216, + "eval_loss": 0.6855367422103882, + "eval_runtime": 293.1276, + "eval_samples_per_second": 11.855, + "eval_steps_per_second": 11.855, + "step": 4750 + }, + { + "epoch": 0.2211248652120712, + "grad_norm": 3.9696576595306396, + "learning_rate": 9.22075763259254e-06, + "loss": 0.7803917407989502, + "memory(GiB)": 33.07, + "step": 4755, + "token_acc": 0.8054952423526407, + "train_speed(iter/s)": 0.095869 + }, + { + "epoch": 0.22135738347202077, + "grad_norm": 4.370530605316162, + "learning_rate": 9.21869511492271e-06, + "loss": 0.8625608444213867, + "memory(GiB)": 33.07, + "step": 4760, + "token_acc": 0.7932675960557634, + "train_speed(iter/s)": 0.095925 + }, + { + "epoch": 0.22158990173197038, + "grad_norm": 5.993688583374023, + "learning_rate": 9.216630102608205e-06, + "loss": 0.8630349159240722, + "memory(GiB)": 33.07, + "step": 4765, + "token_acc": 0.8071367884451996, + "train_speed(iter/s)": 0.095981 + }, + { + "epoch": 0.22182241999191998, + "grad_norm": 6.219290256500244, + "learning_rate": 9.214562596870138e-06, + "loss": 0.7270864963531494, + "memory(GiB)": 33.07, + "step": 4770, + "token_acc": 0.8224020442930153, + "train_speed(iter/s)": 0.096036 + }, + { + "epoch": 0.2220549382518696, + "grad_norm": 5.150698184967041, + "learning_rate": 9.212492598931081e-06, + "loss": 0.8439167976379395, + "memory(GiB)": 33.07, + "step": 4775, + "token_acc": 0.7991869918699187, + "train_speed(iter/s)": 0.096088 + }, + { + "epoch": 0.2222874565118192, + "grad_norm": 5.613473415374756, + "learning_rate": 9.210420110015098e-06, + "loss": 0.7923439979553223, + "memory(GiB)": 33.07, + "step": 4780, + "token_acc": 0.7985475213135459, + "train_speed(iter/s)": 0.096145 + }, + { + "epoch": 0.2225199747717688, + "grad_norm": 5.041796684265137, + "learning_rate": 9.208345131347704e-06, + "loss": 0.8898165702819825, + "memory(GiB)": 33.07, + "step": 4785, + "token_acc": 0.7721759809750297, + "train_speed(iter/s)": 0.096199 + }, + { + "epoch": 0.2227524930317184, + "grad_norm": 7.280837535858154, + "learning_rate": 9.206267664155906e-06, + "loss": 0.8083833694458008, + "memory(GiB)": 33.07, + "step": 4790, + "token_acc": 0.801067615658363, + "train_speed(iter/s)": 0.096254 + }, + { + "epoch": 0.222985011291668, + "grad_norm": 7.027002334594727, + "learning_rate": 9.204187709668173e-06, + "loss": 0.7799443244934082, + "memory(GiB)": 33.07, + "step": 4795, + "token_acc": 0.8110749185667753, + "train_speed(iter/s)": 0.09631 + }, + { + "epoch": 0.2232175295516176, + "grad_norm": 6.834471225738525, + "learning_rate": 9.202105269114444e-06, + "loss": 0.7543346405029296, + "memory(GiB)": 33.07, + "step": 4800, + "token_acc": 0.8283281039892425, + "train_speed(iter/s)": 0.096367 + }, + { + "epoch": 0.2232175295516176, + "eval_loss": 0.6796162724494934, + "eval_runtime": 294.619, + "eval_samples_per_second": 11.795, + "eval_steps_per_second": 11.795, + "step": 4800 + }, + { + "epoch": 0.22345004781156722, + "grad_norm": 5.5002031326293945, + "learning_rate": 9.200020343726132e-06, + "loss": 0.724638032913208, + "memory(GiB)": 33.07, + "step": 4805, + "token_acc": 0.8070286010610714, + "train_speed(iter/s)": 0.095856 + }, + { + "epoch": 0.2236825660715168, + "grad_norm": 7.37080192565918, + "learning_rate": 9.197932934736117e-06, + "loss": 0.6563894271850585, + "memory(GiB)": 33.07, + "step": 4810, + "token_acc": 0.8334597875569044, + "train_speed(iter/s)": 0.095912 + }, + { + "epoch": 0.2239150843314664, + "grad_norm": 7.196676731109619, + "learning_rate": 9.195843043378751e-06, + "loss": 0.8106472015380859, + "memory(GiB)": 33.07, + "step": 4815, + "token_acc": 0.7981157469717362, + "train_speed(iter/s)": 0.095968 + }, + { + "epoch": 0.224147602591416, + "grad_norm": 5.868455410003662, + "learning_rate": 9.193750670889849e-06, + "loss": 0.9501402854919434, + "memory(GiB)": 33.07, + "step": 4820, + "token_acc": 0.7806595035198222, + "train_speed(iter/s)": 0.096023 + }, + { + "epoch": 0.2243801208513656, + "grad_norm": 4.000607490539551, + "learning_rate": 9.191655818506694e-06, + "loss": 0.8487631797790527, + "memory(GiB)": 33.07, + "step": 4825, + "token_acc": 0.8006230529595015, + "train_speed(iter/s)": 0.096079 + }, + { + "epoch": 0.2246126391113152, + "grad_norm": 5.362325668334961, + "learning_rate": 9.18955848746804e-06, + "loss": 0.8604891777038575, + "memory(GiB)": 33.07, + "step": 4830, + "token_acc": 0.7964179104477612, + "train_speed(iter/s)": 0.096132 + }, + { + "epoch": 0.22484515737126481, + "grad_norm": 6.270984172821045, + "learning_rate": 9.1874586790141e-06, + "loss": 0.713783073425293, + "memory(GiB)": 33.07, + "step": 4835, + "token_acc": 0.8275732531930879, + "train_speed(iter/s)": 0.096187 + }, + { + "epoch": 0.22507767563121442, + "grad_norm": 6.523890495300293, + "learning_rate": 9.18535639438656e-06, + "loss": 0.8093732833862305, + "memory(GiB)": 33.07, + "step": 4840, + "token_acc": 0.811935610522183, + "train_speed(iter/s)": 0.096243 + }, + { + "epoch": 0.22531019389116402, + "grad_norm": 6.198504447937012, + "learning_rate": 9.183251634828563e-06, + "loss": 0.678268575668335, + "memory(GiB)": 33.07, + "step": 4845, + "token_acc": 0.8220858895705522, + "train_speed(iter/s)": 0.096299 + }, + { + "epoch": 0.22554271215111363, + "grad_norm": 5.541542053222656, + "learning_rate": 9.181144401584718e-06, + "loss": 0.7477645874023438, + "memory(GiB)": 33.07, + "step": 4850, + "token_acc": 0.8140192198982475, + "train_speed(iter/s)": 0.096353 + }, + { + "epoch": 0.22554271215111363, + "eval_loss": 0.6794469356536865, + "eval_runtime": 289.7339, + "eval_samples_per_second": 11.994, + "eval_steps_per_second": 11.994, + "step": 4850 + }, + { + "epoch": 0.22577523041106323, + "grad_norm": 7.808995246887207, + "learning_rate": 9.1790346959011e-06, + "loss": 0.8346663475036621, + "memory(GiB)": 33.07, + "step": 4855, + "token_acc": 0.8062872047622096, + "train_speed(iter/s)": 0.095856 + }, + { + "epoch": 0.2260077486710128, + "grad_norm": 5.512000560760498, + "learning_rate": 9.17692251902524e-06, + "loss": 0.7062819004058838, + "memory(GiB)": 33.07, + "step": 4860, + "token_acc": 0.833842627960275, + "train_speed(iter/s)": 0.095912 + }, + { + "epoch": 0.2262402669309624, + "grad_norm": 6.988223552703857, + "learning_rate": 9.174807872206134e-06, + "loss": 0.8444395065307617, + "memory(GiB)": 33.07, + "step": 4865, + "token_acc": 0.7886693999254566, + "train_speed(iter/s)": 0.095968 + }, + { + "epoch": 0.22647278519091202, + "grad_norm": 5.697296142578125, + "learning_rate": 9.172690756694238e-06, + "loss": 0.823847484588623, + "memory(GiB)": 33.07, + "step": 4870, + "token_acc": 0.7910394265232975, + "train_speed(iter/s)": 0.096024 + }, + { + "epoch": 0.22670530345086162, + "grad_norm": 5.049820899963379, + "learning_rate": 9.17057117374147e-06, + "loss": 0.8216519355773926, + "memory(GiB)": 33.07, + "step": 4875, + "token_acc": 0.7899603698811096, + "train_speed(iter/s)": 0.096077 + }, + { + "epoch": 0.22693782171081123, + "grad_norm": 7.270157337188721, + "learning_rate": 9.168449124601202e-06, + "loss": 0.7646757125854492, + "memory(GiB)": 33.07, + "step": 4880, + "token_acc": 0.7954699121027722, + "train_speed(iter/s)": 0.096133 + }, + { + "epoch": 0.22717033997076083, + "grad_norm": 5.253354072570801, + "learning_rate": 9.166324610528268e-06, + "loss": 0.8869535446166992, + "memory(GiB)": 33.07, + "step": 4885, + "token_acc": 0.7827964131517768, + "train_speed(iter/s)": 0.096187 + }, + { + "epoch": 0.22740285823071044, + "grad_norm": 6.635804176330566, + "learning_rate": 9.164197632778958e-06, + "loss": 0.7283540725708008, + "memory(GiB)": 33.07, + "step": 4890, + "token_acc": 0.8065134099616859, + "train_speed(iter/s)": 0.096241 + }, + { + "epoch": 0.22763537649066004, + "grad_norm": 6.679150104522705, + "learning_rate": 9.162068192611022e-06, + "loss": 0.6464852809906005, + "memory(GiB)": 33.07, + "step": 4895, + "token_acc": 0.8569292123629113, + "train_speed(iter/s)": 0.096297 + }, + { + "epoch": 0.22786789475060965, + "grad_norm": 7.047290325164795, + "learning_rate": 9.159936291283662e-06, + "loss": 0.8367726325988769, + "memory(GiB)": 33.07, + "step": 4900, + "token_acc": 0.8022113022113022, + "train_speed(iter/s)": 0.096351 + }, + { + "epoch": 0.22786789475060965, + "eval_loss": 0.6772682070732117, + "eval_runtime": 291.6576, + "eval_samples_per_second": 11.915, + "eval_steps_per_second": 11.915, + "step": 4900 + }, + { + "epoch": 0.22810041301055922, + "grad_norm": 7.302158832550049, + "learning_rate": 9.157801930057538e-06, + "loss": 0.6890687465667724, + "memory(GiB)": 33.07, + "step": 4905, + "token_acc": 0.8061630616706574, + "train_speed(iter/s)": 0.095855 + }, + { + "epoch": 0.22833293127050883, + "grad_norm": 6.893573760986328, + "learning_rate": 9.15566511019476e-06, + "loss": 0.712225866317749, + "memory(GiB)": 33.07, + "step": 4910, + "token_acc": 0.8429596073990185, + "train_speed(iter/s)": 0.095911 + }, + { + "epoch": 0.22856544953045843, + "grad_norm": 5.463949680328369, + "learning_rate": 9.153525832958903e-06, + "loss": 0.7492496967315674, + "memory(GiB)": 33.07, + "step": 4915, + "token_acc": 0.8176162409954159, + "train_speed(iter/s)": 0.095965 + }, + { + "epoch": 0.22879796779040804, + "grad_norm": 5.487346172332764, + "learning_rate": 9.151384099614979e-06, + "loss": 0.8953396797180175, + "memory(GiB)": 33.07, + "step": 4920, + "token_acc": 0.7843079635615633, + "train_speed(iter/s)": 0.096019 + }, + { + "epoch": 0.22903048605035764, + "grad_norm": 7.811435222625732, + "learning_rate": 9.149239911429468e-06, + "loss": 0.7093976020812989, + "memory(GiB)": 33.07, + "step": 4925, + "token_acc": 0.8245260185558693, + "train_speed(iter/s)": 0.096075 + }, + { + "epoch": 0.22926300431030724, + "grad_norm": 6.169620037078857, + "learning_rate": 9.147093269670291e-06, + "loss": 0.78594970703125, + "memory(GiB)": 33.07, + "step": 4930, + "token_acc": 0.819672131147541, + "train_speed(iter/s)": 0.096129 + }, + { + "epoch": 0.22949552257025685, + "grad_norm": 5.188272476196289, + "learning_rate": 9.144944175606826e-06, + "loss": 0.7385846138000488, + "memory(GiB)": 33.07, + "step": 4935, + "token_acc": 0.812938177182919, + "train_speed(iter/s)": 0.096185 + }, + { + "epoch": 0.22972804083020645, + "grad_norm": 5.541247367858887, + "learning_rate": 9.142792630509896e-06, + "loss": 0.7875532627105712, + "memory(GiB)": 33.07, + "step": 4940, + "token_acc": 0.8065062690613352, + "train_speed(iter/s)": 0.096238 + }, + { + "epoch": 0.22996055909015606, + "grad_norm": 6.210489749908447, + "learning_rate": 9.140638635651778e-06, + "loss": 0.838779354095459, + "memory(GiB)": 33.07, + "step": 4945, + "token_acc": 0.7984066505022515, + "train_speed(iter/s)": 0.096291 + }, + { + "epoch": 0.23019307735010566, + "grad_norm": 7.857576370239258, + "learning_rate": 9.138482192306194e-06, + "loss": 0.7631664752960206, + "memory(GiB)": 33.07, + "step": 4950, + "token_acc": 0.8173150266971777, + "train_speed(iter/s)": 0.096345 + }, + { + "epoch": 0.23019307735010566, + "eval_loss": 0.6792007684707642, + "eval_runtime": 288.8644, + "eval_samples_per_second": 12.03, + "eval_steps_per_second": 12.03, + "step": 4950 + }, + { + "epoch": 0.23042559561005524, + "grad_norm": 6.835683822631836, + "learning_rate": 9.136323301748317e-06, + "loss": 0.7649744510650635, + "memory(GiB)": 33.07, + "step": 4955, + "token_acc": 0.8066777775997179, + "train_speed(iter/s)": 0.09586 + }, + { + "epoch": 0.23065811387000484, + "grad_norm": 5.466605186462402, + "learning_rate": 9.134161965254767e-06, + "loss": 0.7909045696258545, + "memory(GiB)": 33.07, + "step": 4960, + "token_acc": 0.7993619283941864, + "train_speed(iter/s)": 0.095914 + }, + { + "epoch": 0.23089063212995445, + "grad_norm": 5.472935199737549, + "learning_rate": 9.131998184103603e-06, + "loss": 0.8002223014831543, + "memory(GiB)": 33.07, + "step": 4965, + "token_acc": 0.7913832199546486, + "train_speed(iter/s)": 0.095969 + }, + { + "epoch": 0.23112315038990405, + "grad_norm": 5.254642963409424, + "learning_rate": 9.129831959574342e-06, + "loss": 0.7709039688110352, + "memory(GiB)": 33.07, + "step": 4970, + "token_acc": 0.7976470588235294, + "train_speed(iter/s)": 0.09602 + }, + { + "epoch": 0.23135566864985366, + "grad_norm": 5.667941093444824, + "learning_rate": 9.127663292947937e-06, + "loss": 0.813076400756836, + "memory(GiB)": 33.07, + "step": 4975, + "token_acc": 0.8018323719036308, + "train_speed(iter/s)": 0.096075 + }, + { + "epoch": 0.23158818690980326, + "grad_norm": 5.399019241333008, + "learning_rate": 9.125492185506787e-06, + "loss": 0.7386826515197754, + "memory(GiB)": 33.07, + "step": 4980, + "token_acc": 0.8237602568676418, + "train_speed(iter/s)": 0.096129 + }, + { + "epoch": 0.23182070516975287, + "grad_norm": 7.110016822814941, + "learning_rate": 9.123318638534737e-06, + "loss": 0.7625212669372559, + "memory(GiB)": 33.07, + "step": 4985, + "token_acc": 0.8365465213746857, + "train_speed(iter/s)": 0.096184 + }, + { + "epoch": 0.23205322342970247, + "grad_norm": 5.49600887298584, + "learning_rate": 9.121142653317071e-06, + "loss": 0.7430119991302491, + "memory(GiB)": 33.07, + "step": 4990, + "token_acc": 0.8249013275923932, + "train_speed(iter/s)": 0.096238 + }, + { + "epoch": 0.23228574168965208, + "grad_norm": 5.538654804229736, + "learning_rate": 9.118964231140516e-06, + "loss": 0.6083118915557861, + "memory(GiB)": 33.07, + "step": 4995, + "token_acc": 0.8432861580945806, + "train_speed(iter/s)": 0.096291 + }, + { + "epoch": 0.23251825994960168, + "grad_norm": 5.317222595214844, + "learning_rate": 9.116783373293238e-06, + "loss": 0.7773871898651123, + "memory(GiB)": 33.07, + "step": 5000, + "token_acc": 0.8092269326683291, + "train_speed(iter/s)": 0.096343 + }, + { + "epoch": 0.23251825994960168, + "eval_loss": 0.6743544340133667, + "eval_runtime": 290.6524, + "eval_samples_per_second": 11.956, + "eval_steps_per_second": 11.956, + "step": 5000 + }, + { + "epoch": 0.23275077820955126, + "grad_norm": 6.647306442260742, + "learning_rate": 9.114600081064852e-06, + "loss": 0.7474261283874511, + "memory(GiB)": 33.07, + "step": 5005, + "token_acc": 0.8067668450225882, + "train_speed(iter/s)": 0.09586 + }, + { + "epoch": 0.23298329646950086, + "grad_norm": 6.466226100921631, + "learning_rate": 9.1124143557464e-06, + "loss": 0.7601866722106934, + "memory(GiB)": 33.07, + "step": 5010, + "token_acc": 0.8133603238866397, + "train_speed(iter/s)": 0.095914 + }, + { + "epoch": 0.23321581472945047, + "grad_norm": 5.549792766571045, + "learning_rate": 9.110226198630372e-06, + "loss": 0.6700472354888916, + "memory(GiB)": 33.07, + "step": 5015, + "token_acc": 0.8329427519250083, + "train_speed(iter/s)": 0.095968 + }, + { + "epoch": 0.23344833298940007, + "grad_norm": 5.168575286865234, + "learning_rate": 9.10803561101069e-06, + "loss": 0.7552637100219727, + "memory(GiB)": 33.07, + "step": 5020, + "token_acc": 0.8173913043478261, + "train_speed(iter/s)": 0.096023 + }, + { + "epoch": 0.23368085124934967, + "grad_norm": 4.69772481918335, + "learning_rate": 9.10584259418272e-06, + "loss": 0.7492507457733154, + "memory(GiB)": 33.07, + "step": 5025, + "token_acc": 0.8177269478198713, + "train_speed(iter/s)": 0.096076 + }, + { + "epoch": 0.23391336950929928, + "grad_norm": 5.720139503479004, + "learning_rate": 9.103647149443258e-06, + "loss": 0.8174025535583496, + "memory(GiB)": 33.07, + "step": 5030, + "token_acc": 0.7844343407328163, + "train_speed(iter/s)": 0.096129 + }, + { + "epoch": 0.23414588776924888, + "grad_norm": 4.886946201324463, + "learning_rate": 9.101449278090539e-06, + "loss": 0.7711383819580078, + "memory(GiB)": 33.07, + "step": 5035, + "token_acc": 0.8156508653122648, + "train_speed(iter/s)": 0.096183 + }, + { + "epoch": 0.2343784060291985, + "grad_norm": 5.7471184730529785, + "learning_rate": 9.099248981424232e-06, + "loss": 0.8346371650695801, + "memory(GiB)": 33.07, + "step": 5040, + "token_acc": 0.7889851485148515, + "train_speed(iter/s)": 0.096236 + }, + { + "epoch": 0.2346109242891481, + "grad_norm": 4.732773780822754, + "learning_rate": 9.097046260745439e-06, + "loss": 0.7209797382354737, + "memory(GiB)": 33.07, + "step": 5045, + "token_acc": 0.8060527172144484, + "train_speed(iter/s)": 0.096289 + }, + { + "epoch": 0.2348434425490977, + "grad_norm": 6.407209396362305, + "learning_rate": 9.094841117356698e-06, + "loss": 0.7606247425079345, + "memory(GiB)": 33.07, + "step": 5050, + "token_acc": 0.8119364534134822, + "train_speed(iter/s)": 0.096342 + }, + { + "epoch": 0.2348434425490977, + "eval_loss": 0.6779821515083313, + "eval_runtime": 290.5023, + "eval_samples_per_second": 11.962, + "eval_steps_per_second": 11.962, + "step": 5050 + }, + { + "epoch": 0.23507596080904727, + "grad_norm": 7.122600555419922, + "learning_rate": 9.09263355256198e-06, + "loss": 0.7734743118286133, + "memory(GiB)": 33.07, + "step": 5055, + "token_acc": 0.8066206125464201, + "train_speed(iter/s)": 0.095863 + }, + { + "epoch": 0.23530847906899688, + "grad_norm": 6.685431480407715, + "learning_rate": 9.090423567666683e-06, + "loss": 0.687248420715332, + "memory(GiB)": 33.07, + "step": 5060, + "token_acc": 0.8142581888246628, + "train_speed(iter/s)": 0.095916 + }, + { + "epoch": 0.23554099732894648, + "grad_norm": 5.379944801330566, + "learning_rate": 9.088211163977644e-06, + "loss": 0.7669831275939941, + "memory(GiB)": 33.07, + "step": 5065, + "token_acc": 0.7954483136824787, + "train_speed(iter/s)": 0.095968 + }, + { + "epoch": 0.2357735155888961, + "grad_norm": 6.775158882141113, + "learning_rate": 9.08599634280312e-06, + "loss": 0.637472677230835, + "memory(GiB)": 33.07, + "step": 5070, + "token_acc": 0.8509949097639982, + "train_speed(iter/s)": 0.096022 + }, + { + "epoch": 0.2360060338488457, + "grad_norm": 7.46367883682251, + "learning_rate": 9.083779105452809e-06, + "loss": 0.8237061500549316, + "memory(GiB)": 33.07, + "step": 5075, + "token_acc": 0.8067835365853658, + "train_speed(iter/s)": 0.096075 + }, + { + "epoch": 0.2362385521087953, + "grad_norm": 5.912139892578125, + "learning_rate": 9.081559453237825e-06, + "loss": 0.816553783416748, + "memory(GiB)": 33.07, + "step": 5080, + "token_acc": 0.8075, + "train_speed(iter/s)": 0.096129 + }, + { + "epoch": 0.2364710703687449, + "grad_norm": 6.141449451446533, + "learning_rate": 9.079337387470721e-06, + "loss": 0.7349120140075683, + "memory(GiB)": 33.07, + "step": 5085, + "token_acc": 0.8159602901870943, + "train_speed(iter/s)": 0.096182 + }, + { + "epoch": 0.2367035886286945, + "grad_norm": 5.841860294342041, + "learning_rate": 9.077112909465473e-06, + "loss": 0.7999568939208984, + "memory(GiB)": 33.07, + "step": 5090, + "token_acc": 0.8011173184357542, + "train_speed(iter/s)": 0.096236 + }, + { + "epoch": 0.2369361068886441, + "grad_norm": 7.302816867828369, + "learning_rate": 9.074886020537486e-06, + "loss": 0.7663827896118164, + "memory(GiB)": 33.07, + "step": 5095, + "token_acc": 0.8211050724637681, + "train_speed(iter/s)": 0.096289 + }, + { + "epoch": 0.2371686251485937, + "grad_norm": 7.1966094970703125, + "learning_rate": 9.072656722003581e-06, + "loss": 0.7074132919311523, + "memory(GiB)": 33.07, + "step": 5100, + "token_acc": 0.831053901850362, + "train_speed(iter/s)": 0.096341 + }, + { + "epoch": 0.2371686251485937, + "eval_loss": 0.6764135360717773, + "eval_runtime": 290.8232, + "eval_samples_per_second": 11.949, + "eval_steps_per_second": 11.949, + "step": 5100 + }, + { + "epoch": 0.2374011434085433, + "grad_norm": 7.087984561920166, + "learning_rate": 9.070425015182019e-06, + "loss": 0.8261497497558594, + "memory(GiB)": 33.07, + "step": 5105, + "token_acc": 0.8059494266814551, + "train_speed(iter/s)": 0.095867 + }, + { + "epoch": 0.2376336616684929, + "grad_norm": 6.17009973526001, + "learning_rate": 9.06819090139247e-06, + "loss": 0.8527179718017578, + "memory(GiB)": 33.07, + "step": 5110, + "token_acc": 0.8023850085178875, + "train_speed(iter/s)": 0.095921 + }, + { + "epoch": 0.2378661799284425, + "grad_norm": 7.8616180419921875, + "learning_rate": 9.065954381956042e-06, + "loss": 0.7692588806152344, + "memory(GiB)": 33.07, + "step": 5115, + "token_acc": 0.8109507954125046, + "train_speed(iter/s)": 0.095973 + }, + { + "epoch": 0.2380986981883921, + "grad_norm": 8.128244400024414, + "learning_rate": 9.06371545819525e-06, + "loss": 0.6775061607360839, + "memory(GiB)": 33.07, + "step": 5120, + "token_acc": 0.8120456905503635, + "train_speed(iter/s)": 0.096024 + }, + { + "epoch": 0.2383312164483417, + "grad_norm": 5.31683874130249, + "learning_rate": 9.061474131434045e-06, + "loss": 0.7091841697692871, + "memory(GiB)": 33.07, + "step": 5125, + "token_acc": 0.8258521768477894, + "train_speed(iter/s)": 0.096077 + }, + { + "epoch": 0.23856373470829131, + "grad_norm": 5.563629627227783, + "learning_rate": 9.05923040299779e-06, + "loss": 0.7681272983551025, + "memory(GiB)": 33.07, + "step": 5130, + "token_acc": 0.8094393186657204, + "train_speed(iter/s)": 0.096129 + }, + { + "epoch": 0.23879625296824092, + "grad_norm": 5.285060405731201, + "learning_rate": 9.056984274213272e-06, + "loss": 0.7310568809509277, + "memory(GiB)": 33.07, + "step": 5135, + "token_acc": 0.8217200251098556, + "train_speed(iter/s)": 0.096181 + }, + { + "epoch": 0.23902877122819052, + "grad_norm": 6.377146244049072, + "learning_rate": 9.054735746408695e-06, + "loss": 0.7459100246429443, + "memory(GiB)": 33.07, + "step": 5140, + "token_acc": 0.8177159590043924, + "train_speed(iter/s)": 0.096234 + }, + { + "epoch": 0.23926128948814013, + "grad_norm": 5.891545295715332, + "learning_rate": 9.052484820913683e-06, + "loss": 0.7460856437683105, + "memory(GiB)": 33.07, + "step": 5145, + "token_acc": 0.8140324405884571, + "train_speed(iter/s)": 0.096286 + }, + { + "epoch": 0.2394938077480897, + "grad_norm": 5.500942707061768, + "learning_rate": 9.050231499059278e-06, + "loss": 0.7375868320465088, + "memory(GiB)": 33.07, + "step": 5150, + "token_acc": 0.8267284991568297, + "train_speed(iter/s)": 0.096338 + }, + { + "epoch": 0.2394938077480897, + "eval_loss": 0.6725084185600281, + "eval_runtime": 289.1326, + "eval_samples_per_second": 12.019, + "eval_steps_per_second": 12.019, + "step": 5150 + }, + { + "epoch": 0.2397263260080393, + "grad_norm": 6.5969319343566895, + "learning_rate": 9.04797578217794e-06, + "loss": 0.7631452560424805, + "memory(GiB)": 33.07, + "step": 5155, + "token_acc": 0.8082897828067316, + "train_speed(iter/s)": 0.09587 + }, + { + "epoch": 0.2399588442679889, + "grad_norm": 6.98110294342041, + "learning_rate": 9.045717671603544e-06, + "loss": 0.7008907794952393, + "memory(GiB)": 33.07, + "step": 5160, + "token_acc": 0.8269662921348314, + "train_speed(iter/s)": 0.095922 + }, + { + "epoch": 0.24019136252793852, + "grad_norm": 7.913124084472656, + "learning_rate": 9.043457168671378e-06, + "loss": 0.7300667285919189, + "memory(GiB)": 33.07, + "step": 5165, + "token_acc": 0.8395061728395061, + "train_speed(iter/s)": 0.095975 + }, + { + "epoch": 0.24042388078788812, + "grad_norm": 6.288471698760986, + "learning_rate": 9.041194274718151e-06, + "loss": 0.7818746566772461, + "memory(GiB)": 33.07, + "step": 5170, + "token_acc": 0.8066581306017926, + "train_speed(iter/s)": 0.096026 + }, + { + "epoch": 0.24065639904783773, + "grad_norm": 5.895214080810547, + "learning_rate": 9.038928991081976e-06, + "loss": 0.7084590911865234, + "memory(GiB)": 33.07, + "step": 5175, + "token_acc": 0.817533129459735, + "train_speed(iter/s)": 0.096077 + }, + { + "epoch": 0.24088891730778733, + "grad_norm": 4.603113174438477, + "learning_rate": 9.036661319102393e-06, + "loss": 0.764235258102417, + "memory(GiB)": 33.07, + "step": 5180, + "token_acc": 0.8124600638977636, + "train_speed(iter/s)": 0.096127 + }, + { + "epoch": 0.24112143556773694, + "grad_norm": 6.578195095062256, + "learning_rate": 9.034391260120342e-06, + "loss": 0.732538890838623, + "memory(GiB)": 33.07, + "step": 5185, + "token_acc": 0.8214139762975071, + "train_speed(iter/s)": 0.09618 + }, + { + "epoch": 0.24135395382768654, + "grad_norm": 7.548125267028809, + "learning_rate": 9.032118815478177e-06, + "loss": 0.7742302894592286, + "memory(GiB)": 33.07, + "step": 5190, + "token_acc": 0.812288993923025, + "train_speed(iter/s)": 0.096233 + }, + { + "epoch": 0.24158647208763614, + "grad_norm": 6.740896701812744, + "learning_rate": 9.029843986519667e-06, + "loss": 0.7679877758026123, + "memory(GiB)": 33.07, + "step": 5195, + "token_acc": 0.8307624504186867, + "train_speed(iter/s)": 0.096285 + }, + { + "epoch": 0.24181899034758572, + "grad_norm": 5.745598316192627, + "learning_rate": 9.02756677458999e-06, + "loss": 0.7190179824829102, + "memory(GiB)": 33.07, + "step": 5200, + "token_acc": 0.8271690286899412, + "train_speed(iter/s)": 0.096336 + }, + { + "epoch": 0.24181899034758572, + "eval_loss": 0.6732439994812012, + "eval_runtime": 293.5982, + "eval_samples_per_second": 11.836, + "eval_steps_per_second": 11.836, + "step": 5200 + }, + { + "epoch": 0.24205150860753533, + "grad_norm": 6.52875280380249, + "learning_rate": 9.025287181035731e-06, + "loss": 0.7002001285552979, + "memory(GiB)": 33.07, + "step": 5205, + "token_acc": 0.808760162764112, + "train_speed(iter/s)": 0.095866 + }, + { + "epoch": 0.24228402686748493, + "grad_norm": 4.988480091094971, + "learning_rate": 9.023005207204883e-06, + "loss": 0.7857324600219726, + "memory(GiB)": 33.07, + "step": 5210, + "token_acc": 0.7975120939875605, + "train_speed(iter/s)": 0.095917 + }, + { + "epoch": 0.24251654512743454, + "grad_norm": 6.880677700042725, + "learning_rate": 9.020720854446847e-06, + "loss": 0.7423035621643066, + "memory(GiB)": 33.07, + "step": 5215, + "token_acc": 0.8371703641691084, + "train_speed(iter/s)": 0.095968 + }, + { + "epoch": 0.24274906338738414, + "grad_norm": 6.216911315917969, + "learning_rate": 9.018434124112434e-06, + "loss": 0.7673687934875488, + "memory(GiB)": 33.07, + "step": 5220, + "token_acc": 0.80249753532698, + "train_speed(iter/s)": 0.09602 + }, + { + "epoch": 0.24298158164733374, + "grad_norm": 5.229708194732666, + "learning_rate": 9.016145017553856e-06, + "loss": 0.8729522705078125, + "memory(GiB)": 33.07, + "step": 5225, + "token_acc": 0.7923865300146413, + "train_speed(iter/s)": 0.096072 + }, + { + "epoch": 0.24321409990728335, + "grad_norm": 5.493806838989258, + "learning_rate": 9.013853536124732e-06, + "loss": 0.806683349609375, + "memory(GiB)": 33.07, + "step": 5230, + "token_acc": 0.806822262118492, + "train_speed(iter/s)": 0.096125 + }, + { + "epoch": 0.24344661816723295, + "grad_norm": 6.720024108886719, + "learning_rate": 9.011559681180088e-06, + "loss": 0.7355064392089844, + "memory(GiB)": 33.07, + "step": 5235, + "token_acc": 0.8318548387096775, + "train_speed(iter/s)": 0.096177 + }, + { + "epoch": 0.24367913642718256, + "grad_norm": 7.687710285186768, + "learning_rate": 9.009263454076349e-06, + "loss": 0.7195593833923339, + "memory(GiB)": 33.07, + "step": 5240, + "token_acc": 0.8145440554059253, + "train_speed(iter/s)": 0.096229 + }, + { + "epoch": 0.24391165468713216, + "grad_norm": 7.271475315093994, + "learning_rate": 9.006964856171347e-06, + "loss": 0.6926477432250977, + "memory(GiB)": 33.07, + "step": 5245, + "token_acc": 0.8299108872530027, + "train_speed(iter/s)": 0.09628 + }, + { + "epoch": 0.24414417294708174, + "grad_norm": 6.054494857788086, + "learning_rate": 9.004663888824312e-06, + "loss": 0.7744291305541993, + "memory(GiB)": 33.07, + "step": 5250, + "token_acc": 0.7921225382932167, + "train_speed(iter/s)": 0.096332 + }, + { + "epoch": 0.24414417294708174, + "eval_loss": 0.6698423624038696, + "eval_runtime": 289.3444, + "eval_samples_per_second": 12.01, + "eval_steps_per_second": 12.01, + "step": 5250 + }, + { + "epoch": 0.24437669120703134, + "grad_norm": 5.884529113769531, + "learning_rate": 9.002360553395877e-06, + "loss": 0.6975594997406006, + "memory(GiB)": 33.07, + "step": 5255, + "token_acc": 0.808309229555614, + "train_speed(iter/s)": 0.095874 + }, + { + "epoch": 0.24460920946698095, + "grad_norm": 5.387625217437744, + "learning_rate": 9.000054851248078e-06, + "loss": 0.721724557876587, + "memory(GiB)": 33.07, + "step": 5260, + "token_acc": 0.820926243567753, + "train_speed(iter/s)": 0.095925 + }, + { + "epoch": 0.24484172772693055, + "grad_norm": 5.464982986450195, + "learning_rate": 8.997746783744346e-06, + "loss": 0.8935551643371582, + "memory(GiB)": 33.07, + "step": 5265, + "token_acc": 0.7821548821548822, + "train_speed(iter/s)": 0.095976 + }, + { + "epoch": 0.24507424598688016, + "grad_norm": 7.145682334899902, + "learning_rate": 8.995436352249512e-06, + "loss": 0.6936135768890381, + "memory(GiB)": 33.07, + "step": 5270, + "token_acc": 0.8183799491463858, + "train_speed(iter/s)": 0.096028 + }, + { + "epoch": 0.24530676424682976, + "grad_norm": 6.86348819732666, + "learning_rate": 8.993123558129806e-06, + "loss": 0.7699307918548584, + "memory(GiB)": 33.07, + "step": 5275, + "token_acc": 0.8124552612741589, + "train_speed(iter/s)": 0.09608 + }, + { + "epoch": 0.24553928250677937, + "grad_norm": 6.941252708435059, + "learning_rate": 8.990808402752856e-06, + "loss": 0.7396425247192383, + "memory(GiB)": 33.07, + "step": 5280, + "token_acc": 0.8251801289343952, + "train_speed(iter/s)": 0.096132 + }, + { + "epoch": 0.24577180076672897, + "grad_norm": 5.640344619750977, + "learning_rate": 8.988490887487683e-06, + "loss": 0.7938966751098633, + "memory(GiB)": 33.07, + "step": 5285, + "token_acc": 0.8080515297906602, + "train_speed(iter/s)": 0.096184 + }, + { + "epoch": 0.24600431902667858, + "grad_norm": 5.452683448791504, + "learning_rate": 8.986171013704703e-06, + "loss": 0.7618279933929444, + "memory(GiB)": 33.07, + "step": 5290, + "token_acc": 0.8115112756647593, + "train_speed(iter/s)": 0.096235 + }, + { + "epoch": 0.24623683728662815, + "grad_norm": 5.495558738708496, + "learning_rate": 8.983848782775735e-06, + "loss": 0.8416355133056641, + "memory(GiB)": 33.07, + "step": 5295, + "token_acc": 0.7899057464274856, + "train_speed(iter/s)": 0.096287 + }, + { + "epoch": 0.24646935554657776, + "grad_norm": 6.56035041809082, + "learning_rate": 8.981524196073981e-06, + "loss": 0.9013174057006836, + "memory(GiB)": 33.07, + "step": 5300, + "token_acc": 0.8080099091659785, + "train_speed(iter/s)": 0.096338 + }, + { + "epoch": 0.24646935554657776, + "eval_loss": 0.6700074672698975, + "eval_runtime": 288.0838, + "eval_samples_per_second": 12.062, + "eval_steps_per_second": 12.062, + "step": 5300 + }, + { + "epoch": 0.24670187380652736, + "grad_norm": 6.2724833488464355, + "learning_rate": 8.979197254974045e-06, + "loss": 0.7496028900146484, + "memory(GiB)": 33.07, + "step": 5305, + "token_acc": 0.8083637677087931, + "train_speed(iter/s)": 0.095886 + }, + { + "epoch": 0.24693439206647697, + "grad_norm": 4.500880241394043, + "learning_rate": 8.976867960851915e-06, + "loss": 0.7409276008605957, + "memory(GiB)": 33.07, + "step": 5310, + "token_acc": 0.8254813600983204, + "train_speed(iter/s)": 0.095937 + }, + { + "epoch": 0.24716691032642657, + "grad_norm": 6.141285419464111, + "learning_rate": 8.974536315084976e-06, + "loss": 0.8386311531066895, + "memory(GiB)": 33.07, + "step": 5315, + "token_acc": 0.7797459893048129, + "train_speed(iter/s)": 0.095988 + }, + { + "epoch": 0.24739942858637617, + "grad_norm": 5.962406635284424, + "learning_rate": 8.972202319052004e-06, + "loss": 0.6776054859161377, + "memory(GiB)": 33.07, + "step": 5320, + "token_acc": 0.8359511343804538, + "train_speed(iter/s)": 0.09604 + }, + { + "epoch": 0.24763194684632578, + "grad_norm": 6.162694931030273, + "learning_rate": 8.969865974133161e-06, + "loss": 0.7959749221801757, + "memory(GiB)": 33.07, + "step": 5325, + "token_acc": 0.7978366281238344, + "train_speed(iter/s)": 0.096091 + }, + { + "epoch": 0.24786446510627538, + "grad_norm": 6.722538948059082, + "learning_rate": 8.96752728171e-06, + "loss": 0.7133076190948486, + "memory(GiB)": 33.07, + "step": 5330, + "token_acc": 0.8228179143510951, + "train_speed(iter/s)": 0.096139 + }, + { + "epoch": 0.248096983366225, + "grad_norm": 5.973226070404053, + "learning_rate": 8.965186243165461e-06, + "loss": 0.7359925270080566, + "memory(GiB)": 33.07, + "step": 5335, + "token_acc": 0.8213429256594724, + "train_speed(iter/s)": 0.096191 + }, + { + "epoch": 0.2483295016261746, + "grad_norm": 7.109038829803467, + "learning_rate": 8.962842859883875e-06, + "loss": 0.714454174041748, + "memory(GiB)": 33.07, + "step": 5340, + "token_acc": 0.8213627992633518, + "train_speed(iter/s)": 0.096241 + }, + { + "epoch": 0.24856201988612417, + "grad_norm": 5.056357383728027, + "learning_rate": 8.960497133250954e-06, + "loss": 0.7473381519317627, + "memory(GiB)": 33.07, + "step": 5345, + "token_acc": 0.8127871362940275, + "train_speed(iter/s)": 0.096292 + }, + { + "epoch": 0.24879453814607377, + "grad_norm": 6.350715637207031, + "learning_rate": 8.958149064653802e-06, + "loss": 0.7907238483428956, + "memory(GiB)": 33.07, + "step": 5350, + "token_acc": 0.8073970690858339, + "train_speed(iter/s)": 0.096342 + }, + { + "epoch": 0.24879453814607377, + "eval_loss": 0.6689327359199524, + "eval_runtime": 290.5795, + "eval_samples_per_second": 11.959, + "eval_steps_per_second": 11.959, + "step": 5350 + }, + { + "epoch": 0.24902705640602338, + "grad_norm": 5.9941534996032715, + "learning_rate": 8.955798655480901e-06, + "loss": 0.7376707077026368, + "memory(GiB)": 33.07, + "step": 5355, + "token_acc": 0.8085616190529794, + "train_speed(iter/s)": 0.095891 + }, + { + "epoch": 0.24925957466597298, + "grad_norm": 5.566898345947266, + "learning_rate": 8.953445907122123e-06, + "loss": 0.854979419708252, + "memory(GiB)": 33.07, + "step": 5360, + "token_acc": 0.7967625899280576, + "train_speed(iter/s)": 0.095942 + }, + { + "epoch": 0.2494920929259226, + "grad_norm": 5.000674724578857, + "learning_rate": 8.95109082096872e-06, + "loss": 0.7167798042297363, + "memory(GiB)": 33.07, + "step": 5365, + "token_acc": 0.8103186646433991, + "train_speed(iter/s)": 0.095992 + }, + { + "epoch": 0.2497246111858722, + "grad_norm": 7.17212438583374, + "learning_rate": 8.948733398413326e-06, + "loss": 0.9033098220825195, + "memory(GiB)": 33.07, + "step": 5370, + "token_acc": 0.7684180468303826, + "train_speed(iter/s)": 0.096042 + }, + { + "epoch": 0.2499571294458218, + "grad_norm": 6.104959487915039, + "learning_rate": 8.946373640849958e-06, + "loss": 0.8636404991149902, + "memory(GiB)": 33.07, + "step": 5375, + "token_acc": 0.8045325779036827, + "train_speed(iter/s)": 0.096091 + }, + { + "epoch": 0.2501896477057714, + "grad_norm": 5.245123386383057, + "learning_rate": 8.944011549674016e-06, + "loss": 0.5826745986938476, + "memory(GiB)": 33.07, + "step": 5380, + "token_acc": 0.8652193577566711, + "train_speed(iter/s)": 0.096141 + }, + { + "epoch": 0.250422165965721, + "grad_norm": 6.003298282623291, + "learning_rate": 8.941647126282275e-06, + "loss": 0.7276782989501953, + "memory(GiB)": 33.07, + "step": 5385, + "token_acc": 0.8281549673954737, + "train_speed(iter/s)": 0.096193 + }, + { + "epoch": 0.2506546842256706, + "grad_norm": 5.025068759918213, + "learning_rate": 8.939280372072891e-06, + "loss": 0.7485162734985351, + "memory(GiB)": 33.07, + "step": 5390, + "token_acc": 0.8045441304981066, + "train_speed(iter/s)": 0.096243 + }, + { + "epoch": 0.2508872024856202, + "grad_norm": 6.471608638763428, + "learning_rate": 8.9369112884454e-06, + "loss": 0.8305785179138183, + "memory(GiB)": 33.07, + "step": 5395, + "token_acc": 0.7878504672897196, + "train_speed(iter/s)": 0.096294 + }, + { + "epoch": 0.2511197207455698, + "grad_norm": 7.496315002441406, + "learning_rate": 8.934539876800716e-06, + "loss": 0.8001940727233887, + "memory(GiB)": 33.07, + "step": 5400, + "token_acc": 0.8088871411718442, + "train_speed(iter/s)": 0.096344 + }, + { + "epoch": 0.2511197207455698, + "eval_loss": 0.6697515845298767, + "eval_runtime": 291.0629, + "eval_samples_per_second": 11.939, + "eval_steps_per_second": 11.939, + "step": 5400 + }, + { + "epoch": 0.2513522390055194, + "grad_norm": 6.354772567749023, + "learning_rate": 8.932166138541127e-06, + "loss": 0.765593433380127, + "memory(GiB)": 33.07, + "step": 5405, + "token_acc": 0.8085344593189706, + "train_speed(iter/s)": 0.095895 + }, + { + "epoch": 0.25158475726546903, + "grad_norm": 7.054717063903809, + "learning_rate": 8.929790075070295e-06, + "loss": 0.7476856708526611, + "memory(GiB)": 33.07, + "step": 5410, + "token_acc": 0.8313807531380754, + "train_speed(iter/s)": 0.095945 + }, + { + "epoch": 0.2518172755254186, + "grad_norm": 5.243678569793701, + "learning_rate": 8.92741168779326e-06, + "loss": 0.7400431632995605, + "memory(GiB)": 33.07, + "step": 5415, + "token_acc": 0.8249431633647288, + "train_speed(iter/s)": 0.095994 + }, + { + "epoch": 0.2520497937853682, + "grad_norm": 7.673439979553223, + "learning_rate": 8.925030978116441e-06, + "loss": 0.6828903675079345, + "memory(GiB)": 33.07, + "step": 5420, + "token_acc": 0.8329528158295282, + "train_speed(iter/s)": 0.096045 + }, + { + "epoch": 0.2522823120453178, + "grad_norm": 5.734298229217529, + "learning_rate": 8.92264794744762e-06, + "loss": 0.7182761192321777, + "memory(GiB)": 33.07, + "step": 5425, + "token_acc": 0.8171109733415995, + "train_speed(iter/s)": 0.096095 + }, + { + "epoch": 0.2525148303052674, + "grad_norm": 6.604122161865234, + "learning_rate": 8.920262597195959e-06, + "loss": 0.7213967800140381, + "memory(GiB)": 33.07, + "step": 5430, + "token_acc": 0.8148869836321122, + "train_speed(iter/s)": 0.096146 + }, + { + "epoch": 0.252747348565217, + "grad_norm": 5.710094451904297, + "learning_rate": 8.917874928771988e-06, + "loss": 0.7809642314910888, + "memory(GiB)": 33.07, + "step": 5435, + "token_acc": 0.7974232456140351, + "train_speed(iter/s)": 0.096195 + }, + { + "epoch": 0.2529798668251666, + "grad_norm": 6.227021217346191, + "learning_rate": 8.91548494358761e-06, + "loss": 0.9448799133300781, + "memory(GiB)": 33.07, + "step": 5440, + "token_acc": 0.7778573987817986, + "train_speed(iter/s)": 0.096245 + }, + { + "epoch": 0.25321238508511623, + "grad_norm": 6.427996635437012, + "learning_rate": 8.913092643056095e-06, + "loss": 0.7775098323822022, + "memory(GiB)": 33.07, + "step": 5445, + "token_acc": 0.8085705348764467, + "train_speed(iter/s)": 0.096293 + }, + { + "epoch": 0.2534449033450658, + "grad_norm": 5.984553337097168, + "learning_rate": 8.910698028592087e-06, + "loss": 0.7316121578216552, + "memory(GiB)": 33.07, + "step": 5450, + "token_acc": 0.8200064745872451, + "train_speed(iter/s)": 0.096344 + }, + { + "epoch": 0.2534449033450658, + "eval_loss": 0.6686436533927917, + "eval_runtime": 291.087, + "eval_samples_per_second": 11.938, + "eval_steps_per_second": 11.938, + "step": 5450 + }, + { + "epoch": 0.25367742160501544, + "grad_norm": 5.354024410247803, + "learning_rate": 8.908301101611594e-06, + "loss": 0.7186790466308594, + "memory(GiB)": 33.07, + "step": 5455, + "token_acc": 0.8094466720128308, + "train_speed(iter/s)": 0.0959 + }, + { + "epoch": 0.253909939864965, + "grad_norm": 7.332178592681885, + "learning_rate": 8.905901863531992e-06, + "loss": 0.9101710319519043, + "memory(GiB)": 33.07, + "step": 5460, + "token_acc": 0.7733913584084551, + "train_speed(iter/s)": 0.09595 + }, + { + "epoch": 0.25414245812491465, + "grad_norm": 5.764885902404785, + "learning_rate": 8.903500315772026e-06, + "loss": 0.7292781352996827, + "memory(GiB)": 33.07, + "step": 5465, + "token_acc": 0.8157294832826748, + "train_speed(iter/s)": 0.095999 + }, + { + "epoch": 0.2543749763848642, + "grad_norm": 5.920174598693848, + "learning_rate": 8.901096459751805e-06, + "loss": 0.7806157112121582, + "memory(GiB)": 33.07, + "step": 5470, + "token_acc": 0.7944377267230955, + "train_speed(iter/s)": 0.096048 + }, + { + "epoch": 0.2546074946448138, + "grad_norm": 6.420112609863281, + "learning_rate": 8.898690296892804e-06, + "loss": 0.7653073310852051, + "memory(GiB)": 33.07, + "step": 5475, + "token_acc": 0.8201388888888889, + "train_speed(iter/s)": 0.096097 + }, + { + "epoch": 0.25484001290476344, + "grad_norm": 6.058072090148926, + "learning_rate": 8.896281828617861e-06, + "loss": 0.944705867767334, + "memory(GiB)": 33.07, + "step": 5480, + "token_acc": 0.7741743528711693, + "train_speed(iter/s)": 0.096147 + }, + { + "epoch": 0.255072531164713, + "grad_norm": 6.223658561706543, + "learning_rate": 8.893871056351178e-06, + "loss": 0.7866355895996093, + "memory(GiB)": 33.07, + "step": 5485, + "token_acc": 0.8078734858681023, + "train_speed(iter/s)": 0.096195 + }, + { + "epoch": 0.25530504942466264, + "grad_norm": 4.456324100494385, + "learning_rate": 8.891457981518317e-06, + "loss": 0.7789161682128907, + "memory(GiB)": 33.07, + "step": 5490, + "token_acc": 0.8106844741235393, + "train_speed(iter/s)": 0.096244 + }, + { + "epoch": 0.2555375676846122, + "grad_norm": 6.5606160163879395, + "learning_rate": 8.889042605546206e-06, + "loss": 0.8363648414611816, + "memory(GiB)": 33.07, + "step": 5495, + "token_acc": 0.803680981595092, + "train_speed(iter/s)": 0.096292 + }, + { + "epoch": 0.25577008594456185, + "grad_norm": 5.641964435577393, + "learning_rate": 8.886624929863128e-06, + "loss": 0.8033831596374512, + "memory(GiB)": 33.07, + "step": 5500, + "token_acc": 0.8071581196581197, + "train_speed(iter/s)": 0.096341 + }, + { + "epoch": 0.25577008594456185, + "eval_loss": 0.6669542193412781, + "eval_runtime": 290.9922, + "eval_samples_per_second": 11.942, + "eval_steps_per_second": 11.942, + "step": 5500 + }, + { + "epoch": 0.25600260420451143, + "grad_norm": 5.638400554656982, + "learning_rate": 8.884204955898734e-06, + "loss": 0.8716331481933594, + "memory(GiB)": 33.07, + "step": 5505, + "token_acc": 0.8083292041622046, + "train_speed(iter/s)": 0.095901 + }, + { + "epoch": 0.25623512246446106, + "grad_norm": 6.0181989669799805, + "learning_rate": 8.881782685084027e-06, + "loss": 0.6829388618469239, + "memory(GiB)": 33.07, + "step": 5510, + "token_acc": 0.8382509776039815, + "train_speed(iter/s)": 0.095951 + }, + { + "epoch": 0.25646764072441064, + "grad_norm": 6.096235752105713, + "learning_rate": 8.879358118851369e-06, + "loss": 0.7325149536132812, + "memory(GiB)": 33.07, + "step": 5515, + "token_acc": 0.8188976377952756, + "train_speed(iter/s)": 0.095999 + }, + { + "epoch": 0.2567001589843602, + "grad_norm": 7.176323413848877, + "learning_rate": 8.876931258634483e-06, + "loss": 0.7665040969848633, + "memory(GiB)": 33.07, + "step": 5520, + "token_acc": 0.8297587131367292, + "train_speed(iter/s)": 0.096048 + }, + { + "epoch": 0.25693267724430985, + "grad_norm": 7.309201240539551, + "learning_rate": 8.874502105868447e-06, + "loss": 0.7806261539459228, + "memory(GiB)": 33.07, + "step": 5525, + "token_acc": 0.8165064102564102, + "train_speed(iter/s)": 0.096097 + }, + { + "epoch": 0.2571651955042594, + "grad_norm": 6.094222545623779, + "learning_rate": 8.872070661989691e-06, + "loss": 0.7837971210479736, + "memory(GiB)": 33.07, + "step": 5530, + "token_acc": 0.8048245614035088, + "train_speed(iter/s)": 0.096145 + }, + { + "epoch": 0.25739771376420906, + "grad_norm": 5.346552848815918, + "learning_rate": 8.869636928436006e-06, + "loss": 0.919887924194336, + "memory(GiB)": 33.07, + "step": 5535, + "token_acc": 0.7646528403967539, + "train_speed(iter/s)": 0.096193 + }, + { + "epoch": 0.25763023202415863, + "grad_norm": 5.813202381134033, + "learning_rate": 8.867200906646532e-06, + "loss": 0.6144495487213135, + "memory(GiB)": 33.07, + "step": 5540, + "token_acc": 0.8432231962238705, + "train_speed(iter/s)": 0.096241 + }, + { + "epoch": 0.25786275028410827, + "grad_norm": 5.423628330230713, + "learning_rate": 8.864762598061764e-06, + "loss": 0.7319612026214599, + "memory(GiB)": 33.07, + "step": 5545, + "token_acc": 0.8209011737978038, + "train_speed(iter/s)": 0.096288 + }, + { + "epoch": 0.25809526854405784, + "grad_norm": 6.213625907897949, + "learning_rate": 8.86232200412355e-06, + "loss": 0.7926487445831298, + "memory(GiB)": 33.07, + "step": 5550, + "token_acc": 0.8025708635464733, + "train_speed(iter/s)": 0.096337 + }, + { + "epoch": 0.25809526854405784, + "eval_loss": 0.6657958626747131, + "eval_runtime": 290.1857, + "eval_samples_per_second": 11.975, + "eval_steps_per_second": 11.975, + "step": 5550 + }, + { + "epoch": 0.2583277868040075, + "grad_norm": 6.262226104736328, + "learning_rate": 8.859879126275088e-06, + "loss": 0.9141180038452148, + "memory(GiB)": 33.07, + "step": 5555, + "token_acc": 0.8079669387201519, + "train_speed(iter/s)": 0.095901 + }, + { + "epoch": 0.25856030506395705, + "grad_norm": 8.025341033935547, + "learning_rate": 8.857433965960926e-06, + "loss": 0.799907112121582, + "memory(GiB)": 33.07, + "step": 5560, + "token_acc": 0.8073394495412844, + "train_speed(iter/s)": 0.09595 + }, + { + "epoch": 0.25879282332390663, + "grad_norm": 5.492082118988037, + "learning_rate": 8.854986524626965e-06, + "loss": 0.9118124008178711, + "memory(GiB)": 33.07, + "step": 5565, + "token_acc": 0.7928526249209361, + "train_speed(iter/s)": 0.095997 + }, + { + "epoch": 0.25902534158385626, + "grad_norm": 5.745024681091309, + "learning_rate": 8.852536803720452e-06, + "loss": 0.8177533149719238, + "memory(GiB)": 33.07, + "step": 5570, + "token_acc": 0.8142504118616145, + "train_speed(iter/s)": 0.096046 + }, + { + "epoch": 0.25925785984380584, + "grad_norm": 4.650488376617432, + "learning_rate": 8.850084804689982e-06, + "loss": 0.8530696868896485, + "memory(GiB)": 33.07, + "step": 5575, + "token_acc": 0.7954545454545454, + "train_speed(iter/s)": 0.096095 + }, + { + "epoch": 0.25949037810375547, + "grad_norm": 7.37318229675293, + "learning_rate": 8.847630528985496e-06, + "loss": 0.7755809783935547, + "memory(GiB)": 33.07, + "step": 5580, + "token_acc": 0.7971880492091389, + "train_speed(iter/s)": 0.096143 + }, + { + "epoch": 0.25972289636370505, + "grad_norm": 6.992980003356934, + "learning_rate": 8.845173978058288e-06, + "loss": 0.7328850269317627, + "memory(GiB)": 33.07, + "step": 5585, + "token_acc": 0.8208566108007449, + "train_speed(iter/s)": 0.096191 + }, + { + "epoch": 0.2599554146236547, + "grad_norm": 4.85237979888916, + "learning_rate": 8.842715153360987e-06, + "loss": 0.6974979400634765, + "memory(GiB)": 33.07, + "step": 5590, + "token_acc": 0.8187894541403639, + "train_speed(iter/s)": 0.09624 + }, + { + "epoch": 0.26018793288360426, + "grad_norm": 6.679464817047119, + "learning_rate": 8.840254056347574e-06, + "loss": 0.815558910369873, + "memory(GiB)": 33.07, + "step": 5595, + "token_acc": 0.8056239015817224, + "train_speed(iter/s)": 0.096289 + }, + { + "epoch": 0.2604204511435539, + "grad_norm": 5.637943744659424, + "learning_rate": 8.837790688473373e-06, + "loss": 0.9436439514160156, + "memory(GiB)": 33.07, + "step": 5600, + "token_acc": 0.781738907412157, + "train_speed(iter/s)": 0.096337 + }, + { + "epoch": 0.2604204511435539, + "eval_loss": 0.6708033680915833, + "eval_runtime": 291.4602, + "eval_samples_per_second": 11.923, + "eval_steps_per_second": 11.923, + "step": 5600 + }, + { + "epoch": 0.26065296940350346, + "grad_norm": 6.944599628448486, + "learning_rate": 8.835325051195047e-06, + "loss": 0.6876493930816651, + "memory(GiB)": 33.07, + "step": 5605, + "token_acc": 0.8092388568871987, + "train_speed(iter/s)": 0.095904 + }, + { + "epoch": 0.2608854876634531, + "grad_norm": 6.316150188446045, + "learning_rate": 8.832857145970606e-06, + "loss": 0.8054632186889649, + "memory(GiB)": 33.07, + "step": 5610, + "token_acc": 0.8057960105382009, + "train_speed(iter/s)": 0.095953 + }, + { + "epoch": 0.2611180059234027, + "grad_norm": 6.6088175773620605, + "learning_rate": 8.830386974259398e-06, + "loss": 0.784023666381836, + "memory(GiB)": 33.07, + "step": 5615, + "token_acc": 0.8076171875, + "train_speed(iter/s)": 0.096 + }, + { + "epoch": 0.26135052418335225, + "grad_norm": 5.055639743804932, + "learning_rate": 8.827914537522111e-06, + "loss": 0.7353429794311523, + "memory(GiB)": 33.07, + "step": 5620, + "token_acc": 0.8256467941507312, + "train_speed(iter/s)": 0.096049 + }, + { + "epoch": 0.2615830424433019, + "grad_norm": 6.476213455200195, + "learning_rate": 8.825439837220772e-06, + "loss": 0.7795384883880615, + "memory(GiB)": 33.07, + "step": 5625, + "token_acc": 0.8153323262839879, + "train_speed(iter/s)": 0.096097 + }, + { + "epoch": 0.26181556070325146, + "grad_norm": 5.233215808868408, + "learning_rate": 8.82296287481875e-06, + "loss": 0.8795851707458496, + "memory(GiB)": 33.07, + "step": 5630, + "token_acc": 0.7775357809583074, + "train_speed(iter/s)": 0.096145 + }, + { + "epoch": 0.2620480789632011, + "grad_norm": 5.791959285736084, + "learning_rate": 8.820483651780746e-06, + "loss": 0.7681045532226562, + "memory(GiB)": 33.07, + "step": 5635, + "token_acc": 0.8212083847102343, + "train_speed(iter/s)": 0.096193 + }, + { + "epoch": 0.26228059722315067, + "grad_norm": 7.508996963500977, + "learning_rate": 8.818002169572808e-06, + "loss": 0.8352363586425782, + "memory(GiB)": 33.07, + "step": 5640, + "token_acc": 0.8109121199500208, + "train_speed(iter/s)": 0.096241 + }, + { + "epoch": 0.2625131154831003, + "grad_norm": 5.6559906005859375, + "learning_rate": 8.815518429662304e-06, + "loss": 0.7900479316711426, + "memory(GiB)": 33.07, + "step": 5645, + "token_acc": 0.8133054684778823, + "train_speed(iter/s)": 0.09629 + }, + { + "epoch": 0.2627456337430499, + "grad_norm": 6.982070446014404, + "learning_rate": 8.813032433517953e-06, + "loss": 0.7146542549133301, + "memory(GiB)": 33.07, + "step": 5650, + "token_acc": 0.8333946303788158, + "train_speed(iter/s)": 0.096338 + }, + { + "epoch": 0.2627456337430499, + "eval_loss": 0.6662178039550781, + "eval_runtime": 295.0199, + "eval_samples_per_second": 11.779, + "eval_steps_per_second": 11.779, + "step": 5650 + }, + { + "epoch": 0.2629781520029995, + "grad_norm": 6.79498815536499, + "learning_rate": 8.810544182609799e-06, + "loss": 0.7456812381744384, + "memory(GiB)": 33.07, + "step": 5655, + "token_acc": 0.8098742446251013, + "train_speed(iter/s)": 0.095903 + }, + { + "epoch": 0.2632106702629491, + "grad_norm": 4.711836814880371, + "learning_rate": 8.80805367840922e-06, + "loss": 0.88272705078125, + "memory(GiB)": 33.07, + "step": 5660, + "token_acc": 0.7980769230769231, + "train_speed(iter/s)": 0.095949 + }, + { + "epoch": 0.26344318852289866, + "grad_norm": 6.486434459686279, + "learning_rate": 8.805560922388932e-06, + "loss": 0.7468509674072266, + "memory(GiB)": 33.07, + "step": 5665, + "token_acc": 0.8165413533834587, + "train_speed(iter/s)": 0.095996 + }, + { + "epoch": 0.2636757067828483, + "grad_norm": 5.890286922454834, + "learning_rate": 8.803065916022974e-06, + "loss": 0.7029210567474365, + "memory(GiB)": 33.07, + "step": 5670, + "token_acc": 0.8349483204134367, + "train_speed(iter/s)": 0.096044 + }, + { + "epoch": 0.2639082250427979, + "grad_norm": 5.925194263458252, + "learning_rate": 8.800568660786724e-06, + "loss": 0.8480375289916993, + "memory(GiB)": 33.07, + "step": 5675, + "token_acc": 0.7957658779576587, + "train_speed(iter/s)": 0.096092 + }, + { + "epoch": 0.2641407433027475, + "grad_norm": 5.214552402496338, + "learning_rate": 8.798069158156884e-06, + "loss": 0.7835229396820068, + "memory(GiB)": 33.07, + "step": 5680, + "token_acc": 0.8250564334085779, + "train_speed(iter/s)": 0.096138 + }, + { + "epoch": 0.2643732615626971, + "grad_norm": 4.562428951263428, + "learning_rate": 8.795567409611487e-06, + "loss": 0.6990029335021972, + "memory(GiB)": 33.07, + "step": 5685, + "token_acc": 0.81728, + "train_speed(iter/s)": 0.096185 + }, + { + "epoch": 0.2646057798226467, + "grad_norm": 4.071775913238525, + "learning_rate": 8.793063416629895e-06, + "loss": 0.6751185417175293, + "memory(GiB)": 33.07, + "step": 5690, + "token_acc": 0.8259620907524411, + "train_speed(iter/s)": 0.096232 + }, + { + "epoch": 0.2648382980825963, + "grad_norm": 5.290688991546631, + "learning_rate": 8.790557180692796e-06, + "loss": 0.6121927738189697, + "memory(GiB)": 33.07, + "step": 5695, + "token_acc": 0.845578231292517, + "train_speed(iter/s)": 0.09628 + }, + { + "epoch": 0.2650708163425459, + "grad_norm": 7.041702747344971, + "learning_rate": 8.788048703282204e-06, + "loss": 0.7391871929168701, + "memory(GiB)": 33.07, + "step": 5700, + "token_acc": 0.8141878274889157, + "train_speed(iter/s)": 0.096327 + }, + { + "epoch": 0.2650708163425459, + "eval_loss": 0.6662322878837585, + "eval_runtime": 292.6362, + "eval_samples_per_second": 11.875, + "eval_steps_per_second": 11.875, + "step": 5700 + }, + { + "epoch": 0.2653033346024955, + "grad_norm": 6.969703674316406, + "learning_rate": 8.785537985881463e-06, + "loss": 0.7058303356170654, + "memory(GiB)": 33.07, + "step": 5705, + "token_acc": 0.8105295109073336, + "train_speed(iter/s)": 0.095901 + }, + { + "epoch": 0.2655358528624451, + "grad_norm": 6.892284393310547, + "learning_rate": 8.783025029975231e-06, + "loss": 0.7768474102020264, + "memory(GiB)": 33.07, + "step": 5710, + "token_acc": 0.8084707646176912, + "train_speed(iter/s)": 0.095949 + }, + { + "epoch": 0.2657683711223947, + "grad_norm": 5.746127128601074, + "learning_rate": 8.780509837049501e-06, + "loss": 0.7630928516387939, + "memory(GiB)": 33.07, + "step": 5715, + "token_acc": 0.810706787963611, + "train_speed(iter/s)": 0.095997 + }, + { + "epoch": 0.2660008893823443, + "grad_norm": 5.716222763061523, + "learning_rate": 8.777992408591587e-06, + "loss": 0.7920127391815186, + "memory(GiB)": 33.07, + "step": 5720, + "token_acc": 0.8055358410220014, + "train_speed(iter/s)": 0.096044 + }, + { + "epoch": 0.2662334076422939, + "grad_norm": 6.537604331970215, + "learning_rate": 8.775472746090114e-06, + "loss": 0.6463952541351319, + "memory(GiB)": 33.07, + "step": 5725, + "token_acc": 0.8377331925378388, + "train_speed(iter/s)": 0.096092 + }, + { + "epoch": 0.2664659259022435, + "grad_norm": 5.381245136260986, + "learning_rate": 8.772950851035043e-06, + "loss": 0.7079122543334961, + "memory(GiB)": 33.07, + "step": 5730, + "token_acc": 0.8135184067592034, + "train_speed(iter/s)": 0.096139 + }, + { + "epoch": 0.2666984441621931, + "grad_norm": 4.338988304138184, + "learning_rate": 8.770426724917645e-06, + "loss": 0.7638363838195801, + "memory(GiB)": 33.07, + "step": 5735, + "token_acc": 0.8122582564712884, + "train_speed(iter/s)": 0.096185 + }, + { + "epoch": 0.2669309624221427, + "grad_norm": 5.95316743850708, + "learning_rate": 8.767900369230516e-06, + "loss": 0.7845072269439697, + "memory(GiB)": 33.07, + "step": 5740, + "token_acc": 0.8254620123203286, + "train_speed(iter/s)": 0.096232 + }, + { + "epoch": 0.26716348068209234, + "grad_norm": 5.003815650939941, + "learning_rate": 8.765371785467565e-06, + "loss": 0.7775311946868897, + "memory(GiB)": 33.07, + "step": 5745, + "token_acc": 0.8154538634658665, + "train_speed(iter/s)": 0.09628 + }, + { + "epoch": 0.2673959989420419, + "grad_norm": 7.775989532470703, + "learning_rate": 8.762840975124025e-06, + "loss": 0.7087774276733398, + "memory(GiB)": 33.07, + "step": 5750, + "token_acc": 0.8237026131762973, + "train_speed(iter/s)": 0.096326 + }, + { + "epoch": 0.2673959989420419, + "eval_loss": 0.6630222797393799, + "eval_runtime": 294.8049, + "eval_samples_per_second": 11.787, + "eval_steps_per_second": 11.787, + "step": 5750 + }, + { + "epoch": 0.26762851720199154, + "grad_norm": 5.743710041046143, + "learning_rate": 8.760307939696441e-06, + "loss": 0.7595709323883056, + "memory(GiB)": 33.07, + "step": 5755, + "token_acc": 0.810019710644546, + "train_speed(iter/s)": 0.095898 + }, + { + "epoch": 0.2678610354619411, + "grad_norm": 6.3062615394592285, + "learning_rate": 8.757772680682674e-06, + "loss": 0.6787972450256348, + "memory(GiB)": 33.07, + "step": 5760, + "token_acc": 0.8221516474791584, + "train_speed(iter/s)": 0.095944 + }, + { + "epoch": 0.2680935537218907, + "grad_norm": 9.692729949951172, + "learning_rate": 8.755235199581902e-06, + "loss": 0.9254583358764649, + "memory(GiB)": 33.07, + "step": 5765, + "token_acc": 0.7801642451042324, + "train_speed(iter/s)": 0.095991 + }, + { + "epoch": 0.26832607198184033, + "grad_norm": 7.099086284637451, + "learning_rate": 8.752695497894616e-06, + "loss": 0.8400090217590332, + "memory(GiB)": 33.07, + "step": 5770, + "token_acc": 0.7888177837655777, + "train_speed(iter/s)": 0.096038 + }, + { + "epoch": 0.2685585902417899, + "grad_norm": 5.777800559997559, + "learning_rate": 8.750153577122622e-06, + "loss": 0.6479124546051025, + "memory(GiB)": 33.07, + "step": 5775, + "token_acc": 0.843238930993219, + "train_speed(iter/s)": 0.096085 + }, + { + "epoch": 0.26879110850173954, + "grad_norm": 6.415759086608887, + "learning_rate": 8.747609438769032e-06, + "loss": 0.8482369422912598, + "memory(GiB)": 33.07, + "step": 5780, + "token_acc": 0.7988942639944713, + "train_speed(iter/s)": 0.09613 + }, + { + "epoch": 0.2690236267616891, + "grad_norm": 6.494517803192139, + "learning_rate": 8.745063084338275e-06, + "loss": 0.7210347652435303, + "memory(GiB)": 33.07, + "step": 5785, + "token_acc": 0.8194831013916501, + "train_speed(iter/s)": 0.096177 + }, + { + "epoch": 0.26925614502163875, + "grad_norm": 6.0870137214660645, + "learning_rate": 8.742514515336092e-06, + "loss": 0.8033206939697266, + "memory(GiB)": 33.07, + "step": 5790, + "token_acc": 0.8005087209302325, + "train_speed(iter/s)": 0.096223 + }, + { + "epoch": 0.2694886632815883, + "grad_norm": 4.545366287231445, + "learning_rate": 8.739963733269526e-06, + "loss": 0.8455084800720215, + "memory(GiB)": 33.07, + "step": 5795, + "token_acc": 0.7944677871148459, + "train_speed(iter/s)": 0.096269 + }, + { + "epoch": 0.26972118154153796, + "grad_norm": 5.711040496826172, + "learning_rate": 8.737410739646935e-06, + "loss": 0.7394067287445069, + "memory(GiB)": 33.07, + "step": 5800, + "token_acc": 0.8074335148990708, + "train_speed(iter/s)": 0.096315 + }, + { + "epoch": 0.26972118154153796, + "eval_loss": 0.659167468547821, + "eval_runtime": 294.1469, + "eval_samples_per_second": 11.814, + "eval_steps_per_second": 11.814, + "step": 5800 + }, + { + "epoch": 0.26995369980148753, + "grad_norm": 7.04903507232666, + "learning_rate": 8.734855535977984e-06, + "loss": 0.7887364864349365, + "memory(GiB)": 33.07, + "step": 5805, + "token_acc": 0.8100934818594013, + "train_speed(iter/s)": 0.095892 + }, + { + "epoch": 0.2701862180614371, + "grad_norm": 5.3122334480285645, + "learning_rate": 8.732298123773644e-06, + "loss": 0.7698288917541504, + "memory(GiB)": 33.07, + "step": 5810, + "token_acc": 0.8110236220472441, + "train_speed(iter/s)": 0.095937 + }, + { + "epoch": 0.27041873632138674, + "grad_norm": 3.649322032928467, + "learning_rate": 8.72973850454619e-06, + "loss": 0.808299446105957, + "memory(GiB)": 33.07, + "step": 5815, + "token_acc": 0.7795212765957447, + "train_speed(iter/s)": 0.095983 + }, + { + "epoch": 0.2706512545813363, + "grad_norm": 7.034269332885742, + "learning_rate": 8.727176679809203e-06, + "loss": 0.9111138343811035, + "memory(GiB)": 33.07, + "step": 5820, + "token_acc": 0.7846249610955494, + "train_speed(iter/s)": 0.096029 + }, + { + "epoch": 0.27088377284128595, + "grad_norm": 5.195382118225098, + "learning_rate": 8.724612651077573e-06, + "loss": 0.7312620162963868, + "memory(GiB)": 33.07, + "step": 5825, + "token_acc": 0.8101784147411524, + "train_speed(iter/s)": 0.096074 + }, + { + "epoch": 0.27111629110123553, + "grad_norm": 5.934696197509766, + "learning_rate": 8.722046419867488e-06, + "loss": 0.8077304840087891, + "memory(GiB)": 33.07, + "step": 5830, + "token_acc": 0.8055733504163998, + "train_speed(iter/s)": 0.096119 + }, + { + "epoch": 0.27134880936118516, + "grad_norm": 5.654599189758301, + "learning_rate": 8.719477987696436e-06, + "loss": 0.7460138320922851, + "memory(GiB)": 33.07, + "step": 5835, + "token_acc": 0.8146775389177169, + "train_speed(iter/s)": 0.096165 + }, + { + "epoch": 0.27158132762113474, + "grad_norm": 4.112966537475586, + "learning_rate": 8.716907356083217e-06, + "loss": 0.8577005386352539, + "memory(GiB)": 33.07, + "step": 5840, + "token_acc": 0.7860661505981703, + "train_speed(iter/s)": 0.09621 + }, + { + "epoch": 0.27181384588108437, + "grad_norm": 7.279501438140869, + "learning_rate": 8.714334526547918e-06, + "loss": 0.7479821681976319, + "memory(GiB)": 33.07, + "step": 5845, + "token_acc": 0.8225538971807629, + "train_speed(iter/s)": 0.096257 + }, + { + "epoch": 0.27204636414103395, + "grad_norm": 6.161126613616943, + "learning_rate": 8.711759500611937e-06, + "loss": 0.7519172191619873, + "memory(GiB)": 33.07, + "step": 5850, + "token_acc": 0.8014393195943735, + "train_speed(iter/s)": 0.096302 + }, + { + "epoch": 0.27204636414103395, + "eval_loss": 0.6617825627326965, + "eval_runtime": 295.4158, + "eval_samples_per_second": 11.763, + "eval_steps_per_second": 11.763, + "step": 5850 + }, + { + "epoch": 0.2722788824009836, + "grad_norm": 7.260519027709961, + "learning_rate": 8.709182279797963e-06, + "loss": 0.7220078945159912, + "memory(GiB)": 33.07, + "step": 5855, + "token_acc": 0.8102235397804664, + "train_speed(iter/s)": 0.095881 + }, + { + "epoch": 0.27251140066093316, + "grad_norm": 6.423417568206787, + "learning_rate": 8.706602865629989e-06, + "loss": 0.6432157039642334, + "memory(GiB)": 33.07, + "step": 5860, + "token_acc": 0.8444360333080999, + "train_speed(iter/s)": 0.095927 + }, + { + "epoch": 0.27274391892088273, + "grad_norm": 5.766536712646484, + "learning_rate": 8.704021259633302e-06, + "loss": 0.7632864475250244, + "memory(GiB)": 33.07, + "step": 5865, + "token_acc": 0.8184250764525994, + "train_speed(iter/s)": 0.095973 + }, + { + "epoch": 0.27297643718083237, + "grad_norm": 5.202002048492432, + "learning_rate": 8.701437463334485e-06, + "loss": 0.7291605472564697, + "memory(GiB)": 33.07, + "step": 5870, + "token_acc": 0.8104372355430184, + "train_speed(iter/s)": 0.096019 + }, + { + "epoch": 0.27320895544078194, + "grad_norm": 7.422544956207275, + "learning_rate": 8.698851478261416e-06, + "loss": 0.8415180206298828, + "memory(GiB)": 33.07, + "step": 5875, + "token_acc": 0.7878695519658641, + "train_speed(iter/s)": 0.096065 + }, + { + "epoch": 0.2734414737007316, + "grad_norm": 4.930156707763672, + "learning_rate": 8.696263305943268e-06, + "loss": 0.7762112140655517, + "memory(GiB)": 33.07, + "step": 5880, + "token_acc": 0.8077803203661327, + "train_speed(iter/s)": 0.096111 + }, + { + "epoch": 0.27367399196068115, + "grad_norm": 5.615900993347168, + "learning_rate": 8.693672947910507e-06, + "loss": 0.8112217903137207, + "memory(GiB)": 33.07, + "step": 5885, + "token_acc": 0.7994978479196556, + "train_speed(iter/s)": 0.096156 + }, + { + "epoch": 0.2739065102206308, + "grad_norm": 8.410451889038086, + "learning_rate": 8.691080405694891e-06, + "loss": 0.7312962532043457, + "memory(GiB)": 33.07, + "step": 5890, + "token_acc": 0.8337928318235526, + "train_speed(iter/s)": 0.096202 + }, + { + "epoch": 0.27413902848058036, + "grad_norm": 6.97584867477417, + "learning_rate": 8.688485680829473e-06, + "loss": 0.7963497161865234, + "memory(GiB)": 33.07, + "step": 5895, + "token_acc": 0.7905405405405406, + "train_speed(iter/s)": 0.096248 + }, + { + "epoch": 0.27437154674053, + "grad_norm": 7.579192161560059, + "learning_rate": 8.685888774848591e-06, + "loss": 0.8456910133361817, + "memory(GiB)": 33.07, + "step": 5900, + "token_acc": 0.7953077184631078, + "train_speed(iter/s)": 0.096293 + }, + { + "epoch": 0.27437154674053, + "eval_loss": 0.6617845892906189, + "eval_runtime": 294.0898, + "eval_samples_per_second": 11.816, + "eval_steps_per_second": 11.816, + "step": 5900 + }, + { + "epoch": 0.27460406500047957, + "grad_norm": 5.993030071258545, + "learning_rate": 8.683289689287877e-06, + "loss": 0.6374862670898438, + "memory(GiB)": 33.07, + "step": 5905, + "token_acc": 0.811827697650861, + "train_speed(iter/s)": 0.095879 + }, + { + "epoch": 0.27483658326042915, + "grad_norm": 7.107336044311523, + "learning_rate": 8.68068842568425e-06, + "loss": 0.724134111404419, + "memory(GiB)": 33.07, + "step": 5910, + "token_acc": 0.832606108687029, + "train_speed(iter/s)": 0.095925 + }, + { + "epoch": 0.2750691015203788, + "grad_norm": 6.270671367645264, + "learning_rate": 8.678084985575918e-06, + "loss": 0.7967855930328369, + "memory(GiB)": 33.07, + "step": 5915, + "token_acc": 0.8057302585604472, + "train_speed(iter/s)": 0.095971 + }, + { + "epoch": 0.27530161978032835, + "grad_norm": 6.208034038543701, + "learning_rate": 8.675479370502375e-06, + "loss": 0.7351973533630372, + "memory(GiB)": 33.07, + "step": 5920, + "token_acc": 0.8105228105228105, + "train_speed(iter/s)": 0.096016 + }, + { + "epoch": 0.275534138040278, + "grad_norm": 5.249854564666748, + "learning_rate": 8.672871582004404e-06, + "loss": 0.6611810684204101, + "memory(GiB)": 33.07, + "step": 5925, + "token_acc": 0.8356846473029046, + "train_speed(iter/s)": 0.096062 + }, + { + "epoch": 0.27576665630022756, + "grad_norm": 8.400338172912598, + "learning_rate": 8.670261621624065e-06, + "loss": 0.6317246913909912, + "memory(GiB)": 33.07, + "step": 5930, + "token_acc": 0.8427358961557664, + "train_speed(iter/s)": 0.096108 + }, + { + "epoch": 0.2759991745601772, + "grad_norm": 8.131949424743652, + "learning_rate": 8.667649490904715e-06, + "loss": 0.7783225059509278, + "memory(GiB)": 33.07, + "step": 5935, + "token_acc": 0.8031975637609441, + "train_speed(iter/s)": 0.096155 + }, + { + "epoch": 0.2762316928201268, + "grad_norm": 6.932249069213867, + "learning_rate": 8.665035191390982e-06, + "loss": 0.7265284061431885, + "memory(GiB)": 33.07, + "step": 5940, + "token_acc": 0.8292762090230444, + "train_speed(iter/s)": 0.096199 + }, + { + "epoch": 0.2764642110800764, + "grad_norm": 8.016256332397461, + "learning_rate": 8.662418724628786e-06, + "loss": 0.7612427711486817, + "memory(GiB)": 33.07, + "step": 5945, + "token_acc": 0.8056741915802319, + "train_speed(iter/s)": 0.096244 + }, + { + "epoch": 0.276696729340026, + "grad_norm": 6.207606315612793, + "learning_rate": 8.659800092165324e-06, + "loss": 0.6891643524169921, + "memory(GiB)": 33.07, + "step": 5950, + "token_acc": 0.8315262718932444, + "train_speed(iter/s)": 0.09629 + }, + { + "epoch": 0.276696729340026, + "eval_loss": 0.6595419049263, + "eval_runtime": 290.2901, + "eval_samples_per_second": 11.971, + "eval_steps_per_second": 11.971, + "step": 5950 + }, + { + "epoch": 0.27692924759997556, + "grad_norm": 5.775126934051514, + "learning_rate": 8.657179295549072e-06, + "loss": 0.6428290367126465, + "memory(GiB)": 33.07, + "step": 5955, + "token_acc": 0.8115010354450015, + "train_speed(iter/s)": 0.095884 + }, + { + "epoch": 0.2771617658599252, + "grad_norm": 6.465238094329834, + "learning_rate": 8.65455633632979e-06, + "loss": 0.7096127510070801, + "memory(GiB)": 33.07, + "step": 5960, + "token_acc": 0.8188382015096817, + "train_speed(iter/s)": 0.095929 + }, + { + "epoch": 0.27739428411987477, + "grad_norm": 7.701021194458008, + "learning_rate": 8.651931216058514e-06, + "loss": 0.7280645370483398, + "memory(GiB)": 33.07, + "step": 5965, + "token_acc": 0.8177358490566038, + "train_speed(iter/s)": 0.095973 + }, + { + "epoch": 0.2776268023798244, + "grad_norm": 6.742284297943115, + "learning_rate": 8.649303936287557e-06, + "loss": 0.6810751914978027, + "memory(GiB)": 33.07, + "step": 5970, + "token_acc": 0.8265813788201848, + "train_speed(iter/s)": 0.096019 + }, + { + "epoch": 0.277859320639774, + "grad_norm": 7.681085586547852, + "learning_rate": 8.646674498570515e-06, + "loss": 0.7115217208862304, + "memory(GiB)": 33.07, + "step": 5975, + "token_acc": 0.8235088943146146, + "train_speed(iter/s)": 0.096064 + }, + { + "epoch": 0.2780918388997236, + "grad_norm": 6.155812740325928, + "learning_rate": 8.64404290446225e-06, + "loss": 0.8212972640991211, + "memory(GiB)": 33.07, + "step": 5980, + "token_acc": 0.7921840759678598, + "train_speed(iter/s)": 0.096109 + }, + { + "epoch": 0.2783243571596732, + "grad_norm": 6.859022617340088, + "learning_rate": 8.641409155518911e-06, + "loss": 0.8141220092773438, + "memory(GiB)": 33.07, + "step": 5985, + "token_acc": 0.7958908723475918, + "train_speed(iter/s)": 0.096155 + }, + { + "epoch": 0.2785568754196228, + "grad_norm": 6.1459808349609375, + "learning_rate": 8.63877325329791e-06, + "loss": 0.7060758590698242, + "memory(GiB)": 33.07, + "step": 5990, + "token_acc": 0.8293323330832708, + "train_speed(iter/s)": 0.096201 + }, + { + "epoch": 0.2787893936795724, + "grad_norm": 4.698131084442139, + "learning_rate": 8.63613519935794e-06, + "loss": 0.8317166328430176, + "memory(GiB)": 33.07, + "step": 5995, + "token_acc": 0.7865638058502089, + "train_speed(iter/s)": 0.096246 + }, + { + "epoch": 0.279021911939522, + "grad_norm": 7.586791515350342, + "learning_rate": 8.633494995258963e-06, + "loss": 0.7706262588500976, + "memory(GiB)": 33.07, + "step": 6000, + "token_acc": 0.811706629055007, + "train_speed(iter/s)": 0.096292 + }, + { + "epoch": 0.279021911939522, + "eval_loss": 0.6601762175559998, + "eval_runtime": 290.388, + "eval_samples_per_second": 11.967, + "eval_steps_per_second": 11.967, + "step": 6000 + }, + { + "epoch": 0.2792544301994716, + "grad_norm": 6.393497943878174, + "learning_rate": 8.630852642562214e-06, + "loss": 0.7850899219512939, + "memory(GiB)": 33.07, + "step": 6005, + "token_acc": 0.810933776269228, + "train_speed(iter/s)": 0.095891 + }, + { + "epoch": 0.2794869484594212, + "grad_norm": 7.869515419006348, + "learning_rate": 8.628208142830196e-06, + "loss": 0.735156774520874, + "memory(GiB)": 33.07, + "step": 6010, + "token_acc": 0.8219298245614035, + "train_speed(iter/s)": 0.095939 + }, + { + "epoch": 0.2797194667193708, + "grad_norm": 6.392009258270264, + "learning_rate": 8.625561497626684e-06, + "loss": 0.6847464084625244, + "memory(GiB)": 33.07, + "step": 6015, + "token_acc": 0.8271791767554479, + "train_speed(iter/s)": 0.095985 + }, + { + "epoch": 0.2799519849793204, + "grad_norm": 7.729519844055176, + "learning_rate": 8.622912708516722e-06, + "loss": 0.7716471195220947, + "memory(GiB)": 33.07, + "step": 6020, + "token_acc": 0.8147534189805222, + "train_speed(iter/s)": 0.096032 + }, + { + "epoch": 0.28018450323927, + "grad_norm": 6.673350811004639, + "learning_rate": 8.620261777066621e-06, + "loss": 0.751627779006958, + "memory(GiB)": 33.07, + "step": 6025, + "token_acc": 0.8256952604778691, + "train_speed(iter/s)": 0.096079 + }, + { + "epoch": 0.2804170214992196, + "grad_norm": 5.185956954956055, + "learning_rate": 8.617608704843956e-06, + "loss": 0.7098074436187745, + "memory(GiB)": 33.07, + "step": 6030, + "token_acc": 0.8378985181859003, + "train_speed(iter/s)": 0.096126 + }, + { + "epoch": 0.28064953975916923, + "grad_norm": 6.692188739776611, + "learning_rate": 8.614953493417572e-06, + "loss": 0.7026958465576172, + "memory(GiB)": 33.07, + "step": 6035, + "token_acc": 0.825254104769351, + "train_speed(iter/s)": 0.096171 + }, + { + "epoch": 0.2808820580191188, + "grad_norm": 5.576934814453125, + "learning_rate": 8.612296144357578e-06, + "loss": 0.6882329463958741, + "memory(GiB)": 33.07, + "step": 6040, + "token_acc": 0.8241112828438949, + "train_speed(iter/s)": 0.096215 + }, + { + "epoch": 0.28111457627906844, + "grad_norm": 3.9254825115203857, + "learning_rate": 8.60963665923535e-06, + "loss": 0.8447055816650391, + "memory(GiB)": 33.07, + "step": 6045, + "token_acc": 0.8103691180614258, + "train_speed(iter/s)": 0.096259 + }, + { + "epoch": 0.281347094539018, + "grad_norm": 5.394123077392578, + "learning_rate": 8.606975039623516e-06, + "loss": 0.8298517227172851, + "memory(GiB)": 33.07, + "step": 6050, + "token_acc": 0.7894736842105263, + "train_speed(iter/s)": 0.096304 + }, + { + "epoch": 0.281347094539018, + "eval_loss": 0.659683346748352, + "eval_runtime": 295.3634, + "eval_samples_per_second": 11.765, + "eval_steps_per_second": 11.765, + "step": 6050 + }, + { + "epoch": 0.2815796127989676, + "grad_norm": 5.169157028198242, + "learning_rate": 8.604311287095978e-06, + "loss": 0.619074821472168, + "memory(GiB)": 33.07, + "step": 6055, + "token_acc": 0.8119939190270443, + "train_speed(iter/s)": 0.095897 + }, + { + "epoch": 0.2818121310589172, + "grad_norm": 5.410036087036133, + "learning_rate": 8.601645403227897e-06, + "loss": 0.6804319858551026, + "memory(GiB)": 33.07, + "step": 6060, + "token_acc": 0.8311770274534553, + "train_speed(iter/s)": 0.095943 + }, + { + "epoch": 0.2820446493188668, + "grad_norm": 7.332240581512451, + "learning_rate": 8.59897738959569e-06, + "loss": 0.7271266937255859, + "memory(GiB)": 33.07, + "step": 6065, + "token_acc": 0.8204196933010492, + "train_speed(iter/s)": 0.095989 + }, + { + "epoch": 0.28227716757881643, + "grad_norm": 7.151330471038818, + "learning_rate": 8.596307247777036e-06, + "loss": 0.7178050518035889, + "memory(GiB)": 33.07, + "step": 6070, + "token_acc": 0.8232123607617678, + "train_speed(iter/s)": 0.096035 + }, + { + "epoch": 0.282509685838766, + "grad_norm": 6.48547887802124, + "learning_rate": 8.59363497935087e-06, + "loss": 0.7122317314147949, + "memory(GiB)": 33.07, + "step": 6075, + "token_acc": 0.8342954159592529, + "train_speed(iter/s)": 0.09608 + }, + { + "epoch": 0.28274220409871564, + "grad_norm": 6.208950042724609, + "learning_rate": 8.590960585897393e-06, + "loss": 0.5908382892608642, + "memory(GiB)": 33.07, + "step": 6080, + "token_acc": 0.8553921568627451, + "train_speed(iter/s)": 0.096126 + }, + { + "epoch": 0.2829747223586652, + "grad_norm": 9.506049156188965, + "learning_rate": 8.58828406899805e-06, + "loss": 0.8396751403808593, + "memory(GiB)": 33.07, + "step": 6085, + "token_acc": 0.8013124316441852, + "train_speed(iter/s)": 0.096172 + }, + { + "epoch": 0.28320724061861485, + "grad_norm": 7.106131076812744, + "learning_rate": 8.585605430235552e-06, + "loss": 0.6826900959014892, + "memory(GiB)": 33.07, + "step": 6090, + "token_acc": 0.8440840398083302, + "train_speed(iter/s)": 0.096217 + }, + { + "epoch": 0.28343975887856443, + "grad_norm": 6.968870639801025, + "learning_rate": 8.58292467119386e-06, + "loss": 0.760336971282959, + "memory(GiB)": 33.07, + "step": 6095, + "token_acc": 0.813989239046887, + "train_speed(iter/s)": 0.096263 + }, + { + "epoch": 0.283672277138514, + "grad_norm": 7.073690891265869, + "learning_rate": 8.580241793458188e-06, + "loss": 0.6863061428070069, + "memory(GiB)": 33.07, + "step": 6100, + "token_acc": 0.8284789644012945, + "train_speed(iter/s)": 0.096308 + }, + { + "epoch": 0.283672277138514, + "eval_loss": 0.6553303599357605, + "eval_runtime": 290.703, + "eval_samples_per_second": 11.954, + "eval_steps_per_second": 11.954, + "step": 6100 + }, + { + "epoch": 0.28390479539846364, + "grad_norm": 4.766737937927246, + "learning_rate": 8.577556798615008e-06, + "loss": 0.7520906925201416, + "memory(GiB)": 33.07, + "step": 6105, + "token_acc": 0.811454285782675, + "train_speed(iter/s)": 0.095912 + }, + { + "epoch": 0.2841373136584132, + "grad_norm": 7.36306095123291, + "learning_rate": 8.574869688252036e-06, + "loss": 0.8124327659606934, + "memory(GiB)": 33.07, + "step": 6110, + "token_acc": 0.814280140460398, + "train_speed(iter/s)": 0.095957 + }, + { + "epoch": 0.28436983191836285, + "grad_norm": 6.3582329750061035, + "learning_rate": 8.572180463958246e-06, + "loss": 0.8805639266967773, + "memory(GiB)": 33.07, + "step": 6115, + "token_acc": 0.7744294909303686, + "train_speed(iter/s)": 0.096 + }, + { + "epoch": 0.2846023501783124, + "grad_norm": 6.230129718780518, + "learning_rate": 8.569489127323858e-06, + "loss": 0.7166120529174804, + "memory(GiB)": 33.07, + "step": 6120, + "token_acc": 0.8236749116607773, + "train_speed(iter/s)": 0.096044 + }, + { + "epoch": 0.28483486843826206, + "grad_norm": 7.509944915771484, + "learning_rate": 8.566795679940342e-06, + "loss": 0.7509881973266601, + "memory(GiB)": 33.07, + "step": 6125, + "token_acc": 0.8205128205128205, + "train_speed(iter/s)": 0.096088 + }, + { + "epoch": 0.28506738669821163, + "grad_norm": 6.930967330932617, + "learning_rate": 8.56410012340042e-06, + "loss": 0.7765787601470947, + "memory(GiB)": 33.07, + "step": 6130, + "token_acc": 0.820565342072921, + "train_speed(iter/s)": 0.096132 + }, + { + "epoch": 0.28529990495816127, + "grad_norm": 6.196775913238525, + "learning_rate": 8.561402459298055e-06, + "loss": 0.7661912918090821, + "memory(GiB)": 33.07, + "step": 6135, + "token_acc": 0.8045511221945137, + "train_speed(iter/s)": 0.096176 + }, + { + "epoch": 0.28553242321811084, + "grad_norm": 5.132917881011963, + "learning_rate": 8.55870268922846e-06, + "loss": 0.8116618156433105, + "memory(GiB)": 33.07, + "step": 6140, + "token_acc": 0.8019323671497585, + "train_speed(iter/s)": 0.096221 + }, + { + "epoch": 0.2857649414780605, + "grad_norm": 5.738801002502441, + "learning_rate": 8.556000814788091e-06, + "loss": 0.6142177581787109, + "memory(GiB)": 33.07, + "step": 6145, + "token_acc": 0.8393782383419689, + "train_speed(iter/s)": 0.096264 + }, + { + "epoch": 0.28599745973801005, + "grad_norm": 6.459722995758057, + "learning_rate": 8.553296837574651e-06, + "loss": 0.6736063957214355, + "memory(GiB)": 33.07, + "step": 6150, + "token_acc": 0.8397048960429242, + "train_speed(iter/s)": 0.096307 + }, + { + "epoch": 0.28599745973801005, + "eval_loss": 0.6559529304504395, + "eval_runtime": 290.8894, + "eval_samples_per_second": 11.946, + "eval_steps_per_second": 11.946, + "step": 6150 + }, + { + "epoch": 0.28622997799795963, + "grad_norm": 8.525583267211914, + "learning_rate": 8.550590759187086e-06, + "loss": 0.7800402164459228, + "memory(GiB)": 33.07, + "step": 6155, + "token_acc": 0.8106261163562133, + "train_speed(iter/s)": 0.095913 + }, + { + "epoch": 0.28646249625790926, + "grad_norm": 5.509976863861084, + "learning_rate": 8.547882581225581e-06, + "loss": 0.7267116069793701, + "memory(GiB)": 33.07, + "step": 6160, + "token_acc": 0.81893528849219, + "train_speed(iter/s)": 0.095957 + }, + { + "epoch": 0.28669501451785884, + "grad_norm": 6.860876560211182, + "learning_rate": 8.545172305291566e-06, + "loss": 0.6962712287902832, + "memory(GiB)": 33.07, + "step": 6165, + "token_acc": 0.8284686125549781, + "train_speed(iter/s)": 0.096001 + }, + { + "epoch": 0.28692753277780847, + "grad_norm": 7.085202217102051, + "learning_rate": 8.542459932987714e-06, + "loss": 0.7279558181762695, + "memory(GiB)": 33.07, + "step": 6170, + "token_acc": 0.8318947801068639, + "train_speed(iter/s)": 0.096045 + }, + { + "epoch": 0.28716005103775805, + "grad_norm": 7.539945602416992, + "learning_rate": 8.539745465917932e-06, + "loss": 0.8185580253601075, + "memory(GiB)": 33.07, + "step": 6175, + "token_acc": 0.8042194092827004, + "train_speed(iter/s)": 0.096088 + }, + { + "epoch": 0.2873925692977077, + "grad_norm": 6.323469638824463, + "learning_rate": 8.537028905687368e-06, + "loss": 0.8452632904052735, + "memory(GiB)": 33.07, + "step": 6180, + "token_acc": 0.8044806517311609, + "train_speed(iter/s)": 0.096132 + }, + { + "epoch": 0.28762508755765726, + "grad_norm": 6.691417217254639, + "learning_rate": 8.53431025390241e-06, + "loss": 0.7386738300323487, + "memory(GiB)": 33.07, + "step": 6185, + "token_acc": 0.8143226282761167, + "train_speed(iter/s)": 0.096174 + }, + { + "epoch": 0.2878576058176069, + "grad_norm": 6.233213424682617, + "learning_rate": 8.531589512170675e-06, + "loss": 0.6917527675628662, + "memory(GiB)": 33.07, + "step": 6190, + "token_acc": 0.8180118416952321, + "train_speed(iter/s)": 0.096217 + }, + { + "epoch": 0.28809012407755646, + "grad_norm": 5.885256290435791, + "learning_rate": 8.528866682101029e-06, + "loss": 0.7740827560424804, + "memory(GiB)": 33.07, + "step": 6195, + "token_acc": 0.8050786838340487, + "train_speed(iter/s)": 0.096261 + }, + { + "epoch": 0.28832264233750604, + "grad_norm": 4.884213447570801, + "learning_rate": 8.526141765303562e-06, + "loss": 0.7537154674530029, + "memory(GiB)": 33.07, + "step": 6200, + "token_acc": 0.8030990173847317, + "train_speed(iter/s)": 0.096305 + }, + { + "epoch": 0.28832264233750604, + "eval_loss": 0.656577467918396, + "eval_runtime": 290.9454, + "eval_samples_per_second": 11.944, + "eval_steps_per_second": 11.944, + "step": 6200 + }, + { + "epoch": 0.2885551605974557, + "grad_norm": 6.681056022644043, + "learning_rate": 8.523414763389601e-06, + "loss": 0.786424970626831, + "memory(GiB)": 33.07, + "step": 6205, + "token_acc": 0.8106969465955142, + "train_speed(iter/s)": 0.095914 + }, + { + "epoch": 0.28878767885740525, + "grad_norm": 6.169203758239746, + "learning_rate": 8.520685677971707e-06, + "loss": 0.6843218326568603, + "memory(GiB)": 33.07, + "step": 6210, + "token_acc": 0.8432510885341074, + "train_speed(iter/s)": 0.095958 + }, + { + "epoch": 0.2890201971173549, + "grad_norm": 5.191234588623047, + "learning_rate": 8.517954510663673e-06, + "loss": 0.7208163261413574, + "memory(GiB)": 33.07, + "step": 6215, + "token_acc": 0.8300854700854701, + "train_speed(iter/s)": 0.096002 + }, + { + "epoch": 0.28925271537730446, + "grad_norm": 6.596996307373047, + "learning_rate": 8.515221263080522e-06, + "loss": 0.7454941272735596, + "memory(GiB)": 33.07, + "step": 6220, + "token_acc": 0.8168147641831852, + "train_speed(iter/s)": 0.096045 + }, + { + "epoch": 0.2894852336372541, + "grad_norm": 6.094503879547119, + "learning_rate": 8.512485936838507e-06, + "loss": 0.7535664081573487, + "memory(GiB)": 33.07, + "step": 6225, + "token_acc": 0.8183839881393625, + "train_speed(iter/s)": 0.096089 + }, + { + "epoch": 0.28971775189720367, + "grad_norm": 7.548673152923584, + "learning_rate": 8.50974853355511e-06, + "loss": 0.7302347183227539, + "memory(GiB)": 33.07, + "step": 6230, + "token_acc": 0.819743935309973, + "train_speed(iter/s)": 0.096132 + }, + { + "epoch": 0.2899502701571533, + "grad_norm": 6.916913032531738, + "learning_rate": 8.507009054849047e-06, + "loss": 0.7550792217254638, + "memory(GiB)": 33.07, + "step": 6235, + "token_acc": 0.8215641609719059, + "train_speed(iter/s)": 0.096176 + }, + { + "epoch": 0.2901827884171029, + "grad_norm": 8.061797142028809, + "learning_rate": 8.504267502340252e-06, + "loss": 0.764448595046997, + "memory(GiB)": 33.07, + "step": 6240, + "token_acc": 0.8140432098765432, + "train_speed(iter/s)": 0.09622 + }, + { + "epoch": 0.2904153066770525, + "grad_norm": 6.484617233276367, + "learning_rate": 8.501523877649891e-06, + "loss": 0.8045131683349609, + "memory(GiB)": 33.07, + "step": 6245, + "token_acc": 0.7974641307974641, + "train_speed(iter/s)": 0.096263 + }, + { + "epoch": 0.2906478249370021, + "grad_norm": 6.302259922027588, + "learning_rate": 8.498778182400353e-06, + "loss": 0.6931636333465576, + "memory(GiB)": 33.07, + "step": 6250, + "token_acc": 0.8204667863554758, + "train_speed(iter/s)": 0.096306 + }, + { + "epoch": 0.2906478249370021, + "eval_loss": 0.6549901366233826, + "eval_runtime": 289.9607, + "eval_samples_per_second": 11.984, + "eval_steps_per_second": 11.984, + "step": 6250 + }, + { + "epoch": 0.29088034319695166, + "grad_norm": 6.187180519104004, + "learning_rate": 8.496030418215254e-06, + "loss": 0.6815443992614746, + "memory(GiB)": 33.07, + "step": 6255, + "token_acc": 0.8120918964881039, + "train_speed(iter/s)": 0.095921 + }, + { + "epoch": 0.2911128614569013, + "grad_norm": 5.392935276031494, + "learning_rate": 8.493280586719428e-06, + "loss": 0.6975905895233154, + "memory(GiB)": 33.07, + "step": 6260, + "token_acc": 0.8076083567196757, + "train_speed(iter/s)": 0.095963 + }, + { + "epoch": 0.29134537971685087, + "grad_norm": 5.913050174713135, + "learning_rate": 8.490528689538939e-06, + "loss": 0.7572990894317627, + "memory(GiB)": 33.07, + "step": 6265, + "token_acc": 0.8055087127599775, + "train_speed(iter/s)": 0.096006 + }, + { + "epoch": 0.2915778979768005, + "grad_norm": 7.170483589172363, + "learning_rate": 8.48777472830107e-06, + "loss": 0.7077459812164306, + "memory(GiB)": 33.07, + "step": 6270, + "token_acc": 0.835335141418055, + "train_speed(iter/s)": 0.096049 + }, + { + "epoch": 0.2918104162367501, + "grad_norm": 6.242068767547607, + "learning_rate": 8.48501870463432e-06, + "loss": 0.5808324337005615, + "memory(GiB)": 33.07, + "step": 6275, + "token_acc": 0.8642131979695431, + "train_speed(iter/s)": 0.096092 + }, + { + "epoch": 0.2920429344966997, + "grad_norm": 6.824711322784424, + "learning_rate": 8.48226062016841e-06, + "loss": 0.6758532524108887, + "memory(GiB)": 33.07, + "step": 6280, + "token_acc": 0.8284789644012945, + "train_speed(iter/s)": 0.096135 + }, + { + "epoch": 0.2922754527566493, + "grad_norm": 5.987837791442871, + "learning_rate": 8.479500476534286e-06, + "loss": 0.7241264343261719, + "memory(GiB)": 33.07, + "step": 6285, + "token_acc": 0.8183223811364515, + "train_speed(iter/s)": 0.096178 + }, + { + "epoch": 0.2925079710165989, + "grad_norm": 7.447147369384766, + "learning_rate": 8.476738275364101e-06, + "loss": 0.8527143478393555, + "memory(GiB)": 33.07, + "step": 6290, + "token_acc": 0.7953020134228188, + "train_speed(iter/s)": 0.096222 + }, + { + "epoch": 0.2927404892765485, + "grad_norm": 7.889321804046631, + "learning_rate": 8.47397401829123e-06, + "loss": 0.6823805332183838, + "memory(GiB)": 33.07, + "step": 6295, + "token_acc": 0.8269107257546564, + "train_speed(iter/s)": 0.096265 + }, + { + "epoch": 0.2929730075364981, + "grad_norm": 5.670337677001953, + "learning_rate": 8.471207706950268e-06, + "loss": 0.6378509044647217, + "memory(GiB)": 33.07, + "step": 6300, + "token_acc": 0.8265379113018598, + "train_speed(iter/s)": 0.096308 + }, + { + "epoch": 0.2929730075364981, + "eval_loss": 0.6550729870796204, + "eval_runtime": 290.9136, + "eval_samples_per_second": 11.945, + "eval_steps_per_second": 11.945, + "step": 6300 + }, + { + "epoch": 0.2932055257964477, + "grad_norm": 6.653439521789551, + "learning_rate": 8.468439342977017e-06, + "loss": 0.6482341766357422, + "memory(GiB)": 33.07, + "step": 6305, + "token_acc": 0.8118095907030203, + "train_speed(iter/s)": 0.095924 + }, + { + "epoch": 0.2934380440563973, + "grad_norm": 5.770527362823486, + "learning_rate": 8.465668928008494e-06, + "loss": 0.8743701934814453, + "memory(GiB)": 33.07, + "step": 6310, + "token_acc": 0.7887878787878788, + "train_speed(iter/s)": 0.095966 + }, + { + "epoch": 0.2936705623163469, + "grad_norm": 7.127387523651123, + "learning_rate": 8.462896463682934e-06, + "loss": 0.6957249641418457, + "memory(GiB)": 33.07, + "step": 6315, + "token_acc": 0.8284452853989278, + "train_speed(iter/s)": 0.096009 + }, + { + "epoch": 0.2939030805762965, + "grad_norm": 6.6421308517456055, + "learning_rate": 8.460121951639783e-06, + "loss": 0.7410873889923095, + "memory(GiB)": 33.07, + "step": 6320, + "token_acc": 0.82201203783319, + "train_speed(iter/s)": 0.096052 + }, + { + "epoch": 0.2941355988362461, + "grad_norm": 6.1326775550842285, + "learning_rate": 8.457345393519689e-06, + "loss": 0.68385009765625, + "memory(GiB)": 33.07, + "step": 6325, + "token_acc": 0.8306617344841759, + "train_speed(iter/s)": 0.096095 + }, + { + "epoch": 0.2943681170961957, + "grad_norm": 7.342061519622803, + "learning_rate": 8.454566790964522e-06, + "loss": 0.6733174800872803, + "memory(GiB)": 33.07, + "step": 6330, + "token_acc": 0.8364063023801542, + "train_speed(iter/s)": 0.096138 + }, + { + "epoch": 0.29460063535614534, + "grad_norm": 5.662764549255371, + "learning_rate": 8.451786145617355e-06, + "loss": 0.794578742980957, + "memory(GiB)": 33.07, + "step": 6335, + "token_acc": 0.8155339805825242, + "train_speed(iter/s)": 0.09618 + }, + { + "epoch": 0.2948331536160949, + "grad_norm": 4.566216468811035, + "learning_rate": 8.449003459122467e-06, + "loss": 0.7572064399719238, + "memory(GiB)": 33.07, + "step": 6340, + "token_acc": 0.8042925727195708, + "train_speed(iter/s)": 0.096223 + }, + { + "epoch": 0.2950656718760445, + "grad_norm": 6.372825622558594, + "learning_rate": 8.446218733125347e-06, + "loss": 0.9462939262390136, + "memory(GiB)": 33.07, + "step": 6345, + "token_acc": 0.7759522031366691, + "train_speed(iter/s)": 0.096266 + }, + { + "epoch": 0.2952981901359941, + "grad_norm": 5.942713737487793, + "learning_rate": 8.443431969272691e-06, + "loss": 0.6915206909179688, + "memory(GiB)": 33.07, + "step": 6350, + "token_acc": 0.8340192043895748, + "train_speed(iter/s)": 0.09631 + }, + { + "epoch": 0.2952981901359941, + "eval_loss": 0.6510042548179626, + "eval_runtime": 292.3379, + "eval_samples_per_second": 11.887, + "eval_steps_per_second": 11.887, + "step": 6350 + }, + { + "epoch": 0.2955307083959437, + "grad_norm": 6.594305992126465, + "learning_rate": 8.440643169212396e-06, + "loss": 0.702507734298706, + "memory(GiB)": 33.07, + "step": 6355, + "token_acc": 0.8125821972734563, + "train_speed(iter/s)": 0.095926 + }, + { + "epoch": 0.29576322665589333, + "grad_norm": 5.589081764221191, + "learning_rate": 8.43785233459357e-06, + "loss": 0.7530613899230957, + "memory(GiB)": 33.07, + "step": 6360, + "token_acc": 0.8185123966942148, + "train_speed(iter/s)": 0.09597 + }, + { + "epoch": 0.2959957449158429, + "grad_norm": 7.626203536987305, + "learning_rate": 8.435059467066516e-06, + "loss": 0.8047209739685058, + "memory(GiB)": 33.07, + "step": 6365, + "token_acc": 0.7979341510652034, + "train_speed(iter/s)": 0.096011 + }, + { + "epoch": 0.29622826317579254, + "grad_norm": 7.446657657623291, + "learning_rate": 8.432264568282741e-06, + "loss": 0.8075847625732422, + "memory(GiB)": 33.07, + "step": 6370, + "token_acc": 0.7980014275517487, + "train_speed(iter/s)": 0.096052 + }, + { + "epoch": 0.2964607814357421, + "grad_norm": 6.0461859703063965, + "learning_rate": 8.429467639894961e-06, + "loss": 0.7339020729064941, + "memory(GiB)": 33.07, + "step": 6375, + "token_acc": 0.821873557914167, + "train_speed(iter/s)": 0.096095 + }, + { + "epoch": 0.29669329969569175, + "grad_norm": 4.4445600509643555, + "learning_rate": 8.426668683557082e-06, + "loss": 0.6500592231750488, + "memory(GiB)": 33.07, + "step": 6380, + "token_acc": 0.8398605150214592, + "train_speed(iter/s)": 0.096137 + }, + { + "epoch": 0.2969258179556413, + "grad_norm": 5.21086311340332, + "learning_rate": 8.423867700924213e-06, + "loss": 0.7790399074554444, + "memory(GiB)": 33.07, + "step": 6385, + "token_acc": 0.7943536404160475, + "train_speed(iter/s)": 0.096179 + }, + { + "epoch": 0.29715833621559096, + "grad_norm": 6.420889854431152, + "learning_rate": 8.421064693652663e-06, + "loss": 0.7149899482727051, + "memory(GiB)": 33.07, + "step": 6390, + "token_acc": 0.8253719655442443, + "train_speed(iter/s)": 0.096222 + }, + { + "epoch": 0.29739085447554053, + "grad_norm": 6.900496959686279, + "learning_rate": 8.418259663399936e-06, + "loss": 0.6499155998229981, + "memory(GiB)": 33.07, + "step": 6395, + "token_acc": 0.8465002046663938, + "train_speed(iter/s)": 0.096264 + }, + { + "epoch": 0.2976233727354901, + "grad_norm": 7.934140205383301, + "learning_rate": 8.415452611824733e-06, + "loss": 0.6458067893981934, + "memory(GiB)": 33.07, + "step": 6400, + "token_acc": 0.8421300659754948, + "train_speed(iter/s)": 0.096307 + }, + { + "epoch": 0.2976233727354901, + "eval_loss": 0.6483955979347229, + "eval_runtime": 296.3225, + "eval_samples_per_second": 11.727, + "eval_steps_per_second": 11.727, + "step": 6400 + }, + { + "epoch": 0.29785589099543974, + "grad_norm": 5.832469940185547, + "learning_rate": 8.412643540586951e-06, + "loss": 0.8407029151916504, + "memory(GiB)": 33.07, + "step": 6405, + "token_acc": 0.8117392933149475, + "train_speed(iter/s)": 0.095921 + }, + { + "epoch": 0.2980884092553893, + "grad_norm": 6.6476054191589355, + "learning_rate": 8.409832451347682e-06, + "loss": 0.6710747241973877, + "memory(GiB)": 33.07, + "step": 6410, + "token_acc": 0.8312541037426132, + "train_speed(iter/s)": 0.095963 + }, + { + "epoch": 0.29832092751533895, + "grad_norm": 4.817314147949219, + "learning_rate": 8.407019345769205e-06, + "loss": 0.7402913093566894, + "memory(GiB)": 33.07, + "step": 6415, + "token_acc": 0.8139926945771284, + "train_speed(iter/s)": 0.096004 + }, + { + "epoch": 0.29855344577528853, + "grad_norm": 6.434061050415039, + "learning_rate": 8.404204225515e-06, + "loss": 0.577388858795166, + "memory(GiB)": 33.07, + "step": 6420, + "token_acc": 0.855781723689909, + "train_speed(iter/s)": 0.096047 + }, + { + "epoch": 0.29878596403523816, + "grad_norm": 6.586878776550293, + "learning_rate": 8.401387092249733e-06, + "loss": 0.763947868347168, + "memory(GiB)": 33.07, + "step": 6425, + "token_acc": 0.8243938280675973, + "train_speed(iter/s)": 0.096089 + }, + { + "epoch": 0.29901848229518774, + "grad_norm": 6.808994293212891, + "learning_rate": 8.398567947639264e-06, + "loss": 0.8441635131835937, + "memory(GiB)": 33.07, + "step": 6430, + "token_acc": 0.7888157894736842, + "train_speed(iter/s)": 0.09613 + }, + { + "epoch": 0.29925100055513737, + "grad_norm": 7.461640357971191, + "learning_rate": 8.39574679335064e-06, + "loss": 0.9145607948303223, + "memory(GiB)": 33.07, + "step": 6435, + "token_acc": 0.7673830594184576, + "train_speed(iter/s)": 0.096172 + }, + { + "epoch": 0.29948351881508695, + "grad_norm": 6.615748405456543, + "learning_rate": 8.392923631052092e-06, + "loss": 0.7803031444549561, + "memory(GiB)": 33.07, + "step": 6440, + "token_acc": 0.8063650306748467, + "train_speed(iter/s)": 0.096213 + }, + { + "epoch": 0.2997160370750365, + "grad_norm": 6.24949312210083, + "learning_rate": 8.390098462413047e-06, + "loss": 0.715507173538208, + "memory(GiB)": 33.07, + "step": 6445, + "token_acc": 0.8268319358366751, + "train_speed(iter/s)": 0.096255 + }, + { + "epoch": 0.29994855533498616, + "grad_norm": 5.609653949737549, + "learning_rate": 8.387271289104116e-06, + "loss": 0.648672342300415, + "memory(GiB)": 33.07, + "step": 6450, + "token_acc": 0.8555355535553555, + "train_speed(iter/s)": 0.096297 + }, + { + "epoch": 0.29994855533498616, + "eval_loss": 0.6544153094291687, + "eval_runtime": 294.7661, + "eval_samples_per_second": 11.789, + "eval_steps_per_second": 11.789, + "step": 6450 + }, + { + "epoch": 0.30018107359493573, + "grad_norm": 7.218892574310303, + "learning_rate": 8.38444211279709e-06, + "loss": 0.7374346256256104, + "memory(GiB)": 33.07, + "step": 6455, + "token_acc": 0.8123859087269816, + "train_speed(iter/s)": 0.095915 + }, + { + "epoch": 0.30041359185488536, + "grad_norm": 7.429409027099609, + "learning_rate": 8.38161093516495e-06, + "loss": 0.6752567291259766, + "memory(GiB)": 33.07, + "step": 6460, + "token_acc": 0.8336781133636739, + "train_speed(iter/s)": 0.095956 + }, + { + "epoch": 0.30064611011483494, + "grad_norm": 5.649204254150391, + "learning_rate": 8.37877775788186e-06, + "loss": 0.8494151115417481, + "memory(GiB)": 33.07, + "step": 6465, + "token_acc": 0.7666419203167533, + "train_speed(iter/s)": 0.095997 + }, + { + "epoch": 0.3008786283747846, + "grad_norm": 5.821555137634277, + "learning_rate": 8.375942582623162e-06, + "loss": 0.7295107364654541, + "memory(GiB)": 33.07, + "step": 6470, + "token_acc": 0.8188311688311688, + "train_speed(iter/s)": 0.096039 + }, + { + "epoch": 0.30111114663473415, + "grad_norm": 6.589723110198975, + "learning_rate": 8.373105411065386e-06, + "loss": 0.7120474815368653, + "memory(GiB)": 33.07, + "step": 6475, + "token_acc": 0.8293269230769231, + "train_speed(iter/s)": 0.096081 + }, + { + "epoch": 0.3013436648946838, + "grad_norm": 5.835727691650391, + "learning_rate": 8.370266244886238e-06, + "loss": 0.7486866474151611, + "memory(GiB)": 33.07, + "step": 6480, + "token_acc": 0.8098676293622142, + "train_speed(iter/s)": 0.096123 + }, + { + "epoch": 0.30157618315463336, + "grad_norm": 8.453908920288086, + "learning_rate": 8.367425085764604e-06, + "loss": 0.697899341583252, + "memory(GiB)": 33.07, + "step": 6485, + "token_acc": 0.8223976319684262, + "train_speed(iter/s)": 0.096165 + }, + { + "epoch": 0.301808701414583, + "grad_norm": 4.8690032958984375, + "learning_rate": 8.36458193538055e-06, + "loss": 0.8523747444152832, + "memory(GiB)": 33.07, + "step": 6490, + "token_acc": 0.7894736842105263, + "train_speed(iter/s)": 0.096205 + }, + { + "epoch": 0.30204121967453257, + "grad_norm": 6.731339454650879, + "learning_rate": 8.361736795415317e-06, + "loss": 0.6203603267669677, + "memory(GiB)": 33.07, + "step": 6495, + "token_acc": 0.8547215496368039, + "train_speed(iter/s)": 0.096247 + }, + { + "epoch": 0.30227373793448215, + "grad_norm": 6.212488174438477, + "learning_rate": 8.358889667551327e-06, + "loss": 0.6619673728942871, + "memory(GiB)": 33.07, + "step": 6500, + "token_acc": 0.8305249513933896, + "train_speed(iter/s)": 0.096287 + }, + { + "epoch": 0.30227373793448215, + "eval_loss": 0.6488327383995056, + "eval_runtime": 297.4827, + "eval_samples_per_second": 11.681, + "eval_steps_per_second": 11.681, + "step": 6500 + }, + { + "epoch": 0.3025062561944318, + "grad_norm": 4.924098014831543, + "learning_rate": 8.356040553472172e-06, + "loss": 0.6729198455810547, + "memory(GiB)": 33.07, + "step": 6505, + "token_acc": 0.8140951740480994, + "train_speed(iter/s)": 0.095905 + }, + { + "epoch": 0.30273877445438135, + "grad_norm": 8.07417106628418, + "learning_rate": 8.35318945486262e-06, + "loss": 0.786518907546997, + "memory(GiB)": 33.07, + "step": 6510, + "token_acc": 0.8138771683075481, + "train_speed(iter/s)": 0.095947 + }, + { + "epoch": 0.302971292714331, + "grad_norm": 6.638355255126953, + "learning_rate": 8.350336373408617e-06, + "loss": 0.7544785976409912, + "memory(GiB)": 33.07, + "step": 6515, + "token_acc": 0.8314682943370634, + "train_speed(iter/s)": 0.095988 + }, + { + "epoch": 0.30320381097428056, + "grad_norm": 6.213714122772217, + "learning_rate": 8.347481310797277e-06, + "loss": 0.7576951503753662, + "memory(GiB)": 33.07, + "step": 6520, + "token_acc": 0.8079490291262136, + "train_speed(iter/s)": 0.09603 + }, + { + "epoch": 0.3034363292342302, + "grad_norm": 5.810522079467773, + "learning_rate": 8.344624268716888e-06, + "loss": 0.8516165733337402, + "memory(GiB)": 33.07, + "step": 6525, + "token_acc": 0.7934306569343066, + "train_speed(iter/s)": 0.096072 + }, + { + "epoch": 0.30366884749417977, + "grad_norm": 7.226070880889893, + "learning_rate": 8.341765248856904e-06, + "loss": 0.689451026916504, + "memory(GiB)": 33.07, + "step": 6530, + "token_acc": 0.8396111786148238, + "train_speed(iter/s)": 0.096113 + }, + { + "epoch": 0.3039013657541294, + "grad_norm": 5.169941425323486, + "learning_rate": 8.338904252907953e-06, + "loss": 0.7344254970550537, + "memory(GiB)": 33.07, + "step": 6535, + "token_acc": 0.8086441268734751, + "train_speed(iter/s)": 0.096155 + }, + { + "epoch": 0.304133884014079, + "grad_norm": 5.565005779266357, + "learning_rate": 8.33604128256183e-06, + "loss": 0.7079814910888672, + "memory(GiB)": 33.07, + "step": 6540, + "token_acc": 0.8164042661249366, + "train_speed(iter/s)": 0.096193 + }, + { + "epoch": 0.30436640227402856, + "grad_norm": 6.246211051940918, + "learning_rate": 8.3331763395115e-06, + "loss": 0.7657256126403809, + "memory(GiB)": 33.07, + "step": 6545, + "token_acc": 0.8109756097560976, + "train_speed(iter/s)": 0.096233 + }, + { + "epoch": 0.3045989205339782, + "grad_norm": 5.423332214355469, + "learning_rate": 8.330309425451089e-06, + "loss": 0.7353767395019531, + "memory(GiB)": 33.07, + "step": 6550, + "token_acc": 0.8268970189701897, + "train_speed(iter/s)": 0.096274 + }, + { + "epoch": 0.3045989205339782, + "eval_loss": 0.6473827958106995, + "eval_runtime": 291.9143, + "eval_samples_per_second": 11.904, + "eval_steps_per_second": 11.904, + "step": 6550 + }, + { + "epoch": 0.30483143879392777, + "grad_norm": 6.537137508392334, + "learning_rate": 8.327440542075892e-06, + "loss": 0.7706812858581543, + "memory(GiB)": 33.07, + "step": 6555, + "token_acc": 0.8133992338267872, + "train_speed(iter/s)": 0.095903 + }, + { + "epoch": 0.3050639570538774, + "grad_norm": 2.7098896503448486, + "learning_rate": 8.324569691082374e-06, + "loss": 0.8289295196533203, + "memory(GiB)": 33.07, + "step": 6560, + "token_acc": 0.7824712643678161, + "train_speed(iter/s)": 0.095943 + }, + { + "epoch": 0.305296475313827, + "grad_norm": 5.2406816482543945, + "learning_rate": 8.32169687416815e-06, + "loss": 0.6724458694458008, + "memory(GiB)": 33.07, + "step": 6565, + "token_acc": 0.8261464750171116, + "train_speed(iter/s)": 0.095984 + }, + { + "epoch": 0.3055289935737766, + "grad_norm": 6.757907390594482, + "learning_rate": 8.318822093032011e-06, + "loss": 0.6793931484222412, + "memory(GiB)": 33.07, + "step": 6570, + "token_acc": 0.8337605272793849, + "train_speed(iter/s)": 0.096026 + }, + { + "epoch": 0.3057615118337262, + "grad_norm": 5.912170886993408, + "learning_rate": 8.3159453493739e-06, + "loss": 0.7322561740875244, + "memory(GiB)": 33.07, + "step": 6575, + "token_acc": 0.8194933145672062, + "train_speed(iter/s)": 0.096067 + }, + { + "epoch": 0.3059940300936758, + "grad_norm": 5.746609687805176, + "learning_rate": 8.313066644894927e-06, + "loss": 0.9141531944274902, + "memory(GiB)": 33.07, + "step": 6580, + "token_acc": 0.778856526429342, + "train_speed(iter/s)": 0.096107 + }, + { + "epoch": 0.3062265483536254, + "grad_norm": 6.435399055480957, + "learning_rate": 8.310185981297359e-06, + "loss": 0.7600241661071777, + "memory(GiB)": 33.07, + "step": 6585, + "token_acc": 0.80836820083682, + "train_speed(iter/s)": 0.096149 + }, + { + "epoch": 0.30645906661357497, + "grad_norm": 7.434659957885742, + "learning_rate": 8.307303360284618e-06, + "loss": 0.7691242218017578, + "memory(GiB)": 33.07, + "step": 6590, + "token_acc": 0.8111338797814208, + "train_speed(iter/s)": 0.09619 + }, + { + "epoch": 0.3066915848735246, + "grad_norm": 8.359783172607422, + "learning_rate": 8.30441878356129e-06, + "loss": 0.7094342708587646, + "memory(GiB)": 33.07, + "step": 6595, + "token_acc": 0.8230152949745084, + "train_speed(iter/s)": 0.096232 + }, + { + "epoch": 0.3069241031334742, + "grad_norm": 7.274518966674805, + "learning_rate": 8.301532252833112e-06, + "loss": 0.6358844757080078, + "memory(GiB)": 33.07, + "step": 6600, + "token_acc": 0.8455056179775281, + "train_speed(iter/s)": 0.096272 + }, + { + "epoch": 0.3069241031334742, + "eval_loss": 0.6463093161582947, + "eval_runtime": 289.4779, + "eval_samples_per_second": 12.004, + "eval_steps_per_second": 12.004, + "step": 6600 + }, + { + "epoch": 0.3071566213934238, + "grad_norm": 6.91964054107666, + "learning_rate": 8.298643769806981e-06, + "loss": 0.7548566818237304, + "memory(GiB)": 33.07, + "step": 6605, + "token_acc": 0.8135487582702081, + "train_speed(iter/s)": 0.095908 + }, + { + "epoch": 0.3073891396533734, + "grad_norm": 5.320629119873047, + "learning_rate": 8.295753336190945e-06, + "loss": 0.6493088722229003, + "memory(GiB)": 33.07, + "step": 6610, + "token_acc": 0.8388537402307406, + "train_speed(iter/s)": 0.095949 + }, + { + "epoch": 0.307621657913323, + "grad_norm": 6.0627217292785645, + "learning_rate": 8.292860953694208e-06, + "loss": 0.8688552856445313, + "memory(GiB)": 33.07, + "step": 6615, + "token_acc": 0.7757100881488737, + "train_speed(iter/s)": 0.09599 + }, + { + "epoch": 0.3078541761732726, + "grad_norm": 6.99749231338501, + "learning_rate": 8.289966624027123e-06, + "loss": 0.6862592220306396, + "memory(GiB)": 33.07, + "step": 6620, + "token_acc": 0.8159357628165534, + "train_speed(iter/s)": 0.096031 + }, + { + "epoch": 0.30808669443322223, + "grad_norm": 6.315097332000732, + "learning_rate": 8.287070348901198e-06, + "loss": 0.7650423526763916, + "memory(GiB)": 33.07, + "step": 6625, + "token_acc": 0.8128571428571428, + "train_speed(iter/s)": 0.096072 + }, + { + "epoch": 0.3083192126931718, + "grad_norm": 6.0371527671813965, + "learning_rate": 8.28417213002909e-06, + "loss": 0.6402733325958252, + "memory(GiB)": 33.07, + "step": 6630, + "token_acc": 0.8501052631578947, + "train_speed(iter/s)": 0.096113 + }, + { + "epoch": 0.30855173095312144, + "grad_norm": 5.829378604888916, + "learning_rate": 8.281271969124602e-06, + "loss": 0.8457640647888184, + "memory(GiB)": 33.07, + "step": 6635, + "token_acc": 0.7996213316503629, + "train_speed(iter/s)": 0.096153 + }, + { + "epoch": 0.308784249213071, + "grad_norm": 6.845560073852539, + "learning_rate": 8.278369867902693e-06, + "loss": 0.678613805770874, + "memory(GiB)": 33.07, + "step": 6640, + "token_acc": 0.8426270136307311, + "train_speed(iter/s)": 0.096193 + }, + { + "epoch": 0.3090167674730206, + "grad_norm": 5.743708610534668, + "learning_rate": 8.275465828079463e-06, + "loss": 0.8091531753540039, + "memory(GiB)": 33.07, + "step": 6645, + "token_acc": 0.811193309745899, + "train_speed(iter/s)": 0.096235 + }, + { + "epoch": 0.3092492857329702, + "grad_norm": 5.360169410705566, + "learning_rate": 8.27255985137216e-06, + "loss": 0.633982801437378, + "memory(GiB)": 33.07, + "step": 6650, + "token_acc": 0.843441466854725, + "train_speed(iter/s)": 0.096275 + }, + { + "epoch": 0.3092492857329702, + "eval_loss": 0.6490738987922668, + "eval_runtime": 290.6445, + "eval_samples_per_second": 11.956, + "eval_steps_per_second": 11.956, + "step": 6650 + }, + { + "epoch": 0.3094818039929198, + "grad_norm": 8.406295776367188, + "learning_rate": 8.26965193949918e-06, + "loss": 0.7386780738830566, + "memory(GiB)": 33.07, + "step": 6655, + "token_acc": 0.8133779961053635, + "train_speed(iter/s)": 0.095911 + }, + { + "epoch": 0.30971432225286943, + "grad_norm": 6.63882303237915, + "learning_rate": 8.266742094180058e-06, + "loss": 0.7321940422058105, + "memory(GiB)": 33.07, + "step": 6660, + "token_acc": 0.8038100653966449, + "train_speed(iter/s)": 0.095951 + }, + { + "epoch": 0.309946840512819, + "grad_norm": 5.776708602905273, + "learning_rate": 8.263830317135479e-06, + "loss": 0.6394780158996582, + "memory(GiB)": 33.07, + "step": 6665, + "token_acc": 0.850541215653622, + "train_speed(iter/s)": 0.095992 + }, + { + "epoch": 0.31017935877276864, + "grad_norm": 6.080323219299316, + "learning_rate": 8.260916610087264e-06, + "loss": 0.903103256225586, + "memory(GiB)": 33.07, + "step": 6670, + "token_acc": 0.7794170564951421, + "train_speed(iter/s)": 0.096032 + }, + { + "epoch": 0.3104118770327182, + "grad_norm": 5.895566463470459, + "learning_rate": 8.258000974758378e-06, + "loss": 0.6670703887939453, + "memory(GiB)": 33.07, + "step": 6675, + "token_acc": 0.8356873822975518, + "train_speed(iter/s)": 0.096072 + }, + { + "epoch": 0.31064439529266785, + "grad_norm": 5.250027179718018, + "learning_rate": 8.255083412872927e-06, + "loss": 0.6507026672363281, + "memory(GiB)": 33.07, + "step": 6680, + "token_acc": 0.8399592252803262, + "train_speed(iter/s)": 0.096113 + }, + { + "epoch": 0.31087691355261743, + "grad_norm": 6.803175449371338, + "learning_rate": 8.252163926156154e-06, + "loss": 0.7843762397766113, + "memory(GiB)": 33.07, + "step": 6685, + "token_acc": 0.7998108448928121, + "train_speed(iter/s)": 0.096154 + }, + { + "epoch": 0.311109431812567, + "grad_norm": 6.703561305999756, + "learning_rate": 8.249242516334444e-06, + "loss": 0.6578123569488525, + "memory(GiB)": 33.07, + "step": 6690, + "token_acc": 0.8411325206449076, + "train_speed(iter/s)": 0.096195 + }, + { + "epoch": 0.31134195007251664, + "grad_norm": 5.961348533630371, + "learning_rate": 8.246319185135317e-06, + "loss": 0.7796625137329102, + "memory(GiB)": 33.07, + "step": 6695, + "token_acc": 0.80397127165188, + "train_speed(iter/s)": 0.096235 + }, + { + "epoch": 0.3115744683324662, + "grad_norm": 7.202671527862549, + "learning_rate": 8.243393934287424e-06, + "loss": 0.6795454502105713, + "memory(GiB)": 33.07, + "step": 6700, + "token_acc": 0.8431786216596343, + "train_speed(iter/s)": 0.096276 + }, + { + "epoch": 0.3115744683324662, + "eval_loss": 0.6488170027732849, + "eval_runtime": 293.4475, + "eval_samples_per_second": 11.842, + "eval_steps_per_second": 11.842, + "step": 6700 + }, + { + "epoch": 0.31180698659241585, + "grad_norm": 6.131471157073975, + "learning_rate": 8.240466765520563e-06, + "loss": 0.7111649513244629, + "memory(GiB)": 33.07, + "step": 6705, + "token_acc": 0.8138903053094525, + "train_speed(iter/s)": 0.095912 + }, + { + "epoch": 0.3120395048523654, + "grad_norm": 7.5404815673828125, + "learning_rate": 8.237537680565655e-06, + "loss": 0.6519620895385743, + "memory(GiB)": 33.07, + "step": 6710, + "token_acc": 0.8393972804116133, + "train_speed(iter/s)": 0.095952 + }, + { + "epoch": 0.31227202311231506, + "grad_norm": 5.742206573486328, + "learning_rate": 8.23460668115476e-06, + "loss": 0.7203374862670898, + "memory(GiB)": 33.07, + "step": 6715, + "token_acc": 0.8262962962962963, + "train_speed(iter/s)": 0.095993 + }, + { + "epoch": 0.31250454137226463, + "grad_norm": 5.321285247802734, + "learning_rate": 8.231673769021066e-06, + "loss": 0.893209171295166, + "memory(GiB)": 33.07, + "step": 6720, + "token_acc": 0.797979797979798, + "train_speed(iter/s)": 0.096034 + }, + { + "epoch": 0.31273705963221426, + "grad_norm": 7.4848952293396, + "learning_rate": 8.228738945898897e-06, + "loss": 0.6799060344696045, + "memory(GiB)": 33.07, + "step": 6725, + "token_acc": 0.827831025914093, + "train_speed(iter/s)": 0.096074 + }, + { + "epoch": 0.31296957789216384, + "grad_norm": 5.708653450012207, + "learning_rate": 8.225802213523705e-06, + "loss": 0.6490331649780273, + "memory(GiB)": 33.07, + "step": 6730, + "token_acc": 0.8281505728314239, + "train_speed(iter/s)": 0.096114 + }, + { + "epoch": 0.3132020961521134, + "grad_norm": 4.684449195861816, + "learning_rate": 8.222863573632068e-06, + "loss": 0.7676737785339356, + "memory(GiB)": 33.07, + "step": 6735, + "token_acc": 0.8076923076923077, + "train_speed(iter/s)": 0.096153 + }, + { + "epoch": 0.31343461441206305, + "grad_norm": 5.874281406402588, + "learning_rate": 8.219923027961696e-06, + "loss": 0.6278162002563477, + "memory(GiB)": 33.07, + "step": 6740, + "token_acc": 0.8484974958263773, + "train_speed(iter/s)": 0.096194 + }, + { + "epoch": 0.3136671326720126, + "grad_norm": 6.561712265014648, + "learning_rate": 8.216980578251426e-06, + "loss": 0.6256554603576661, + "memory(GiB)": 33.07, + "step": 6745, + "token_acc": 0.8286576168929111, + "train_speed(iter/s)": 0.096233 + }, + { + "epoch": 0.31389965093196226, + "grad_norm": 7.314164161682129, + "learning_rate": 8.214036226241216e-06, + "loss": 0.6697664737701416, + "memory(GiB)": 33.07, + "step": 6750, + "token_acc": 0.8280954184021204, + "train_speed(iter/s)": 0.096272 + }, + { + "epoch": 0.31389965093196226, + "eval_loss": 0.6476150155067444, + "eval_runtime": 294.4404, + "eval_samples_per_second": 11.802, + "eval_steps_per_second": 11.802, + "step": 6750 + }, + { + "epoch": 0.31413216919191184, + "grad_norm": 6.524766445159912, + "learning_rate": 8.211089973672155e-06, + "loss": 0.8700243949890136, + "memory(GiB)": 33.07, + "step": 6755, + "token_acc": 0.8140937559915639, + "train_speed(iter/s)": 0.095909 + }, + { + "epoch": 0.31436468745186147, + "grad_norm": 5.901501655578613, + "learning_rate": 8.208141822286452e-06, + "loss": 0.7633928298950196, + "memory(GiB)": 33.07, + "step": 6760, + "token_acc": 0.8067516362383741, + "train_speed(iter/s)": 0.095949 + }, + { + "epoch": 0.31459720571181105, + "grad_norm": 8.813726425170898, + "learning_rate": 8.20519177382744e-06, + "loss": 0.8027013778686524, + "memory(GiB)": 36.53, + "step": 6765, + "token_acc": 0.7644291091593476, + "train_speed(iter/s)": 0.095986 + }, + { + "epoch": 0.3148297239717607, + "grad_norm": 7.789068222045898, + "learning_rate": 8.202239830039572e-06, + "loss": 0.6651137828826904, + "memory(GiB)": 36.53, + "step": 6770, + "token_acc": 0.8395303326810176, + "train_speed(iter/s)": 0.096025 + }, + { + "epoch": 0.31506224223171025, + "grad_norm": 5.441076755523682, + "learning_rate": 8.199285992668426e-06, + "loss": 0.7530568599700928, + "memory(GiB)": 36.53, + "step": 6775, + "token_acc": 0.8276004973062577, + "train_speed(iter/s)": 0.096065 + }, + { + "epoch": 0.3152947604916599, + "grad_norm": 5.6672234535217285, + "learning_rate": 8.196330263460698e-06, + "loss": 0.8605976104736328, + "memory(GiB)": 36.53, + "step": 6780, + "token_acc": 0.7902680197762165, + "train_speed(iter/s)": 0.096105 + }, + { + "epoch": 0.31552727875160946, + "grad_norm": 6.022310256958008, + "learning_rate": 8.1933726441642e-06, + "loss": 0.7086090087890625, + "memory(GiB)": 36.53, + "step": 6785, + "token_acc": 0.8303532490187527, + "train_speed(iter/s)": 0.096145 + }, + { + "epoch": 0.31575979701155904, + "grad_norm": 5.349603176116943, + "learning_rate": 8.190413136527861e-06, + "loss": 0.8489409446716308, + "memory(GiB)": 36.53, + "step": 6790, + "token_acc": 0.7902423865755127, + "train_speed(iter/s)": 0.096184 + }, + { + "epoch": 0.3159923152715087, + "grad_norm": 7.828374862670898, + "learning_rate": 8.187451742301735e-06, + "loss": 0.7629014492034912, + "memory(GiB)": 36.53, + "step": 6795, + "token_acc": 0.8210290827740492, + "train_speed(iter/s)": 0.096224 + }, + { + "epoch": 0.31622483353145825, + "grad_norm": 9.842150688171387, + "learning_rate": 8.184488463236984e-06, + "loss": 0.6981842041015625, + "memory(GiB)": 36.53, + "step": 6800, + "token_acc": 0.8441385435168739, + "train_speed(iter/s)": 0.096264 + }, + { + "epoch": 0.31622483353145825, + "eval_loss": 0.6452161073684692, + "eval_runtime": 293.1209, + "eval_samples_per_second": 11.855, + "eval_steps_per_second": 11.855, + "step": 6800 + }, + { + "epoch": 0.3164573517914079, + "grad_norm": 5.845304489135742, + "learning_rate": 8.181523301085883e-06, + "loss": 0.8383314132690429, + "memory(GiB)": 36.53, + "step": 6805, + "token_acc": 0.813117224573554, + "train_speed(iter/s)": 0.095906 + }, + { + "epoch": 0.31668987005135746, + "grad_norm": 7.104671478271484, + "learning_rate": 8.178556257601828e-06, + "loss": 0.7857285499572754, + "memory(GiB)": 36.53, + "step": 6810, + "token_acc": 0.8020100502512563, + "train_speed(iter/s)": 0.095947 + }, + { + "epoch": 0.3169223883113071, + "grad_norm": 5.765302658081055, + "learning_rate": 8.175587334539321e-06, + "loss": 0.7419705867767334, + "memory(GiB)": 36.53, + "step": 6815, + "token_acc": 0.8118503118503119, + "train_speed(iter/s)": 0.095986 + }, + { + "epoch": 0.31715490657125667, + "grad_norm": 7.719875335693359, + "learning_rate": 8.172616533653978e-06, + "loss": 0.7236505508422851, + "memory(GiB)": 36.53, + "step": 6820, + "token_acc": 0.8233870967741935, + "train_speed(iter/s)": 0.096025 + }, + { + "epoch": 0.3173874248312063, + "grad_norm": 5.352121829986572, + "learning_rate": 8.169643856702528e-06, + "loss": 0.8514439582824707, + "memory(GiB)": 36.53, + "step": 6825, + "token_acc": 0.7984257357973991, + "train_speed(iter/s)": 0.096064 + }, + { + "epoch": 0.3176199430911559, + "grad_norm": 5.204432964324951, + "learning_rate": 8.166669305442803e-06, + "loss": 0.7329915523529053, + "memory(GiB)": 36.53, + "step": 6830, + "token_acc": 0.8289398280802293, + "train_speed(iter/s)": 0.096103 + }, + { + "epoch": 0.31785246135110545, + "grad_norm": 8.581599235534668, + "learning_rate": 8.16369288163375e-06, + "loss": 0.7899059295654297, + "memory(GiB)": 36.53, + "step": 6835, + "token_acc": 0.8126003210272873, + "train_speed(iter/s)": 0.096141 + }, + { + "epoch": 0.3180849796110551, + "grad_norm": 8.947052955627441, + "learning_rate": 8.160714587035418e-06, + "loss": 0.6274521827697754, + "memory(GiB)": 36.53, + "step": 6840, + "token_acc": 0.8581187598528639, + "train_speed(iter/s)": 0.096181 + }, + { + "epoch": 0.31831749787100466, + "grad_norm": 5.990546226501465, + "learning_rate": 8.157734423408964e-06, + "loss": 0.6243311405181885, + "memory(GiB)": 36.53, + "step": 6845, + "token_acc": 0.8362896190753126, + "train_speed(iter/s)": 0.09622 + }, + { + "epoch": 0.3185500161309543, + "grad_norm": 6.6962690353393555, + "learning_rate": 8.154752392516654e-06, + "loss": 0.6459828853607178, + "memory(GiB)": 36.53, + "step": 6850, + "token_acc": 0.8609100310237849, + "train_speed(iter/s)": 0.096259 + }, + { + "epoch": 0.3185500161309543, + "eval_loss": 0.6414746046066284, + "eval_runtime": 296.31, + "eval_samples_per_second": 11.728, + "eval_steps_per_second": 11.728, + "step": 6850 + }, + { + "epoch": 0.31878253439090387, + "grad_norm": 5.734102249145508, + "learning_rate": 8.151768496121852e-06, + "loss": 0.7357370853424072, + "memory(GiB)": 36.53, + "step": 6855, + "token_acc": 0.8145698542054879, + "train_speed(iter/s)": 0.095898 + }, + { + "epoch": 0.3190150526508535, + "grad_norm": 7.676290035247803, + "learning_rate": 8.148782735989032e-06, + "loss": 0.7362208366394043, + "memory(GiB)": 36.53, + "step": 6860, + "token_acc": 0.8165252906063463, + "train_speed(iter/s)": 0.095937 + }, + { + "epoch": 0.3192475709108031, + "grad_norm": 4.947035789489746, + "learning_rate": 8.145795113883762e-06, + "loss": 0.8011846542358398, + "memory(GiB)": 36.53, + "step": 6865, + "token_acc": 0.8070175438596491, + "train_speed(iter/s)": 0.095977 + }, + { + "epoch": 0.3194800891707527, + "grad_norm": 6.373569488525391, + "learning_rate": 8.142805631572714e-06, + "loss": 0.6174682140350342, + "memory(GiB)": 36.53, + "step": 6870, + "token_acc": 0.8545526212061489, + "train_speed(iter/s)": 0.096015 + }, + { + "epoch": 0.3197126074307023, + "grad_norm": 8.267929077148438, + "learning_rate": 8.139814290823666e-06, + "loss": 0.6648574829101562, + "memory(GiB)": 36.53, + "step": 6875, + "token_acc": 0.8253856942496494, + "train_speed(iter/s)": 0.096054 + }, + { + "epoch": 0.3199451256906519, + "grad_norm": 5.163671016693115, + "learning_rate": 8.13682109340549e-06, + "loss": 0.906800651550293, + "memory(GiB)": 36.53, + "step": 6880, + "token_acc": 0.7814548860443334, + "train_speed(iter/s)": 0.096093 + }, + { + "epoch": 0.3201776439506015, + "grad_norm": 6.802454471588135, + "learning_rate": 8.133826041088151e-06, + "loss": 0.7294719696044922, + "memory(GiB)": 36.53, + "step": 6885, + "token_acc": 0.8328280640970117, + "train_speed(iter/s)": 0.096133 + }, + { + "epoch": 0.3204101622105511, + "grad_norm": 7.575679779052734, + "learning_rate": 8.130829135642719e-06, + "loss": 0.8211429595947266, + "memory(GiB)": 36.53, + "step": 6890, + "token_acc": 0.7984913793103449, + "train_speed(iter/s)": 0.096172 + }, + { + "epoch": 0.3206426804705007, + "grad_norm": 8.01412582397461, + "learning_rate": 8.127830378841356e-06, + "loss": 0.8153658866882324, + "memory(GiB)": 36.53, + "step": 6895, + "token_acc": 0.8100911002102312, + "train_speed(iter/s)": 0.096212 + }, + { + "epoch": 0.3208751987304503, + "grad_norm": 6.638772010803223, + "learning_rate": 8.124829772457324e-06, + "loss": 0.8706229209899903, + "memory(GiB)": 36.53, + "step": 6900, + "token_acc": 0.8031319910514542, + "train_speed(iter/s)": 0.096251 + }, + { + "epoch": 0.3208751987304503, + "eval_loss": 0.6478042006492615, + "eval_runtime": 294.3248, + "eval_samples_per_second": 11.807, + "eval_steps_per_second": 11.807, + "step": 6900 + }, + { + "epoch": 0.3211077169903999, + "grad_norm": 7.012734413146973, + "learning_rate": 8.121827318264966e-06, + "loss": 0.7513750076293946, + "memory(GiB)": 36.53, + "step": 6905, + "token_acc": 0.8138943484015884, + "train_speed(iter/s)": 0.095896 + }, + { + "epoch": 0.3213402352503495, + "grad_norm": 4.67357063293457, + "learning_rate": 8.118823018039732e-06, + "loss": 0.7380107402801513, + "memory(GiB)": 36.53, + "step": 6910, + "token_acc": 0.814316974054392, + "train_speed(iter/s)": 0.095935 + }, + { + "epoch": 0.3215727535102991, + "grad_norm": 5.693792343139648, + "learning_rate": 8.115816873558155e-06, + "loss": 0.7206833839416504, + "memory(GiB)": 36.53, + "step": 6915, + "token_acc": 0.8147345612134345, + "train_speed(iter/s)": 0.095973 + }, + { + "epoch": 0.3218052717702487, + "grad_norm": 7.146193027496338, + "learning_rate": 8.112808886597863e-06, + "loss": 0.8876776695251465, + "memory(GiB)": 36.53, + "step": 6920, + "token_acc": 0.7880449684672334, + "train_speed(iter/s)": 0.096012 + }, + { + "epoch": 0.32203779003019833, + "grad_norm": 6.618865013122559, + "learning_rate": 8.109799058937568e-06, + "loss": 0.7404951095581055, + "memory(GiB)": 36.53, + "step": 6925, + "token_acc": 0.805001689760054, + "train_speed(iter/s)": 0.09605 + }, + { + "epoch": 0.3222703082901479, + "grad_norm": 6.573246955871582, + "learning_rate": 8.106787392357077e-06, + "loss": 0.7026708602905274, + "memory(GiB)": 36.53, + "step": 6930, + "token_acc": 0.8272071453108897, + "train_speed(iter/s)": 0.096089 + }, + { + "epoch": 0.3225028265500975, + "grad_norm": 7.522039890289307, + "learning_rate": 8.103773888637281e-06, + "loss": 0.712070894241333, + "memory(GiB)": 36.53, + "step": 6935, + "token_acc": 0.8365795724465558, + "train_speed(iter/s)": 0.096129 + }, + { + "epoch": 0.3227353448100471, + "grad_norm": 5.250293731689453, + "learning_rate": 8.100758549560157e-06, + "loss": 0.6830814838409424, + "memory(GiB)": 36.53, + "step": 6940, + "token_acc": 0.8091185410334346, + "train_speed(iter/s)": 0.096168 + }, + { + "epoch": 0.3229678630699967, + "grad_norm": 9.969382286071777, + "learning_rate": 8.09774137690877e-06, + "loss": 0.7145745754241943, + "memory(GiB)": 36.53, + "step": 6945, + "token_acc": 0.8318619582664526, + "train_speed(iter/s)": 0.096207 + }, + { + "epoch": 0.32320038132994633, + "grad_norm": 6.600118637084961, + "learning_rate": 8.094722372467264e-06, + "loss": 0.6322240352630615, + "memory(GiB)": 36.53, + "step": 6950, + "token_acc": 0.8467902051621443, + "train_speed(iter/s)": 0.096246 + }, + { + "epoch": 0.32320038132994633, + "eval_loss": 0.6434064507484436, + "eval_runtime": 292.2564, + "eval_samples_per_second": 11.89, + "eval_steps_per_second": 11.89, + "step": 6950 + }, + { + "epoch": 0.3234328995898959, + "grad_norm": 7.271444320678711, + "learning_rate": 8.091701538020871e-06, + "loss": 0.5597502708435058, + "memory(GiB)": 36.53, + "step": 6955, + "token_acc": 0.8157582938388626, + "train_speed(iter/s)": 0.095896 + }, + { + "epoch": 0.32366541784984554, + "grad_norm": 6.547903537750244, + "learning_rate": 8.088678875355907e-06, + "loss": 0.7079691410064697, + "memory(GiB)": 36.53, + "step": 6960, + "token_acc": 0.8249725375320396, + "train_speed(iter/s)": 0.095936 + }, + { + "epoch": 0.3238979361097951, + "grad_norm": 6.421803951263428, + "learning_rate": 8.08565438625976e-06, + "loss": 0.7322468757629395, + "memory(GiB)": 36.53, + "step": 6965, + "token_acc": 0.822429906542056, + "train_speed(iter/s)": 0.095976 + }, + { + "epoch": 0.32413045436974475, + "grad_norm": 5.57395076751709, + "learning_rate": 8.082628072520909e-06, + "loss": 0.7521162509918213, + "memory(GiB)": 36.53, + "step": 6970, + "token_acc": 0.817231548938866, + "train_speed(iter/s)": 0.096015 + }, + { + "epoch": 0.3243629726296943, + "grad_norm": 6.311516761779785, + "learning_rate": 8.079599935928903e-06, + "loss": 0.6746647357940674, + "memory(GiB)": 36.53, + "step": 6975, + "token_acc": 0.8303362001563722, + "train_speed(iter/s)": 0.096054 + }, + { + "epoch": 0.3245954908896439, + "grad_norm": 6.750150203704834, + "learning_rate": 8.076569978274373e-06, + "loss": 0.6810788154602051, + "memory(GiB)": 36.53, + "step": 6980, + "token_acc": 0.8330940416367552, + "train_speed(iter/s)": 0.096092 + }, + { + "epoch": 0.32482800914959353, + "grad_norm": 6.657168388366699, + "learning_rate": 8.073538201349027e-06, + "loss": 0.7422618865966797, + "memory(GiB)": 36.53, + "step": 6985, + "token_acc": 0.8146167557932263, + "train_speed(iter/s)": 0.09613 + }, + { + "epoch": 0.3250605274095431, + "grad_norm": 7.414977550506592, + "learning_rate": 8.070504606945652e-06, + "loss": 0.744996976852417, + "memory(GiB)": 36.53, + "step": 6990, + "token_acc": 0.8114323258869908, + "train_speed(iter/s)": 0.096168 + }, + { + "epoch": 0.32529304566949274, + "grad_norm": 5.731770992279053, + "learning_rate": 8.067469196858101e-06, + "loss": 0.714255428314209, + "memory(GiB)": 36.53, + "step": 6995, + "token_acc": 0.8291497975708502, + "train_speed(iter/s)": 0.096207 + }, + { + "epoch": 0.3255255639294423, + "grad_norm": 6.356823921203613, + "learning_rate": 8.064431972881308e-06, + "loss": 0.8558525085449219, + "memory(GiB)": 36.53, + "step": 7000, + "token_acc": 0.784629981024668, + "train_speed(iter/s)": 0.096245 + }, + { + "epoch": 0.3255255639294423, + "eval_loss": 0.6428462266921997, + "eval_runtime": 289.8023, + "eval_samples_per_second": 11.991, + "eval_steps_per_second": 11.991, + "step": 7000 + }, + { + "epoch": 0.32575808218939195, + "grad_norm": 7.4635443687438965, + "learning_rate": 8.061392936811276e-06, + "loss": 0.6529666900634765, + "memory(GiB)": 36.53, + "step": 7005, + "token_acc": 0.8152108566527458, + "train_speed(iter/s)": 0.095901 + }, + { + "epoch": 0.3259906004493415, + "grad_norm": 5.513030052185059, + "learning_rate": 8.058352090445085e-06, + "loss": 0.7095055103302002, + "memory(GiB)": 36.53, + "step": 7010, + "token_acc": 0.8111888111888111, + "train_speed(iter/s)": 0.09594 + }, + { + "epoch": 0.32622311870929116, + "grad_norm": 6.568864345550537, + "learning_rate": 8.055309435580874e-06, + "loss": 0.796638298034668, + "memory(GiB)": 36.53, + "step": 7015, + "token_acc": 0.8049580751002552, + "train_speed(iter/s)": 0.095979 + }, + { + "epoch": 0.32645563696924074, + "grad_norm": 5.537734508514404, + "learning_rate": 8.052264974017864e-06, + "loss": 0.8192606925964355, + "memory(GiB)": 36.53, + "step": 7020, + "token_acc": 0.8001395186606208, + "train_speed(iter/s)": 0.096018 + }, + { + "epoch": 0.32668815522919037, + "grad_norm": 5.668457984924316, + "learning_rate": 8.049218707556338e-06, + "loss": 0.7112496852874756, + "memory(GiB)": 36.53, + "step": 7025, + "token_acc": 0.8256777108433735, + "train_speed(iter/s)": 0.096057 + }, + { + "epoch": 0.32692067348913995, + "grad_norm": 7.687334060668945, + "learning_rate": 8.046170637997651e-06, + "loss": 0.7744904041290284, + "memory(GiB)": 36.53, + "step": 7030, + "token_acc": 0.8145842596709649, + "train_speed(iter/s)": 0.096096 + }, + { + "epoch": 0.3271531917490895, + "grad_norm": 6.078497886657715, + "learning_rate": 8.043120767144212e-06, + "loss": 0.6508955955505371, + "memory(GiB)": 36.53, + "step": 7035, + "token_acc": 0.834321590512731, + "train_speed(iter/s)": 0.096135 + }, + { + "epoch": 0.32738571000903915, + "grad_norm": 5.643200397491455, + "learning_rate": 8.040069096799511e-06, + "loss": 0.8292275428771972, + "memory(GiB)": 36.53, + "step": 7040, + "token_acc": 0.7918106886466365, + "train_speed(iter/s)": 0.096172 + }, + { + "epoch": 0.32761822826898873, + "grad_norm": 6.846371173858643, + "learning_rate": 8.037015628768092e-06, + "loss": 0.7285429477691651, + "memory(GiB)": 36.53, + "step": 7045, + "token_acc": 0.8193423597678917, + "train_speed(iter/s)": 0.096211 + }, + { + "epoch": 0.32785074652893836, + "grad_norm": 7.871469974517822, + "learning_rate": 8.033960364855566e-06, + "loss": 0.7744301795959473, + "memory(GiB)": 36.53, + "step": 7050, + "token_acc": 0.818105616093881, + "train_speed(iter/s)": 0.09625 + }, + { + "epoch": 0.32785074652893836, + "eval_loss": 0.6424740552902222, + "eval_runtime": 295.611, + "eval_samples_per_second": 11.755, + "eval_steps_per_second": 11.755, + "step": 7050 + }, + { + "epoch": 0.32808326478888794, + "grad_norm": 6.070770740509033, + "learning_rate": 8.030903306868605e-06, + "loss": 0.7583102226257324, + "memory(GiB)": 36.53, + "step": 7055, + "token_acc": 0.8147746967071057, + "train_speed(iter/s)": 0.095901 + }, + { + "epoch": 0.3283157830488376, + "grad_norm": 7.5017523765563965, + "learning_rate": 8.027844456614942e-06, + "loss": 0.8310544967651368, + "memory(GiB)": 36.53, + "step": 7060, + "token_acc": 0.7947409733124019, + "train_speed(iter/s)": 0.095939 + }, + { + "epoch": 0.32854830130878715, + "grad_norm": 8.23862361907959, + "learning_rate": 8.024783815903367e-06, + "loss": 0.7447206974029541, + "memory(GiB)": 36.53, + "step": 7065, + "token_acc": 0.8162358642972536, + "train_speed(iter/s)": 0.095977 + }, + { + "epoch": 0.3287808195687368, + "grad_norm": 7.500193119049072, + "learning_rate": 8.021721386543733e-06, + "loss": 0.8137165069580078, + "memory(GiB)": 36.53, + "step": 7070, + "token_acc": 0.7975917431192661, + "train_speed(iter/s)": 0.096015 + }, + { + "epoch": 0.32901333782868636, + "grad_norm": 7.275796413421631, + "learning_rate": 8.018657170346951e-06, + "loss": 0.7593709468841553, + "memory(GiB)": 36.53, + "step": 7075, + "token_acc": 0.8170637970791699, + "train_speed(iter/s)": 0.096051 + }, + { + "epoch": 0.32924585608863594, + "grad_norm": 7.272811412811279, + "learning_rate": 8.015591169124984e-06, + "loss": 0.7790214538574218, + "memory(GiB)": 36.53, + "step": 7080, + "token_acc": 0.8054070112893642, + "train_speed(iter/s)": 0.096089 + }, + { + "epoch": 0.32947837434858557, + "grad_norm": 8.291638374328613, + "learning_rate": 8.012523384690853e-06, + "loss": 0.7552329063415527, + "memory(GiB)": 36.53, + "step": 7085, + "token_acc": 0.8216233557908245, + "train_speed(iter/s)": 0.096127 + }, + { + "epoch": 0.32971089260853514, + "grad_norm": 7.049375057220459, + "learning_rate": 8.009453818858637e-06, + "loss": 0.7487932682037354, + "memory(GiB)": 36.53, + "step": 7090, + "token_acc": 0.8164435946462715, + "train_speed(iter/s)": 0.096163 + }, + { + "epoch": 0.3299434108684848, + "grad_norm": 6.85177755355835, + "learning_rate": 8.006382473443461e-06, + "loss": 0.6691460132598877, + "memory(GiB)": 36.53, + "step": 7095, + "token_acc": 0.8258766626360339, + "train_speed(iter/s)": 0.096201 + }, + { + "epoch": 0.33017592912843435, + "grad_norm": 6.34906530380249, + "learning_rate": 8.00330935026151e-06, + "loss": 0.6504391193389892, + "memory(GiB)": 36.53, + "step": 7100, + "token_acc": 0.8318752377329783, + "train_speed(iter/s)": 0.096239 + }, + { + "epoch": 0.33017592912843435, + "eval_loss": 0.6397922039031982, + "eval_runtime": 293.0806, + "eval_samples_per_second": 11.857, + "eval_steps_per_second": 11.857, + "step": 7100 + }, + { + "epoch": 0.330408447388384, + "grad_norm": 6.709843158721924, + "learning_rate": 8.000234451130013e-06, + "loss": 0.797484302520752, + "memory(GiB)": 36.53, + "step": 7105, + "token_acc": 0.8149428407203985, + "train_speed(iter/s)": 0.095896 + }, + { + "epoch": 0.33064096564833356, + "grad_norm": 7.30489444732666, + "learning_rate": 7.997157777867255e-06, + "loss": 0.7768725395202637, + "memory(GiB)": 36.53, + "step": 7110, + "token_acc": 0.8156225218080888, + "train_speed(iter/s)": 0.095935 + }, + { + "epoch": 0.3308734839082832, + "grad_norm": 7.3210577964782715, + "learning_rate": 7.994079332292566e-06, + "loss": 0.7170722007751464, + "memory(GiB)": 36.53, + "step": 7115, + "token_acc": 0.8237371953373366, + "train_speed(iter/s)": 0.095972 + }, + { + "epoch": 0.33110600216823277, + "grad_norm": 5.7141337394714355, + "learning_rate": 7.99099911622633e-06, + "loss": 0.6883097171783448, + "memory(GiB)": 36.53, + "step": 7120, + "token_acc": 0.8426040379068809, + "train_speed(iter/s)": 0.09601 + }, + { + "epoch": 0.33133852042818235, + "grad_norm": 6.0469970703125, + "learning_rate": 7.987917131489971e-06, + "loss": 0.8080909729003907, + "memory(GiB)": 36.53, + "step": 7125, + "token_acc": 0.8011449231696294, + "train_speed(iter/s)": 0.096048 + }, + { + "epoch": 0.331571038688132, + "grad_norm": 5.462447643280029, + "learning_rate": 7.984833379905961e-06, + "loss": 0.7844725608825683, + "memory(GiB)": 36.53, + "step": 7130, + "token_acc": 0.7944785276073619, + "train_speed(iter/s)": 0.096086 + }, + { + "epoch": 0.33180355694808156, + "grad_norm": 7.006442070007324, + "learning_rate": 7.981747863297817e-06, + "loss": 0.6901938438415527, + "memory(GiB)": 36.53, + "step": 7135, + "token_acc": 0.8397600685518424, + "train_speed(iter/s)": 0.096124 + }, + { + "epoch": 0.3320360752080312, + "grad_norm": 5.262815475463867, + "learning_rate": 7.978660583490104e-06, + "loss": 0.6862985134124756, + "memory(GiB)": 36.53, + "step": 7140, + "token_acc": 0.8212121212121212, + "train_speed(iter/s)": 0.096162 + }, + { + "epoch": 0.33226859346798077, + "grad_norm": 6.187898635864258, + "learning_rate": 7.975571542308422e-06, + "loss": 0.691615104675293, + "memory(GiB)": 36.53, + "step": 7145, + "token_acc": 0.835931700074239, + "train_speed(iter/s)": 0.096199 + }, + { + "epoch": 0.3325011117279304, + "grad_norm": 7.044938564300537, + "learning_rate": 7.97248074157942e-06, + "loss": 0.6047670364379882, + "memory(GiB)": 36.53, + "step": 7150, + "token_acc": 0.8514950166112957, + "train_speed(iter/s)": 0.096237 + }, + { + "epoch": 0.3325011117279304, + "eval_loss": 0.640709638595581, + "eval_runtime": 289.414, + "eval_samples_per_second": 12.007, + "eval_steps_per_second": 12.007, + "step": 7150 + }, + { + "epoch": 0.33273362998788, + "grad_norm": 7.655607223510742, + "learning_rate": 7.969388183130779e-06, + "loss": 0.7802319526672363, + "memory(GiB)": 36.53, + "step": 7155, + "token_acc": 0.8151366458130888, + "train_speed(iter/s)": 0.095901 + }, + { + "epoch": 0.3329661482478296, + "grad_norm": 6.0845947265625, + "learning_rate": 7.966293868791231e-06, + "loss": 0.6793017387390137, + "memory(GiB)": 36.53, + "step": 7160, + "token_acc": 0.832723279648609, + "train_speed(iter/s)": 0.095939 + }, + { + "epoch": 0.3331986665077792, + "grad_norm": 5.9088850021362305, + "learning_rate": 7.963197800390533e-06, + "loss": 0.7765905857086182, + "memory(GiB)": 36.53, + "step": 7165, + "token_acc": 0.8096875895672112, + "train_speed(iter/s)": 0.095975 + }, + { + "epoch": 0.3334311847677288, + "grad_norm": 5.6625895500183105, + "learning_rate": 7.96009997975949e-06, + "loss": 0.6958408832550049, + "memory(GiB)": 36.53, + "step": 7170, + "token_acc": 0.8290849673202615, + "train_speed(iter/s)": 0.096012 + }, + { + "epoch": 0.3336637030276784, + "grad_norm": 6.877390384674072, + "learning_rate": 7.957000408729937e-06, + "loss": 0.7387121200561524, + "memory(GiB)": 36.53, + "step": 7175, + "token_acc": 0.8228388473852721, + "train_speed(iter/s)": 0.096049 + }, + { + "epoch": 0.33389622128762797, + "grad_norm": 8.14645004272461, + "learning_rate": 7.953899089134747e-06, + "loss": 0.7548263072967529, + "memory(GiB)": 36.53, + "step": 7180, + "token_acc": 0.8222559817698443, + "train_speed(iter/s)": 0.096086 + }, + { + "epoch": 0.3341287395475776, + "grad_norm": 6.597789764404297, + "learning_rate": 7.950796022807825e-06, + "loss": 0.8279875755310059, + "memory(GiB)": 36.53, + "step": 7185, + "token_acc": 0.8059187887130076, + "train_speed(iter/s)": 0.096124 + }, + { + "epoch": 0.3343612578075272, + "grad_norm": 7.290713787078857, + "learning_rate": 7.947691211584111e-06, + "loss": 0.7254250049591064, + "memory(GiB)": 36.53, + "step": 7190, + "token_acc": 0.8295350957155879, + "train_speed(iter/s)": 0.096162 + }, + { + "epoch": 0.3345937760674768, + "grad_norm": 6.3813629150390625, + "learning_rate": 7.944584657299574e-06, + "loss": 0.8218119621276856, + "memory(GiB)": 36.53, + "step": 7195, + "token_acc": 0.7869318181818182, + "train_speed(iter/s)": 0.096199 + }, + { + "epoch": 0.3348262943274264, + "grad_norm": 7.908492565155029, + "learning_rate": 7.941476361791219e-06, + "loss": 0.7454845428466796, + "memory(GiB)": 36.53, + "step": 7200, + "token_acc": 0.8088725817211474, + "train_speed(iter/s)": 0.096237 + }, + { + "epoch": 0.3348262943274264, + "eval_loss": 0.6419472694396973, + "eval_runtime": 292.8292, + "eval_samples_per_second": 11.867, + "eval_steps_per_second": 11.867, + "step": 7200 + }, + { + "epoch": 0.335058812587376, + "grad_norm": 8.641154289245605, + "learning_rate": 7.938366326897074e-06, + "loss": 0.7571213245391846, + "memory(GiB)": 36.53, + "step": 7205, + "token_acc": 0.8146778119395337, + "train_speed(iter/s)": 0.095899 + }, + { + "epoch": 0.3352913308473256, + "grad_norm": 5.167827129364014, + "learning_rate": 7.9352545544562e-06, + "loss": 0.9509881019592286, + "memory(GiB)": 36.53, + "step": 7210, + "token_acc": 0.7677793904208998, + "train_speed(iter/s)": 0.095935 + }, + { + "epoch": 0.33552384910727523, + "grad_norm": 7.85144567489624, + "learning_rate": 7.932141046308684e-06, + "loss": 0.7327893733978271, + "memory(GiB)": 36.53, + "step": 7215, + "token_acc": 0.8123628383321141, + "train_speed(iter/s)": 0.095971 + }, + { + "epoch": 0.3357563673672248, + "grad_norm": 7.0988287925720215, + "learning_rate": 7.929025804295639e-06, + "loss": 0.7216827392578125, + "memory(GiB)": 36.53, + "step": 7220, + "token_acc": 0.8254152182309772, + "train_speed(iter/s)": 0.096008 + }, + { + "epoch": 0.3359888856271744, + "grad_norm": 7.263143062591553, + "learning_rate": 7.925908830259201e-06, + "loss": 0.8273670196533203, + "memory(GiB)": 36.53, + "step": 7225, + "token_acc": 0.7873343151693667, + "train_speed(iter/s)": 0.096045 + }, + { + "epoch": 0.336221403887124, + "grad_norm": 7.280778408050537, + "learning_rate": 7.922790126042539e-06, + "loss": 0.6277379989624023, + "memory(GiB)": 36.53, + "step": 7230, + "token_acc": 0.8471760797342193, + "train_speed(iter/s)": 0.096083 + }, + { + "epoch": 0.3364539221470736, + "grad_norm": 7.691588878631592, + "learning_rate": 7.919669693489835e-06, + "loss": 0.7610618591308593, + "memory(GiB)": 36.53, + "step": 7235, + "token_acc": 0.8135218736190897, + "train_speed(iter/s)": 0.096121 + }, + { + "epoch": 0.3366864404070232, + "grad_norm": 6.554846286773682, + "learning_rate": 7.9165475344463e-06, + "loss": 0.7569685935974121, + "memory(GiB)": 36.53, + "step": 7240, + "token_acc": 0.8254284575528099, + "train_speed(iter/s)": 0.096159 + }, + { + "epoch": 0.3369189586669728, + "grad_norm": 6.927182674407959, + "learning_rate": 7.913423650758158e-06, + "loss": 0.7405023097991943, + "memory(GiB)": 36.53, + "step": 7245, + "token_acc": 0.8135833038556773, + "train_speed(iter/s)": 0.096195 + }, + { + "epoch": 0.33715147692692243, + "grad_norm": 6.637570381164551, + "learning_rate": 7.910298044272661e-06, + "loss": 0.663820457458496, + "memory(GiB)": 36.53, + "step": 7250, + "token_acc": 0.8394245723172629, + "train_speed(iter/s)": 0.096233 + }, + { + "epoch": 0.33715147692692243, + "eval_loss": 0.6380437612533569, + "eval_runtime": 292.3654, + "eval_samples_per_second": 11.886, + "eval_steps_per_second": 11.886, + "step": 7250 + }, + { + "epoch": 0.337383995186872, + "grad_norm": 7.666382312774658, + "learning_rate": 7.90717071683808e-06, + "loss": 0.7239179611206055, + "memory(GiB)": 36.53, + "step": 7255, + "token_acc": 0.8155583638603171, + "train_speed(iter/s)": 0.095897 + }, + { + "epoch": 0.33761651344682164, + "grad_norm": 5.590011119842529, + "learning_rate": 7.904041670303695e-06, + "loss": 0.7178312778472901, + "memory(GiB)": 36.53, + "step": 7260, + "token_acc": 0.8258992805755395, + "train_speed(iter/s)": 0.095936 + }, + { + "epoch": 0.3378490317067712, + "grad_norm": 6.321069717407227, + "learning_rate": 7.90091090651981e-06, + "loss": 0.7359897136688233, + "memory(GiB)": 36.53, + "step": 7265, + "token_acc": 0.8224368499257058, + "train_speed(iter/s)": 0.095972 + }, + { + "epoch": 0.33808154996672085, + "grad_norm": 7.234593868255615, + "learning_rate": 7.897778427337741e-06, + "loss": 0.6815497398376464, + "memory(GiB)": 36.53, + "step": 7270, + "token_acc": 0.8278571428571428, + "train_speed(iter/s)": 0.09601 + }, + { + "epoch": 0.33831406822667043, + "grad_norm": 7.684406280517578, + "learning_rate": 7.894644234609823e-06, + "loss": 0.8643548965454102, + "memory(GiB)": 36.53, + "step": 7275, + "token_acc": 0.8054187192118226, + "train_speed(iter/s)": 0.096048 + }, + { + "epoch": 0.33854658648662, + "grad_norm": 8.585091590881348, + "learning_rate": 7.891508330189398e-06, + "loss": 0.6693760395050049, + "memory(GiB)": 36.53, + "step": 7280, + "token_acc": 0.8482220294882914, + "train_speed(iter/s)": 0.096086 + }, + { + "epoch": 0.33877910474656964, + "grad_norm": 8.701008796691895, + "learning_rate": 7.888370715930823e-06, + "loss": 0.7502879619598388, + "memory(GiB)": 36.53, + "step": 7285, + "token_acc": 0.806949806949807, + "train_speed(iter/s)": 0.096123 + }, + { + "epoch": 0.3390116230065192, + "grad_norm": 7.461516857147217, + "learning_rate": 7.885231393689467e-06, + "loss": 0.7151779651641845, + "memory(GiB)": 36.53, + "step": 7290, + "token_acc": 0.829172610556348, + "train_speed(iter/s)": 0.096161 + }, + { + "epoch": 0.33924414126646885, + "grad_norm": 6.577462196350098, + "learning_rate": 7.882090365321708e-06, + "loss": 0.8359928131103516, + "memory(GiB)": 36.53, + "step": 7295, + "token_acc": 0.8137787056367433, + "train_speed(iter/s)": 0.096198 + }, + { + "epoch": 0.3394766595264184, + "grad_norm": 7.914944171905518, + "learning_rate": 7.878947632684933e-06, + "loss": 0.7542798519134521, + "memory(GiB)": 36.53, + "step": 7300, + "token_acc": 0.8035133376707873, + "train_speed(iter/s)": 0.096236 + }, + { + "epoch": 0.3394766595264184, + "eval_loss": 0.6390102505683899, + "eval_runtime": 290.9563, + "eval_samples_per_second": 11.943, + "eval_steps_per_second": 11.943, + "step": 7300 + }, + { + "epoch": 0.33970917778636806, + "grad_norm": 7.265176296234131, + "learning_rate": 7.875803197637539e-06, + "loss": 0.6686437606811524, + "memory(GiB)": 36.53, + "step": 7305, + "token_acc": 0.8155821698475727, + "train_speed(iter/s)": 0.095904 + }, + { + "epoch": 0.33994169604631763, + "grad_norm": 10.155379295349121, + "learning_rate": 7.872657062038921e-06, + "loss": 0.6359403133392334, + "memory(GiB)": 36.53, + "step": 7310, + "token_acc": 0.8454882571075402, + "train_speed(iter/s)": 0.09594 + }, + { + "epoch": 0.34017421430626726, + "grad_norm": 6.692675590515137, + "learning_rate": 7.869509227749495e-06, + "loss": 0.7660014629364014, + "memory(GiB)": 36.53, + "step": 7315, + "token_acc": 0.8152403991533111, + "train_speed(iter/s)": 0.095978 + }, + { + "epoch": 0.34040673256621684, + "grad_norm": 8.65769100189209, + "learning_rate": 7.866359696630666e-06, + "loss": 0.7088188648223877, + "memory(GiB)": 36.53, + "step": 7320, + "token_acc": 0.8209606986899564, + "train_speed(iter/s)": 0.096017 + }, + { + "epoch": 0.3406392508261664, + "grad_norm": 5.40556001663208, + "learning_rate": 7.863208470544852e-06, + "loss": 0.6675843715667724, + "memory(GiB)": 36.53, + "step": 7325, + "token_acc": 0.8324375592791653, + "train_speed(iter/s)": 0.096054 + }, + { + "epoch": 0.34087176908611605, + "grad_norm": 5.316320419311523, + "learning_rate": 7.86005555135547e-06, + "loss": 0.7577543735504151, + "memory(GiB)": 36.53, + "step": 7330, + "token_acc": 0.8117934616559731, + "train_speed(iter/s)": 0.096091 + }, + { + "epoch": 0.3411042873460656, + "grad_norm": 7.521231651306152, + "learning_rate": 7.856900940926937e-06, + "loss": 0.614345932006836, + "memory(GiB)": 36.53, + "step": 7335, + "token_acc": 0.840962904498816, + "train_speed(iter/s)": 0.096129 + }, + { + "epoch": 0.34133680560601526, + "grad_norm": 7.031819820404053, + "learning_rate": 7.853744641124672e-06, + "loss": 0.6725636482238769, + "memory(GiB)": 36.53, + "step": 7340, + "token_acc": 0.8362771739130435, + "train_speed(iter/s)": 0.096166 + }, + { + "epoch": 0.34156932386596484, + "grad_norm": 5.72618293762207, + "learning_rate": 7.850586653815093e-06, + "loss": 0.6473200798034668, + "memory(GiB)": 36.53, + "step": 7345, + "token_acc": 0.836783988957902, + "train_speed(iter/s)": 0.096204 + }, + { + "epoch": 0.34180184212591447, + "grad_norm": 7.18748664855957, + "learning_rate": 7.847426980865618e-06, + "loss": 0.8898324012756348, + "memory(GiB)": 36.53, + "step": 7350, + "token_acc": 0.7603327965646807, + "train_speed(iter/s)": 0.096241 + }, + { + "epoch": 0.34180184212591447, + "eval_loss": 0.6362443566322327, + "eval_runtime": 291.2812, + "eval_samples_per_second": 11.93, + "eval_steps_per_second": 11.93, + "step": 7350 + }, + { + "epoch": 0.34203436038586404, + "grad_norm": 8.468127250671387, + "learning_rate": 7.844265624144653e-06, + "loss": 0.736899995803833, + "memory(GiB)": 36.53, + "step": 7355, + "token_acc": 0.8153188554564701, + "train_speed(iter/s)": 0.095912 + }, + { + "epoch": 0.3422668786458137, + "grad_norm": 6.774470806121826, + "learning_rate": 7.841102585521612e-06, + "loss": 0.8117254257202149, + "memory(GiB)": 36.53, + "step": 7360, + "token_acc": 0.8039726473461413, + "train_speed(iter/s)": 0.095949 + }, + { + "epoch": 0.34249939690576325, + "grad_norm": 7.67664098739624, + "learning_rate": 7.837937866866894e-06, + "loss": 0.6635471820831299, + "memory(GiB)": 36.53, + "step": 7365, + "token_acc": 0.8254976704786108, + "train_speed(iter/s)": 0.095987 + }, + { + "epoch": 0.34273191516571283, + "grad_norm": 6.061575412750244, + "learning_rate": 7.834771470051895e-06, + "loss": 0.7754477977752685, + "memory(GiB)": 36.53, + "step": 7370, + "token_acc": 0.7987326493663247, + "train_speed(iter/s)": 0.096023 + }, + { + "epoch": 0.34296443342566246, + "grad_norm": 7.205853462219238, + "learning_rate": 7.831603396949005e-06, + "loss": 0.7205926895141601, + "memory(GiB)": 36.53, + "step": 7375, + "token_acc": 0.8245721271393643, + "train_speed(iter/s)": 0.09606 + }, + { + "epoch": 0.34319695168561204, + "grad_norm": 7.258941173553467, + "learning_rate": 7.8284336494316e-06, + "loss": 0.7685527324676513, + "memory(GiB)": 36.53, + "step": 7380, + "token_acc": 0.8014018691588785, + "train_speed(iter/s)": 0.096096 + }, + { + "epoch": 0.34342946994556167, + "grad_norm": 7.458856582641602, + "learning_rate": 7.825262229374054e-06, + "loss": 0.7406332492828369, + "memory(GiB)": 36.53, + "step": 7385, + "token_acc": 0.8115746971736204, + "train_speed(iter/s)": 0.096133 + }, + { + "epoch": 0.34366198820551125, + "grad_norm": 6.199806213378906, + "learning_rate": 7.822089138651723e-06, + "loss": 0.6603247165679932, + "memory(GiB)": 36.53, + "step": 7390, + "token_acc": 0.8321905449296283, + "train_speed(iter/s)": 0.096169 + }, + { + "epoch": 0.3438945064654609, + "grad_norm": 6.644196033477783, + "learning_rate": 7.818914379140953e-06, + "loss": 0.7804720401763916, + "memory(GiB)": 36.53, + "step": 7395, + "token_acc": 0.7989261744966443, + "train_speed(iter/s)": 0.096206 + }, + { + "epoch": 0.34412702472541046, + "grad_norm": 5.659149169921875, + "learning_rate": 7.815737952719081e-06, + "loss": 0.6324204444885254, + "memory(GiB)": 36.53, + "step": 7400, + "token_acc": 0.8367801463569837, + "train_speed(iter/s)": 0.096243 + }, + { + "epoch": 0.34412702472541046, + "eval_loss": 0.6397080421447754, + "eval_runtime": 293.3095, + "eval_samples_per_second": 11.848, + "eval_steps_per_second": 11.848, + "step": 7400 + }, + { + "epoch": 0.3443595429853601, + "grad_norm": 5.597126483917236, + "learning_rate": 7.81255986126442e-06, + "loss": 0.8179545402526855, + "memory(GiB)": 36.53, + "step": 7405, + "token_acc": 0.8155359950203098, + "train_speed(iter/s)": 0.095913 + }, + { + "epoch": 0.34459206124530967, + "grad_norm": 6.572484493255615, + "learning_rate": 7.809380106656278e-06, + "loss": 0.7602914333343506, + "memory(GiB)": 36.53, + "step": 7410, + "token_acc": 0.8123157549950868, + "train_speed(iter/s)": 0.095949 + }, + { + "epoch": 0.3448245795052593, + "grad_norm": 6.53602933883667, + "learning_rate": 7.806198690774943e-06, + "loss": 0.6647308826446533, + "memory(GiB)": 36.53, + "step": 7415, + "token_acc": 0.8425531914893617, + "train_speed(iter/s)": 0.095986 + }, + { + "epoch": 0.3450570977652089, + "grad_norm": 7.273094654083252, + "learning_rate": 7.803015615501679e-06, + "loss": 0.7383760452270508, + "memory(GiB)": 36.53, + "step": 7420, + "token_acc": 0.8075300227198962, + "train_speed(iter/s)": 0.096024 + }, + { + "epoch": 0.34528961602515845, + "grad_norm": 6.734947681427002, + "learning_rate": 7.799830882718743e-06, + "loss": 0.7719890594482421, + "memory(GiB)": 36.53, + "step": 7425, + "token_acc": 0.8220779220779221, + "train_speed(iter/s)": 0.096062 + }, + { + "epoch": 0.3455221342851081, + "grad_norm": 6.639370918273926, + "learning_rate": 7.796644494309361e-06, + "loss": 0.794908094406128, + "memory(GiB)": 36.53, + "step": 7430, + "token_acc": 0.8007067137809187, + "train_speed(iter/s)": 0.096099 + }, + { + "epoch": 0.34575465254505766, + "grad_norm": 4.576010704040527, + "learning_rate": 7.793456452157746e-06, + "loss": 0.7384835243225097, + "memory(GiB)": 36.53, + "step": 7435, + "token_acc": 0.8103640830913558, + "train_speed(iter/s)": 0.096134 + }, + { + "epoch": 0.3459871708050073, + "grad_norm": 7.08085298538208, + "learning_rate": 7.790266758149083e-06, + "loss": 0.7475857257843017, + "memory(GiB)": 36.53, + "step": 7440, + "token_acc": 0.8180333224436459, + "train_speed(iter/s)": 0.096171 + }, + { + "epoch": 0.34621968906495687, + "grad_norm": 6.292354583740234, + "learning_rate": 7.78707541416954e-06, + "loss": 0.702857780456543, + "memory(GiB)": 36.53, + "step": 7445, + "token_acc": 0.821754165356806, + "train_speed(iter/s)": 0.096206 + }, + { + "epoch": 0.3464522073249065, + "grad_norm": 6.977808952331543, + "learning_rate": 7.783882422106254e-06, + "loss": 0.6732274055480957, + "memory(GiB)": 36.53, + "step": 7450, + "token_acc": 0.8363211223694466, + "train_speed(iter/s)": 0.096243 + }, + { + "epoch": 0.3464522073249065, + "eval_loss": 0.6331688165664673, + "eval_runtime": 290.2598, + "eval_samples_per_second": 11.972, + "eval_steps_per_second": 11.972, + "step": 7450 + }, + { + "epoch": 0.3466847255848561, + "grad_norm": 7.257906913757324, + "learning_rate": 7.780687783847341e-06, + "loss": 0.7075543880462647, + "memory(GiB)": 36.53, + "step": 7455, + "token_acc": 0.8163660155905044, + "train_speed(iter/s)": 0.095919 + }, + { + "epoch": 0.3469172438448057, + "grad_norm": 6.582991123199463, + "learning_rate": 7.777491501281891e-06, + "loss": 0.7077393531799316, + "memory(GiB)": 36.53, + "step": 7460, + "token_acc": 0.8191687344913151, + "train_speed(iter/s)": 0.095955 + }, + { + "epoch": 0.3471497621047553, + "grad_norm": 7.522470951080322, + "learning_rate": 7.77429357629996e-06, + "loss": 0.692666482925415, + "memory(GiB)": 36.53, + "step": 7465, + "token_acc": 0.8261022927689594, + "train_speed(iter/s)": 0.095991 + }, + { + "epoch": 0.34738228036470487, + "grad_norm": 7.295367240905762, + "learning_rate": 7.771094010792585e-06, + "loss": 0.7090956687927246, + "memory(GiB)": 36.53, + "step": 7470, + "token_acc": 0.8261661807580175, + "train_speed(iter/s)": 0.096027 + }, + { + "epoch": 0.3476147986246545, + "grad_norm": 5.637494087219238, + "learning_rate": 7.767892806651765e-06, + "loss": 0.7600067138671875, + "memory(GiB)": 36.53, + "step": 7475, + "token_acc": 0.8067147787888296, + "train_speed(iter/s)": 0.096063 + }, + { + "epoch": 0.3478473168846041, + "grad_norm": 6.288785457611084, + "learning_rate": 7.764689965770472e-06, + "loss": 0.8045848846435547, + "memory(GiB)": 36.53, + "step": 7480, + "token_acc": 0.7907473309608541, + "train_speed(iter/s)": 0.096099 + }, + { + "epoch": 0.3480798351445537, + "grad_norm": 7.792519569396973, + "learning_rate": 7.761485490042642e-06, + "loss": 0.6550180912017822, + "memory(GiB)": 36.53, + "step": 7485, + "token_acc": 0.8389721627408994, + "train_speed(iter/s)": 0.096136 + }, + { + "epoch": 0.3483123534045033, + "grad_norm": 4.879187107086182, + "learning_rate": 7.758279381363184e-06, + "loss": 0.7051380157470704, + "memory(GiB)": 36.53, + "step": 7490, + "token_acc": 0.8250084947332654, + "train_speed(iter/s)": 0.096172 + }, + { + "epoch": 0.3485448716644529, + "grad_norm": 7.7562055587768555, + "learning_rate": 7.755071641627968e-06, + "loss": 0.6621471881866455, + "memory(GiB)": 36.53, + "step": 7495, + "token_acc": 0.8392456219128873, + "train_speed(iter/s)": 0.096209 + }, + { + "epoch": 0.3487773899244025, + "grad_norm": 6.802863121032715, + "learning_rate": 7.751862272733825e-06, + "loss": 0.6991421222686768, + "memory(GiB)": 36.53, + "step": 7500, + "token_acc": 0.8280751506557958, + "train_speed(iter/s)": 0.096245 + }, + { + "epoch": 0.3487773899244025, + "eval_loss": 0.6374966502189636, + "eval_runtime": 292.9641, + "eval_samples_per_second": 11.862, + "eval_steps_per_second": 11.862, + "step": 7500 + }, + { + "epoch": 0.3490099081843521, + "grad_norm": 7.602797031402588, + "learning_rate": 7.748651276578563e-06, + "loss": 0.6773967266082763, + "memory(GiB)": 36.53, + "step": 7505, + "token_acc": 0.8170272850529896, + "train_speed(iter/s)": 0.095921 + }, + { + "epoch": 0.3492424264443017, + "grad_norm": 5.853288173675537, + "learning_rate": 7.745438655060935e-06, + "loss": 0.8045696258544922, + "memory(GiB)": 36.53, + "step": 7510, + "token_acc": 0.7974137931034483, + "train_speed(iter/s)": 0.095957 + }, + { + "epoch": 0.3494749447042513, + "grad_norm": 7.790389060974121, + "learning_rate": 7.742224410080668e-06, + "loss": 0.5909008026123047, + "memory(GiB)": 36.53, + "step": 7515, + "token_acc": 0.8477551020408163, + "train_speed(iter/s)": 0.095993 + }, + { + "epoch": 0.3497074629642009, + "grad_norm": 11.229941368103027, + "learning_rate": 7.739008543538442e-06, + "loss": 0.6343198299407959, + "memory(GiB)": 36.53, + "step": 7520, + "token_acc": 0.8413852073535699, + "train_speed(iter/s)": 0.096029 + }, + { + "epoch": 0.3499399812241505, + "grad_norm": 7.57761812210083, + "learning_rate": 7.735791057335899e-06, + "loss": 0.626180362701416, + "memory(GiB)": 36.53, + "step": 7525, + "token_acc": 0.8515372168284789, + "train_speed(iter/s)": 0.096064 + }, + { + "epoch": 0.3501724994841001, + "grad_norm": 5.744942665100098, + "learning_rate": 7.732571953375638e-06, + "loss": 0.7779204845428467, + "memory(GiB)": 36.53, + "step": 7530, + "token_acc": 0.810126582278481, + "train_speed(iter/s)": 0.096099 + }, + { + "epoch": 0.3504050177440497, + "grad_norm": 6.306331634521484, + "learning_rate": 7.729351233561216e-06, + "loss": 0.6912620067596436, + "memory(GiB)": 36.53, + "step": 7535, + "token_acc": 0.8287752675386445, + "train_speed(iter/s)": 0.096135 + }, + { + "epoch": 0.35063753600399933, + "grad_norm": 7.440969467163086, + "learning_rate": 7.72612889979714e-06, + "loss": 0.7543702125549316, + "memory(GiB)": 36.53, + "step": 7540, + "token_acc": 0.8168260038240918, + "train_speed(iter/s)": 0.096171 + }, + { + "epoch": 0.3508700542639489, + "grad_norm": 6.490276336669922, + "learning_rate": 7.72290495398888e-06, + "loss": 0.7501804828643799, + "memory(GiB)": 36.53, + "step": 7545, + "token_acc": 0.8132794068082237, + "train_speed(iter/s)": 0.096206 + }, + { + "epoch": 0.35110257252389854, + "grad_norm": 6.402077674865723, + "learning_rate": 7.719679398042851e-06, + "loss": 0.756907320022583, + "memory(GiB)": 36.53, + "step": 7550, + "token_acc": 0.8172623061362104, + "train_speed(iter/s)": 0.096243 + }, + { + "epoch": 0.35110257252389854, + "eval_loss": 0.6371598839759827, + "eval_runtime": 293.6264, + "eval_samples_per_second": 11.835, + "eval_steps_per_second": 11.835, + "step": 7550 + }, + { + "epoch": 0.3513350907838481, + "grad_norm": 6.457944393157959, + "learning_rate": 7.716452233866427e-06, + "loss": 0.6614902496337891, + "memory(GiB)": 36.53, + "step": 7555, + "token_acc": 0.8163759379208619, + "train_speed(iter/s)": 0.095919 + }, + { + "epoch": 0.35156760904379775, + "grad_norm": 9.463583946228027, + "learning_rate": 7.713223463367928e-06, + "loss": 0.831269359588623, + "memory(GiB)": 36.53, + "step": 7560, + "token_acc": 0.7870534135125575, + "train_speed(iter/s)": 0.095956 + }, + { + "epoch": 0.3518001273037473, + "grad_norm": 7.320459365844727, + "learning_rate": 7.709993088456625e-06, + "loss": 0.7018909454345703, + "memory(GiB)": 36.53, + "step": 7565, + "token_acc": 0.8254747871643746, + "train_speed(iter/s)": 0.095992 + }, + { + "epoch": 0.3520326455636969, + "grad_norm": 6.774960041046143, + "learning_rate": 7.706761111042738e-06, + "loss": 0.7598164558410645, + "memory(GiB)": 36.53, + "step": 7570, + "token_acc": 0.8191214470284238, + "train_speed(iter/s)": 0.096028 + }, + { + "epoch": 0.35226516382364653, + "grad_norm": 5.613641738891602, + "learning_rate": 7.703527533037438e-06, + "loss": 0.7213669776916504, + "memory(GiB)": 36.53, + "step": 7575, + "token_acc": 0.8275735294117647, + "train_speed(iter/s)": 0.096064 + }, + { + "epoch": 0.3524976820835961, + "grad_norm": 8.267657279968262, + "learning_rate": 7.700292356352839e-06, + "loss": 0.6571903705596924, + "memory(GiB)": 36.53, + "step": 7580, + "token_acc": 0.8469924812030075, + "train_speed(iter/s)": 0.096099 + }, + { + "epoch": 0.35273020034354574, + "grad_norm": 6.900925159454346, + "learning_rate": 7.697055582901997e-06, + "loss": 0.7545526504516602, + "memory(GiB)": 36.53, + "step": 7585, + "token_acc": 0.8162962962962963, + "train_speed(iter/s)": 0.096135 + }, + { + "epoch": 0.3529627186034953, + "grad_norm": 6.630009174346924, + "learning_rate": 7.693817214598922e-06, + "loss": 0.7359566688537598, + "memory(GiB)": 36.53, + "step": 7590, + "token_acc": 0.815540113708149, + "train_speed(iter/s)": 0.096171 + }, + { + "epoch": 0.35319523686344495, + "grad_norm": 6.38250207901001, + "learning_rate": 7.690577253358555e-06, + "loss": 0.898930549621582, + "memory(GiB)": 36.53, + "step": 7595, + "token_acc": 0.7799352750809061, + "train_speed(iter/s)": 0.096206 + }, + { + "epoch": 0.3534277551233945, + "grad_norm": 7.791952610015869, + "learning_rate": 7.68733570109679e-06, + "loss": 0.6741089820861816, + "memory(GiB)": 36.53, + "step": 7600, + "token_acc": 0.8259507829977628, + "train_speed(iter/s)": 0.096241 + }, + { + "epoch": 0.3534277551233945, + "eval_loss": 0.6381203532218933, + "eval_runtime": 291.6684, + "eval_samples_per_second": 11.914, + "eval_steps_per_second": 11.914, + "step": 7600 + }, + { + "epoch": 0.35366027338334416, + "grad_norm": 6.15386962890625, + "learning_rate": 7.684092559730454e-06, + "loss": 0.7473597049713134, + "memory(GiB)": 36.53, + "step": 7605, + "token_acc": 0.8166645391881542, + "train_speed(iter/s)": 0.095922 + }, + { + "epoch": 0.35389279164329374, + "grad_norm": 6.232975959777832, + "learning_rate": 7.680847831177318e-06, + "loss": 0.6105194091796875, + "memory(GiB)": 36.53, + "step": 7610, + "token_acc": 0.8553259141494436, + "train_speed(iter/s)": 0.095957 + }, + { + "epoch": 0.3541253099032433, + "grad_norm": 7.638478755950928, + "learning_rate": 7.67760151735609e-06, + "loss": 0.8064892768859864, + "memory(GiB)": 36.53, + "step": 7615, + "token_acc": 0.7994109947643979, + "train_speed(iter/s)": 0.095993 + }, + { + "epoch": 0.35435782816319294, + "grad_norm": 6.6157941818237305, + "learning_rate": 7.674353620186416e-06, + "loss": 0.6375001907348633, + "memory(GiB)": 36.53, + "step": 7620, + "token_acc": 0.8336236933797909, + "train_speed(iter/s)": 0.096028 + }, + { + "epoch": 0.3545903464231425, + "grad_norm": 6.55482292175293, + "learning_rate": 7.671104141588877e-06, + "loss": 0.7433343887329101, + "memory(GiB)": 36.53, + "step": 7625, + "token_acc": 0.8180338541666666, + "train_speed(iter/s)": 0.096064 + }, + { + "epoch": 0.35482286468309215, + "grad_norm": 7.421290874481201, + "learning_rate": 7.66785308348499e-06, + "loss": 0.7123981952667237, + "memory(GiB)": 36.53, + "step": 7630, + "token_acc": 0.8225122349102774, + "train_speed(iter/s)": 0.0961 + }, + { + "epoch": 0.35505538294304173, + "grad_norm": 6.6890082359313965, + "learning_rate": 7.664600447797206e-06, + "loss": 0.6621024131774902, + "memory(GiB)": 36.53, + "step": 7635, + "token_acc": 0.8417508417508418, + "train_speed(iter/s)": 0.096136 + }, + { + "epoch": 0.35528790120299136, + "grad_norm": 8.536678314208984, + "learning_rate": 7.661346236448908e-06, + "loss": 0.8194070816040039, + "memory(GiB)": 36.53, + "step": 7640, + "token_acc": 0.7854671280276817, + "train_speed(iter/s)": 0.096171 + }, + { + "epoch": 0.35552041946294094, + "grad_norm": 7.1367597579956055, + "learning_rate": 7.658090451364415e-06, + "loss": 0.735087776184082, + "memory(GiB)": 36.53, + "step": 7645, + "token_acc": 0.8191523778712391, + "train_speed(iter/s)": 0.096206 + }, + { + "epoch": 0.35575293772289057, + "grad_norm": 6.231055736541748, + "learning_rate": 7.65483309446897e-06, + "loss": 0.7065731048583984, + "memory(GiB)": 36.53, + "step": 7650, + "token_acc": 0.8235985887887103, + "train_speed(iter/s)": 0.096241 + }, + { + "epoch": 0.35575293772289057, + "eval_loss": 0.6341753602027893, + "eval_runtime": 292.1601, + "eval_samples_per_second": 11.894, + "eval_steps_per_second": 11.894, + "step": 7650 + }, + { + "epoch": 0.35598545598284015, + "grad_norm": 8.451674461364746, + "learning_rate": 7.65157416768875e-06, + "loss": 0.7524304389953613, + "memory(GiB)": 36.53, + "step": 7655, + "token_acc": 0.8164709654149802, + "train_speed(iter/s)": 0.095923 + }, + { + "epoch": 0.3562179742427898, + "grad_norm": 7.157677173614502, + "learning_rate": 7.64831367295086e-06, + "loss": 0.6308634281158447, + "memory(GiB)": 36.53, + "step": 7660, + "token_acc": 0.8431555971312753, + "train_speed(iter/s)": 0.095959 + }, + { + "epoch": 0.35645049250273936, + "grad_norm": 6.980820655822754, + "learning_rate": 7.645051612183329e-06, + "loss": 0.7008957862854004, + "memory(GiB)": 36.53, + "step": 7665, + "token_acc": 0.8320689655172414, + "train_speed(iter/s)": 0.095995 + }, + { + "epoch": 0.35668301076268893, + "grad_norm": 6.382811546325684, + "learning_rate": 7.641787987315115e-06, + "loss": 0.6737799644470215, + "memory(GiB)": 36.53, + "step": 7670, + "token_acc": 0.8232436472346786, + "train_speed(iter/s)": 0.09603 + }, + { + "epoch": 0.35691552902263857, + "grad_norm": 7.144225597381592, + "learning_rate": 7.6385228002761e-06, + "loss": 0.6494386196136475, + "memory(GiB)": 36.53, + "step": 7675, + "token_acc": 0.8335089567966281, + "train_speed(iter/s)": 0.096065 + }, + { + "epoch": 0.35714804728258814, + "grad_norm": 7.304563522338867, + "learning_rate": 7.63525605299709e-06, + "loss": 0.6193370819091797, + "memory(GiB)": 36.53, + "step": 7680, + "token_acc": 0.8410814375206067, + "train_speed(iter/s)": 0.096098 + }, + { + "epoch": 0.3573805655425378, + "grad_norm": 5.732212543487549, + "learning_rate": 7.631987747409816e-06, + "loss": 0.7993177890777587, + "memory(GiB)": 36.53, + "step": 7685, + "token_acc": 0.8033457249070632, + "train_speed(iter/s)": 0.096134 + }, + { + "epoch": 0.35761308380248735, + "grad_norm": 8.190177917480469, + "learning_rate": 7.628717885446926e-06, + "loss": 0.7521889686584473, + "memory(GiB)": 36.53, + "step": 7690, + "token_acc": 0.8173354735152488, + "train_speed(iter/s)": 0.096169 + }, + { + "epoch": 0.357845602062437, + "grad_norm": 5.477593421936035, + "learning_rate": 7.625446469041988e-06, + "loss": 0.7286582469940186, + "memory(GiB)": 36.53, + "step": 7695, + "token_acc": 0.815913688469319, + "train_speed(iter/s)": 0.096205 + }, + { + "epoch": 0.35807812032238656, + "grad_norm": 7.711561679840088, + "learning_rate": 7.622173500129495e-06, + "loss": 0.6683283805847168, + "memory(GiB)": 36.53, + "step": 7700, + "token_acc": 0.8445332364693062, + "train_speed(iter/s)": 0.096239 + }, + { + "epoch": 0.35807812032238656, + "eval_loss": 0.6329870223999023, + "eval_runtime": 291.3377, + "eval_samples_per_second": 11.928, + "eval_steps_per_second": 11.928, + "step": 7700 + }, + { + "epoch": 0.3583106385823362, + "grad_norm": 7.231578826904297, + "learning_rate": 7.618898980644854e-06, + "loss": 0.7106448173522949, + "memory(GiB)": 36.53, + "step": 7705, + "token_acc": 0.8174452583963813, + "train_speed(iter/s)": 0.095925 + }, + { + "epoch": 0.35854315684228577, + "grad_norm": 6.559253215789795, + "learning_rate": 7.6156229125243884e-06, + "loss": 0.6210072040557861, + "memory(GiB)": 36.53, + "step": 7710, + "token_acc": 0.8506024096385543, + "train_speed(iter/s)": 0.09596 + }, + { + "epoch": 0.35877567510223535, + "grad_norm": 7.540843963623047, + "learning_rate": 7.612345297705337e-06, + "loss": 0.6631568908691406, + "memory(GiB)": 36.53, + "step": 7715, + "token_acc": 0.8360078277886497, + "train_speed(iter/s)": 0.095995 + }, + { + "epoch": 0.359008193362185, + "grad_norm": 6.490909576416016, + "learning_rate": 7.6090661381258576e-06, + "loss": 0.7047979831695557, + "memory(GiB)": 36.53, + "step": 7720, + "token_acc": 0.8158925573587017, + "train_speed(iter/s)": 0.096031 + }, + { + "epoch": 0.35924071162213456, + "grad_norm": 4.649106502532959, + "learning_rate": 7.6057854357250194e-06, + "loss": 0.7783477783203125, + "memory(GiB)": 36.53, + "step": 7725, + "token_acc": 0.8007202881152461, + "train_speed(iter/s)": 0.096066 + }, + { + "epoch": 0.3594732298820842, + "grad_norm": 9.474201202392578, + "learning_rate": 7.6025031924427985e-06, + "loss": 0.6698055267333984, + "memory(GiB)": 36.53, + "step": 7730, + "token_acc": 0.8268434134217068, + "train_speed(iter/s)": 0.096101 + }, + { + "epoch": 0.35970574814203377, + "grad_norm": 6.788260459899902, + "learning_rate": 7.599219410220089e-06, + "loss": 0.649644422531128, + "memory(GiB)": 36.53, + "step": 7735, + "token_acc": 0.8305889803673211, + "train_speed(iter/s)": 0.096136 + }, + { + "epoch": 0.3599382664019834, + "grad_norm": 6.228346824645996, + "learning_rate": 7.5959340909986935e-06, + "loss": 0.675438404083252, + "memory(GiB)": 36.53, + "step": 7740, + "token_acc": 0.8261859582542694, + "train_speed(iter/s)": 0.09617 + }, + { + "epoch": 0.360170784661933, + "grad_norm": 4.204248905181885, + "learning_rate": 7.592647236721324e-06, + "loss": 0.7380726337432861, + "memory(GiB)": 36.53, + "step": 7745, + "token_acc": 0.8304862023653088, + "train_speed(iter/s)": 0.096205 + }, + { + "epoch": 0.3604033029218826, + "grad_norm": 6.923390865325928, + "learning_rate": 7.589358849331594e-06, + "loss": 0.7568727493286133, + "memory(GiB)": 36.53, + "step": 7750, + "token_acc": 0.829938570966699, + "train_speed(iter/s)": 0.096239 + }, + { + "epoch": 0.3604033029218826, + "eval_loss": 0.634183406829834, + "eval_runtime": 296.2328, + "eval_samples_per_second": 11.731, + "eval_steps_per_second": 11.731, + "step": 7750 + }, + { + "epoch": 0.3606358211818322, + "grad_norm": 6.226519584655762, + "learning_rate": 7.586068930774032e-06, + "loss": 0.6907257556915283, + "memory(GiB)": 36.53, + "step": 7755, + "token_acc": 0.8169730623249694, + "train_speed(iter/s)": 0.09592 + }, + { + "epoch": 0.36086833944178176, + "grad_norm": 6.021866798400879, + "learning_rate": 7.5827774829940685e-06, + "loss": 0.7334243774414062, + "memory(GiB)": 36.53, + "step": 7760, + "token_acc": 0.8235482836060315, + "train_speed(iter/s)": 0.095955 + }, + { + "epoch": 0.3611008577017314, + "grad_norm": 6.340511798858643, + "learning_rate": 7.579484507938037e-06, + "loss": 0.7543991088867188, + "memory(GiB)": 36.53, + "step": 7765, + "token_acc": 0.8080444735120994, + "train_speed(iter/s)": 0.09599 + }, + { + "epoch": 0.36133337596168097, + "grad_norm": 7.009573936462402, + "learning_rate": 7.576190007553177e-06, + "loss": 0.7209124565124512, + "memory(GiB)": 36.53, + "step": 7770, + "token_acc": 0.8328840970350404, + "train_speed(iter/s)": 0.096024 + }, + { + "epoch": 0.3615658942216306, + "grad_norm": 6.050121307373047, + "learning_rate": 7.572893983787626e-06, + "loss": 0.6631560802459717, + "memory(GiB)": 36.53, + "step": 7775, + "token_acc": 0.8289615522817104, + "train_speed(iter/s)": 0.096058 + }, + { + "epoch": 0.3617984124815802, + "grad_norm": 7.535763263702393, + "learning_rate": 7.5695964385904255e-06, + "loss": 0.6829229354858398, + "memory(GiB)": 36.53, + "step": 7780, + "token_acc": 0.8269992082343627, + "train_speed(iter/s)": 0.096093 + }, + { + "epoch": 0.3620309307415298, + "grad_norm": 7.163476943969727, + "learning_rate": 7.566297373911517e-06, + "loss": 0.8164946556091308, + "memory(GiB)": 36.53, + "step": 7785, + "token_acc": 0.8013176144244105, + "train_speed(iter/s)": 0.096128 + }, + { + "epoch": 0.3622634490014794, + "grad_norm": 7.023055553436279, + "learning_rate": 7.562996791701739e-06, + "loss": 0.6314420223236084, + "memory(GiB)": 36.53, + "step": 7790, + "token_acc": 0.8464187327823691, + "train_speed(iter/s)": 0.096163 + }, + { + "epoch": 0.362495967261429, + "grad_norm": 6.3728928565979, + "learning_rate": 7.559694693912827e-06, + "loss": 0.7461518764495849, + "memory(GiB)": 36.53, + "step": 7795, + "token_acc": 0.8165374677002584, + "train_speed(iter/s)": 0.096198 + }, + { + "epoch": 0.3627284855213786, + "grad_norm": 5.9811201095581055, + "learning_rate": 7.5563910824974114e-06, + "loss": 0.7891818046569824, + "memory(GiB)": 36.53, + "step": 7800, + "token_acc": 0.8100607111882047, + "train_speed(iter/s)": 0.096233 + }, + { + "epoch": 0.3627284855213786, + "eval_loss": 0.6325167417526245, + "eval_runtime": 294.4563, + "eval_samples_per_second": 11.801, + "eval_steps_per_second": 11.801, + "step": 7800 + }, + { + "epoch": 0.36296100378132823, + "grad_norm": 6.407812118530273, + "learning_rate": 7.553085959409023e-06, + "loss": 0.526038122177124, + "memory(GiB)": 36.53, + "step": 7805, + "token_acc": 0.8190200902988461, + "train_speed(iter/s)": 0.095917 + }, + { + "epoch": 0.3631935220412778, + "grad_norm": 7.23723840713501, + "learning_rate": 7.549779326602083e-06, + "loss": 0.7444475650787353, + "memory(GiB)": 36.53, + "step": 7810, + "token_acc": 0.823447313328681, + "train_speed(iter/s)": 0.095951 + }, + { + "epoch": 0.3634260403012274, + "grad_norm": 5.919946193695068, + "learning_rate": 7.546471186031903e-06, + "loss": 0.8032725334167481, + "memory(GiB)": 36.53, + "step": 7815, + "token_acc": 0.7879730430274754, + "train_speed(iter/s)": 0.095985 + }, + { + "epoch": 0.363658558561177, + "grad_norm": 7.535763263702393, + "learning_rate": 7.543161539654688e-06, + "loss": 0.6441785812377929, + "memory(GiB)": 36.53, + "step": 7820, + "token_acc": 0.8508461235733963, + "train_speed(iter/s)": 0.09602 + }, + { + "epoch": 0.3638910768211266, + "grad_norm": 8.509560585021973, + "learning_rate": 7.539850389427539e-06, + "loss": 0.6608382225036621, + "memory(GiB)": 36.53, + "step": 7825, + "token_acc": 0.8400160384923817, + "train_speed(iter/s)": 0.096055 + }, + { + "epoch": 0.3641235950810762, + "grad_norm": 6.910362243652344, + "learning_rate": 7.536537737308437e-06, + "loss": 0.669133996963501, + "memory(GiB)": 36.53, + "step": 7830, + "token_acc": 0.8291802094618996, + "train_speed(iter/s)": 0.09609 + }, + { + "epoch": 0.3643561133410258, + "grad_norm": 6.624199390411377, + "learning_rate": 7.533223585256255e-06, + "loss": 0.7464718341827392, + "memory(GiB)": 36.53, + "step": 7835, + "token_acc": 0.8115384615384615, + "train_speed(iter/s)": 0.096124 + }, + { + "epoch": 0.36458863160097543, + "grad_norm": 6.117915630340576, + "learning_rate": 7.529907935230758e-06, + "loss": 0.7008463859558105, + "memory(GiB)": 36.53, + "step": 7840, + "token_acc": 0.8128292531763247, + "train_speed(iter/s)": 0.096159 + }, + { + "epoch": 0.364821149860925, + "grad_norm": 6.617120742797852, + "learning_rate": 7.5265907891925895e-06, + "loss": 0.6590275287628173, + "memory(GiB)": 36.53, + "step": 7845, + "token_acc": 0.8379475821336286, + "train_speed(iter/s)": 0.096193 + }, + { + "epoch": 0.36505366812087464, + "grad_norm": 6.689937591552734, + "learning_rate": 7.52327214910328e-06, + "loss": 0.7318082809448242, + "memory(GiB)": 36.53, + "step": 7850, + "token_acc": 0.8155797101449276, + "train_speed(iter/s)": 0.096227 + }, + { + "epoch": 0.36505366812087464, + "eval_loss": 0.6311256289482117, + "eval_runtime": 292.5386, + "eval_samples_per_second": 11.879, + "eval_steps_per_second": 11.879, + "step": 7850 + }, + { + "epoch": 0.3652861863808242, + "grad_norm": 10.915748596191406, + "learning_rate": 7.5199520169252425e-06, + "loss": 0.702227783203125, + "memory(GiB)": 36.53, + "step": 7855, + "token_acc": 0.8179930906788287, + "train_speed(iter/s)": 0.095918 + }, + { + "epoch": 0.3655187046407738, + "grad_norm": 6.974138259887695, + "learning_rate": 7.5166303946217765e-06, + "loss": 0.7374119281768798, + "memory(GiB)": 36.53, + "step": 7860, + "token_acc": 0.8254620123203286, + "train_speed(iter/s)": 0.095953 + }, + { + "epoch": 0.3657512229007234, + "grad_norm": 6.728903293609619, + "learning_rate": 7.513307284157059e-06, + "loss": 0.8151761054992676, + "memory(GiB)": 36.53, + "step": 7865, + "token_acc": 0.803594351732991, + "train_speed(iter/s)": 0.095987 + }, + { + "epoch": 0.365983741160673, + "grad_norm": 6.212688446044922, + "learning_rate": 7.509982687496147e-06, + "loss": 0.6229990482330322, + "memory(GiB)": 36.53, + "step": 7870, + "token_acc": 0.8375451263537906, + "train_speed(iter/s)": 0.096022 + }, + { + "epoch": 0.36621625942062264, + "grad_norm": 6.141170501708984, + "learning_rate": 7.506656606604977e-06, + "loss": 0.6170244216918945, + "memory(GiB)": 36.53, + "step": 7875, + "token_acc": 0.8408247422680413, + "train_speed(iter/s)": 0.096056 + }, + { + "epoch": 0.3664487776805722, + "grad_norm": 11.874704360961914, + "learning_rate": 7.503329043450365e-06, + "loss": 0.7883286476135254, + "memory(GiB)": 36.53, + "step": 7880, + "token_acc": 0.8000757288905718, + "train_speed(iter/s)": 0.096091 + }, + { + "epoch": 0.36668129594052185, + "grad_norm": 6.312220096588135, + "learning_rate": 7.500000000000001e-06, + "loss": 0.7715856075286865, + "memory(GiB)": 36.53, + "step": 7885, + "token_acc": 0.8031995170540296, + "train_speed(iter/s)": 0.096125 + }, + { + "epoch": 0.3669138142004714, + "grad_norm": 6.496086597442627, + "learning_rate": 7.496669478222451e-06, + "loss": 0.7942769527435303, + "memory(GiB)": 36.53, + "step": 7890, + "token_acc": 0.8106926698049765, + "train_speed(iter/s)": 0.096159 + }, + { + "epoch": 0.36714633246042105, + "grad_norm": 7.234003067016602, + "learning_rate": 7.493337480087154e-06, + "loss": 0.6148253917694092, + "memory(GiB)": 36.53, + "step": 7895, + "token_acc": 0.8405434393993565, + "train_speed(iter/s)": 0.096194 + }, + { + "epoch": 0.36737885072037063, + "grad_norm": 4.3299784660339355, + "learning_rate": 7.490004007564426e-06, + "loss": 0.6379117965698242, + "memory(GiB)": 36.53, + "step": 7900, + "token_acc": 0.8427672955974843, + "train_speed(iter/s)": 0.096228 + }, + { + "epoch": 0.36737885072037063, + "eval_loss": 0.631164014339447, + "eval_runtime": 292.1888, + "eval_samples_per_second": 11.893, + "eval_steps_per_second": 11.893, + "step": 7900 + }, + { + "epoch": 0.36761136898032026, + "grad_norm": 7.93583345413208, + "learning_rate": 7.4866690626254504e-06, + "loss": 0.7415062427520752, + "memory(GiB)": 36.53, + "step": 7905, + "token_acc": 0.8179509025615709, + "train_speed(iter/s)": 0.09592 + }, + { + "epoch": 0.36784388724026984, + "grad_norm": 6.743467330932617, + "learning_rate": 7.483332647242283e-06, + "loss": 0.6835087299346924, + "memory(GiB)": 36.53, + "step": 7910, + "token_acc": 0.8301818181818181, + "train_speed(iter/s)": 0.095954 + }, + { + "epoch": 0.3680764055002194, + "grad_norm": 5.77875280380249, + "learning_rate": 7.47999476338785e-06, + "loss": 0.7879226207733154, + "memory(GiB)": 36.53, + "step": 7915, + "token_acc": 0.8026151930261519, + "train_speed(iter/s)": 0.095988 + }, + { + "epoch": 0.36830892376016905, + "grad_norm": 6.8071112632751465, + "learning_rate": 7.4766554130359446e-06, + "loss": 0.7319873809814453, + "memory(GiB)": 36.53, + "step": 7920, + "token_acc": 0.815610454708199, + "train_speed(iter/s)": 0.096022 + }, + { + "epoch": 0.3685414420201186, + "grad_norm": 7.000846862792969, + "learning_rate": 7.47331459816123e-06, + "loss": 0.6864665508270263, + "memory(GiB)": 36.53, + "step": 7925, + "token_acc": 0.8173270651443922, + "train_speed(iter/s)": 0.096055 + }, + { + "epoch": 0.36877396028006826, + "grad_norm": 3.548530101776123, + "learning_rate": 7.46997232073923e-06, + "loss": 0.8144044876098633, + "memory(GiB)": 36.53, + "step": 7930, + "token_acc": 0.7679455445544554, + "train_speed(iter/s)": 0.096088 + }, + { + "epoch": 0.36900647854001783, + "grad_norm": 6.005768775939941, + "learning_rate": 7.466628582746339e-06, + "loss": 0.699169111251831, + "memory(GiB)": 36.53, + "step": 7935, + "token_acc": 0.823170731707317, + "train_speed(iter/s)": 0.096121 + }, + { + "epoch": 0.36923899679996747, + "grad_norm": 7.976009368896484, + "learning_rate": 7.4632833861598096e-06, + "loss": 0.6825541496276856, + "memory(GiB)": 36.53, + "step": 7940, + "token_acc": 0.81710615280595, + "train_speed(iter/s)": 0.096155 + }, + { + "epoch": 0.36947151505991704, + "grad_norm": 9.505805969238281, + "learning_rate": 7.459936732957762e-06, + "loss": 0.7014679908752441, + "memory(GiB)": 36.53, + "step": 7945, + "token_acc": 0.8235664776307499, + "train_speed(iter/s)": 0.096189 + }, + { + "epoch": 0.3697040333198667, + "grad_norm": 6.265750885009766, + "learning_rate": 7.456588625119176e-06, + "loss": 0.5671255111694335, + "memory(GiB)": 36.53, + "step": 7950, + "token_acc": 0.8687304075235109, + "train_speed(iter/s)": 0.096223 + }, + { + "epoch": 0.3697040333198667, + "eval_loss": 0.6289050579071045, + "eval_runtime": 293.2764, + "eval_samples_per_second": 11.849, + "eval_steps_per_second": 11.849, + "step": 7950 + }, + { + "epoch": 0.36993655157981625, + "grad_norm": 7.854147911071777, + "learning_rate": 7.453239064623891e-06, + "loss": 0.602921724319458, + "memory(GiB)": 36.53, + "step": 7955, + "token_acc": 0.8187371239388522, + "train_speed(iter/s)": 0.095916 + }, + { + "epoch": 0.37016906983976583, + "grad_norm": 6.244097709655762, + "learning_rate": 7.449888053452602e-06, + "loss": 0.718879508972168, + "memory(GiB)": 36.53, + "step": 7960, + "token_acc": 0.8064516129032258, + "train_speed(iter/s)": 0.09595 + }, + { + "epoch": 0.37040158809971546, + "grad_norm": 8.784631729125977, + "learning_rate": 7.44653559358687e-06, + "loss": 0.8118132591247559, + "memory(GiB)": 36.53, + "step": 7965, + "token_acc": 0.8013925729442971, + "train_speed(iter/s)": 0.095984 + }, + { + "epoch": 0.37063410635966504, + "grad_norm": 5.966042518615723, + "learning_rate": 7.443181687009107e-06, + "loss": 0.5720973014831543, + "memory(GiB)": 36.53, + "step": 7970, + "token_acc": 0.8402120408936009, + "train_speed(iter/s)": 0.096017 + }, + { + "epoch": 0.37086662461961467, + "grad_norm": 4.4571003913879395, + "learning_rate": 7.439826335702579e-06, + "loss": 0.7941804885864258, + "memory(GiB)": 36.53, + "step": 7975, + "token_acc": 0.8104440789473685, + "train_speed(iter/s)": 0.096051 + }, + { + "epoch": 0.37109914287956425, + "grad_norm": 6.233658313751221, + "learning_rate": 7.43646954165141e-06, + "loss": 0.61534104347229, + "memory(GiB)": 36.53, + "step": 7980, + "token_acc": 0.854672704816797, + "train_speed(iter/s)": 0.096085 + }, + { + "epoch": 0.3713316611395139, + "grad_norm": 5.3049492835998535, + "learning_rate": 7.433111306840578e-06, + "loss": 0.7509316444396973, + "memory(GiB)": 36.53, + "step": 7985, + "token_acc": 0.8122314885013899, + "train_speed(iter/s)": 0.096118 + }, + { + "epoch": 0.37156417939946346, + "grad_norm": 6.574211120605469, + "learning_rate": 7.429751633255908e-06, + "loss": 0.6532687187194824, + "memory(GiB)": 36.53, + "step": 7990, + "token_acc": 0.8419689119170984, + "train_speed(iter/s)": 0.096152 + }, + { + "epoch": 0.3717966976594131, + "grad_norm": 7.246621131896973, + "learning_rate": 7.426390522884081e-06, + "loss": 0.6897338390350342, + "memory(GiB)": 36.53, + "step": 7995, + "token_acc": 0.8131868131868132, + "train_speed(iter/s)": 0.096187 + }, + { + "epoch": 0.37202921591936267, + "grad_norm": 7.290582656860352, + "learning_rate": 7.423027977712625e-06, + "loss": 0.6523595809936523, + "memory(GiB)": 36.53, + "step": 8000, + "token_acc": 0.8341408870667164, + "train_speed(iter/s)": 0.096221 + }, + { + "epoch": 0.37202921591936267, + "eval_loss": 0.6307212710380554, + "eval_runtime": 292.7948, + "eval_samples_per_second": 11.868, + "eval_steps_per_second": 11.868, + "step": 8000 + }, + { + "epoch": 0.37226173417931224, + "grad_norm": 6.008643627166748, + "learning_rate": 7.419663999729914e-06, + "loss": 0.6222623348236084, + "memory(GiB)": 36.53, + "step": 8005, + "token_acc": 0.8188679547309952, + "train_speed(iter/s)": 0.095916 + }, + { + "epoch": 0.3724942524392619, + "grad_norm": 6.651423454284668, + "learning_rate": 7.4162985909251775e-06, + "loss": 0.5432338237762451, + "memory(GiB)": 36.53, + "step": 8010, + "token_acc": 0.8626045400238949, + "train_speed(iter/s)": 0.095951 + }, + { + "epoch": 0.37272677069921145, + "grad_norm": 5.912210941314697, + "learning_rate": 7.412931753288479e-06, + "loss": 0.6824829578399658, + "memory(GiB)": 36.53, + "step": 8015, + "token_acc": 0.8359683794466403, + "train_speed(iter/s)": 0.095984 + }, + { + "epoch": 0.3729592889591611, + "grad_norm": 7.724246025085449, + "learning_rate": 7.409563488810739e-06, + "loss": 0.6164308071136475, + "memory(GiB)": 36.53, + "step": 8020, + "token_acc": 0.8528896672504378, + "train_speed(iter/s)": 0.096017 + }, + { + "epoch": 0.37319180721911066, + "grad_norm": 7.594185829162598, + "learning_rate": 7.406193799483714e-06, + "loss": 0.6749426841735839, + "memory(GiB)": 36.53, + "step": 8025, + "token_acc": 0.8529159519725558, + "train_speed(iter/s)": 0.096051 + }, + { + "epoch": 0.3734243254790603, + "grad_norm": 6.529979705810547, + "learning_rate": 7.402822687300005e-06, + "loss": 0.6923796653747558, + "memory(GiB)": 36.53, + "step": 8030, + "token_acc": 0.826313957535677, + "train_speed(iter/s)": 0.096085 + }, + { + "epoch": 0.37365684373900987, + "grad_norm": 6.124407768249512, + "learning_rate": 7.399450154253055e-06, + "loss": 0.845398998260498, + "memory(GiB)": 36.53, + "step": 8035, + "token_acc": 0.7821681864235056, + "train_speed(iter/s)": 0.096119 + }, + { + "epoch": 0.3738893619989595, + "grad_norm": 7.790156364440918, + "learning_rate": 7.396076202337148e-06, + "loss": 0.7758492469787598, + "memory(GiB)": 36.53, + "step": 8040, + "token_acc": 0.8095768374164811, + "train_speed(iter/s)": 0.096153 + }, + { + "epoch": 0.3741218802589091, + "grad_norm": 6.442034721374512, + "learning_rate": 7.392700833547404e-06, + "loss": 0.6660655498504638, + "memory(GiB)": 36.53, + "step": 8045, + "token_acc": 0.8396190476190476, + "train_speed(iter/s)": 0.096186 + }, + { + "epoch": 0.3743543985188587, + "grad_norm": 5.838046073913574, + "learning_rate": 7.389324049879784e-06, + "loss": 0.6552052974700928, + "memory(GiB)": 36.53, + "step": 8050, + "token_acc": 0.8436286621955524, + "train_speed(iter/s)": 0.096218 + }, + { + "epoch": 0.3743543985188587, + "eval_loss": 0.6274383068084717, + "eval_runtime": 295.4935, + "eval_samples_per_second": 11.76, + "eval_steps_per_second": 11.76, + "step": 8050 + }, + { + "epoch": 0.3745869167788083, + "grad_norm": 7.194561958312988, + "learning_rate": 7.385945853331087e-06, + "loss": 0.9055326461791993, + "memory(GiB)": 36.53, + "step": 8055, + "token_acc": 0.8169802095273211, + "train_speed(iter/s)": 0.095913 + }, + { + "epoch": 0.37481943503875786, + "grad_norm": 5.424001693725586, + "learning_rate": 7.382566245898939e-06, + "loss": 0.6388668060302735, + "memory(GiB)": 36.53, + "step": 8060, + "token_acc": 0.8429508196721311, + "train_speed(iter/s)": 0.095947 + }, + { + "epoch": 0.3750519532987075, + "grad_norm": 6.293416976928711, + "learning_rate": 7.379185229581811e-06, + "loss": 0.6606010437011719, + "memory(GiB)": 36.53, + "step": 8065, + "token_acc": 0.8308625336927223, + "train_speed(iter/s)": 0.09598 + }, + { + "epoch": 0.3752844715586571, + "grad_norm": 6.584990501403809, + "learning_rate": 7.375802806379001e-06, + "loss": 0.7164745330810547, + "memory(GiB)": 36.53, + "step": 8070, + "token_acc": 0.8291956305858987, + "train_speed(iter/s)": 0.096013 + }, + { + "epoch": 0.3755169898186067, + "grad_norm": 5.697750091552734, + "learning_rate": 7.37241897829064e-06, + "loss": 0.6898795604705811, + "memory(GiB)": 36.53, + "step": 8075, + "token_acc": 0.8306896551724138, + "train_speed(iter/s)": 0.096046 + }, + { + "epoch": 0.3757495080785563, + "grad_norm": 7.506107807159424, + "learning_rate": 7.369033747317689e-06, + "loss": 0.7770956039428711, + "memory(GiB)": 36.53, + "step": 8080, + "token_acc": 0.8004734528238079, + "train_speed(iter/s)": 0.096079 + }, + { + "epoch": 0.3759820263385059, + "grad_norm": 8.35728931427002, + "learning_rate": 7.3656471154619414e-06, + "loss": 0.7625340938568115, + "memory(GiB)": 36.53, + "step": 8085, + "token_acc": 0.801779359430605, + "train_speed(iter/s)": 0.096112 + }, + { + "epoch": 0.3762145445984555, + "grad_norm": 6.635538101196289, + "learning_rate": 7.362259084726016e-06, + "loss": 0.6521346092224121, + "memory(GiB)": 36.53, + "step": 8090, + "token_acc": 0.8464827050136028, + "train_speed(iter/s)": 0.096144 + }, + { + "epoch": 0.3764470628584051, + "grad_norm": 4.508878231048584, + "learning_rate": 7.358869657113361e-06, + "loss": 0.7203670978546143, + "memory(GiB)": 36.53, + "step": 8095, + "token_acc": 0.8256578947368421, + "train_speed(iter/s)": 0.096176 + }, + { + "epoch": 0.3766795811183547, + "grad_norm": 5.5791521072387695, + "learning_rate": 7.355478834628248e-06, + "loss": 0.7203432559967041, + "memory(GiB)": 36.53, + "step": 8100, + "token_acc": 0.8210862619808307, + "train_speed(iter/s)": 0.096208 + }, + { + "epoch": 0.3766795811183547, + "eval_loss": 0.6275553703308105, + "eval_runtime": 296.3077, + "eval_samples_per_second": 11.728, + "eval_steps_per_second": 11.728, + "step": 8100 + }, + { + "epoch": 0.3769120993783043, + "grad_norm": 5.211892604827881, + "learning_rate": 7.352086619275778e-06, + "loss": 0.6877071380615234, + "memory(GiB)": 36.53, + "step": 8105, + "token_acc": 0.8190062240829664, + "train_speed(iter/s)": 0.095903 + }, + { + "epoch": 0.3771446176382539, + "grad_norm": 8.251355171203613, + "learning_rate": 7.348693013061869e-06, + "loss": 0.780007791519165, + "memory(GiB)": 36.53, + "step": 8110, + "token_acc": 0.8195804195804196, + "train_speed(iter/s)": 0.095936 + }, + { + "epoch": 0.3773771358982035, + "grad_norm": 6.018730163574219, + "learning_rate": 7.345298017993268e-06, + "loss": 0.744899320602417, + "memory(GiB)": 36.53, + "step": 8115, + "token_acc": 0.8276515151515151, + "train_speed(iter/s)": 0.095969 + }, + { + "epoch": 0.3776096541581531, + "grad_norm": 7.462071895599365, + "learning_rate": 7.341901636077538e-06, + "loss": 0.6954497337341309, + "memory(GiB)": 36.53, + "step": 8120, + "token_acc": 0.8262008733624454, + "train_speed(iter/s)": 0.096003 + }, + { + "epoch": 0.3778421724181027, + "grad_norm": 7.142657279968262, + "learning_rate": 7.338503869323066e-06, + "loss": 0.6354021072387696, + "memory(GiB)": 36.53, + "step": 8125, + "token_acc": 0.8485804416403786, + "train_speed(iter/s)": 0.096035 + }, + { + "epoch": 0.3780746906780523, + "grad_norm": 6.712642669677734, + "learning_rate": 7.335104719739057e-06, + "loss": 0.7543253898620605, + "memory(GiB)": 36.53, + "step": 8130, + "token_acc": 0.8300211416490486, + "train_speed(iter/s)": 0.096068 + }, + { + "epoch": 0.3783072089380019, + "grad_norm": 6.749781131744385, + "learning_rate": 7.331704189335532e-06, + "loss": 0.6483595848083497, + "memory(GiB)": 36.53, + "step": 8135, + "token_acc": 0.8318812520167796, + "train_speed(iter/s)": 0.096102 + }, + { + "epoch": 0.37853972719795154, + "grad_norm": 7.32687520980835, + "learning_rate": 7.328302280123329e-06, + "loss": 0.7506073951721192, + "memory(GiB)": 36.53, + "step": 8140, + "token_acc": 0.8157894736842105, + "train_speed(iter/s)": 0.096135 + }, + { + "epoch": 0.3787722454579011, + "grad_norm": 8.008889198303223, + "learning_rate": 7.324898994114105e-06, + "loss": 0.7339870452880859, + "memory(GiB)": 36.53, + "step": 8145, + "token_acc": 0.8172205438066465, + "train_speed(iter/s)": 0.096168 + }, + { + "epoch": 0.3790047637178507, + "grad_norm": 5.6713972091674805, + "learning_rate": 7.321494333320324e-06, + "loss": 0.7103267669677734, + "memory(GiB)": 36.53, + "step": 8150, + "token_acc": 0.8242909987669543, + "train_speed(iter/s)": 0.096201 + }, + { + "epoch": 0.3790047637178507, + "eval_loss": 0.6278688907623291, + "eval_runtime": 289.0966, + "eval_samples_per_second": 12.02, + "eval_steps_per_second": 12.02, + "step": 8150 + }, + { + "epoch": 0.3792372819778003, + "grad_norm": 6.218601703643799, + "learning_rate": 7.318088299755269e-06, + "loss": 0.713615608215332, + "memory(GiB)": 36.53, + "step": 8155, + "token_acc": 0.8185400083117548, + "train_speed(iter/s)": 0.095907 + }, + { + "epoch": 0.3794698002377499, + "grad_norm": 7.3420538902282715, + "learning_rate": 7.314680895433033e-06, + "loss": 0.6775365829467773, + "memory(GiB)": 36.53, + "step": 8160, + "token_acc": 0.8393574297188755, + "train_speed(iter/s)": 0.095942 + }, + { + "epoch": 0.37970231849769953, + "grad_norm": 4.985617637634277, + "learning_rate": 7.311272122368518e-06, + "loss": 0.6389101505279541, + "memory(GiB)": 36.53, + "step": 8165, + "token_acc": 0.851161369193154, + "train_speed(iter/s)": 0.095975 + }, + { + "epoch": 0.3799348367576491, + "grad_norm": 7.136925220489502, + "learning_rate": 7.30786198257744e-06, + "loss": 0.7170490264892578, + "memory(GiB)": 36.53, + "step": 8170, + "token_acc": 0.8263403263403264, + "train_speed(iter/s)": 0.096009 + }, + { + "epoch": 0.38016735501759874, + "grad_norm": 7.366158962249756, + "learning_rate": 7.304450478076316e-06, + "loss": 0.6841135025024414, + "memory(GiB)": 36.53, + "step": 8175, + "token_acc": 0.8237410071942446, + "train_speed(iter/s)": 0.096042 + }, + { + "epoch": 0.3803998732775483, + "grad_norm": 8.012834548950195, + "learning_rate": 7.301037610882475e-06, + "loss": 0.7514279842376709, + "memory(GiB)": 36.53, + "step": 8180, + "token_acc": 0.8040621266427718, + "train_speed(iter/s)": 0.096076 + }, + { + "epoch": 0.38063239153749795, + "grad_norm": 7.065184116363525, + "learning_rate": 7.297623383014054e-06, + "loss": 0.7410264015197754, + "memory(GiB)": 36.53, + "step": 8185, + "token_acc": 0.829802513464991, + "train_speed(iter/s)": 0.096109 + }, + { + "epoch": 0.3808649097974475, + "grad_norm": 6.61446475982666, + "learning_rate": 7.2942077964899885e-06, + "loss": 0.8626726150512696, + "memory(GiB)": 36.53, + "step": 8190, + "token_acc": 0.7982254354255669, + "train_speed(iter/s)": 0.096141 + }, + { + "epoch": 0.38109742805739716, + "grad_norm": 9.516047477722168, + "learning_rate": 7.29079085333002e-06, + "loss": 0.7314748764038086, + "memory(GiB)": 36.53, + "step": 8195, + "token_acc": 0.845380564863571, + "train_speed(iter/s)": 0.096174 + }, + { + "epoch": 0.38132994631734674, + "grad_norm": 6.8897318840026855, + "learning_rate": 7.287372555554692e-06, + "loss": 0.6762599468231201, + "memory(GiB)": 36.53, + "step": 8200, + "token_acc": 0.8329145728643216, + "train_speed(iter/s)": 0.096207 + }, + { + "epoch": 0.38132994631734674, + "eval_loss": 0.6261598467826843, + "eval_runtime": 290.8816, + "eval_samples_per_second": 11.946, + "eval_steps_per_second": 11.946, + "step": 8200 + }, + { + "epoch": 0.3815624645772963, + "grad_norm": 6.636117458343506, + "learning_rate": 7.283952905185352e-06, + "loss": 0.7347008228302002, + "memory(GiB)": 36.53, + "step": 8205, + "token_acc": 0.8183973182217256, + "train_speed(iter/s)": 0.095911 + }, + { + "epoch": 0.38179498283724594, + "grad_norm": 7.701080322265625, + "learning_rate": 7.280531904244143e-06, + "loss": 0.7064132213592529, + "memory(GiB)": 36.53, + "step": 8210, + "token_acc": 0.8228217280349982, + "train_speed(iter/s)": 0.095944 + }, + { + "epoch": 0.3820275010971955, + "grad_norm": 6.672091960906982, + "learning_rate": 7.277109554754009e-06, + "loss": 0.691359281539917, + "memory(GiB)": 36.53, + "step": 8215, + "token_acc": 0.8353398058252427, + "train_speed(iter/s)": 0.095976 + }, + { + "epoch": 0.38226001935714515, + "grad_norm": 7.158807277679443, + "learning_rate": 7.27368585873869e-06, + "loss": 0.656017541885376, + "memory(GiB)": 36.53, + "step": 8220, + "token_acc": 0.8263909117390038, + "train_speed(iter/s)": 0.096009 + }, + { + "epoch": 0.38249253761709473, + "grad_norm": 6.932867527008057, + "learning_rate": 7.270260818222725e-06, + "loss": 0.6323503971099853, + "memory(GiB)": 36.53, + "step": 8225, + "token_acc": 0.8400509121764955, + "train_speed(iter/s)": 0.096042 + }, + { + "epoch": 0.38272505587704436, + "grad_norm": 6.20552396774292, + "learning_rate": 7.266834435231446e-06, + "loss": 0.724024772644043, + "memory(GiB)": 36.53, + "step": 8230, + "token_acc": 0.8196887686062246, + "train_speed(iter/s)": 0.096074 + }, + { + "epoch": 0.38295757413699394, + "grad_norm": 6.925886154174805, + "learning_rate": 7.263406711790978e-06, + "loss": 0.6515414237976074, + "memory(GiB)": 36.53, + "step": 8235, + "token_acc": 0.8424753867791842, + "train_speed(iter/s)": 0.096108 + }, + { + "epoch": 0.38319009239694357, + "grad_norm": 8.428985595703125, + "learning_rate": 7.2599776499282385e-06, + "loss": 0.8670886039733887, + "memory(GiB)": 36.53, + "step": 8240, + "token_acc": 0.7900763358778626, + "train_speed(iter/s)": 0.09614 + }, + { + "epoch": 0.38342261065689315, + "grad_norm": 7.204659461975098, + "learning_rate": 7.25654725167094e-06, + "loss": 0.7875346183776856, + "memory(GiB)": 36.53, + "step": 8245, + "token_acc": 0.8029728020240354, + "train_speed(iter/s)": 0.096172 + }, + { + "epoch": 0.3836551289168427, + "grad_norm": 8.074034690856934, + "learning_rate": 7.253115519047582e-06, + "loss": 0.6254090785980224, + "memory(GiB)": 36.53, + "step": 8250, + "token_acc": 0.8534579439252337, + "train_speed(iter/s)": 0.096205 + }, + { + "epoch": 0.3836551289168427, + "eval_loss": 0.6289153695106506, + "eval_runtime": 292.7668, + "eval_samples_per_second": 11.87, + "eval_steps_per_second": 11.87, + "step": 8250 + }, + { + "epoch": 0.38388764717679236, + "grad_norm": 6.093391418457031, + "learning_rate": 7.249682454087455e-06, + "loss": 0.6864508628845215, + "memory(GiB)": 36.53, + "step": 8255, + "token_acc": 0.8191242209155144, + "train_speed(iter/s)": 0.09591 + }, + { + "epoch": 0.38412016543674193, + "grad_norm": 7.866343975067139, + "learning_rate": 7.246248058820633e-06, + "loss": 0.7101897716522216, + "memory(GiB)": 36.53, + "step": 8260, + "token_acc": 0.8233898305084746, + "train_speed(iter/s)": 0.095943 + }, + { + "epoch": 0.38435268369669157, + "grad_norm": 6.162656307220459, + "learning_rate": 7.242812335277983e-06, + "loss": 0.6610394477844238, + "memory(GiB)": 36.53, + "step": 8265, + "token_acc": 0.835983785469286, + "train_speed(iter/s)": 0.095976 + }, + { + "epoch": 0.38458520195664114, + "grad_norm": 6.9368367195129395, + "learning_rate": 7.239375285491155e-06, + "loss": 0.7751856327056885, + "memory(GiB)": 36.53, + "step": 8270, + "token_acc": 0.8140916808149405, + "train_speed(iter/s)": 0.096009 + }, + { + "epoch": 0.3848177202165908, + "grad_norm": 7.226904392242432, + "learning_rate": 7.23593691149258e-06, + "loss": 0.6185824394226074, + "memory(GiB)": 36.53, + "step": 8275, + "token_acc": 0.8502024291497976, + "train_speed(iter/s)": 0.096041 + }, + { + "epoch": 0.38505023847654035, + "grad_norm": 7.314187049865723, + "learning_rate": 7.232497215315475e-06, + "loss": 0.7929863929748535, + "memory(GiB)": 36.53, + "step": 8280, + "token_acc": 0.8059055118110237, + "train_speed(iter/s)": 0.096074 + }, + { + "epoch": 0.38528275673649, + "grad_norm": 6.342879772186279, + "learning_rate": 7.229056198993841e-06, + "loss": 0.641834306716919, + "memory(GiB)": 36.53, + "step": 8285, + "token_acc": 0.8361940298507463, + "train_speed(iter/s)": 0.096107 + }, + { + "epoch": 0.38551527499643956, + "grad_norm": 8.272603034973145, + "learning_rate": 7.225613864562456e-06, + "loss": 0.715467882156372, + "memory(GiB)": 36.53, + "step": 8290, + "token_acc": 0.8329686360320934, + "train_speed(iter/s)": 0.096139 + }, + { + "epoch": 0.3857477932563892, + "grad_norm": 8.183306694030762, + "learning_rate": 7.222170214056878e-06, + "loss": 0.7441752433776856, + "memory(GiB)": 36.53, + "step": 8295, + "token_acc": 0.8247078464106845, + "train_speed(iter/s)": 0.096171 + }, + { + "epoch": 0.38598031151633877, + "grad_norm": 6.751532077789307, + "learning_rate": 7.218725249513444e-06, + "loss": 0.7253667831420898, + "memory(GiB)": 36.53, + "step": 8300, + "token_acc": 0.8165555945282357, + "train_speed(iter/s)": 0.096204 + }, + { + "epoch": 0.38598031151633877, + "eval_loss": 0.6258677840232849, + "eval_runtime": 290.9616, + "eval_samples_per_second": 11.943, + "eval_steps_per_second": 11.943, + "step": 8300 + }, + { + "epoch": 0.38621282977628835, + "grad_norm": 6.383945465087891, + "learning_rate": 7.215278972969267e-06, + "loss": 0.6705090999603271, + "memory(GiB)": 36.53, + "step": 8305, + "token_acc": 0.8193674339710201, + "train_speed(iter/s)": 0.095913 + }, + { + "epoch": 0.386445348036238, + "grad_norm": 6.654679298400879, + "learning_rate": 7.211831386462239e-06, + "loss": 0.7232837677001953, + "memory(GiB)": 36.53, + "step": 8310, + "token_acc": 0.8157894736842105, + "train_speed(iter/s)": 0.095946 + }, + { + "epoch": 0.38667786629618756, + "grad_norm": 7.329780578613281, + "learning_rate": 7.208382492031024e-06, + "loss": 0.681901216506958, + "memory(GiB)": 36.53, + "step": 8315, + "token_acc": 0.8424460431654677, + "train_speed(iter/s)": 0.095979 + }, + { + "epoch": 0.3869103845561372, + "grad_norm": 5.073947429656982, + "learning_rate": 7.204932291715059e-06, + "loss": 0.7335203647613525, + "memory(GiB)": 36.53, + "step": 8320, + "token_acc": 0.8240954580446497, + "train_speed(iter/s)": 0.096011 + }, + { + "epoch": 0.38714290281608676, + "grad_norm": 8.371402740478516, + "learning_rate": 7.201480787554551e-06, + "loss": 0.7167182922363281, + "memory(GiB)": 36.53, + "step": 8325, + "token_acc": 0.8266949152542373, + "train_speed(iter/s)": 0.096043 + }, + { + "epoch": 0.3873754210760364, + "grad_norm": 6.6527180671691895, + "learning_rate": 7.198027981590487e-06, + "loss": 0.6592539310455322, + "memory(GiB)": 36.53, + "step": 8330, + "token_acc": 0.8371653543307087, + "train_speed(iter/s)": 0.096076 + }, + { + "epoch": 0.387607939335986, + "grad_norm": 5.508927822113037, + "learning_rate": 7.194573875864615e-06, + "loss": 0.7026764869689941, + "memory(GiB)": 36.53, + "step": 8335, + "token_acc": 0.8083706238483811, + "train_speed(iter/s)": 0.096108 + }, + { + "epoch": 0.3878404575959356, + "grad_norm": 7.605576038360596, + "learning_rate": 7.1911184724194504e-06, + "loss": 0.603968620300293, + "memory(GiB)": 36.53, + "step": 8340, + "token_acc": 0.851528384279476, + "train_speed(iter/s)": 0.096141 + }, + { + "epoch": 0.3880729758558852, + "grad_norm": 6.188586711883545, + "learning_rate": 7.187661773298287e-06, + "loss": 0.6946659088134766, + "memory(GiB)": 36.53, + "step": 8345, + "token_acc": 0.834151979196764, + "train_speed(iter/s)": 0.096172 + }, + { + "epoch": 0.38830549411583476, + "grad_norm": 7.38683557510376, + "learning_rate": 7.184203780545173e-06, + "loss": 0.6234054565429688, + "memory(GiB)": 36.53, + "step": 8350, + "token_acc": 0.8553169734151329, + "train_speed(iter/s)": 0.096205 + }, + { + "epoch": 0.38830549411583476, + "eval_loss": 0.6242329478263855, + "eval_runtime": 291.5772, + "eval_samples_per_second": 11.918, + "eval_steps_per_second": 11.918, + "step": 8350 + }, + { + "epoch": 0.3885380123757844, + "grad_norm": 7.963403224945068, + "learning_rate": 7.180744496204928e-06, + "loss": 0.8015275955200195, + "memory(GiB)": 36.53, + "step": 8355, + "token_acc": 0.8184321983010544, + "train_speed(iter/s)": 0.095913 + }, + { + "epoch": 0.38877053063573397, + "grad_norm": 9.448433876037598, + "learning_rate": 7.177283922323132e-06, + "loss": 0.8290170669555664, + "memory(GiB)": 36.53, + "step": 8360, + "token_acc": 0.7876712328767124, + "train_speed(iter/s)": 0.095944 + }, + { + "epoch": 0.3890030488956836, + "grad_norm": 6.719177722930908, + "learning_rate": 7.173822060946131e-06, + "loss": 0.8085485458374023, + "memory(GiB)": 36.53, + "step": 8365, + "token_acc": 0.7998301245753114, + "train_speed(iter/s)": 0.095977 + }, + { + "epoch": 0.3892355671556332, + "grad_norm": 8.454413414001465, + "learning_rate": 7.170358914121031e-06, + "loss": 0.6984954833984375, + "memory(GiB)": 36.53, + "step": 8370, + "token_acc": 0.8300073909830007, + "train_speed(iter/s)": 0.09601 + }, + { + "epoch": 0.3894680854155828, + "grad_norm": 6.720226764678955, + "learning_rate": 7.166894483895695e-06, + "loss": 0.6794505596160889, + "memory(GiB)": 36.53, + "step": 8375, + "token_acc": 0.8185955786736021, + "train_speed(iter/s)": 0.096041 + }, + { + "epoch": 0.3897006036755324, + "grad_norm": 5.379835605621338, + "learning_rate": 7.163428772318749e-06, + "loss": 0.7329598903656006, + "memory(GiB)": 36.53, + "step": 8380, + "token_acc": 0.8256791720569211, + "train_speed(iter/s)": 0.096072 + }, + { + "epoch": 0.389933121935482, + "grad_norm": 5.505593776702881, + "learning_rate": 7.1599617814395764e-06, + "loss": 0.6776937007904053, + "memory(GiB)": 36.53, + "step": 8385, + "token_acc": 0.8378680581438688, + "train_speed(iter/s)": 0.096103 + }, + { + "epoch": 0.3901656401954316, + "grad_norm": 6.174607276916504, + "learning_rate": 7.1564935133083146e-06, + "loss": 0.6524590492248535, + "memory(GiB)": 36.53, + "step": 8390, + "token_acc": 0.8298722044728435, + "train_speed(iter/s)": 0.096135 + }, + { + "epoch": 0.3903981584553812, + "grad_norm": 8.53377914428711, + "learning_rate": 7.153023969975858e-06, + "loss": 0.7605315685272217, + "memory(GiB)": 36.53, + "step": 8395, + "token_acc": 0.8234060402684564, + "train_speed(iter/s)": 0.096167 + }, + { + "epoch": 0.3906306767153308, + "grad_norm": 5.26388692855835, + "learning_rate": 7.149553153493853e-06, + "loss": 0.6725038051605224, + "memory(GiB)": 36.53, + "step": 8400, + "token_acc": 0.8365276211950394, + "train_speed(iter/s)": 0.096199 + }, + { + "epoch": 0.3906306767153308, + "eval_loss": 0.6255432963371277, + "eval_runtime": 290.2824, + "eval_samples_per_second": 11.971, + "eval_steps_per_second": 11.971, + "step": 8400 + }, + { + "epoch": 0.3908631949752804, + "grad_norm": 7.3965044021606445, + "learning_rate": 7.1460810659147036e-06, + "loss": 0.6825075626373291, + "memory(GiB)": 36.53, + "step": 8405, + "token_acc": 0.8196063733073893, + "train_speed(iter/s)": 0.095913 + }, + { + "epoch": 0.39109571323523, + "grad_norm": 5.887383937835693, + "learning_rate": 7.142607709291561e-06, + "loss": 0.5826900959014892, + "memory(GiB)": 36.53, + "step": 8410, + "token_acc": 0.8551888195063931, + "train_speed(iter/s)": 0.095944 + }, + { + "epoch": 0.3913282314951796, + "grad_norm": 6.264004707336426, + "learning_rate": 7.139133085678329e-06, + "loss": 0.6516207218170166, + "memory(GiB)": 36.53, + "step": 8415, + "token_acc": 0.8345011678345011, + "train_speed(iter/s)": 0.095975 + }, + { + "epoch": 0.3915607497551292, + "grad_norm": 7.808225154876709, + "learning_rate": 7.135657197129658e-06, + "loss": 0.7851691246032715, + "memory(GiB)": 36.53, + "step": 8420, + "token_acc": 0.7870065789473685, + "train_speed(iter/s)": 0.096008 + }, + { + "epoch": 0.3917932680150788, + "grad_norm": 7.2982563972473145, + "learning_rate": 7.132180045700948e-06, + "loss": 0.6950534343719482, + "memory(GiB)": 36.53, + "step": 8425, + "token_acc": 0.8203883495145631, + "train_speed(iter/s)": 0.09604 + }, + { + "epoch": 0.39202578627502843, + "grad_norm": 6.782866477966309, + "learning_rate": 7.128701633448349e-06, + "loss": 0.7969116687774658, + "memory(GiB)": 36.53, + "step": 8430, + "token_acc": 0.7997275204359673, + "train_speed(iter/s)": 0.096072 + }, + { + "epoch": 0.392258304534978, + "grad_norm": 6.884278297424316, + "learning_rate": 7.125221962428751e-06, + "loss": 0.7231873035430908, + "memory(GiB)": 36.53, + "step": 8435, + "token_acc": 0.8266978922716628, + "train_speed(iter/s)": 0.096104 + }, + { + "epoch": 0.39249082279492764, + "grad_norm": 6.458530902862549, + "learning_rate": 7.121741034699791e-06, + "loss": 0.685096549987793, + "memory(GiB)": 36.53, + "step": 8440, + "token_acc": 0.8211323476379373, + "train_speed(iter/s)": 0.096136 + }, + { + "epoch": 0.3927233410548772, + "grad_norm": 6.654186725616455, + "learning_rate": 7.118258852319849e-06, + "loss": 0.7162202358245849, + "memory(GiB)": 36.53, + "step": 8445, + "token_acc": 0.8270702853166318, + "train_speed(iter/s)": 0.096167 + }, + { + "epoch": 0.3929558593148268, + "grad_norm": 6.901993274688721, + "learning_rate": 7.11477541734805e-06, + "loss": 0.7216857433319092, + "memory(GiB)": 36.53, + "step": 8450, + "token_acc": 0.8303501945525292, + "train_speed(iter/s)": 0.0962 + }, + { + "epoch": 0.3929558593148268, + "eval_loss": 0.6227236986160278, + "eval_runtime": 293.0838, + "eval_samples_per_second": 11.857, + "eval_steps_per_second": 11.857, + "step": 8450 + }, + { + "epoch": 0.3931883775747764, + "grad_norm": 5.063840866088867, + "learning_rate": 7.1112907318442525e-06, + "loss": 0.8187254905700684, + "memory(GiB)": 36.53, + "step": 8455, + "token_acc": 0.8182777610258526, + "train_speed(iter/s)": 0.095909 + }, + { + "epoch": 0.393420895834726, + "grad_norm": 6.54954195022583, + "learning_rate": 7.107804797869061e-06, + "loss": 0.6639408111572266, + "memory(GiB)": 36.53, + "step": 8460, + "token_acc": 0.8401349072512647, + "train_speed(iter/s)": 0.09594 + }, + { + "epoch": 0.39365341409467564, + "grad_norm": 7.679318904876709, + "learning_rate": 7.104317617483815e-06, + "loss": 0.6447121143341065, + "memory(GiB)": 36.53, + "step": 8465, + "token_acc": 0.8449675324675324, + "train_speed(iter/s)": 0.095971 + }, + { + "epoch": 0.3938859323546252, + "grad_norm": 6.481260776519775, + "learning_rate": 7.100829192750592e-06, + "loss": 0.6538030624389648, + "memory(GiB)": 36.53, + "step": 8470, + "token_acc": 0.8317723342939481, + "train_speed(iter/s)": 0.096003 + }, + { + "epoch": 0.39411845061457484, + "grad_norm": 4.740011692047119, + "learning_rate": 7.097339525732207e-06, + "loss": 0.6647510528564453, + "memory(GiB)": 36.53, + "step": 8475, + "token_acc": 0.8279646017699115, + "train_speed(iter/s)": 0.096034 + }, + { + "epoch": 0.3943509688745244, + "grad_norm": 8.062383651733398, + "learning_rate": 7.0938486184922055e-06, + "loss": 0.7419010162353515, + "memory(GiB)": 36.53, + "step": 8480, + "token_acc": 0.8148614609571788, + "train_speed(iter/s)": 0.096066 + }, + { + "epoch": 0.39458348713447405, + "grad_norm": 6.529599189758301, + "learning_rate": 7.090356473094868e-06, + "loss": 0.6296727180480957, + "memory(GiB)": 36.53, + "step": 8485, + "token_acc": 0.8369491525423729, + "train_speed(iter/s)": 0.096098 + }, + { + "epoch": 0.39481600539442363, + "grad_norm": 6.894472122192383, + "learning_rate": 7.086863091605212e-06, + "loss": 0.7523578643798828, + "memory(GiB)": 36.53, + "step": 8490, + "token_acc": 0.8076321551454488, + "train_speed(iter/s)": 0.09613 + }, + { + "epoch": 0.3950485236543732, + "grad_norm": 7.225290298461914, + "learning_rate": 7.083368476088978e-06, + "loss": 0.658946943283081, + "memory(GiB)": 36.53, + "step": 8495, + "token_acc": 0.8323272971160295, + "train_speed(iter/s)": 0.096161 + }, + { + "epoch": 0.39528104191432284, + "grad_norm": 5.5117926597595215, + "learning_rate": 7.07987262861264e-06, + "loss": 0.6642012119293212, + "memory(GiB)": 36.53, + "step": 8500, + "token_acc": 0.8328358208955224, + "train_speed(iter/s)": 0.096193 + }, + { + "epoch": 0.39528104191432284, + "eval_loss": 0.6217544674873352, + "eval_runtime": 291.8932, + "eval_samples_per_second": 11.905, + "eval_steps_per_second": 11.905, + "step": 8500 + }, + { + "epoch": 0.3955135601742724, + "grad_norm": 8.180193901062012, + "learning_rate": 7.076375551243404e-06, + "loss": 0.8467119216918946, + "memory(GiB)": 36.53, + "step": 8505, + "token_acc": 0.8187991678821644, + "train_speed(iter/s)": 0.095906 + }, + { + "epoch": 0.39574607843422205, + "grad_norm": 9.231237411499023, + "learning_rate": 7.072877246049197e-06, + "loss": 0.7153133869171142, + "memory(GiB)": 36.53, + "step": 8510, + "token_acc": 0.8269018743109151, + "train_speed(iter/s)": 0.095938 + }, + { + "epoch": 0.3959785966941716, + "grad_norm": 8.131089210510254, + "learning_rate": 7.069377715098675e-06, + "loss": 0.5359804630279541, + "memory(GiB)": 36.53, + "step": 8515, + "token_acc": 0.8640939597315436, + "train_speed(iter/s)": 0.09597 + }, + { + "epoch": 0.39621111495412126, + "grad_norm": 7.11887264251709, + "learning_rate": 7.065876960461219e-06, + "loss": 0.7117724418640137, + "memory(GiB)": 36.53, + "step": 8520, + "token_acc": 0.8228682170542636, + "train_speed(iter/s)": 0.096001 + }, + { + "epoch": 0.39644363321407083, + "grad_norm": 5.5428667068481445, + "learning_rate": 7.062374984206935e-06, + "loss": 0.806981086730957, + "memory(GiB)": 36.53, + "step": 8525, + "token_acc": 0.807551766138855, + "train_speed(iter/s)": 0.096033 + }, + { + "epoch": 0.39667615147402047, + "grad_norm": 5.641619682312012, + "learning_rate": 7.058871788406647e-06, + "loss": 0.7675019264221191, + "memory(GiB)": 36.53, + "step": 8530, + "token_acc": 0.8024366150806718, + "train_speed(iter/s)": 0.096065 + }, + { + "epoch": 0.39690866973397004, + "grad_norm": 6.307408332824707, + "learning_rate": 7.055367375131904e-06, + "loss": 0.6659773349761963, + "memory(GiB)": 36.53, + "step": 8535, + "token_acc": 0.8338645418326693, + "train_speed(iter/s)": 0.096097 + }, + { + "epoch": 0.3971411879939196, + "grad_norm": 6.317528247833252, + "learning_rate": 7.051861746454973e-06, + "loss": 0.6953274250030518, + "memory(GiB)": 36.53, + "step": 8540, + "token_acc": 0.8302925989672978, + "train_speed(iter/s)": 0.096129 + }, + { + "epoch": 0.39737370625386925, + "grad_norm": 7.031256675720215, + "learning_rate": 7.048354904448843e-06, + "loss": 0.7554344177246094, + "memory(GiB)": 36.53, + "step": 8545, + "token_acc": 0.7995824634655533, + "train_speed(iter/s)": 0.09616 + }, + { + "epoch": 0.39760622451381883, + "grad_norm": 9.937764167785645, + "learning_rate": 7.044846851187216e-06, + "loss": 0.7292638778686523, + "memory(GiB)": 36.53, + "step": 8550, + "token_acc": 0.8106201262532492, + "train_speed(iter/s)": 0.096192 + }, + { + "epoch": 0.39760622451381883, + "eval_loss": 0.6203534007072449, + "eval_runtime": 294.7863, + "eval_samples_per_second": 11.788, + "eval_steps_per_second": 11.788, + "step": 8550 + }, + { + "epoch": 0.39783874277376846, + "grad_norm": 7.463900566101074, + "learning_rate": 7.0413375887445125e-06, + "loss": 0.7390836238861084, + "memory(GiB)": 36.53, + "step": 8555, + "token_acc": 0.8197846190800272, + "train_speed(iter/s)": 0.095905 + }, + { + "epoch": 0.39807126103371804, + "grad_norm": 5.609832286834717, + "learning_rate": 7.037827119195867e-06, + "loss": 0.6625056743621827, + "memory(GiB)": 36.53, + "step": 8560, + "token_acc": 0.8261359369132557, + "train_speed(iter/s)": 0.095936 + }, + { + "epoch": 0.39830377929366767, + "grad_norm": 5.434301376342773, + "learning_rate": 7.034315444617129e-06, + "loss": 0.5975072860717774, + "memory(GiB)": 36.53, + "step": 8565, + "token_acc": 0.848990953375087, + "train_speed(iter/s)": 0.095967 + }, + { + "epoch": 0.39853629755361725, + "grad_norm": 6.72133731842041, + "learning_rate": 7.03080256708486e-06, + "loss": 0.65772385597229, + "memory(GiB)": 36.53, + "step": 8570, + "token_acc": 0.8415803605677024, + "train_speed(iter/s)": 0.095998 + }, + { + "epoch": 0.3987688158135669, + "grad_norm": 6.663748741149902, + "learning_rate": 7.027288488676335e-06, + "loss": 0.7755102634429931, + "memory(GiB)": 36.53, + "step": 8575, + "token_acc": 0.7967137944210928, + "train_speed(iter/s)": 0.096029 + }, + { + "epoch": 0.39900133407351646, + "grad_norm": 8.37984561920166, + "learning_rate": 7.023773211469535e-06, + "loss": 0.7156404495239258, + "memory(GiB)": 36.53, + "step": 8580, + "token_acc": 0.8284989122552574, + "train_speed(iter/s)": 0.096061 + }, + { + "epoch": 0.3992338523334661, + "grad_norm": 6.5237908363342285, + "learning_rate": 7.020256737543149e-06, + "loss": 0.6762457847595215, + "memory(GiB)": 36.53, + "step": 8585, + "token_acc": 0.8411397345823576, + "train_speed(iter/s)": 0.096092 + }, + { + "epoch": 0.39946637059341567, + "grad_norm": 5.718166828155518, + "learning_rate": 7.016739068976583e-06, + "loss": 0.7685590744018554, + "memory(GiB)": 36.53, + "step": 8590, + "token_acc": 0.8261183261183261, + "train_speed(iter/s)": 0.096122 + }, + { + "epoch": 0.39969888885336524, + "grad_norm": 5.6232008934021, + "learning_rate": 7.01322020784994e-06, + "loss": 0.6538674354553222, + "memory(GiB)": 36.53, + "step": 8595, + "token_acc": 0.8406889128094726, + "train_speed(iter/s)": 0.096154 + }, + { + "epoch": 0.3999314071133149, + "grad_norm": 8.361418724060059, + "learning_rate": 7.00970015624403e-06, + "loss": 0.7033608913421631, + "memory(GiB)": 36.53, + "step": 8600, + "token_acc": 0.8283450704225352, + "train_speed(iter/s)": 0.096185 + }, + { + "epoch": 0.3999314071133149, + "eval_loss": 0.6218823194503784, + "eval_runtime": 292.7751, + "eval_samples_per_second": 11.869, + "eval_steps_per_second": 11.869, + "step": 8600 + }, + { + "epoch": 0.40016392537326445, + "grad_norm": 7.39115047454834, + "learning_rate": 7.0061789162403694e-06, + "loss": 0.740311861038208, + "memory(GiB)": 36.53, + "step": 8605, + "token_acc": 0.8190235284670361, + "train_speed(iter/s)": 0.095902 + }, + { + "epoch": 0.4003964436332141, + "grad_norm": 7.320059299468994, + "learning_rate": 7.002656489921177e-06, + "loss": 0.6501819133758545, + "memory(GiB)": 36.53, + "step": 8610, + "token_acc": 0.8309228650137741, + "train_speed(iter/s)": 0.095933 + }, + { + "epoch": 0.40062896189316366, + "grad_norm": 5.359230995178223, + "learning_rate": 6.99913287936937e-06, + "loss": 0.6049031257629395, + "memory(GiB)": 36.53, + "step": 8615, + "token_acc": 0.8495334370139969, + "train_speed(iter/s)": 0.095965 + }, + { + "epoch": 0.4008614801531133, + "grad_norm": 7.4673237800598145, + "learning_rate": 6.99560808666857e-06, + "loss": 0.7724496841430664, + "memory(GiB)": 36.53, + "step": 8620, + "token_acc": 0.8127490039840638, + "train_speed(iter/s)": 0.095996 + }, + { + "epoch": 0.40109399841306287, + "grad_norm": 7.398331165313721, + "learning_rate": 6.992082113903096e-06, + "loss": 0.6779186248779296, + "memory(GiB)": 36.53, + "step": 8625, + "token_acc": 0.8232695139911634, + "train_speed(iter/s)": 0.096027 + }, + { + "epoch": 0.4013265166730125, + "grad_norm": 6.002816677093506, + "learning_rate": 6.988554963157962e-06, + "loss": 0.8541918754577636, + "memory(GiB)": 36.53, + "step": 8630, + "token_acc": 0.7821246819338422, + "train_speed(iter/s)": 0.096057 + }, + { + "epoch": 0.4015590349329621, + "grad_norm": 5.535634994506836, + "learning_rate": 6.985026636518884e-06, + "loss": 0.7069508552551269, + "memory(GiB)": 36.53, + "step": 8635, + "token_acc": 0.8242740134028295, + "train_speed(iter/s)": 0.096089 + }, + { + "epoch": 0.40179155319291165, + "grad_norm": 9.848203659057617, + "learning_rate": 6.9814971360722695e-06, + "loss": 0.7899524211883545, + "memory(GiB)": 36.53, + "step": 8640, + "token_acc": 0.8144616607071911, + "train_speed(iter/s)": 0.09612 + }, + { + "epoch": 0.4020240714528613, + "grad_norm": 4.755890369415283, + "learning_rate": 6.977966463905219e-06, + "loss": 0.691878080368042, + "memory(GiB)": 36.53, + "step": 8645, + "token_acc": 0.835621387283237, + "train_speed(iter/s)": 0.096151 + }, + { + "epoch": 0.40225658971281086, + "grad_norm": 7.151211261749268, + "learning_rate": 6.974434622105531e-06, + "loss": 0.7025826930999756, + "memory(GiB)": 36.53, + "step": 8650, + "token_acc": 0.8290136789056876, + "train_speed(iter/s)": 0.096183 + }, + { + "epoch": 0.40225658971281086, + "eval_loss": 0.6199798583984375, + "eval_runtime": 293.9287, + "eval_samples_per_second": 11.823, + "eval_steps_per_second": 11.823, + "step": 8650 + }, + { + "epoch": 0.4024891079727605, + "grad_norm": 5.387206077575684, + "learning_rate": 6.970901612761693e-06, + "loss": 0.714640760421753, + "memory(GiB)": 36.53, + "step": 8655, + "token_acc": 0.8199699723681899, + "train_speed(iter/s)": 0.0959 + }, + { + "epoch": 0.4027216262327101, + "grad_norm": 10.001330375671387, + "learning_rate": 6.967367437962879e-06, + "loss": 0.7553164005279541, + "memory(GiB)": 36.53, + "step": 8660, + "token_acc": 0.8316412859560067, + "train_speed(iter/s)": 0.095931 + }, + { + "epoch": 0.4029541444926597, + "grad_norm": 4.902223587036133, + "learning_rate": 6.963832099798957e-06, + "loss": 0.7956992626190186, + "memory(GiB)": 36.53, + "step": 8665, + "token_acc": 0.8115117014547755, + "train_speed(iter/s)": 0.095962 + }, + { + "epoch": 0.4031866627526093, + "grad_norm": 8.351801872253418, + "learning_rate": 6.960295600360484e-06, + "loss": 0.7541163921356201, + "memory(GiB)": 36.53, + "step": 8670, + "token_acc": 0.8208117443868739, + "train_speed(iter/s)": 0.095994 + }, + { + "epoch": 0.4034191810125589, + "grad_norm": 5.438088893890381, + "learning_rate": 6.956757941738699e-06, + "loss": 0.6900172710418702, + "memory(GiB)": 36.53, + "step": 8675, + "token_acc": 0.8305369127516778, + "train_speed(iter/s)": 0.096026 + }, + { + "epoch": 0.4036516992725085, + "grad_norm": 6.124330043792725, + "learning_rate": 6.953219126025529e-06, + "loss": 0.7927371978759765, + "memory(GiB)": 36.53, + "step": 8680, + "token_acc": 0.8033503277494537, + "train_speed(iter/s)": 0.096057 + }, + { + "epoch": 0.4038842175324581, + "grad_norm": 8.318258285522461, + "learning_rate": 6.949679155313585e-06, + "loss": 0.7160331726074218, + "memory(GiB)": 36.53, + "step": 8685, + "token_acc": 0.8290909090909091, + "train_speed(iter/s)": 0.096089 + }, + { + "epoch": 0.4041167357924077, + "grad_norm": 5.619810581207275, + "learning_rate": 6.946138031696161e-06, + "loss": 0.6082026958465576, + "memory(GiB)": 36.53, + "step": 8690, + "token_acc": 0.8539733763614361, + "train_speed(iter/s)": 0.09612 + }, + { + "epoch": 0.4043492540523573, + "grad_norm": 6.684939861297607, + "learning_rate": 6.942595757267234e-06, + "loss": 0.6005841255187988, + "memory(GiB)": 36.53, + "step": 8695, + "token_acc": 0.8429752066115702, + "train_speed(iter/s)": 0.09615 + }, + { + "epoch": 0.4045817723123069, + "grad_norm": 6.462775707244873, + "learning_rate": 6.939052334121458e-06, + "loss": 0.6406507968902588, + "memory(GiB)": 36.53, + "step": 8700, + "token_acc": 0.8348567946374162, + "train_speed(iter/s)": 0.09618 + }, + { + "epoch": 0.4045817723123069, + "eval_loss": 0.6194509267807007, + "eval_runtime": 291.2015, + "eval_samples_per_second": 11.933, + "eval_steps_per_second": 11.933, + "step": 8700 + }, + { + "epoch": 0.4048142905722565, + "grad_norm": 7.093710422515869, + "learning_rate": 6.9355077643541704e-06, + "loss": 0.7701839923858642, + "memory(GiB)": 36.53, + "step": 8705, + "token_acc": 0.8190435191607076, + "train_speed(iter/s)": 0.0959 + }, + { + "epoch": 0.4050468088322061, + "grad_norm": 9.275561332702637, + "learning_rate": 6.931962050061384e-06, + "loss": 0.7341086864471436, + "memory(GiB)": 36.53, + "step": 8710, + "token_acc": 0.8329238329238329, + "train_speed(iter/s)": 0.095932 + }, + { + "epoch": 0.4052793270921557, + "grad_norm": 6.438332557678223, + "learning_rate": 6.928415193339789e-06, + "loss": 0.6380467891693116, + "memory(GiB)": 36.53, + "step": 8715, + "token_acc": 0.8350694444444444, + "train_speed(iter/s)": 0.095964 + }, + { + "epoch": 0.4055118453521053, + "grad_norm": 6.661995887756348, + "learning_rate": 6.924867196286753e-06, + "loss": 0.6209036350250244, + "memory(GiB)": 36.53, + "step": 8720, + "token_acc": 0.8397932816537468, + "train_speed(iter/s)": 0.095995 + }, + { + "epoch": 0.4057443636120549, + "grad_norm": 7.798610687255859, + "learning_rate": 6.921318061000313e-06, + "loss": 0.6994572639465332, + "memory(GiB)": 36.53, + "step": 8725, + "token_acc": 0.834107498341075, + "train_speed(iter/s)": 0.096025 + }, + { + "epoch": 0.40597688187200454, + "grad_norm": 8.071057319641113, + "learning_rate": 6.917767789579184e-06, + "loss": 0.6899877548217773, + "memory(GiB)": 36.53, + "step": 8730, + "token_acc": 0.819412347309343, + "train_speed(iter/s)": 0.096055 + }, + { + "epoch": 0.4062094001319541, + "grad_norm": 9.451443672180176, + "learning_rate": 6.914216384122752e-06, + "loss": 0.6809048652648926, + "memory(GiB)": 36.53, + "step": 8735, + "token_acc": 0.839418924224578, + "train_speed(iter/s)": 0.096087 + }, + { + "epoch": 0.4064419183919037, + "grad_norm": 8.285967826843262, + "learning_rate": 6.910663846731072e-06, + "loss": 0.6227863311767579, + "memory(GiB)": 36.53, + "step": 8740, + "token_acc": 0.8486526393503138, + "train_speed(iter/s)": 0.096117 + }, + { + "epoch": 0.4066744366518533, + "grad_norm": 6.145111083984375, + "learning_rate": 6.9071101795048665e-06, + "loss": 0.8297652244567871, + "memory(GiB)": 36.53, + "step": 8745, + "token_acc": 0.8068181818181818, + "train_speed(iter/s)": 0.096148 + }, + { + "epoch": 0.4069069549118029, + "grad_norm": 7.013164043426514, + "learning_rate": 6.903555384545533e-06, + "loss": 0.665109920501709, + "memory(GiB)": 36.53, + "step": 8750, + "token_acc": 0.8276580958999306, + "train_speed(iter/s)": 0.096179 + }, + { + "epoch": 0.4069069549118029, + "eval_loss": 0.6221299767494202, + "eval_runtime": 292.4864, + "eval_samples_per_second": 11.881, + "eval_steps_per_second": 11.881, + "step": 8750 + }, + { + "epoch": 0.40713947317175253, + "grad_norm": 7.368844032287598, + "learning_rate": 6.899999463955129e-06, + "loss": 0.771725845336914, + "memory(GiB)": 36.53, + "step": 8755, + "token_acc": 0.8200054410446805, + "train_speed(iter/s)": 0.095902 + }, + { + "epoch": 0.4073719914317021, + "grad_norm": 8.730157852172852, + "learning_rate": 6.896442419836381e-06, + "loss": 0.7538277626037597, + "memory(GiB)": 36.53, + "step": 8760, + "token_acc": 0.8295061340044039, + "train_speed(iter/s)": 0.095932 + }, + { + "epoch": 0.40760450969165174, + "grad_norm": 5.8263630867004395, + "learning_rate": 6.892884254292677e-06, + "loss": 0.7268019676208496, + "memory(GiB)": 36.53, + "step": 8765, + "token_acc": 0.8250991705733862, + "train_speed(iter/s)": 0.095962 + }, + { + "epoch": 0.4078370279516013, + "grad_norm": 7.256455421447754, + "learning_rate": 6.889324969428074e-06, + "loss": 0.7247865676879883, + "memory(GiB)": 36.53, + "step": 8770, + "token_acc": 0.8247863247863247, + "train_speed(iter/s)": 0.095992 + }, + { + "epoch": 0.40806954621155095, + "grad_norm": 5.477489471435547, + "learning_rate": 6.885764567347284e-06, + "loss": 0.6486649990081788, + "memory(GiB)": 36.53, + "step": 8775, + "token_acc": 0.8216751726208346, + "train_speed(iter/s)": 0.096022 + }, + { + "epoch": 0.4083020644715005, + "grad_norm": 8.233301162719727, + "learning_rate": 6.882203050155684e-06, + "loss": 0.6443092346191406, + "memory(GiB)": 36.53, + "step": 8780, + "token_acc": 0.8434589800443459, + "train_speed(iter/s)": 0.096053 + }, + { + "epoch": 0.4085345827314501, + "grad_norm": 9.864056587219238, + "learning_rate": 6.878640419959306e-06, + "loss": 0.8140290260314942, + "memory(GiB)": 36.53, + "step": 8785, + "token_acc": 0.8026589115081014, + "train_speed(iter/s)": 0.096083 + }, + { + "epoch": 0.40876710099139973, + "grad_norm": 7.310309886932373, + "learning_rate": 6.875076678864847e-06, + "loss": 0.6930231094360352, + "memory(GiB)": 36.53, + "step": 8790, + "token_acc": 0.832723644828733, + "train_speed(iter/s)": 0.096114 + }, + { + "epoch": 0.4089996192513493, + "grad_norm": 7.788050174713135, + "learning_rate": 6.8715118289796575e-06, + "loss": 0.6872176170349121, + "memory(GiB)": 36.53, + "step": 8795, + "token_acc": 0.8275215598050244, + "train_speed(iter/s)": 0.096144 + }, + { + "epoch": 0.40923213751129894, + "grad_norm": 6.276396751403809, + "learning_rate": 6.867945872411741e-06, + "loss": 0.6261724948883056, + "memory(GiB)": 36.53, + "step": 8800, + "token_acc": 0.8405044510385756, + "train_speed(iter/s)": 0.096175 + }, + { + "epoch": 0.40923213751129894, + "eval_loss": 0.6192914247512817, + "eval_runtime": 291.4338, + "eval_samples_per_second": 11.924, + "eval_steps_per_second": 11.924, + "step": 8800 + }, + { + "epoch": 0.4094646557712485, + "grad_norm": 6.184392929077148, + "learning_rate": 6.8643788112697565e-06, + "loss": 0.5885149002075195, + "memory(GiB)": 36.53, + "step": 8805, + "token_acc": 0.8207722865661143, + "train_speed(iter/s)": 0.0959 + }, + { + "epoch": 0.40969717403119815, + "grad_norm": 7.027329444885254, + "learning_rate": 6.860810647663021e-06, + "loss": 0.6702795028686523, + "memory(GiB)": 36.53, + "step": 8810, + "token_acc": 0.8245682888540031, + "train_speed(iter/s)": 0.095931 + }, + { + "epoch": 0.40992969229114773, + "grad_norm": 7.6430983543396, + "learning_rate": 6.857241383701498e-06, + "loss": 0.7755990028381348, + "memory(GiB)": 36.53, + "step": 8815, + "token_acc": 0.8186856690419636, + "train_speed(iter/s)": 0.095961 + }, + { + "epoch": 0.41016221055109736, + "grad_norm": 7.117034435272217, + "learning_rate": 6.853671021495804e-06, + "loss": 0.7607792854309082, + "memory(GiB)": 36.53, + "step": 8820, + "token_acc": 0.8068219088225648, + "train_speed(iter/s)": 0.095992 + }, + { + "epoch": 0.41039472881104694, + "grad_norm": 5.775667190551758, + "learning_rate": 6.850099563157202e-06, + "loss": 0.6519227981567383, + "memory(GiB)": 36.53, + "step": 8825, + "token_acc": 0.8289615522817104, + "train_speed(iter/s)": 0.096022 + }, + { + "epoch": 0.41062724707099657, + "grad_norm": 7.273473739624023, + "learning_rate": 6.84652701079761e-06, + "loss": 0.7388500690460205, + "memory(GiB)": 36.53, + "step": 8830, + "token_acc": 0.8183768323203432, + "train_speed(iter/s)": 0.096052 + }, + { + "epoch": 0.41085976533094615, + "grad_norm": 5.887622833251953, + "learning_rate": 6.842953366529584e-06, + "loss": 0.5621285438537598, + "memory(GiB)": 36.53, + "step": 8835, + "token_acc": 0.8547868061142397, + "train_speed(iter/s)": 0.096083 + }, + { + "epoch": 0.4110922835908957, + "grad_norm": 7.271026611328125, + "learning_rate": 6.839378632466334e-06, + "loss": 0.692125940322876, + "memory(GiB)": 36.53, + "step": 8840, + "token_acc": 0.8310536044362292, + "train_speed(iter/s)": 0.096114 + }, + { + "epoch": 0.41132480185084536, + "grad_norm": 8.459503173828125, + "learning_rate": 6.8358028107217065e-06, + "loss": 0.7182388305664062, + "memory(GiB)": 36.53, + "step": 8845, + "token_acc": 0.8347509113001215, + "train_speed(iter/s)": 0.096145 + }, + { + "epoch": 0.41155732011079493, + "grad_norm": 7.945958137512207, + "learning_rate": 6.8322259034102e-06, + "loss": 0.7506031036376953, + "memory(GiB)": 36.53, + "step": 8850, + "token_acc": 0.819006309148265, + "train_speed(iter/s)": 0.096175 + }, + { + "epoch": 0.41155732011079493, + "eval_loss": 0.6183744668960571, + "eval_runtime": 293.1435, + "eval_samples_per_second": 11.854, + "eval_steps_per_second": 11.854, + "step": 8850 + }, + { + "epoch": 0.41178983837074457, + "grad_norm": 7.761203289031982, + "learning_rate": 6.828647912646947e-06, + "loss": 0.6559175014495849, + "memory(GiB)": 36.53, + "step": 8855, + "token_acc": 0.8202479338842975, + "train_speed(iter/s)": 0.0959 + }, + { + "epoch": 0.41202235663069414, + "grad_norm": 6.704031944274902, + "learning_rate": 6.825068840547726e-06, + "loss": 0.7581852912902832, + "memory(GiB)": 36.53, + "step": 8860, + "token_acc": 0.8106565176022835, + "train_speed(iter/s)": 0.095931 + }, + { + "epoch": 0.4122548748906438, + "grad_norm": 6.423466205596924, + "learning_rate": 6.82148868922895e-06, + "loss": 0.7496987342834472, + "memory(GiB)": 36.53, + "step": 8865, + "token_acc": 0.816932208684786, + "train_speed(iter/s)": 0.095961 + }, + { + "epoch": 0.41248739315059335, + "grad_norm": 5.8876953125, + "learning_rate": 6.8179074608076755e-06, + "loss": 0.6948210716247558, + "memory(GiB)": 36.53, + "step": 8870, + "token_acc": 0.8260869565217391, + "train_speed(iter/s)": 0.095991 + }, + { + "epoch": 0.412719911410543, + "grad_norm": 7.565408706665039, + "learning_rate": 6.8143251574015925e-06, + "loss": 0.6806984901428222, + "memory(GiB)": 36.53, + "step": 8875, + "token_acc": 0.8140625, + "train_speed(iter/s)": 0.096021 + }, + { + "epoch": 0.41295242967049256, + "grad_norm": 9.712871551513672, + "learning_rate": 6.810741781129027e-06, + "loss": 0.7311966419219971, + "memory(GiB)": 36.53, + "step": 8880, + "token_acc": 0.8348817567567568, + "train_speed(iter/s)": 0.096051 + }, + { + "epoch": 0.41318494793044214, + "grad_norm": 6.433829307556152, + "learning_rate": 6.807157334108941e-06, + "loss": 0.7440563678741455, + "memory(GiB)": 36.53, + "step": 8885, + "token_acc": 0.8127053669222344, + "train_speed(iter/s)": 0.09608 + }, + { + "epoch": 0.41341746619039177, + "grad_norm": 4.980173587799072, + "learning_rate": 6.803571818460929e-06, + "loss": 0.772585391998291, + "memory(GiB)": 36.53, + "step": 8890, + "token_acc": 0.8182640144665461, + "train_speed(iter/s)": 0.09611 + }, + { + "epoch": 0.41364998445034135, + "grad_norm": 7.035939693450928, + "learning_rate": 6.799985236305217e-06, + "loss": 0.6235956192016602, + "memory(GiB)": 36.53, + "step": 8895, + "token_acc": 0.8394070413835701, + "train_speed(iter/s)": 0.096141 + }, + { + "epoch": 0.413882502710291, + "grad_norm": 8.031867027282715, + "learning_rate": 6.796397589762661e-06, + "loss": 0.6124022006988525, + "memory(GiB)": 36.53, + "step": 8900, + "token_acc": 0.846942650968477, + "train_speed(iter/s)": 0.096172 + }, + { + "epoch": 0.413882502710291, + "eval_loss": 0.6191074848175049, + "eval_runtime": 295.2805, + "eval_samples_per_second": 11.768, + "eval_steps_per_second": 11.768, + "step": 8900 + }, + { + "epoch": 0.41411502097024055, + "grad_norm": 7.602524280548096, + "learning_rate": 6.792808880954746e-06, + "loss": 0.6715277671813965, + "memory(GiB)": 36.53, + "step": 8905, + "token_acc": 0.8206718594736674, + "train_speed(iter/s)": 0.095895 + }, + { + "epoch": 0.4143475392301902, + "grad_norm": 6.761972427368164, + "learning_rate": 6.789219112003589e-06, + "loss": 0.77059907913208, + "memory(GiB)": 36.53, + "step": 8910, + "token_acc": 0.8076407506702413, + "train_speed(iter/s)": 0.095925 + }, + { + "epoch": 0.41458005749013976, + "grad_norm": 6.945693016052246, + "learning_rate": 6.78562828503193e-06, + "loss": 0.6703316688537597, + "memory(GiB)": 36.53, + "step": 8915, + "token_acc": 0.8321584424303458, + "train_speed(iter/s)": 0.095955 + }, + { + "epoch": 0.4148125757500894, + "grad_norm": 6.604681968688965, + "learning_rate": 6.782036402163136e-06, + "loss": 0.6581210613250732, + "memory(GiB)": 36.53, + "step": 8920, + "token_acc": 0.8386841062227507, + "train_speed(iter/s)": 0.095986 + }, + { + "epoch": 0.415045094010039, + "grad_norm": 6.909808158874512, + "learning_rate": 6.778443465521196e-06, + "loss": 0.7041978359222412, + "memory(GiB)": 36.53, + "step": 8925, + "token_acc": 0.8179746011071313, + "train_speed(iter/s)": 0.096015 + }, + { + "epoch": 0.4152776122699886, + "grad_norm": 8.918962478637695, + "learning_rate": 6.77484947723073e-06, + "loss": 0.6340020179748536, + "memory(GiB)": 36.53, + "step": 8930, + "token_acc": 0.8308794269797055, + "train_speed(iter/s)": 0.096045 + }, + { + "epoch": 0.4155101305299382, + "grad_norm": 8.383973121643066, + "learning_rate": 6.7712544394169675e-06, + "loss": 0.6914999961853028, + "memory(GiB)": 36.53, + "step": 8935, + "token_acc": 0.8301647655259823, + "train_speed(iter/s)": 0.096075 + }, + { + "epoch": 0.41574264878988776, + "grad_norm": 7.2458038330078125, + "learning_rate": 6.7676583542057705e-06, + "loss": 0.701669979095459, + "memory(GiB)": 36.53, + "step": 8940, + "token_acc": 0.826133909287257, + "train_speed(iter/s)": 0.096106 + }, + { + "epoch": 0.4159751670498374, + "grad_norm": 7.525779724121094, + "learning_rate": 6.764061223723612e-06, + "loss": 0.8554682731628418, + "memory(GiB)": 36.53, + "step": 8945, + "token_acc": 0.7920922570016474, + "train_speed(iter/s)": 0.096135 + }, + { + "epoch": 0.41620768530978697, + "grad_norm": 6.9160566329956055, + "learning_rate": 6.760463050097588e-06, + "loss": 0.7539933681488037, + "memory(GiB)": 36.53, + "step": 8950, + "token_acc": 0.8097359210944957, + "train_speed(iter/s)": 0.096165 + }, + { + "epoch": 0.41620768530978697, + "eval_loss": 0.6194281578063965, + "eval_runtime": 293.896, + "eval_samples_per_second": 11.824, + "eval_steps_per_second": 11.824, + "step": 8950 + }, + { + "epoch": 0.4164402035697366, + "grad_norm": 6.317086696624756, + "learning_rate": 6.75686383545541e-06, + "loss": 0.6532005786895752, + "memory(GiB)": 36.53, + "step": 8955, + "token_acc": 0.8211216897351789, + "train_speed(iter/s)": 0.095891 + }, + { + "epoch": 0.4166727218296862, + "grad_norm": 7.509308815002441, + "learning_rate": 6.753263581925403e-06, + "loss": 0.7630683898925781, + "memory(GiB)": 36.53, + "step": 8960, + "token_acc": 0.8083182640144665, + "train_speed(iter/s)": 0.09592 + }, + { + "epoch": 0.4169052400896358, + "grad_norm": 8.196243286132812, + "learning_rate": 6.7496622916365085e-06, + "loss": 0.6224451541900635, + "memory(GiB)": 36.53, + "step": 8965, + "token_acc": 0.8581512605042017, + "train_speed(iter/s)": 0.09595 + }, + { + "epoch": 0.4171377583495854, + "grad_norm": 7.792494297027588, + "learning_rate": 6.746059966718282e-06, + "loss": 0.7187991619110108, + "memory(GiB)": 36.53, + "step": 8970, + "token_acc": 0.8277800080289041, + "train_speed(iter/s)": 0.09598 + }, + { + "epoch": 0.417370276609535, + "grad_norm": 5.681851387023926, + "learning_rate": 6.742456609300888e-06, + "loss": 0.7289433479309082, + "memory(GiB)": 36.53, + "step": 8975, + "token_acc": 0.8181182231549493, + "train_speed(iter/s)": 0.096009 + }, + { + "epoch": 0.4176027948694846, + "grad_norm": 12.690560340881348, + "learning_rate": 6.738852221515104e-06, + "loss": 0.6835853099822998, + "memory(GiB)": 36.53, + "step": 8980, + "token_acc": 0.8417593528816987, + "train_speed(iter/s)": 0.09604 + }, + { + "epoch": 0.41783531312943417, + "grad_norm": 6.0064215660095215, + "learning_rate": 6.735246805492316e-06, + "loss": 0.7906692028045654, + "memory(GiB)": 36.53, + "step": 8985, + "token_acc": 0.8182476466328747, + "train_speed(iter/s)": 0.09607 + }, + { + "epoch": 0.4180678313893838, + "grad_norm": 6.358526706695557, + "learning_rate": 6.731640363364516e-06, + "loss": 0.7464702606201172, + "memory(GiB)": 36.53, + "step": 8990, + "token_acc": 0.8180026281208935, + "train_speed(iter/s)": 0.0961 + }, + { + "epoch": 0.4183003496493334, + "grad_norm": 9.47359561920166, + "learning_rate": 6.728032897264307e-06, + "loss": 0.7647457122802734, + "memory(GiB)": 36.53, + "step": 8995, + "token_acc": 0.8158473954512105, + "train_speed(iter/s)": 0.09613 + }, + { + "epoch": 0.418532867909283, + "grad_norm": 5.881313323974609, + "learning_rate": 6.724424409324893e-06, + "loss": 0.7446362495422363, + "memory(GiB)": 36.53, + "step": 9000, + "token_acc": 0.80649436713055, + "train_speed(iter/s)": 0.09616 + }, + { + "epoch": 0.418532867909283, + "eval_loss": 0.6171696782112122, + "eval_runtime": 297.804, + "eval_samples_per_second": 11.669, + "eval_steps_per_second": 11.669, + "step": 9000 + }, + { + "epoch": 0.4187653861692326, + "grad_norm": 10.644405364990234, + "learning_rate": 6.720814901680086e-06, + "loss": 0.627127742767334, + "memory(GiB)": 36.53, + "step": 9005, + "token_acc": 0.82036708016111, + "train_speed(iter/s)": 0.095884 + }, + { + "epoch": 0.4189979044291822, + "grad_norm": 5.751759052276611, + "learning_rate": 6.717204376464297e-06, + "loss": 0.6266797542572021, + "memory(GiB)": 36.53, + "step": 9010, + "token_acc": 0.8363697705802969, + "train_speed(iter/s)": 0.095913 + }, + { + "epoch": 0.4192304226891318, + "grad_norm": 7.086824893951416, + "learning_rate": 6.713592835812543e-06, + "loss": 0.6686243057250977, + "memory(GiB)": 36.53, + "step": 9015, + "token_acc": 0.8332337118947998, + "train_speed(iter/s)": 0.095942 + }, + { + "epoch": 0.41946294094908143, + "grad_norm": 6.7311530113220215, + "learning_rate": 6.70998028186044e-06, + "loss": 0.6335064888000488, + "memory(GiB)": 36.53, + "step": 9020, + "token_acc": 0.8478342749529191, + "train_speed(iter/s)": 0.095972 + }, + { + "epoch": 0.419695459209031, + "grad_norm": 6.627511024475098, + "learning_rate": 6.706366716744201e-06, + "loss": 0.7964089870452881, + "memory(GiB)": 36.53, + "step": 9025, + "token_acc": 0.8168709444844989, + "train_speed(iter/s)": 0.096002 + }, + { + "epoch": 0.4199279774689806, + "grad_norm": 6.411988258361816, + "learning_rate": 6.702752142600639e-06, + "loss": 0.691087007522583, + "memory(GiB)": 36.53, + "step": 9030, + "token_acc": 0.8316246056782335, + "train_speed(iter/s)": 0.096031 + }, + { + "epoch": 0.4201604957289302, + "grad_norm": 5.192274570465088, + "learning_rate": 6.699136561567165e-06, + "loss": 0.6185484886169433, + "memory(GiB)": 36.53, + "step": 9035, + "token_acc": 0.8502076014053018, + "train_speed(iter/s)": 0.09606 + }, + { + "epoch": 0.4203930139888798, + "grad_norm": 9.895514488220215, + "learning_rate": 6.695519975781782e-06, + "loss": 0.8439302444458008, + "memory(GiB)": 36.53, + "step": 9040, + "token_acc": 0.8051863857374392, + "train_speed(iter/s)": 0.09609 + }, + { + "epoch": 0.4206255322488294, + "grad_norm": 7.7914228439331055, + "learning_rate": 6.6919023873830864e-06, + "loss": 0.7071369647979736, + "memory(GiB)": 36.53, + "step": 9045, + "token_acc": 0.8248201438848921, + "train_speed(iter/s)": 0.096119 + }, + { + "epoch": 0.420858050508779, + "grad_norm": 7.353464603424072, + "learning_rate": 6.688283798510275e-06, + "loss": 0.7173079967498779, + "memory(GiB)": 36.53, + "step": 9050, + "token_acc": 0.8235796668090559, + "train_speed(iter/s)": 0.096149 + }, + { + "epoch": 0.420858050508779, + "eval_loss": 0.6170545220375061, + "eval_runtime": 295.4839, + "eval_samples_per_second": 11.76, + "eval_steps_per_second": 11.76, + "step": 9050 + }, + { + "epoch": 0.42109056876872863, + "grad_norm": 6.842942237854004, + "learning_rate": 6.684664211303129e-06, + "loss": 0.6946396827697754, + "memory(GiB)": 36.53, + "step": 9055, + "token_acc": 0.8207173862646191, + "train_speed(iter/s)": 0.095877 + }, + { + "epoch": 0.4213230870286782, + "grad_norm": 8.142498016357422, + "learning_rate": 6.6810436279020215e-06, + "loss": 0.7505970954895019, + "memory(GiB)": 36.53, + "step": 9060, + "token_acc": 0.8142857142857143, + "train_speed(iter/s)": 0.095907 + }, + { + "epoch": 0.42155560528862784, + "grad_norm": 8.180495262145996, + "learning_rate": 6.677422050447915e-06, + "loss": 0.699577522277832, + "memory(GiB)": 36.53, + "step": 9065, + "token_acc": 0.8255624388653408, + "train_speed(iter/s)": 0.095937 + }, + { + "epoch": 0.4217881235485774, + "grad_norm": 9.911946296691895, + "learning_rate": 6.673799481082362e-06, + "loss": 0.6910494327545166, + "memory(GiB)": 36.53, + "step": 9070, + "token_acc": 0.8324185876326381, + "train_speed(iter/s)": 0.095967 + }, + { + "epoch": 0.42202064180852705, + "grad_norm": 7.65514612197876, + "learning_rate": 6.670175921947497e-06, + "loss": 0.6632385730743409, + "memory(GiB)": 36.53, + "step": 9075, + "token_acc": 0.8326345213137666, + "train_speed(iter/s)": 0.095996 + }, + { + "epoch": 0.42225316006847663, + "grad_norm": 9.681537628173828, + "learning_rate": 6.666551375186043e-06, + "loss": 0.7314382553100586, + "memory(GiB)": 36.53, + "step": 9080, + "token_acc": 0.8132972555083108, + "train_speed(iter/s)": 0.096026 + }, + { + "epoch": 0.4224856783284262, + "grad_norm": 6.742770195007324, + "learning_rate": 6.662925842941308e-06, + "loss": 0.6615162849426269, + "memory(GiB)": 36.53, + "step": 9085, + "token_acc": 0.8341759352881699, + "train_speed(iter/s)": 0.096056 + }, + { + "epoch": 0.42271819658837584, + "grad_norm": 7.636063575744629, + "learning_rate": 6.659299327357181e-06, + "loss": 0.6485334873199463, + "memory(GiB)": 36.53, + "step": 9090, + "token_acc": 0.8355714712026261, + "train_speed(iter/s)": 0.096085 + }, + { + "epoch": 0.4229507148483254, + "grad_norm": 7.497421741485596, + "learning_rate": 6.655671830578131e-06, + "loss": 0.6972911357879639, + "memory(GiB)": 36.53, + "step": 9095, + "token_acc": 0.8263677811550152, + "train_speed(iter/s)": 0.096114 + }, + { + "epoch": 0.42318323310827505, + "grad_norm": 8.554731369018555, + "learning_rate": 6.6520433547492095e-06, + "loss": 0.7575594902038574, + "memory(GiB)": 36.53, + "step": 9100, + "token_acc": 0.8071625344352618, + "train_speed(iter/s)": 0.096143 + }, + { + "epoch": 0.42318323310827505, + "eval_loss": 0.6152464747428894, + "eval_runtime": 294.8661, + "eval_samples_per_second": 11.785, + "eval_steps_per_second": 11.785, + "step": 9100 + }, + { + "epoch": 0.4234157513682246, + "grad_norm": 5.9736738204956055, + "learning_rate": 6.648413902016047e-06, + "loss": 0.7907323360443115, + "memory(GiB)": 36.53, + "step": 9105, + "token_acc": 0.8208651317899057, + "train_speed(iter/s)": 0.095874 + }, + { + "epoch": 0.42364826962817426, + "grad_norm": 6.825490951538086, + "learning_rate": 6.644783474524848e-06, + "loss": 0.8026031494140625, + "memory(GiB)": 36.53, + "step": 9110, + "token_acc": 0.8043887147335423, + "train_speed(iter/s)": 0.095903 + }, + { + "epoch": 0.42388078788812383, + "grad_norm": 7.025318622589111, + "learning_rate": 6.641152074422401e-06, + "loss": 0.6262815475463868, + "memory(GiB)": 36.53, + "step": 9115, + "token_acc": 0.8432312799704906, + "train_speed(iter/s)": 0.095931 + }, + { + "epoch": 0.42411330614807347, + "grad_norm": 7.372581481933594, + "learning_rate": 6.6375197038560636e-06, + "loss": 0.6927440643310547, + "memory(GiB)": 36.53, + "step": 9120, + "token_acc": 0.8218195545488863, + "train_speed(iter/s)": 0.095961 + }, + { + "epoch": 0.42434582440802304, + "grad_norm": 7.317347526550293, + "learning_rate": 6.633886364973767e-06, + "loss": 0.7257655143737793, + "memory(GiB)": 36.53, + "step": 9125, + "token_acc": 0.8196777511141584, + "train_speed(iter/s)": 0.09599 + }, + { + "epoch": 0.4245783426679726, + "grad_norm": 7.706878185272217, + "learning_rate": 6.630252059924016e-06, + "loss": 0.6121723651885986, + "memory(GiB)": 36.53, + "step": 9130, + "token_acc": 0.8564425770308123, + "train_speed(iter/s)": 0.096019 + }, + { + "epoch": 0.42481086092792225, + "grad_norm": 7.673420429229736, + "learning_rate": 6.626616790855891e-06, + "loss": 0.7046977519989014, + "memory(GiB)": 36.53, + "step": 9135, + "token_acc": 0.8263234227701233, + "train_speed(iter/s)": 0.096048 + }, + { + "epoch": 0.42504337918787183, + "grad_norm": 7.619455814361572, + "learning_rate": 6.622980559919037e-06, + "loss": 0.7457279682159423, + "memory(GiB)": 36.53, + "step": 9140, + "token_acc": 0.8259125551544324, + "train_speed(iter/s)": 0.096077 + }, + { + "epoch": 0.42527589744782146, + "grad_norm": 5.783131122589111, + "learning_rate": 6.619343369263667e-06, + "loss": 0.6203608989715577, + "memory(GiB)": 36.53, + "step": 9145, + "token_acc": 0.8454415954415955, + "train_speed(iter/s)": 0.096107 + }, + { + "epoch": 0.42550841570777104, + "grad_norm": 6.525480270385742, + "learning_rate": 6.615705221040568e-06, + "loss": 0.7471608638763427, + "memory(GiB)": 36.53, + "step": 9150, + "token_acc": 0.8127881955118352, + "train_speed(iter/s)": 0.096136 + }, + { + "epoch": 0.42550841570777104, + "eval_loss": 0.6166175007820129, + "eval_runtime": 291.944, + "eval_samples_per_second": 11.903, + "eval_steps_per_second": 11.903, + "step": 9150 + }, + { + "epoch": 0.42574093396772067, + "grad_norm": 6.973150253295898, + "learning_rate": 6.612066117401088e-06, + "loss": 0.6104370594024658, + "memory(GiB)": 36.53, + "step": 9155, + "token_acc": 0.8215181052918232, + "train_speed(iter/s)": 0.095871 + }, + { + "epoch": 0.42597345222767025, + "grad_norm": 8.265861511230469, + "learning_rate": 6.608426060497141e-06, + "loss": 0.6862789154052734, + "memory(GiB)": 36.53, + "step": 9160, + "token_acc": 0.8292282430213465, + "train_speed(iter/s)": 0.095899 + }, + { + "epoch": 0.4262059704876199, + "grad_norm": 8.376321792602539, + "learning_rate": 6.604785052481205e-06, + "loss": 0.7466615200042724, + "memory(GiB)": 36.53, + "step": 9165, + "token_acc": 0.8263552225650066, + "train_speed(iter/s)": 0.095929 + }, + { + "epoch": 0.42643848874756946, + "grad_norm": 8.683837890625, + "learning_rate": 6.60114309550632e-06, + "loss": 0.5966279983520508, + "memory(GiB)": 36.53, + "step": 9170, + "token_acc": 0.8487690504103166, + "train_speed(iter/s)": 0.095958 + }, + { + "epoch": 0.42667100700751903, + "grad_norm": 6.879874229431152, + "learning_rate": 6.59750019172609e-06, + "loss": 0.6662249088287353, + "memory(GiB)": 36.53, + "step": 9175, + "token_acc": 0.8512064343163539, + "train_speed(iter/s)": 0.095988 + }, + { + "epoch": 0.42690352526746866, + "grad_norm": 7.139492034912109, + "learning_rate": 6.593856343294674e-06, + "loss": 0.6416230201721191, + "memory(GiB)": 36.53, + "step": 9180, + "token_acc": 0.8420463032113518, + "train_speed(iter/s)": 0.096016 + }, + { + "epoch": 0.42713604352741824, + "grad_norm": 8.100975036621094, + "learning_rate": 6.590211552366792e-06, + "loss": 0.681126880645752, + "memory(GiB)": 36.53, + "step": 9185, + "token_acc": 0.8341675008341675, + "train_speed(iter/s)": 0.096045 + }, + { + "epoch": 0.4273685617873679, + "grad_norm": 5.829564094543457, + "learning_rate": 6.586565821097722e-06, + "loss": 0.67999267578125, + "memory(GiB)": 36.53, + "step": 9190, + "token_acc": 0.8355945730247406, + "train_speed(iter/s)": 0.096074 + }, + { + "epoch": 0.42760108004731745, + "grad_norm": 8.35464096069336, + "learning_rate": 6.5829191516432985e-06, + "loss": 0.7434378147125245, + "memory(GiB)": 36.53, + "step": 9195, + "token_acc": 0.8255695341720504, + "train_speed(iter/s)": 0.096103 + }, + { + "epoch": 0.4278335983072671, + "grad_norm": 6.569089889526367, + "learning_rate": 6.57927154615991e-06, + "loss": 0.7322329521179199, + "memory(GiB)": 36.53, + "step": 9200, + "token_acc": 0.82040953340047, + "train_speed(iter/s)": 0.096132 + }, + { + "epoch": 0.4278335983072671, + "eval_loss": 0.6154603958129883, + "eval_runtime": 292.0196, + "eval_samples_per_second": 11.9, + "eval_steps_per_second": 11.9, + "step": 9200 + }, + { + "epoch": 0.42806611656721666, + "grad_norm": 6.184996128082275, + "learning_rate": 6.575623006804495e-06, + "loss": 0.7064523696899414, + "memory(GiB)": 36.53, + "step": 9205, + "token_acc": 0.8213706374572378, + "train_speed(iter/s)": 0.095868 + }, + { + "epoch": 0.4282986348271663, + "grad_norm": 6.941103935241699, + "learning_rate": 6.5719735357345505e-06, + "loss": 0.6321978092193603, + "memory(GiB)": 36.53, + "step": 9210, + "token_acc": 0.8569169960474309, + "train_speed(iter/s)": 0.095897 + }, + { + "epoch": 0.42853115308711587, + "grad_norm": 7.983799934387207, + "learning_rate": 6.568323135108121e-06, + "loss": 0.7142380237579345, + "memory(GiB)": 36.53, + "step": 9215, + "token_acc": 0.825369978858351, + "train_speed(iter/s)": 0.095927 + }, + { + "epoch": 0.4287636713470655, + "grad_norm": 7.8501482009887695, + "learning_rate": 6.564671807083801e-06, + "loss": 0.7349348545074463, + "memory(GiB)": 36.53, + "step": 9220, + "token_acc": 0.8120401337792642, + "train_speed(iter/s)": 0.095955 + }, + { + "epoch": 0.4289961896070151, + "grad_norm": 8.24466323852539, + "learning_rate": 6.561019553820732e-06, + "loss": 0.6588833808898926, + "memory(GiB)": 36.53, + "step": 9225, + "token_acc": 0.8419503993274485, + "train_speed(iter/s)": 0.095985 + }, + { + "epoch": 0.42922870786696465, + "grad_norm": 8.526206970214844, + "learning_rate": 6.5573663774786045e-06, + "loss": 0.7232032775878906, + "memory(GiB)": 36.53, + "step": 9230, + "token_acc": 0.8263358778625954, + "train_speed(iter/s)": 0.096015 + }, + { + "epoch": 0.4294612261269143, + "grad_norm": 7.539074420928955, + "learning_rate": 6.553712280217657e-06, + "loss": 0.5962289333343506, + "memory(GiB)": 36.53, + "step": 9235, + "token_acc": 0.8554294975688816, + "train_speed(iter/s)": 0.096043 + }, + { + "epoch": 0.42969374438686386, + "grad_norm": 7.943596363067627, + "learning_rate": 6.550057264198668e-06, + "loss": 0.7809437274932861, + "memory(GiB)": 36.53, + "step": 9240, + "token_acc": 0.8077663671373556, + "train_speed(iter/s)": 0.096072 + }, + { + "epoch": 0.4299262626468135, + "grad_norm": 6.926512718200684, + "learning_rate": 6.546401331582962e-06, + "loss": 0.7709768295288086, + "memory(GiB)": 36.53, + "step": 9245, + "token_acc": 0.8153540701522171, + "train_speed(iter/s)": 0.096101 + }, + { + "epoch": 0.43015878090676307, + "grad_norm": 5.472071647644043, + "learning_rate": 6.542744484532403e-06, + "loss": 0.7952607631683349, + "memory(GiB)": 36.53, + "step": 9250, + "token_acc": 0.8125806451612904, + "train_speed(iter/s)": 0.09613 + }, + { + "epoch": 0.43015878090676307, + "eval_loss": 0.619757890701294, + "eval_runtime": 296.7723, + "eval_samples_per_second": 11.709, + "eval_steps_per_second": 11.709, + "step": 9250 + }, + { + "epoch": 0.4303912991667127, + "grad_norm": 7.531529903411865, + "learning_rate": 6.539086725209401e-06, + "loss": 0.6834822177886963, + "memory(GiB)": 36.53, + "step": 9255, + "token_acc": 0.821519068544119, + "train_speed(iter/s)": 0.095863 + }, + { + "epoch": 0.4306238174266623, + "grad_norm": 7.340893745422363, + "learning_rate": 6.535428055776898e-06, + "loss": 0.6262123107910156, + "memory(GiB)": 36.53, + "step": 9260, + "token_acc": 0.8447067502766507, + "train_speed(iter/s)": 0.095892 + }, + { + "epoch": 0.4308563356866119, + "grad_norm": 7.021424293518066, + "learning_rate": 6.531768478398382e-06, + "loss": 0.7499904632568359, + "memory(GiB)": 36.53, + "step": 9265, + "token_acc": 0.8159286186384667, + "train_speed(iter/s)": 0.095922 + }, + { + "epoch": 0.4310888539465615, + "grad_norm": 6.474427223205566, + "learning_rate": 6.5281079952378686e-06, + "loss": 0.6529985904693604, + "memory(GiB)": 36.53, + "step": 9270, + "token_acc": 0.8382250174703005, + "train_speed(iter/s)": 0.09595 + }, + { + "epoch": 0.43132137220651107, + "grad_norm": 8.948592185974121, + "learning_rate": 6.524446608459922e-06, + "loss": 0.5798418521881104, + "memory(GiB)": 36.53, + "step": 9275, + "token_acc": 0.8486916951080774, + "train_speed(iter/s)": 0.09598 + }, + { + "epoch": 0.4315538904664607, + "grad_norm": 6.600849628448486, + "learning_rate": 6.520784320229628e-06, + "loss": 0.6643566608428955, + "memory(GiB)": 36.53, + "step": 9280, + "token_acc": 0.825115562403698, + "train_speed(iter/s)": 0.096009 + }, + { + "epoch": 0.4317864087264103, + "grad_norm": 6.3031206130981445, + "learning_rate": 6.517121132712613e-06, + "loss": 0.5986540794372559, + "memory(GiB)": 36.53, + "step": 9285, + "token_acc": 0.8465489566613162, + "train_speed(iter/s)": 0.096037 + }, + { + "epoch": 0.4320189269863599, + "grad_norm": 6.503426551818848, + "learning_rate": 6.513457048075031e-06, + "loss": 0.6698907375335693, + "memory(GiB)": 36.53, + "step": 9290, + "token_acc": 0.8272375854891466, + "train_speed(iter/s)": 0.096067 + }, + { + "epoch": 0.4322514452463095, + "grad_norm": 6.880972385406494, + "learning_rate": 6.509792068483569e-06, + "loss": 0.5666281700134277, + "memory(GiB)": 36.53, + "step": 9295, + "token_acc": 0.8612693246541904, + "train_speed(iter/s)": 0.096096 + }, + { + "epoch": 0.4324839635062591, + "grad_norm": 8.267417907714844, + "learning_rate": 6.506126196105444e-06, + "loss": 0.7473461627960205, + "memory(GiB)": 36.53, + "step": 9300, + "token_acc": 0.810893098782138, + "train_speed(iter/s)": 0.096124 + }, + { + "epoch": 0.4324839635062591, + "eval_loss": 0.6146489977836609, + "eval_runtime": 294.9014, + "eval_samples_per_second": 11.784, + "eval_steps_per_second": 11.784, + "step": 9300 + }, + { + "epoch": 0.4327164817662087, + "grad_norm": 7.167984485626221, + "learning_rate": 6.502459433108398e-06, + "loss": 0.6427381992340088, + "memory(GiB)": 36.53, + "step": 9305, + "token_acc": 0.8212586271702882, + "train_speed(iter/s)": 0.09586 + }, + { + "epoch": 0.4329490000261583, + "grad_norm": 3.9880833625793457, + "learning_rate": 6.4987917816607e-06, + "loss": 0.8922223091125489, + "memory(GiB)": 36.53, + "step": 9310, + "token_acc": 0.7608077360637088, + "train_speed(iter/s)": 0.095888 + }, + { + "epoch": 0.4331815182861079, + "grad_norm": 6.413315296173096, + "learning_rate": 6.49512324393115e-06, + "loss": 0.583865737915039, + "memory(GiB)": 36.53, + "step": 9315, + "token_acc": 0.8495801387367652, + "train_speed(iter/s)": 0.095916 + }, + { + "epoch": 0.43341403654605754, + "grad_norm": 6.642376899719238, + "learning_rate": 6.491453822089065e-06, + "loss": 0.6838852405548096, + "memory(GiB)": 36.53, + "step": 9320, + "token_acc": 0.8288527073078379, + "train_speed(iter/s)": 0.095945 + }, + { + "epoch": 0.4336465548060071, + "grad_norm": 5.483030796051025, + "learning_rate": 6.487783518304284e-06, + "loss": 0.7265839099884033, + "memory(GiB)": 36.53, + "step": 9325, + "token_acc": 0.8278023598820059, + "train_speed(iter/s)": 0.095973 + }, + { + "epoch": 0.4338790730659567, + "grad_norm": 7.246890544891357, + "learning_rate": 6.484112334747177e-06, + "loss": 0.6972790241241456, + "memory(GiB)": 36.53, + "step": 9330, + "token_acc": 0.8220140515222483, + "train_speed(iter/s)": 0.096002 + }, + { + "epoch": 0.4341115913259063, + "grad_norm": 7.137838840484619, + "learning_rate": 6.480440273588624e-06, + "loss": 0.5969900131225586, + "memory(GiB)": 36.53, + "step": 9335, + "token_acc": 0.8442120447169131, + "train_speed(iter/s)": 0.096031 + }, + { + "epoch": 0.4343441095858559, + "grad_norm": 5.954293251037598, + "learning_rate": 6.4767673370000305e-06, + "loss": 0.7084389209747315, + "memory(GiB)": 36.53, + "step": 9340, + "token_acc": 0.831023102310231, + "train_speed(iter/s)": 0.09606 + }, + { + "epoch": 0.43457662784580553, + "grad_norm": 8.764803886413574, + "learning_rate": 6.473093527153315e-06, + "loss": 0.6675600528717041, + "memory(GiB)": 36.53, + "step": 9345, + "token_acc": 0.8438982319965502, + "train_speed(iter/s)": 0.096089 + }, + { + "epoch": 0.4348091461057551, + "grad_norm": 6.77636194229126, + "learning_rate": 6.4694188462209174e-06, + "loss": 0.7175776481628418, + "memory(GiB)": 36.53, + "step": 9350, + "token_acc": 0.8179903730445247, + "train_speed(iter/s)": 0.096117 + }, + { + "epoch": 0.4348091461057551, + "eval_loss": 0.6128469705581665, + "eval_runtime": 296.4157, + "eval_samples_per_second": 11.723, + "eval_steps_per_second": 11.723, + "step": 9350 + }, + { + "epoch": 0.43504166436570474, + "grad_norm": 8.777336120605469, + "learning_rate": 6.465743296375788e-06, + "loss": 0.8006362915039062, + "memory(GiB)": 36.53, + "step": 9355, + "token_acc": 0.8213751808712197, + "train_speed(iter/s)": 0.095853 + }, + { + "epoch": 0.4352741826256543, + "grad_norm": 6.73029899597168, + "learning_rate": 6.462066879791393e-06, + "loss": 0.5721518039703369, + "memory(GiB)": 36.53, + "step": 9360, + "token_acc": 0.8447519406007425, + "train_speed(iter/s)": 0.095882 + }, + { + "epoch": 0.43550670088560395, + "grad_norm": 10.870583534240723, + "learning_rate": 6.458389598641711e-06, + "loss": 0.611840009689331, + "memory(GiB)": 36.53, + "step": 9365, + "token_acc": 0.8376413570274637, + "train_speed(iter/s)": 0.095911 + }, + { + "epoch": 0.4357392191455535, + "grad_norm": 6.957264423370361, + "learning_rate": 6.454711455101232e-06, + "loss": 0.6748052597045898, + "memory(GiB)": 36.53, + "step": 9370, + "token_acc": 0.8365800865800865, + "train_speed(iter/s)": 0.09594 + }, + { + "epoch": 0.4359717374055031, + "grad_norm": 5.895677089691162, + "learning_rate": 6.451032451344958e-06, + "loss": 0.6640129566192627, + "memory(GiB)": 36.53, + "step": 9375, + "token_acc": 0.8376369327073553, + "train_speed(iter/s)": 0.095969 + }, + { + "epoch": 0.43620425566545273, + "grad_norm": 6.726668357849121, + "learning_rate": 6.447352589548396e-06, + "loss": 0.6839561462402344, + "memory(GiB)": 36.53, + "step": 9380, + "token_acc": 0.8363384188626907, + "train_speed(iter/s)": 0.095997 + }, + { + "epoch": 0.4364367739254023, + "grad_norm": 5.2225775718688965, + "learning_rate": 6.443671871887561e-06, + "loss": 0.7582714080810546, + "memory(GiB)": 36.53, + "step": 9385, + "token_acc": 0.8162845385067607, + "train_speed(iter/s)": 0.096025 + }, + { + "epoch": 0.43666929218535194, + "grad_norm": 6.472532749176025, + "learning_rate": 6.439990300538975e-06, + "loss": 0.6851258754730225, + "memory(GiB)": 36.53, + "step": 9390, + "token_acc": 0.8372978116079924, + "train_speed(iter/s)": 0.096053 + }, + { + "epoch": 0.4369018104453015, + "grad_norm": 8.415928840637207, + "learning_rate": 6.436307877679666e-06, + "loss": 0.6269676685333252, + "memory(GiB)": 36.53, + "step": 9395, + "token_acc": 0.854813046937152, + "train_speed(iter/s)": 0.096081 + }, + { + "epoch": 0.43713432870525115, + "grad_norm": 7.9225687980651855, + "learning_rate": 6.4326246054871645e-06, + "loss": 0.6694862842559814, + "memory(GiB)": 36.53, + "step": 9400, + "token_acc": 0.8336909871244635, + "train_speed(iter/s)": 0.096111 + }, + { + "epoch": 0.43713432870525115, + "eval_loss": 0.613532543182373, + "eval_runtime": 294.7694, + "eval_samples_per_second": 11.789, + "eval_steps_per_second": 11.789, + "step": 9400 + }, + { + "epoch": 0.43736684696520073, + "grad_norm": 6.317355632781982, + "learning_rate": 6.428940486139502e-06, + "loss": 0.8399629592895508, + "memory(GiB)": 36.53, + "step": 9405, + "token_acc": 0.821104738713915, + "train_speed(iter/s)": 0.095848 + }, + { + "epoch": 0.43759936522515036, + "grad_norm": 6.062745571136475, + "learning_rate": 6.425255521815212e-06, + "loss": 0.7100383758544921, + "memory(GiB)": 36.53, + "step": 9410, + "token_acc": 0.8216726326065734, + "train_speed(iter/s)": 0.095877 + }, + { + "epoch": 0.43783188348509994, + "grad_norm": 9.153746604919434, + "learning_rate": 6.4215697146933275e-06, + "loss": 0.8023602485656738, + "memory(GiB)": 36.53, + "step": 9415, + "token_acc": 0.8047173083593478, + "train_speed(iter/s)": 0.095905 + }, + { + "epoch": 0.4380644017450495, + "grad_norm": 6.360644340515137, + "learning_rate": 6.417883066953381e-06, + "loss": 0.650984811782837, + "memory(GiB)": 36.53, + "step": 9420, + "token_acc": 0.8299910206524993, + "train_speed(iter/s)": 0.095933 + }, + { + "epoch": 0.43829692000499915, + "grad_norm": 6.684108734130859, + "learning_rate": 6.414195580775401e-06, + "loss": 0.7176222801208496, + "memory(GiB)": 36.53, + "step": 9425, + "token_acc": 0.8133476088508208, + "train_speed(iter/s)": 0.095962 + }, + { + "epoch": 0.4385294382649487, + "grad_norm": 6.66787052154541, + "learning_rate": 6.410507258339911e-06, + "loss": 0.6233195304870606, + "memory(GiB)": 36.53, + "step": 9430, + "token_acc": 0.8431514275388508, + "train_speed(iter/s)": 0.095992 + }, + { + "epoch": 0.43876195652489836, + "grad_norm": 8.708562850952148, + "learning_rate": 6.40681810182793e-06, + "loss": 0.6892944812774658, + "memory(GiB)": 36.53, + "step": 9435, + "token_acc": 0.8342415985467757, + "train_speed(iter/s)": 0.09602 + }, + { + "epoch": 0.43899447478484793, + "grad_norm": 8.571969032287598, + "learning_rate": 6.403128113420973e-06, + "loss": 0.6714536190032959, + "memory(GiB)": 36.53, + "step": 9440, + "token_acc": 0.8265379113018598, + "train_speed(iter/s)": 0.096049 + }, + { + "epoch": 0.43922699304479756, + "grad_norm": 5.261947154998779, + "learning_rate": 6.399437295301041e-06, + "loss": 0.7917817115783692, + "memory(GiB)": 36.53, + "step": 9445, + "token_acc": 0.7919876733436055, + "train_speed(iter/s)": 0.096077 + }, + { + "epoch": 0.43945951130474714, + "grad_norm": 7.901933670043945, + "learning_rate": 6.3957456496506275e-06, + "loss": 0.8872608184814453, + "memory(GiB)": 36.53, + "step": 9450, + "token_acc": 0.7660295930949446, + "train_speed(iter/s)": 0.096105 + }, + { + "epoch": 0.43945951130474714, + "eval_loss": 0.6135982871055603, + "eval_runtime": 298.3506, + "eval_samples_per_second": 11.647, + "eval_steps_per_second": 11.647, + "step": 9450 + }, + { + "epoch": 0.4396920295646968, + "grad_norm": 6.660861968994141, + "learning_rate": 6.39205317865272e-06, + "loss": 0.6729079246520996, + "memory(GiB)": 36.53, + "step": 9455, + "token_acc": 0.8219437903348952, + "train_speed(iter/s)": 0.095841 + }, + { + "epoch": 0.43992454782464635, + "grad_norm": 9.15910530090332, + "learning_rate": 6.388359884490789e-06, + "loss": 0.7558378219604492, + "memory(GiB)": 36.53, + "step": 9460, + "token_acc": 0.8095046314941603, + "train_speed(iter/s)": 0.09587 + }, + { + "epoch": 0.440157066084596, + "grad_norm": 8.168127059936523, + "learning_rate": 6.3846657693487945e-06, + "loss": 0.6436011314392089, + "memory(GiB)": 36.53, + "step": 9465, + "token_acc": 0.8427491903562433, + "train_speed(iter/s)": 0.095899 + }, + { + "epoch": 0.44038958434454556, + "grad_norm": 7.540072917938232, + "learning_rate": 6.3809708354111775e-06, + "loss": 0.6013598442077637, + "memory(GiB)": 36.53, + "step": 9470, + "token_acc": 0.8489921421250427, + "train_speed(iter/s)": 0.095926 + }, + { + "epoch": 0.44062210260449514, + "grad_norm": 9.310900688171387, + "learning_rate": 6.3772750848628705e-06, + "loss": 0.7716301441192627, + "memory(GiB)": 36.53, + "step": 9475, + "token_acc": 0.8114617940199336, + "train_speed(iter/s)": 0.095954 + }, + { + "epoch": 0.44085462086444477, + "grad_norm": 7.388217449188232, + "learning_rate": 6.373578519889283e-06, + "loss": 0.6690125942230225, + "memory(GiB)": 36.53, + "step": 9480, + "token_acc": 0.8285063455906281, + "train_speed(iter/s)": 0.095983 + }, + { + "epoch": 0.44108713912439435, + "grad_norm": 6.658295631408691, + "learning_rate": 6.3698811426763086e-06, + "loss": 0.7387238502502441, + "memory(GiB)": 36.53, + "step": 9485, + "token_acc": 0.8200382897255903, + "train_speed(iter/s)": 0.096011 + }, + { + "epoch": 0.441319657384344, + "grad_norm": 8.474617004394531, + "learning_rate": 6.366182955410319e-06, + "loss": 0.8051560401916504, + "memory(GiB)": 36.53, + "step": 9490, + "token_acc": 0.7984709480122324, + "train_speed(iter/s)": 0.096039 + }, + { + "epoch": 0.44155217564429355, + "grad_norm": 6.576582431793213, + "learning_rate": 6.362483960278167e-06, + "loss": 0.6635289669036866, + "memory(GiB)": 36.53, + "step": 9495, + "token_acc": 0.8355202929508697, + "train_speed(iter/s)": 0.096068 + }, + { + "epoch": 0.4417846939042432, + "grad_norm": 7.216211318969727, + "learning_rate": 6.358784159467186e-06, + "loss": 0.5488409996032715, + "memory(GiB)": 36.53, + "step": 9500, + "token_acc": 0.8590710599444223, + "train_speed(iter/s)": 0.096096 + }, + { + "epoch": 0.4417846939042432, + "eval_loss": 0.6116949915885925, + "eval_runtime": 297.3772, + "eval_samples_per_second": 11.685, + "eval_steps_per_second": 11.685, + "step": 9500 + }, + { + "epoch": 0.44201721216419276, + "grad_norm": 6.454580307006836, + "learning_rate": 6.355083555165179e-06, + "loss": 0.6398816585540772, + "memory(GiB)": 36.53, + "step": 9505, + "token_acc": 0.8225794806382518, + "train_speed(iter/s)": 0.095836 + }, + { + "epoch": 0.4422497304241424, + "grad_norm": 7.4101057052612305, + "learning_rate": 6.3513821495604286e-06, + "loss": 0.629861307144165, + "memory(GiB)": 36.53, + "step": 9510, + "token_acc": 0.8217360114777619, + "train_speed(iter/s)": 0.095864 + }, + { + "epoch": 0.44248224868409197, + "grad_norm": 7.9802632331848145, + "learning_rate": 6.347679944841689e-06, + "loss": 0.6410726070404053, + "memory(GiB)": 36.53, + "step": 9515, + "token_acc": 0.844579226686884, + "train_speed(iter/s)": 0.095891 + }, + { + "epoch": 0.44271476694404155, + "grad_norm": 7.573753356933594, + "learning_rate": 6.34397694319819e-06, + "loss": 0.7023254871368408, + "memory(GiB)": 36.53, + "step": 9520, + "token_acc": 0.8184658104824714, + "train_speed(iter/s)": 0.095919 + }, + { + "epoch": 0.4429472852039912, + "grad_norm": 8.315315246582031, + "learning_rate": 6.340273146819631e-06, + "loss": 0.6772781848907471, + "memory(GiB)": 36.53, + "step": 9525, + "token_acc": 0.8359739049394221, + "train_speed(iter/s)": 0.095948 + }, + { + "epoch": 0.44317980346394076, + "grad_norm": 6.7924628257751465, + "learning_rate": 6.336568557896178e-06, + "loss": 0.8196972846984864, + "memory(GiB)": 36.53, + "step": 9530, + "token_acc": 0.7956465237166992, + "train_speed(iter/s)": 0.095976 + }, + { + "epoch": 0.4434123217238904, + "grad_norm": 8.114082336425781, + "learning_rate": 6.332863178618471e-06, + "loss": 0.7693780422210693, + "memory(GiB)": 36.53, + "step": 9535, + "token_acc": 0.8112975849365535, + "train_speed(iter/s)": 0.096004 + }, + { + "epoch": 0.44364483998383997, + "grad_norm": 9.550496101379395, + "learning_rate": 6.329157011177617e-06, + "loss": 0.7559969425201416, + "memory(GiB)": 36.53, + "step": 9540, + "token_acc": 0.8270291568163909, + "train_speed(iter/s)": 0.096033 + }, + { + "epoch": 0.4438773582437896, + "grad_norm": 6.725862979888916, + "learning_rate": 6.325450057765184e-06, + "loss": 0.7191961288452149, + "memory(GiB)": 36.53, + "step": 9545, + "token_acc": 0.8149694828140058, + "train_speed(iter/s)": 0.096061 + }, + { + "epoch": 0.4441098765037392, + "grad_norm": 7.213817596435547, + "learning_rate": 6.321742320573209e-06, + "loss": 0.6995858669281005, + "memory(GiB)": 36.53, + "step": 9550, + "token_acc": 0.83427071616048, + "train_speed(iter/s)": 0.09609 + }, + { + "epoch": 0.4441098765037392, + "eval_loss": 0.6096996068954468, + "eval_runtime": 297.0553, + "eval_samples_per_second": 11.698, + "eval_steps_per_second": 11.698, + "step": 9550 + }, + { + "epoch": 0.4443423947636888, + "grad_norm": 7.757943630218506, + "learning_rate": 6.318033801794193e-06, + "loss": 0.7151656150817871, + "memory(GiB)": 36.53, + "step": 9555, + "token_acc": 0.8218591052905603, + "train_speed(iter/s)": 0.095831 + }, + { + "epoch": 0.4445749130236384, + "grad_norm": 5.580169200897217, + "learning_rate": 6.3143245036210965e-06, + "loss": 0.5690258979797364, + "memory(GiB)": 36.53, + "step": 9560, + "token_acc": 0.8539325842696629, + "train_speed(iter/s)": 0.09586 + }, + { + "epoch": 0.44480743128358796, + "grad_norm": 8.343439102172852, + "learning_rate": 6.3106144282473425e-06, + "loss": 0.8035446166992187, + "memory(GiB)": 36.53, + "step": 9565, + "token_acc": 0.7925283522348232, + "train_speed(iter/s)": 0.095888 + }, + { + "epoch": 0.4450399495435376, + "grad_norm": 7.245533466339111, + "learning_rate": 6.306903577866811e-06, + "loss": 0.7285247325897217, + "memory(GiB)": 36.53, + "step": 9570, + "token_acc": 0.8194254445964432, + "train_speed(iter/s)": 0.095915 + }, + { + "epoch": 0.44527246780348717, + "grad_norm": 7.128092288970947, + "learning_rate": 6.303191954673844e-06, + "loss": 0.7278432846069336, + "memory(GiB)": 36.53, + "step": 9575, + "token_acc": 0.8139937651541392, + "train_speed(iter/s)": 0.095943 + }, + { + "epoch": 0.4455049860634368, + "grad_norm": 7.35676383972168, + "learning_rate": 6.29947956086324e-06, + "loss": 0.8199946403503418, + "memory(GiB)": 36.53, + "step": 9580, + "token_acc": 0.8070937386898299, + "train_speed(iter/s)": 0.095972 + }, + { + "epoch": 0.4457375043233864, + "grad_norm": 5.832348346710205, + "learning_rate": 6.295766398630251e-06, + "loss": 0.6157866477966308, + "memory(GiB)": 36.53, + "step": 9585, + "token_acc": 0.83373063170441, + "train_speed(iter/s)": 0.096 + }, + { + "epoch": 0.445970022583336, + "grad_norm": 8.000378608703613, + "learning_rate": 6.292052470170583e-06, + "loss": 0.6848884105682373, + "memory(GiB)": 36.53, + "step": 9590, + "token_acc": 0.8265479219677693, + "train_speed(iter/s)": 0.096029 + }, + { + "epoch": 0.4462025408432856, + "grad_norm": 9.083320617675781, + "learning_rate": 6.2883377776804e-06, + "loss": 0.8012693405151368, + "memory(GiB)": 36.53, + "step": 9595, + "token_acc": 0.7966036279428792, + "train_speed(iter/s)": 0.096056 + }, + { + "epoch": 0.4464350591032352, + "grad_norm": 8.783888816833496, + "learning_rate": 6.284622323356312e-06, + "loss": 0.7423254489898682, + "memory(GiB)": 36.53, + "step": 9600, + "token_acc": 0.8229587712206953, + "train_speed(iter/s)": 0.096083 + }, + { + "epoch": 0.4464350591032352, + "eval_loss": 0.6070340871810913, + "eval_runtime": 293.5483, + "eval_samples_per_second": 11.838, + "eval_steps_per_second": 11.838, + "step": 9600 + }, + { + "epoch": 0.4466675773631848, + "grad_norm": 8.892648696899414, + "learning_rate": 6.280906109395382e-06, + "loss": 0.6096343040466309, + "memory(GiB)": 36.53, + "step": 9605, + "token_acc": 0.8230499744049142, + "train_speed(iter/s)": 0.095829 + }, + { + "epoch": 0.44690009562313443, + "grad_norm": 6.9500732421875, + "learning_rate": 6.277189137995121e-06, + "loss": 0.6724872589111328, + "memory(GiB)": 36.53, + "step": 9610, + "token_acc": 0.833088018840153, + "train_speed(iter/s)": 0.095858 + }, + { + "epoch": 0.447132613883084, + "grad_norm": 6.852596759796143, + "learning_rate": 6.273471411353491e-06, + "loss": 0.6281951427459717, + "memory(GiB)": 36.53, + "step": 9615, + "token_acc": 0.8316430020283976, + "train_speed(iter/s)": 0.095886 + }, + { + "epoch": 0.4473651321430336, + "grad_norm": 7.1367950439453125, + "learning_rate": 6.269752931668899e-06, + "loss": 0.7241427898406982, + "memory(GiB)": 36.53, + "step": 9620, + "token_acc": 0.8250350631136045, + "train_speed(iter/s)": 0.095913 + }, + { + "epoch": 0.4475976504029832, + "grad_norm": 6.101066589355469, + "learning_rate": 6.266033701140193e-06, + "loss": 0.6471002101898193, + "memory(GiB)": 36.53, + "step": 9625, + "token_acc": 0.8366453351296733, + "train_speed(iter/s)": 0.095941 + }, + { + "epoch": 0.4478301686629328, + "grad_norm": 7.338571548461914, + "learning_rate": 6.262313721966673e-06, + "loss": 0.7227649211883544, + "memory(GiB)": 36.53, + "step": 9630, + "token_acc": 0.8344453711426189, + "train_speed(iter/s)": 0.095969 + }, + { + "epoch": 0.4480626869228824, + "grad_norm": 7.873856067657471, + "learning_rate": 6.2585929963480764e-06, + "loss": 0.7752127647399902, + "memory(GiB)": 36.53, + "step": 9635, + "token_acc": 0.8141263940520446, + "train_speed(iter/s)": 0.095998 + }, + { + "epoch": 0.448295205182832, + "grad_norm": 7.643828392028809, + "learning_rate": 6.254871526484583e-06, + "loss": 0.809475040435791, + "memory(GiB)": 36.53, + "step": 9640, + "token_acc": 0.8117283950617284, + "train_speed(iter/s)": 0.096026 + }, + { + "epoch": 0.44852772344278163, + "grad_norm": 6.188126087188721, + "learning_rate": 6.251149314576812e-06, + "loss": 0.7115334510803223, + "memory(GiB)": 36.53, + "step": 9645, + "token_acc": 0.8103073579633654, + "train_speed(iter/s)": 0.096054 + }, + { + "epoch": 0.4487602417027312, + "grad_norm": 6.334682464599609, + "learning_rate": 6.247426362825823e-06, + "loss": 0.5974188804626465, + "memory(GiB)": 36.53, + "step": 9650, + "token_acc": 0.8622777147181233, + "train_speed(iter/s)": 0.096083 + }, + { + "epoch": 0.4487602417027312, + "eval_loss": 0.6079947352409363, + "eval_runtime": 293.8957, + "eval_samples_per_second": 11.824, + "eval_steps_per_second": 11.824, + "step": 9650 + }, + { + "epoch": 0.44899275996268084, + "grad_norm": 6.791869163513184, + "learning_rate": 6.243702673433111e-06, + "loss": 0.681541919708252, + "memory(GiB)": 36.53, + "step": 9655, + "token_acc": 0.8224100972613259, + "train_speed(iter/s)": 0.09583 + }, + { + "epoch": 0.4492252782226304, + "grad_norm": 6.402570724487305, + "learning_rate": 6.23997824860061e-06, + "loss": 0.527205514907837, + "memory(GiB)": 36.53, + "step": 9660, + "token_acc": 0.876510067114094, + "train_speed(iter/s)": 0.095857 + }, + { + "epoch": 0.44945779648258, + "grad_norm": 7.076948165893555, + "learning_rate": 6.236253090530689e-06, + "loss": 0.7284140110015869, + "memory(GiB)": 36.53, + "step": 9665, + "token_acc": 0.8071654373024236, + "train_speed(iter/s)": 0.095885 + }, + { + "epoch": 0.44969031474252963, + "grad_norm": 8.33540153503418, + "learning_rate": 6.232527201426145e-06, + "loss": 0.6018318176269531, + "memory(GiB)": 36.53, + "step": 9670, + "token_acc": 0.8495754144763445, + "train_speed(iter/s)": 0.095913 + }, + { + "epoch": 0.4499228330024792, + "grad_norm": 5.779272079467773, + "learning_rate": 6.228800583490213e-06, + "loss": 0.6930430412292481, + "memory(GiB)": 36.53, + "step": 9675, + "token_acc": 0.8281002220107834, + "train_speed(iter/s)": 0.09594 + }, + { + "epoch": 0.45015535126242884, + "grad_norm": 6.199838161468506, + "learning_rate": 6.225073238926558e-06, + "loss": 0.6994569301605225, + "memory(GiB)": 36.53, + "step": 9680, + "token_acc": 0.8282828282828283, + "train_speed(iter/s)": 0.095968 + }, + { + "epoch": 0.4503878695223784, + "grad_norm": 7.950428009033203, + "learning_rate": 6.221345169939274e-06, + "loss": 0.705945348739624, + "memory(GiB)": 36.53, + "step": 9685, + "token_acc": 0.8452830188679246, + "train_speed(iter/s)": 0.095997 + }, + { + "epoch": 0.45062038778232805, + "grad_norm": 6.8860249519348145, + "learning_rate": 6.217616378732883e-06, + "loss": 0.6749707221984863, + "memory(GiB)": 36.53, + "step": 9690, + "token_acc": 0.8228431904503527, + "train_speed(iter/s)": 0.096023 + }, + { + "epoch": 0.4508529060422776, + "grad_norm": 7.688700199127197, + "learning_rate": 6.213886867512332e-06, + "loss": 0.6652534484863282, + "memory(GiB)": 36.53, + "step": 9695, + "token_acc": 0.8338132455779514, + "train_speed(iter/s)": 0.096051 + }, + { + "epoch": 0.45108542430222726, + "grad_norm": 8.237911224365234, + "learning_rate": 6.210156638483e-06, + "loss": 0.5874074459075928, + "memory(GiB)": 36.53, + "step": 9700, + "token_acc": 0.8490930142802007, + "train_speed(iter/s)": 0.096079 + }, + { + "epoch": 0.45108542430222726, + "eval_loss": 0.608859121799469, + "eval_runtime": 293.3921, + "eval_samples_per_second": 11.844, + "eval_steps_per_second": 11.844, + "step": 9700 + }, + { + "epoch": 0.45131794256217683, + "grad_norm": 7.017812252044678, + "learning_rate": 6.206425693850684e-06, + "loss": 0.5841714859008789, + "memory(GiB)": 36.53, + "step": 9705, + "token_acc": 0.8230711454870877, + "train_speed(iter/s)": 0.095827 + }, + { + "epoch": 0.45155046082212646, + "grad_norm": 7.0842719078063965, + "learning_rate": 6.202694035821607e-06, + "loss": 0.7316049575805664, + "memory(GiB)": 36.53, + "step": 9710, + "token_acc": 0.8187274909963985, + "train_speed(iter/s)": 0.095854 + }, + { + "epoch": 0.45178297908207604, + "grad_norm": 7.328638076782227, + "learning_rate": 6.198961666602416e-06, + "loss": 0.7782143592834473, + "memory(GiB)": 36.53, + "step": 9715, + "token_acc": 0.8079444658696491, + "train_speed(iter/s)": 0.095882 + }, + { + "epoch": 0.4520154973420256, + "grad_norm": 9.605910301208496, + "learning_rate": 6.195228588400173e-06, + "loss": 0.6689083099365234, + "memory(GiB)": 36.53, + "step": 9720, + "token_acc": 0.8285105086810843, + "train_speed(iter/s)": 0.09591 + }, + { + "epoch": 0.45224801560197525, + "grad_norm": 9.341135025024414, + "learning_rate": 6.191494803422364e-06, + "loss": 0.6024797916412353, + "memory(GiB)": 36.53, + "step": 9725, + "token_acc": 0.8464150943396226, + "train_speed(iter/s)": 0.095938 + }, + { + "epoch": 0.4524805338619248, + "grad_norm": 9.248586654663086, + "learning_rate": 6.187760313876891e-06, + "loss": 0.7038826942443848, + "memory(GiB)": 36.53, + "step": 9730, + "token_acc": 0.8426349496797805, + "train_speed(iter/s)": 0.095965 + }, + { + "epoch": 0.45271305212187446, + "grad_norm": 7.3656487464904785, + "learning_rate": 6.184025121972072e-06, + "loss": 0.7399398803710937, + "memory(GiB)": 36.53, + "step": 9735, + "token_acc": 0.816057293629853, + "train_speed(iter/s)": 0.095993 + }, + { + "epoch": 0.45294557038182404, + "grad_norm": 7.461027145385742, + "learning_rate": 6.180289229916645e-06, + "loss": 0.6084781169891358, + "memory(GiB)": 36.53, + "step": 9740, + "token_acc": 0.8564412542500944, + "train_speed(iter/s)": 0.096021 + }, + { + "epoch": 0.45317808864177367, + "grad_norm": 6.418759822845459, + "learning_rate": 6.176552639919754e-06, + "loss": 0.7723904609680176, + "memory(GiB)": 36.53, + "step": 9745, + "token_acc": 0.8285714285714286, + "train_speed(iter/s)": 0.096048 + }, + { + "epoch": 0.45341060690172325, + "grad_norm": 7.4962921142578125, + "learning_rate": 6.172815354190961e-06, + "loss": 0.5903301239013672, + "memory(GiB)": 36.53, + "step": 9750, + "token_acc": 0.8578680203045685, + "train_speed(iter/s)": 0.096076 + }, + { + "epoch": 0.45341060690172325, + "eval_loss": 0.6094077825546265, + "eval_runtime": 294.2374, + "eval_samples_per_second": 11.81, + "eval_steps_per_second": 11.81, + "step": 9750 + }, + { + "epoch": 0.4536431251616729, + "grad_norm": 8.316295623779297, + "learning_rate": 6.169077374940239e-06, + "loss": 0.704967737197876, + "memory(GiB)": 36.53, + "step": 9755, + "token_acc": 0.8228508585327479, + "train_speed(iter/s)": 0.095825 + }, + { + "epoch": 0.45387564342162245, + "grad_norm": 7.59016227722168, + "learning_rate": 6.165338704377971e-06, + "loss": 0.709078311920166, + "memory(GiB)": 36.53, + "step": 9760, + "token_acc": 0.8235915492957746, + "train_speed(iter/s)": 0.095852 + }, + { + "epoch": 0.45410816168157203, + "grad_norm": 5.767019748687744, + "learning_rate": 6.161599344714948e-06, + "loss": 0.6274002075195313, + "memory(GiB)": 36.53, + "step": 9765, + "token_acc": 0.8305860805860806, + "train_speed(iter/s)": 0.095879 + }, + { + "epoch": 0.45434067994152166, + "grad_norm": 7.16868782043457, + "learning_rate": 6.15785929816237e-06, + "loss": 0.7391060829162598, + "memory(GiB)": 36.53, + "step": 9770, + "token_acc": 0.8283212790255043, + "train_speed(iter/s)": 0.095906 + }, + { + "epoch": 0.45457319820147124, + "grad_norm": 6.7341694831848145, + "learning_rate": 6.154118566931838e-06, + "loss": 0.6822587490081787, + "memory(GiB)": 36.53, + "step": 9775, + "token_acc": 0.8355828220858895, + "train_speed(iter/s)": 0.095934 + }, + { + "epoch": 0.4548057164614209, + "grad_norm": 9.300615310668945, + "learning_rate": 6.150377153235368e-06, + "loss": 0.626803970336914, + "memory(GiB)": 36.53, + "step": 9780, + "token_acc": 0.8419951168468782, + "train_speed(iter/s)": 0.095961 + }, + { + "epoch": 0.45503823472137045, + "grad_norm": 5.771785736083984, + "learning_rate": 6.146635059285367e-06, + "loss": 0.6426148414611816, + "memory(GiB)": 36.53, + "step": 9785, + "token_acc": 0.8419601837672281, + "train_speed(iter/s)": 0.095989 + }, + { + "epoch": 0.4552707529813201, + "grad_norm": 8.090389251708984, + "learning_rate": 6.142892287294656e-06, + "loss": 0.7309681892395019, + "memory(GiB)": 36.53, + "step": 9790, + "token_acc": 0.8210048848569435, + "train_speed(iter/s)": 0.096017 + }, + { + "epoch": 0.45550327124126966, + "grad_norm": 7.393229007720947, + "learning_rate": 6.139148839476448e-06, + "loss": 0.7353736400604248, + "memory(GiB)": 36.53, + "step": 9795, + "token_acc": 0.8155509065550907, + "train_speed(iter/s)": 0.096045 + }, + { + "epoch": 0.4557357895012193, + "grad_norm": 6.2574920654296875, + "learning_rate": 6.135404718044361e-06, + "loss": 0.6523962497711182, + "memory(GiB)": 36.53, + "step": 9800, + "token_acc": 0.8372361954322058, + "train_speed(iter/s)": 0.096073 + }, + { + "epoch": 0.4557357895012193, + "eval_loss": 0.6095502376556396, + "eval_runtime": 292.7828, + "eval_samples_per_second": 11.869, + "eval_steps_per_second": 11.869, + "step": 9800 + }, + { + "epoch": 0.45596830776116887, + "grad_norm": 6.883238315582275, + "learning_rate": 6.13165992521241e-06, + "loss": 0.6746381759643555, + "memory(GiB)": 36.53, + "step": 9805, + "token_acc": 0.8226793375837043, + "train_speed(iter/s)": 0.095826 + }, + { + "epoch": 0.45620082602111844, + "grad_norm": 7.616454124450684, + "learning_rate": 6.127914463195006e-06, + "loss": 0.7246517658233642, + "memory(GiB)": 36.53, + "step": 9810, + "token_acc": 0.8232161874334398, + "train_speed(iter/s)": 0.095854 + }, + { + "epoch": 0.4564333442810681, + "grad_norm": 7.244081974029541, + "learning_rate": 6.124168334206955e-06, + "loss": 0.7134342193603516, + "memory(GiB)": 36.53, + "step": 9815, + "token_acc": 0.830820770519263, + "train_speed(iter/s)": 0.095881 + }, + { + "epoch": 0.45666586254101765, + "grad_norm": 7.020895957946777, + "learning_rate": 6.1204215404634605e-06, + "loss": 0.6642593383789063, + "memory(GiB)": 36.53, + "step": 9820, + "token_acc": 0.8396159317211949, + "train_speed(iter/s)": 0.095908 + }, + { + "epoch": 0.4568983808009673, + "grad_norm": 8.283989906311035, + "learning_rate": 6.116674084180116e-06, + "loss": 0.7088629245758057, + "memory(GiB)": 36.53, + "step": 9825, + "token_acc": 0.83529890199268, + "train_speed(iter/s)": 0.095936 + }, + { + "epoch": 0.45713089906091686, + "grad_norm": 7.5810546875, + "learning_rate": 6.112925967572911e-06, + "loss": 0.6322180271148682, + "memory(GiB)": 36.53, + "step": 9830, + "token_acc": 0.8513738551207327, + "train_speed(iter/s)": 0.095964 + }, + { + "epoch": 0.4573634173208665, + "grad_norm": 7.434423446655273, + "learning_rate": 6.109177192858218e-06, + "loss": 0.7602972984313965, + "memory(GiB)": 36.53, + "step": 9835, + "token_acc": 0.8070408502158751, + "train_speed(iter/s)": 0.095991 + }, + { + "epoch": 0.45759593558081607, + "grad_norm": 7.4128217697143555, + "learning_rate": 6.105427762252807e-06, + "loss": 0.6599167823791504, + "memory(GiB)": 36.53, + "step": 9840, + "token_acc": 0.8335649756775538, + "train_speed(iter/s)": 0.096019 + }, + { + "epoch": 0.4578284538407657, + "grad_norm": 5.756119728088379, + "learning_rate": 6.101677677973831e-06, + "loss": 0.6275468349456788, + "memory(GiB)": 36.53, + "step": 9845, + "token_acc": 0.8496710526315789, + "train_speed(iter/s)": 0.096047 + }, + { + "epoch": 0.4580609721007153, + "grad_norm": 7.097313404083252, + "learning_rate": 6.09792694223883e-06, + "loss": 0.6755487442016601, + "memory(GiB)": 36.53, + "step": 9850, + "token_acc": 0.8331877729257642, + "train_speed(iter/s)": 0.096075 + }, + { + "epoch": 0.4580609721007153, + "eval_loss": 0.6061334013938904, + "eval_runtime": 293.067, + "eval_samples_per_second": 11.857, + "eval_steps_per_second": 11.857, + "step": 9850 + }, + { + "epoch": 0.4582934903606649, + "grad_norm": 6.993188381195068, + "learning_rate": 6.094175557265729e-06, + "loss": 0.6865869998931885, + "memory(GiB)": 36.53, + "step": 9855, + "token_acc": 0.8228351826445551, + "train_speed(iter/s)": 0.095827 + }, + { + "epoch": 0.4585260086206145, + "grad_norm": 5.587054252624512, + "learning_rate": 6.09042352527284e-06, + "loss": 0.605366563796997, + "memory(GiB)": 36.53, + "step": 9860, + "token_acc": 0.8427457098283931, + "train_speed(iter/s)": 0.095854 + }, + { + "epoch": 0.45875852688056407, + "grad_norm": 7.5531721115112305, + "learning_rate": 6.086670848478855e-06, + "loss": 0.7458683967590332, + "memory(GiB)": 36.53, + "step": 9865, + "token_acc": 0.8089076136021667, + "train_speed(iter/s)": 0.095881 + }, + { + "epoch": 0.4589910451405137, + "grad_norm": 5.862915515899658, + "learning_rate": 6.082917529102846e-06, + "loss": 0.7659440040588379, + "memory(GiB)": 36.53, + "step": 9870, + "token_acc": 0.8184658104824714, + "train_speed(iter/s)": 0.095908 + }, + { + "epoch": 0.4592235634004633, + "grad_norm": 8.969634056091309, + "learning_rate": 6.079163569364268e-06, + "loss": 0.6593055725097656, + "memory(GiB)": 36.53, + "step": 9875, + "token_acc": 0.831306990881459, + "train_speed(iter/s)": 0.095935 + }, + { + "epoch": 0.4594560816604129, + "grad_norm": 7.298051357269287, + "learning_rate": 6.0754089714829535e-06, + "loss": 0.6426519393920899, + "memory(GiB)": 36.53, + "step": 9880, + "token_acc": 0.8413323782234957, + "train_speed(iter/s)": 0.095962 + }, + { + "epoch": 0.4596885999203625, + "grad_norm": 5.749274253845215, + "learning_rate": 6.07165373767911e-06, + "loss": 0.7200953960418701, + "memory(GiB)": 36.53, + "step": 9885, + "token_acc": 0.8264546684709067, + "train_speed(iter/s)": 0.095989 + }, + { + "epoch": 0.4599211181803121, + "grad_norm": 8.445324897766113, + "learning_rate": 6.067897870173325e-06, + "loss": 0.686259412765503, + "memory(GiB)": 36.53, + "step": 9890, + "token_acc": 0.8217406501223349, + "train_speed(iter/s)": 0.096016 + }, + { + "epoch": 0.4601536364402617, + "grad_norm": 6.035857677459717, + "learning_rate": 6.0641413711865585e-06, + "loss": 0.5487040042877197, + "memory(GiB)": 36.53, + "step": 9895, + "token_acc": 0.8626220362622036, + "train_speed(iter/s)": 0.096043 + }, + { + "epoch": 0.4603861547002113, + "grad_norm": 6.481517791748047, + "learning_rate": 6.060384242940146e-06, + "loss": 0.7927393436431884, + "memory(GiB)": 36.53, + "step": 9900, + "token_acc": 0.7991266375545851, + "train_speed(iter/s)": 0.09607 + }, + { + "epoch": 0.4603861547002113, + "eval_loss": 0.6078546643257141, + "eval_runtime": 294.4453, + "eval_samples_per_second": 11.802, + "eval_steps_per_second": 11.802, + "step": 9900 + }, + { + "epoch": 0.4606186729601609, + "grad_norm": 6.822314739227295, + "learning_rate": 6.056626487655791e-06, + "loss": 0.6223538398742676, + "memory(GiB)": 36.53, + "step": 9905, + "token_acc": 0.8235760952342779, + "train_speed(iter/s)": 0.095823 + }, + { + "epoch": 0.4608511912201105, + "grad_norm": 8.354050636291504, + "learning_rate": 6.052868107555572e-06, + "loss": 0.7600676059722901, + "memory(GiB)": 36.53, + "step": 9910, + "token_acc": 0.8175775480059084, + "train_speed(iter/s)": 0.095849 + }, + { + "epoch": 0.4610837094800601, + "grad_norm": 8.266427040100098, + "learning_rate": 6.0491091048619325e-06, + "loss": 0.7591500282287598, + "memory(GiB)": 36.53, + "step": 9915, + "token_acc": 0.824430823117338, + "train_speed(iter/s)": 0.095877 + }, + { + "epoch": 0.4613162277400097, + "grad_norm": 6.544285774230957, + "learning_rate": 6.04534948179769e-06, + "loss": 0.7405786991119385, + "memory(GiB)": 36.53, + "step": 9920, + "token_acc": 0.8334106728538283, + "train_speed(iter/s)": 0.095904 + }, + { + "epoch": 0.4615487459999593, + "grad_norm": 6.957915306091309, + "learning_rate": 6.041589240586025e-06, + "loss": 0.6144753932952881, + "memory(GiB)": 36.53, + "step": 9925, + "token_acc": 0.8433460076045627, + "train_speed(iter/s)": 0.095931 + }, + { + "epoch": 0.4617812642599089, + "grad_norm": 7.508825778961182, + "learning_rate": 6.037828383450481e-06, + "loss": 0.597527551651001, + "memory(GiB)": 36.53, + "step": 9930, + "token_acc": 0.8426294820717132, + "train_speed(iter/s)": 0.095959 + }, + { + "epoch": 0.46201378251985853, + "grad_norm": 7.72009801864624, + "learning_rate": 6.034066912614973e-06, + "loss": 0.6360546112060547, + "memory(GiB)": 36.53, + "step": 9935, + "token_acc": 0.8471028037383178, + "train_speed(iter/s)": 0.095986 + }, + { + "epoch": 0.4622463007798081, + "grad_norm": 8.508076667785645, + "learning_rate": 6.030304830303774e-06, + "loss": 0.6973794460296631, + "memory(GiB)": 36.53, + "step": 9940, + "token_acc": 0.8105059619722849, + "train_speed(iter/s)": 0.096013 + }, + { + "epoch": 0.46247881903975774, + "grad_norm": 8.080486297607422, + "learning_rate": 6.026542138741518e-06, + "loss": 0.7362306594848633, + "memory(GiB)": 36.53, + "step": 9945, + "token_acc": 0.8207620528771384, + "train_speed(iter/s)": 0.09604 + }, + { + "epoch": 0.4627113372997073, + "grad_norm": 6.59077262878418, + "learning_rate": 6.0227788401532025e-06, + "loss": 0.6460929870605469, + "memory(GiB)": 36.53, + "step": 9950, + "token_acc": 0.8428842504743833, + "train_speed(iter/s)": 0.096068 + }, + { + "epoch": 0.4627113372997073, + "eval_loss": 0.6045697331428528, + "eval_runtime": 292.8994, + "eval_samples_per_second": 11.864, + "eval_steps_per_second": 11.864, + "step": 9950 + }, + { + "epoch": 0.4629438555596569, + "grad_norm": 9.889881134033203, + "learning_rate": 6.019014936764179e-06, + "loss": 0.6179000854492187, + "memory(GiB)": 36.53, + "step": 9955, + "token_acc": 0.8231968779610388, + "train_speed(iter/s)": 0.095824 + }, + { + "epoch": 0.4631763738196065, + "grad_norm": 7.871378421783447, + "learning_rate": 6.015250430800164e-06, + "loss": 0.7012851715087891, + "memory(GiB)": 36.53, + "step": 9960, + "token_acc": 0.8371753720455208, + "train_speed(iter/s)": 0.095851 + }, + { + "epoch": 0.4634088920795561, + "grad_norm": 6.383567810058594, + "learning_rate": 6.011485324487224e-06, + "loss": 0.5643723487854004, + "memory(GiB)": 36.53, + "step": 9965, + "token_acc": 0.8545157335512873, + "train_speed(iter/s)": 0.095879 + }, + { + "epoch": 0.46364141033950573, + "grad_norm": 8.291006088256836, + "learning_rate": 6.007719620051781e-06, + "loss": 0.6677682399749756, + "memory(GiB)": 36.53, + "step": 9970, + "token_acc": 0.8361702127659575, + "train_speed(iter/s)": 0.095906 + }, + { + "epoch": 0.4638739285994553, + "grad_norm": 8.129223823547363, + "learning_rate": 6.003953319720614e-06, + "loss": 0.6844709396362305, + "memory(GiB)": 36.53, + "step": 9975, + "token_acc": 0.8373146622734761, + "train_speed(iter/s)": 0.095934 + }, + { + "epoch": 0.46410644685940494, + "grad_norm": 5.145586967468262, + "learning_rate": 6.000186425720854e-06, + "loss": 0.7600801944732666, + "memory(GiB)": 36.53, + "step": 9980, + "token_acc": 0.8042265923099501, + "train_speed(iter/s)": 0.095961 + }, + { + "epoch": 0.4643389651193545, + "grad_norm": 9.662071228027344, + "learning_rate": 5.99641894027998e-06, + "loss": 0.835693359375, + "memory(GiB)": 36.53, + "step": 9985, + "token_acc": 0.7827635327635327, + "train_speed(iter/s)": 0.095988 + }, + { + "epoch": 0.46457148337930415, + "grad_norm": 10.052090644836426, + "learning_rate": 5.992650865625823e-06, + "loss": 0.6995959281921387, + "memory(GiB)": 36.53, + "step": 9990, + "token_acc": 0.8277608915906788, + "train_speed(iter/s)": 0.096015 + }, + { + "epoch": 0.4648040016392537, + "grad_norm": 6.37227725982666, + "learning_rate": 5.98888220398656e-06, + "loss": 0.7162473678588868, + "memory(GiB)": 36.53, + "step": 9995, + "token_acc": 0.8284432171211599, + "train_speed(iter/s)": 0.096043 + }, + { + "epoch": 0.46503651989920336, + "grad_norm": 6.917495250701904, + "learning_rate": 5.985112957590721e-06, + "loss": 0.6338780879974365, + "memory(GiB)": 36.53, + "step": 10000, + "token_acc": 0.8304964539007093, + "train_speed(iter/s)": 0.09607 + }, + { + "epoch": 0.46503651989920336, + "eval_loss": 0.6057087182998657, + "eval_runtime": 291.3624, + "eval_samples_per_second": 11.927, + "eval_steps_per_second": 11.927, + "step": 10000 + }, + { + "epoch": 0.46526903815915294, + "grad_norm": 7.6043171882629395, + "learning_rate": 5.981343128667174e-06, + "loss": 0.7458448886871338, + "memory(GiB)": 36.53, + "step": 10005, + "token_acc": 0.8233181843685126, + "train_speed(iter/s)": 0.095829 + }, + { + "epoch": 0.4655015564191025, + "grad_norm": 7.113598823547363, + "learning_rate": 5.977572719445137e-06, + "loss": 0.7033159255981445, + "memory(GiB)": 36.53, + "step": 10010, + "token_acc": 0.8245614035087719, + "train_speed(iter/s)": 0.095855 + }, + { + "epoch": 0.46573407467905215, + "grad_norm": 9.22055721282959, + "learning_rate": 5.973801732154168e-06, + "loss": 0.5509349822998046, + "memory(GiB)": 36.53, + "step": 10015, + "token_acc": 0.865392965696917, + "train_speed(iter/s)": 0.095882 + }, + { + "epoch": 0.4659665929390017, + "grad_norm": 5.772355079650879, + "learning_rate": 5.97003016902417e-06, + "loss": 0.7141910552978515, + "memory(GiB)": 36.53, + "step": 10020, + "token_acc": 0.828882833787466, + "train_speed(iter/s)": 0.095908 + }, + { + "epoch": 0.46619911119895135, + "grad_norm": 7.265198230743408, + "learning_rate": 5.9662580322853825e-06, + "loss": 0.6663059711456298, + "memory(GiB)": 36.53, + "step": 10025, + "token_acc": 0.8441601049868767, + "train_speed(iter/s)": 0.095935 + }, + { + "epoch": 0.46643162945890093, + "grad_norm": 7.731073379516602, + "learning_rate": 5.96248532416839e-06, + "loss": 0.69374098777771, + "memory(GiB)": 36.53, + "step": 10030, + "token_acc": 0.8324854651162791, + "train_speed(iter/s)": 0.095961 + }, + { + "epoch": 0.46666414771885056, + "grad_norm": 6.640981197357178, + "learning_rate": 5.958712046904107e-06, + "loss": 0.7169713020324707, + "memory(GiB)": 36.53, + "step": 10035, + "token_acc": 0.8095913734392736, + "train_speed(iter/s)": 0.095987 + }, + { + "epoch": 0.46689666597880014, + "grad_norm": 6.7847819328308105, + "learning_rate": 5.954938202723794e-06, + "loss": 0.7144430160522461, + "memory(GiB)": 36.53, + "step": 10040, + "token_acc": 0.8301005747126436, + "train_speed(iter/s)": 0.096013 + }, + { + "epoch": 0.4671291842387498, + "grad_norm": 7.647963047027588, + "learning_rate": 5.951163793859036e-06, + "loss": 0.6398671627044678, + "memory(GiB)": 36.53, + "step": 10045, + "token_acc": 0.8311781609195402, + "train_speed(iter/s)": 0.09604 + }, + { + "epoch": 0.46736170249869935, + "grad_norm": 4.547146797180176, + "learning_rate": 5.947388822541762e-06, + "loss": 0.7039501190185546, + "memory(GiB)": 36.53, + "step": 10050, + "token_acc": 0.8191751850546352, + "train_speed(iter/s)": 0.096067 + }, + { + "epoch": 0.46736170249869935, + "eval_loss": 0.6045349836349487, + "eval_runtime": 291.4717, + "eval_samples_per_second": 11.922, + "eval_steps_per_second": 11.922, + "step": 10050 + }, + { + "epoch": 0.4675942207586489, + "grad_norm": 9.998385429382324, + "learning_rate": 5.943613291004224e-06, + "loss": 0.692030668258667, + "memory(GiB)": 36.53, + "step": 10055, + "token_acc": 0.8235185852038571, + "train_speed(iter/s)": 0.095827 + }, + { + "epoch": 0.46782673901859856, + "grad_norm": 5.997773170471191, + "learning_rate": 5.9398372014790175e-06, + "loss": 0.8196736335754394, + "memory(GiB)": 36.53, + "step": 10060, + "token_acc": 0.7867219917012448, + "train_speed(iter/s)": 0.095852 + }, + { + "epoch": 0.46805925727854814, + "grad_norm": 7.602710723876953, + "learning_rate": 5.936060556199055e-06, + "loss": 0.6392131805419922, + "memory(GiB)": 36.53, + "step": 10065, + "token_acc": 0.8540405838376647, + "train_speed(iter/s)": 0.095879 + }, + { + "epoch": 0.46829177553849777, + "grad_norm": 5.817196369171143, + "learning_rate": 5.932283357397586e-06, + "loss": 0.6863467693328857, + "memory(GiB)": 36.53, + "step": 10070, + "token_acc": 0.8298807281858129, + "train_speed(iter/s)": 0.095906 + }, + { + "epoch": 0.46852429379844734, + "grad_norm": 6.113554000854492, + "learning_rate": 5.928505607308182e-06, + "loss": 0.6544069766998291, + "memory(GiB)": 36.53, + "step": 10075, + "token_acc": 0.8291605301914581, + "train_speed(iter/s)": 0.095932 + }, + { + "epoch": 0.468756812058397, + "grad_norm": 7.657655239105225, + "learning_rate": 5.924727308164747e-06, + "loss": 0.7510591506958008, + "memory(GiB)": 36.53, + "step": 10080, + "token_acc": 0.8130738156445098, + "train_speed(iter/s)": 0.095959 + }, + { + "epoch": 0.46898933031834655, + "grad_norm": 5.734485626220703, + "learning_rate": 5.920948462201503e-06, + "loss": 0.7152635097503662, + "memory(GiB)": 36.53, + "step": 10085, + "token_acc": 0.8170918367346939, + "train_speed(iter/s)": 0.095983 + }, + { + "epoch": 0.4692218485782962, + "grad_norm": 7.42374324798584, + "learning_rate": 5.917169071653001e-06, + "loss": 0.7044946193695069, + "memory(GiB)": 36.53, + "step": 10090, + "token_acc": 0.8162097017173847, + "train_speed(iter/s)": 0.09601 + }, + { + "epoch": 0.46945436683824576, + "grad_norm": 6.3649373054504395, + "learning_rate": 5.913389138754109e-06, + "loss": 0.65935378074646, + "memory(GiB)": 36.53, + "step": 10095, + "token_acc": 0.8460063897763578, + "train_speed(iter/s)": 0.096037 + }, + { + "epoch": 0.4696868850981954, + "grad_norm": 6.768080234527588, + "learning_rate": 5.90960866574002e-06, + "loss": 0.6156496047973633, + "memory(GiB)": 36.53, + "step": 10100, + "token_acc": 0.8520084566596194, + "train_speed(iter/s)": 0.096063 + }, + { + "epoch": 0.4696868850981954, + "eval_loss": 0.6128177046775818, + "eval_runtime": 291.3128, + "eval_samples_per_second": 11.929, + "eval_steps_per_second": 11.929, + "step": 10100 + }, + { + "epoch": 0.46991940335814497, + "grad_norm": 5.990058422088623, + "learning_rate": 5.9058276548462435e-06, + "loss": 0.7458582878112793, + "memory(GiB)": 36.53, + "step": 10105, + "token_acc": 0.8227818872371427, + "train_speed(iter/s)": 0.095824 + }, + { + "epoch": 0.47015192161809455, + "grad_norm": 9.34148120880127, + "learning_rate": 5.902046108308607e-06, + "loss": 0.7696472644805908, + "memory(GiB)": 36.53, + "step": 10110, + "token_acc": 0.8111810440577564, + "train_speed(iter/s)": 0.095851 + }, + { + "epoch": 0.4703844398780442, + "grad_norm": 6.020503520965576, + "learning_rate": 5.8982640283632555e-06, + "loss": 0.7021479606628418, + "memory(GiB)": 36.53, + "step": 10115, + "token_acc": 0.8248538011695906, + "train_speed(iter/s)": 0.095877 + }, + { + "epoch": 0.47061695813799376, + "grad_norm": 6.6600775718688965, + "learning_rate": 5.894481417246652e-06, + "loss": 0.6542953491210938, + "memory(GiB)": 36.53, + "step": 10120, + "token_acc": 0.8318471337579618, + "train_speed(iter/s)": 0.095902 + }, + { + "epoch": 0.4708494763979434, + "grad_norm": 7.627831935882568, + "learning_rate": 5.890698277195569e-06, + "loss": 0.7104983329772949, + "memory(GiB)": 36.53, + "step": 10125, + "token_acc": 0.8335798816568047, + "train_speed(iter/s)": 0.095929 + }, + { + "epoch": 0.47108199465789297, + "grad_norm": 6.605009078979492, + "learning_rate": 5.886914610447097e-06, + "loss": 0.6252718925476074, + "memory(GiB)": 36.53, + "step": 10130, + "token_acc": 0.842741935483871, + "train_speed(iter/s)": 0.095956 + }, + { + "epoch": 0.4713145129178426, + "grad_norm": 7.564285755157471, + "learning_rate": 5.8831304192386295e-06, + "loss": 0.6758628368377686, + "memory(GiB)": 36.53, + "step": 10135, + "token_acc": 0.8410499453153482, + "train_speed(iter/s)": 0.095983 + }, + { + "epoch": 0.4715470311777922, + "grad_norm": 6.068528175354004, + "learning_rate": 5.87934570580788e-06, + "loss": 0.6484007835388184, + "memory(GiB)": 36.53, + "step": 10140, + "token_acc": 0.8354826103946854, + "train_speed(iter/s)": 0.096009 + }, + { + "epoch": 0.4717795494377418, + "grad_norm": 5.984204292297363, + "learning_rate": 5.875560472392867e-06, + "loss": 0.6442727088928223, + "memory(GiB)": 36.53, + "step": 10145, + "token_acc": 0.8396540252827678, + "train_speed(iter/s)": 0.096035 + }, + { + "epoch": 0.4720120676976914, + "grad_norm": 8.497846603393555, + "learning_rate": 5.871774721231913e-06, + "loss": 0.6294188499450684, + "memory(GiB)": 36.53, + "step": 10150, + "token_acc": 0.8486238532110092, + "train_speed(iter/s)": 0.096062 + }, + { + "epoch": 0.4720120676976914, + "eval_loss": 0.6052373051643372, + "eval_runtime": 292.5147, + "eval_samples_per_second": 11.88, + "eval_steps_per_second": 11.88, + "step": 10150 + }, + { + "epoch": 0.47224458595764096, + "grad_norm": 8.094533920288086, + "learning_rate": 5.8679884545636515e-06, + "loss": 0.6738255023956299, + "memory(GiB)": 36.53, + "step": 10155, + "token_acc": 0.8238933230759368, + "train_speed(iter/s)": 0.095824 + }, + { + "epoch": 0.4724771042175906, + "grad_norm": 8.374753952026367, + "learning_rate": 5.864201674627017e-06, + "loss": 0.7145820617675781, + "memory(GiB)": 36.53, + "step": 10160, + "token_acc": 0.8257487359004279, + "train_speed(iter/s)": 0.09585 + }, + { + "epoch": 0.47270962247754017, + "grad_norm": 6.898515701293945, + "learning_rate": 5.8604143836612515e-06, + "loss": 0.6502760887145996, + "memory(GiB)": 36.53, + "step": 10165, + "token_acc": 0.8404459823144944, + "train_speed(iter/s)": 0.095876 + }, + { + "epoch": 0.4729421407374898, + "grad_norm": 7.775455951690674, + "learning_rate": 5.856626583905895e-06, + "loss": 0.6345690250396728, + "memory(GiB)": 36.53, + "step": 10170, + "token_acc": 0.8561292865589278, + "train_speed(iter/s)": 0.095903 + }, + { + "epoch": 0.4731746589974394, + "grad_norm": 7.383172035217285, + "learning_rate": 5.8528382776007945e-06, + "loss": 0.6680724143981933, + "memory(GiB)": 36.53, + "step": 10175, + "token_acc": 0.830135039090263, + "train_speed(iter/s)": 0.095929 + }, + { + "epoch": 0.473407177257389, + "grad_norm": 7.9133620262146, + "learning_rate": 5.849049466986087e-06, + "loss": 0.7379461288452148, + "memory(GiB)": 36.53, + "step": 10180, + "token_acc": 0.8111346018322763, + "train_speed(iter/s)": 0.095955 + }, + { + "epoch": 0.4736396955173386, + "grad_norm": 10.533130645751953, + "learning_rate": 5.845260154302216e-06, + "loss": 0.7678250789642334, + "memory(GiB)": 36.53, + "step": 10185, + "token_acc": 0.8148893360160966, + "train_speed(iter/s)": 0.09598 + }, + { + "epoch": 0.4738722137772882, + "grad_norm": 8.054941177368164, + "learning_rate": 5.84147034178992e-06, + "loss": 0.6807512760162353, + "memory(GiB)": 36.53, + "step": 10190, + "token_acc": 0.8333333333333334, + "train_speed(iter/s)": 0.096006 + }, + { + "epoch": 0.4741047320372378, + "grad_norm": 6.960943698883057, + "learning_rate": 5.83768003169023e-06, + "loss": 0.6838326454162598, + "memory(GiB)": 36.53, + "step": 10195, + "token_acc": 0.8228460793804453, + "train_speed(iter/s)": 0.096032 + }, + { + "epoch": 0.4743372502971874, + "grad_norm": 8.947465896606445, + "learning_rate": 5.833889226244474e-06, + "loss": 0.7164243698120117, + "memory(GiB)": 36.53, + "step": 10200, + "token_acc": 0.8165033911077618, + "train_speed(iter/s)": 0.096059 + }, + { + "epoch": 0.4743372502971874, + "eval_loss": 0.6035026907920837, + "eval_runtime": 291.2419, + "eval_samples_per_second": 11.932, + "eval_steps_per_second": 11.932, + "step": 10200 + }, + { + "epoch": 0.474569768557137, + "grad_norm": 6.6514081954956055, + "learning_rate": 5.830097927694274e-06, + "loss": 0.682344388961792, + "memory(GiB)": 36.53, + "step": 10205, + "token_acc": 0.8235237622652987, + "train_speed(iter/s)": 0.095822 + }, + { + "epoch": 0.4748022868170866, + "grad_norm": 8.357755661010742, + "learning_rate": 5.82630613828154e-06, + "loss": 0.6868386745452881, + "memory(GiB)": 36.53, + "step": 10210, + "token_acc": 0.8239069394304052, + "train_speed(iter/s)": 0.095849 + }, + { + "epoch": 0.4750348050770362, + "grad_norm": 5.058123588562012, + "learning_rate": 5.822513860248473e-06, + "loss": 0.6764467716217041, + "memory(GiB)": 36.53, + "step": 10215, + "token_acc": 0.8304372197309418, + "train_speed(iter/s)": 0.095875 + }, + { + "epoch": 0.4752673233369858, + "grad_norm": 6.218808174133301, + "learning_rate": 5.818721095837568e-06, + "loss": 0.6577483654022217, + "memory(GiB)": 36.53, + "step": 10220, + "token_acc": 0.8478093774019985, + "train_speed(iter/s)": 0.095901 + }, + { + "epoch": 0.4754998415969354, + "grad_norm": 5.844109535217285, + "learning_rate": 5.814927847291601e-06, + "loss": 0.8146178245544433, + "memory(GiB)": 36.53, + "step": 10225, + "token_acc": 0.8012820512820513, + "train_speed(iter/s)": 0.095928 + }, + { + "epoch": 0.475732359856885, + "grad_norm": 6.904383182525635, + "learning_rate": 5.811134116853639e-06, + "loss": 0.6491562366485596, + "memory(GiB)": 36.53, + "step": 10230, + "token_acc": 0.8438868976503385, + "train_speed(iter/s)": 0.095954 + }, + { + "epoch": 0.47596487811683463, + "grad_norm": 8.637255668640137, + "learning_rate": 5.8073399067670264e-06, + "loss": 0.7037137031555176, + "memory(GiB)": 36.53, + "step": 10235, + "token_acc": 0.8274404304381245, + "train_speed(iter/s)": 0.095981 + }, + { + "epoch": 0.4761973963767842, + "grad_norm": 6.772544860839844, + "learning_rate": 5.803545219275404e-06, + "loss": 0.6280532360076905, + "memory(GiB)": 36.53, + "step": 10240, + "token_acc": 0.8492176386913229, + "train_speed(iter/s)": 0.096007 + }, + { + "epoch": 0.47642991463673384, + "grad_norm": 9.061598777770996, + "learning_rate": 5.799750056622684e-06, + "loss": 0.667292594909668, + "memory(GiB)": 36.53, + "step": 10245, + "token_acc": 0.8214404248257551, + "train_speed(iter/s)": 0.096033 + }, + { + "epoch": 0.4766624328966834, + "grad_norm": 7.211753845214844, + "learning_rate": 5.795954421053064e-06, + "loss": 0.7735927104949951, + "memory(GiB)": 36.53, + "step": 10250, + "token_acc": 0.8092586146884789, + "train_speed(iter/s)": 0.096059 + }, + { + "epoch": 0.4766624328966834, + "eval_loss": 0.6029525995254517, + "eval_runtime": 293.6988, + "eval_samples_per_second": 11.832, + "eval_steps_per_second": 11.832, + "step": 10250 + }, + { + "epoch": 0.476894951156633, + "grad_norm": 8.002875328063965, + "learning_rate": 5.792158314811018e-06, + "loss": 0.6148755073547363, + "memory(GiB)": 36.53, + "step": 10255, + "token_acc": 0.8244646359524477, + "train_speed(iter/s)": 0.095821 + }, + { + "epoch": 0.47712746941658263, + "grad_norm": 8.440995216369629, + "learning_rate": 5.788361740141305e-06, + "loss": 0.6819697856903076, + "memory(GiB)": 36.53, + "step": 10260, + "token_acc": 0.8247137781287012, + "train_speed(iter/s)": 0.095847 + }, + { + "epoch": 0.4773599876765322, + "grad_norm": 7.9807844161987305, + "learning_rate": 5.784564699288955e-06, + "loss": 0.7811295032501221, + "memory(GiB)": 36.53, + "step": 10265, + "token_acc": 0.7987358616101131, + "train_speed(iter/s)": 0.095874 + }, + { + "epoch": 0.47759250593648184, + "grad_norm": 8.874316215515137, + "learning_rate": 5.780767194499275e-06, + "loss": 0.7386106014251709, + "memory(GiB)": 36.53, + "step": 10270, + "token_acc": 0.817530695770805, + "train_speed(iter/s)": 0.0959 + }, + { + "epoch": 0.4778250241964314, + "grad_norm": 7.540121078491211, + "learning_rate": 5.776969228017846e-06, + "loss": 0.6754724502563476, + "memory(GiB)": 36.53, + "step": 10275, + "token_acc": 0.8390589992531741, + "train_speed(iter/s)": 0.095927 + }, + { + "epoch": 0.47805754245638105, + "grad_norm": 9.226720809936523, + "learning_rate": 5.773170802090526e-06, + "loss": 0.6454158306121827, + "memory(GiB)": 36.53, + "step": 10280, + "token_acc": 0.8572533849129593, + "train_speed(iter/s)": 0.095952 + }, + { + "epoch": 0.4782900607163306, + "grad_norm": 8.085728645324707, + "learning_rate": 5.7693719189634375e-06, + "loss": 0.6315486431121826, + "memory(GiB)": 36.53, + "step": 10285, + "token_acc": 0.843432289548597, + "train_speed(iter/s)": 0.095979 + }, + { + "epoch": 0.47852257897628026, + "grad_norm": 9.334856986999512, + "learning_rate": 5.76557258088298e-06, + "loss": 0.7062033653259278, + "memory(GiB)": 36.53, + "step": 10290, + "token_acc": 0.8273440726972325, + "train_speed(iter/s)": 0.096004 + }, + { + "epoch": 0.47875509723622983, + "grad_norm": 9.930914878845215, + "learning_rate": 5.76177279009582e-06, + "loss": 0.693555498123169, + "memory(GiB)": 36.53, + "step": 10295, + "token_acc": 0.8392219134577213, + "train_speed(iter/s)": 0.09603 + }, + { + "epoch": 0.4789876154961794, + "grad_norm": 6.257607460021973, + "learning_rate": 5.757972548848888e-06, + "loss": 0.5264789581298828, + "memory(GiB)": 36.53, + "step": 10300, + "token_acc": 0.8754208754208754, + "train_speed(iter/s)": 0.096056 + }, + { + "epoch": 0.4789876154961794, + "eval_loss": 0.6043440699577332, + "eval_runtime": 292.2265, + "eval_samples_per_second": 11.891, + "eval_steps_per_second": 11.891, + "step": 10300 + }, + { + "epoch": 0.47922013375612904, + "grad_norm": 6.7262420654296875, + "learning_rate": 5.7541718593893865e-06, + "loss": 0.6431325912475586, + "memory(GiB)": 36.53, + "step": 10305, + "token_acc": 0.8243854851348974, + "train_speed(iter/s)": 0.09582 + }, + { + "epoch": 0.4794526520160786, + "grad_norm": 6.871412754058838, + "learning_rate": 5.750370723964781e-06, + "loss": 0.6176501274108886, + "memory(GiB)": 36.53, + "step": 10310, + "token_acc": 0.8424317617866005, + "train_speed(iter/s)": 0.095846 + }, + { + "epoch": 0.47968517027602825, + "grad_norm": 7.732744216918945, + "learning_rate": 5.7465691448227985e-06, + "loss": 0.691460132598877, + "memory(GiB)": 36.53, + "step": 10315, + "token_acc": 0.8188585607940446, + "train_speed(iter/s)": 0.095872 + }, + { + "epoch": 0.4799176885359778, + "grad_norm": 4.607907772064209, + "learning_rate": 5.7427671242114305e-06, + "loss": 0.6410884857177734, + "memory(GiB)": 36.53, + "step": 10320, + "token_acc": 0.8241157556270097, + "train_speed(iter/s)": 0.095898 + }, + { + "epoch": 0.48015020679592746, + "grad_norm": 5.815286636352539, + "learning_rate": 5.73896466437893e-06, + "loss": 0.627712631225586, + "memory(GiB)": 36.53, + "step": 10325, + "token_acc": 0.848414539829853, + "train_speed(iter/s)": 0.095924 + }, + { + "epoch": 0.48038272505587704, + "grad_norm": 7.110845565795898, + "learning_rate": 5.735161767573809e-06, + "loss": 0.8327849388122559, + "memory(GiB)": 36.53, + "step": 10330, + "token_acc": 0.7952969550798915, + "train_speed(iter/s)": 0.09595 + }, + { + "epoch": 0.48061524331582667, + "grad_norm": 7.159823894500732, + "learning_rate": 5.731358436044836e-06, + "loss": 0.7731481075286866, + "memory(GiB)": 36.53, + "step": 10335, + "token_acc": 0.8158567774936062, + "train_speed(iter/s)": 0.095976 + }, + { + "epoch": 0.48084776157577624, + "grad_norm": 5.621152400970459, + "learning_rate": 5.7275546720410395e-06, + "loss": 0.66044921875, + "memory(GiB)": 36.53, + "step": 10340, + "token_acc": 0.8278571428571428, + "train_speed(iter/s)": 0.096002 + }, + { + "epoch": 0.4810802798357259, + "grad_norm": 6.352847099304199, + "learning_rate": 5.723750477811702e-06, + "loss": 0.7206785678863525, + "memory(GiB)": 36.53, + "step": 10345, + "token_acc": 0.8181818181818182, + "train_speed(iter/s)": 0.096028 + }, + { + "epoch": 0.48131279809567545, + "grad_norm": 6.40288782119751, + "learning_rate": 5.719945855606364e-06, + "loss": 0.7612641811370849, + "memory(GiB)": 36.53, + "step": 10350, + "token_acc": 0.8156575395295025, + "train_speed(iter/s)": 0.096054 + }, + { + "epoch": 0.48131279809567545, + "eval_loss": 0.6041258573532104, + "eval_runtime": 292.4376, + "eval_samples_per_second": 11.883, + "eval_steps_per_second": 11.883, + "step": 10350 + }, + { + "epoch": 0.48154531635562503, + "grad_norm": 6.934345245361328, + "learning_rate": 5.716140807674812e-06, + "loss": 0.6299592971801757, + "memory(GiB)": 36.53, + "step": 10355, + "token_acc": 0.8244488015685647, + "train_speed(iter/s)": 0.095819 + }, + { + "epoch": 0.48177783461557466, + "grad_norm": 5.559031963348389, + "learning_rate": 5.71233533626709e-06, + "loss": 0.7900909900665283, + "memory(GiB)": 36.53, + "step": 10360, + "token_acc": 0.7986577181208053, + "train_speed(iter/s)": 0.095845 + }, + { + "epoch": 0.48201035287552424, + "grad_norm": 6.343786716461182, + "learning_rate": 5.708529443633491e-06, + "loss": 0.781171464920044, + "memory(GiB)": 36.53, + "step": 10365, + "token_acc": 0.8162832929782082, + "train_speed(iter/s)": 0.09587 + }, + { + "epoch": 0.48224287113547387, + "grad_norm": 8.414325714111328, + "learning_rate": 5.704723132024557e-06, + "loss": 0.6617238044738769, + "memory(GiB)": 36.53, + "step": 10370, + "token_acc": 0.8287714831317632, + "train_speed(iter/s)": 0.095896 + }, + { + "epoch": 0.48247538939542345, + "grad_norm": 6.823397636413574, + "learning_rate": 5.700916403691077e-06, + "loss": 0.6314909934997559, + "memory(GiB)": 36.53, + "step": 10375, + "token_acc": 0.8461814270347795, + "train_speed(iter/s)": 0.095922 + }, + { + "epoch": 0.4827079076553731, + "grad_norm": 6.531882286071777, + "learning_rate": 5.697109260884085e-06, + "loss": 0.5414093017578125, + "memory(GiB)": 36.53, + "step": 10380, + "token_acc": 0.8636543797066983, + "train_speed(iter/s)": 0.095948 + }, + { + "epoch": 0.48294042591532266, + "grad_norm": 6.150195121765137, + "learning_rate": 5.693301705854867e-06, + "loss": 0.6772344589233399, + "memory(GiB)": 36.53, + "step": 10385, + "token_acc": 0.8308388444135051, + "train_speed(iter/s)": 0.095974 + }, + { + "epoch": 0.4831729441752723, + "grad_norm": 5.847650527954102, + "learning_rate": 5.6894937408549435e-06, + "loss": 0.7219397068023682, + "memory(GiB)": 36.53, + "step": 10390, + "token_acc": 0.8232174425456689, + "train_speed(iter/s)": 0.096 + }, + { + "epoch": 0.48340546243522187, + "grad_norm": 5.661290168762207, + "learning_rate": 5.6856853681360825e-06, + "loss": 0.7071576118469238, + "memory(GiB)": 36.53, + "step": 10395, + "token_acc": 0.8241574908241575, + "train_speed(iter/s)": 0.096025 + }, + { + "epoch": 0.48363798069517144, + "grad_norm": 7.167285442352295, + "learning_rate": 5.681876589950295e-06, + "loss": 0.6745022296905517, + "memory(GiB)": 36.53, + "step": 10400, + "token_acc": 0.8311345646437994, + "train_speed(iter/s)": 0.096051 + }, + { + "epoch": 0.48363798069517144, + "eval_loss": 0.604568600654602, + "eval_runtime": 291.1795, + "eval_samples_per_second": 11.934, + "eval_steps_per_second": 11.934, + "step": 10400 + }, + { + "epoch": 0.4838704989551211, + "grad_norm": 6.558994293212891, + "learning_rate": 5.678067408549828e-06, + "loss": 0.7601808547973633, + "memory(GiB)": 36.53, + "step": 10405, + "token_acc": 0.8237313086542231, + "train_speed(iter/s)": 0.09582 + }, + { + "epoch": 0.48410301721507065, + "grad_norm": 9.058159828186035, + "learning_rate": 5.6742578261871665e-06, + "loss": 0.6190596580505371, + "memory(GiB)": 36.53, + "step": 10410, + "token_acc": 0.8485485906604964, + "train_speed(iter/s)": 0.095845 + }, + { + "epoch": 0.4843355354750203, + "grad_norm": 6.316933631896973, + "learning_rate": 5.670447845115033e-06, + "loss": 0.7364337921142579, + "memory(GiB)": 36.53, + "step": 10415, + "token_acc": 0.8257394084732215, + "train_speed(iter/s)": 0.095871 + }, + { + "epoch": 0.48456805373496986, + "grad_norm": 9.497570037841797, + "learning_rate": 5.66663746758639e-06, + "loss": 0.7649255752563476, + "memory(GiB)": 36.53, + "step": 10420, + "token_acc": 0.8071589809738794, + "train_speed(iter/s)": 0.095897 + }, + { + "epoch": 0.4848005719949195, + "grad_norm": 7.515045166015625, + "learning_rate": 5.662826695854431e-06, + "loss": 0.6880429744720459, + "memory(GiB)": 36.53, + "step": 10425, + "token_acc": 0.8259047619047619, + "train_speed(iter/s)": 0.095923 + }, + { + "epoch": 0.48503309025486907, + "grad_norm": 7.987893581390381, + "learning_rate": 5.6590155321725825e-06, + "loss": 0.8033793449401856, + "memory(GiB)": 36.53, + "step": 10430, + "token_acc": 0.8061657032755298, + "train_speed(iter/s)": 0.095948 + }, + { + "epoch": 0.4852656085148187, + "grad_norm": 8.104069709777832, + "learning_rate": 5.655203978794504e-06, + "loss": 0.6667456150054931, + "memory(GiB)": 36.53, + "step": 10435, + "token_acc": 0.8448275862068966, + "train_speed(iter/s)": 0.095974 + }, + { + "epoch": 0.4854981267747683, + "grad_norm": 7.878139495849609, + "learning_rate": 5.6513920379740816e-06, + "loss": 0.6613424777984619, + "memory(GiB)": 36.53, + "step": 10440, + "token_acc": 0.8525703200775946, + "train_speed(iter/s)": 0.096 + }, + { + "epoch": 0.48573064503471786, + "grad_norm": 9.738919258117676, + "learning_rate": 5.647579711965438e-06, + "loss": 0.6458121299743652, + "memory(GiB)": 36.53, + "step": 10445, + "token_acc": 0.8279947345326898, + "train_speed(iter/s)": 0.096026 + }, + { + "epoch": 0.4859631632946675, + "grad_norm": 6.289546966552734, + "learning_rate": 5.6437670030229155e-06, + "loss": 0.6806787014007568, + "memory(GiB)": 36.53, + "step": 10450, + "token_acc": 0.8339513803049031, + "train_speed(iter/s)": 0.096052 + }, + { + "epoch": 0.4859631632946675, + "eval_loss": 0.6016459465026855, + "eval_runtime": 292.0162, + "eval_samples_per_second": 11.9, + "eval_steps_per_second": 11.9, + "step": 10450 + }, + { + "epoch": 0.48619568155461707, + "grad_norm": 8.368342399597168, + "learning_rate": 5.63995391340109e-06, + "loss": 0.5780067443847656, + "memory(GiB)": 36.53, + "step": 10455, + "token_acc": 0.8241071571627706, + "train_speed(iter/s)": 0.095821 + }, + { + "epoch": 0.4864281998145667, + "grad_norm": 8.043057441711426, + "learning_rate": 5.6361404453547545e-06, + "loss": 0.7120685577392578, + "memory(GiB)": 36.53, + "step": 10460, + "token_acc": 0.8316590563165905, + "train_speed(iter/s)": 0.095847 + }, + { + "epoch": 0.4866607180745163, + "grad_norm": 8.706648826599121, + "learning_rate": 5.632326601138935e-06, + "loss": 0.8613996505737305, + "memory(GiB)": 36.53, + "step": 10465, + "token_acc": 0.7830508474576271, + "train_speed(iter/s)": 0.095873 + }, + { + "epoch": 0.4868932363344659, + "grad_norm": 7.210156440734863, + "learning_rate": 5.628512383008874e-06, + "loss": 0.668830156326294, + "memory(GiB)": 36.53, + "step": 10470, + "token_acc": 0.8363457114689451, + "train_speed(iter/s)": 0.095899 + }, + { + "epoch": 0.4871257545944155, + "grad_norm": 6.831915855407715, + "learning_rate": 5.624697793220035e-06, + "loss": 0.7384651660919189, + "memory(GiB)": 36.53, + "step": 10475, + "token_acc": 0.8148029477731497, + "train_speed(iter/s)": 0.095925 + }, + { + "epoch": 0.4873582728543651, + "grad_norm": 8.7930269241333, + "learning_rate": 5.620882834028103e-06, + "loss": 0.6526782989501954, + "memory(GiB)": 36.53, + "step": 10480, + "token_acc": 0.8457166057653268, + "train_speed(iter/s)": 0.095951 + }, + { + "epoch": 0.4875907911143147, + "grad_norm": 6.194753170013428, + "learning_rate": 5.617067507688983e-06, + "loss": 0.7427937984466553, + "memory(GiB)": 36.53, + "step": 10485, + "token_acc": 0.8137285491419657, + "train_speed(iter/s)": 0.095977 + }, + { + "epoch": 0.4878233093742643, + "grad_norm": 7.289088726043701, + "learning_rate": 5.613251816458794e-06, + "loss": 0.7535356521606446, + "memory(GiB)": 36.53, + "step": 10490, + "token_acc": 0.8124249699879952, + "train_speed(iter/s)": 0.096002 + }, + { + "epoch": 0.4880558276342139, + "grad_norm": 8.472503662109375, + "learning_rate": 5.609435762593873e-06, + "loss": 0.8218043327331543, + "memory(GiB)": 36.53, + "step": 10495, + "token_acc": 0.7921367521367522, + "train_speed(iter/s)": 0.096028 + }, + { + "epoch": 0.4882883458941635, + "grad_norm": 6.736743450164795, + "learning_rate": 5.605619348350768e-06, + "loss": 0.668080186843872, + "memory(GiB)": 36.53, + "step": 10500, + "token_acc": 0.8355832467982001, + "train_speed(iter/s)": 0.096054 + }, + { + "epoch": 0.4882883458941635, + "eval_loss": 0.6022413372993469, + "eval_runtime": 293.5956, + "eval_samples_per_second": 11.836, + "eval_steps_per_second": 11.836, + "step": 10500 + }, + { + "epoch": 0.4885208641541131, + "grad_norm": 6.929119110107422, + "learning_rate": 5.6018025759862445e-06, + "loss": 0.7089277267456054, + "memory(GiB)": 36.53, + "step": 10505, + "token_acc": 0.8237999102736653, + "train_speed(iter/s)": 0.095822 + }, + { + "epoch": 0.4887533824140627, + "grad_norm": 5.99254846572876, + "learning_rate": 5.597985447757278e-06, + "loss": 0.653064489364624, + "memory(GiB)": 36.53, + "step": 10510, + "token_acc": 0.8375241779497099, + "train_speed(iter/s)": 0.095847 + }, + { + "epoch": 0.4889859006740123, + "grad_norm": 6.13014030456543, + "learning_rate": 5.594167965921055e-06, + "loss": 0.7110246658325196, + "memory(GiB)": 36.53, + "step": 10515, + "token_acc": 0.82035494386092, + "train_speed(iter/s)": 0.095873 + }, + { + "epoch": 0.4892184189339619, + "grad_norm": 8.600988388061523, + "learning_rate": 5.590350132734966e-06, + "loss": 0.6460587024688721, + "memory(GiB)": 36.53, + "step": 10520, + "token_acc": 0.8511146496815286, + "train_speed(iter/s)": 0.095898 + }, + { + "epoch": 0.48945093719391153, + "grad_norm": 7.910162925720215, + "learning_rate": 5.586531950456619e-06, + "loss": 0.7574851512908936, + "memory(GiB)": 36.53, + "step": 10525, + "token_acc": 0.8053097345132744, + "train_speed(iter/s)": 0.095924 + }, + { + "epoch": 0.4896834554538611, + "grad_norm": 6.965766429901123, + "learning_rate": 5.582713421343822e-06, + "loss": 0.595002555847168, + "memory(GiB)": 36.53, + "step": 10530, + "token_acc": 0.8482737734706238, + "train_speed(iter/s)": 0.095949 + }, + { + "epoch": 0.48991597371381074, + "grad_norm": 7.77105188369751, + "learning_rate": 5.578894547654586e-06, + "loss": 0.7004836082458497, + "memory(GiB)": 36.53, + "step": 10535, + "token_acc": 0.8149184149184149, + "train_speed(iter/s)": 0.095974 + }, + { + "epoch": 0.4901484919737603, + "grad_norm": 8.512481689453125, + "learning_rate": 5.57507533164713e-06, + "loss": 0.7674338340759277, + "memory(GiB)": 36.53, + "step": 10540, + "token_acc": 0.8025250890255746, + "train_speed(iter/s)": 0.095999 + }, + { + "epoch": 0.4903810102337099, + "grad_norm": 4.976019382476807, + "learning_rate": 5.571255775579878e-06, + "loss": 0.6822651386260986, + "memory(GiB)": 36.53, + "step": 10545, + "token_acc": 0.8155166249553093, + "train_speed(iter/s)": 0.096025 + }, + { + "epoch": 0.4906135284936595, + "grad_norm": 7.175931930541992, + "learning_rate": 5.567435881711446e-06, + "loss": 0.7229970932006836, + "memory(GiB)": 36.53, + "step": 10550, + "token_acc": 0.8246963562753037, + "train_speed(iter/s)": 0.09605 + }, + { + "epoch": 0.4906135284936595, + "eval_loss": 0.599609375, + "eval_runtime": 295.1161, + "eval_samples_per_second": 11.775, + "eval_steps_per_second": 11.775, + "step": 10550 + }, + { + "epoch": 0.4908460467536091, + "grad_norm": 6.382102012634277, + "learning_rate": 5.56361565230066e-06, + "loss": 0.6602955341339112, + "memory(GiB)": 36.53, + "step": 10555, + "token_acc": 0.8243287662446986, + "train_speed(iter/s)": 0.095818 + }, + { + "epoch": 0.49107856501355873, + "grad_norm": 7.595702648162842, + "learning_rate": 5.559795089606536e-06, + "loss": 0.6507344722747803, + "memory(GiB)": 36.53, + "step": 10560, + "token_acc": 0.8347368421052631, + "train_speed(iter/s)": 0.095843 + }, + { + "epoch": 0.4913110832735083, + "grad_norm": 8.050884246826172, + "learning_rate": 5.555974195888293e-06, + "loss": 0.6467938899993897, + "memory(GiB)": 36.53, + "step": 10565, + "token_acc": 0.8394875659382065, + "train_speed(iter/s)": 0.095868 + }, + { + "epoch": 0.49154360153345794, + "grad_norm": 5.628944396972656, + "learning_rate": 5.552152973405343e-06, + "loss": 0.6498900890350342, + "memory(GiB)": 36.53, + "step": 10570, + "token_acc": 0.831989247311828, + "train_speed(iter/s)": 0.095893 + }, + { + "epoch": 0.4917761197934075, + "grad_norm": 7.6785454750061035, + "learning_rate": 5.548331424417293e-06, + "loss": 0.6663597106933594, + "memory(GiB)": 36.53, + "step": 10575, + "token_acc": 0.8286666666666667, + "train_speed(iter/s)": 0.095918 + }, + { + "epoch": 0.49200863805335715, + "grad_norm": 6.159049987792969, + "learning_rate": 5.54450955118394e-06, + "loss": 0.5869782447814942, + "memory(GiB)": 36.53, + "step": 10580, + "token_acc": 0.858295334970186, + "train_speed(iter/s)": 0.095944 + }, + { + "epoch": 0.4922411563133067, + "grad_norm": 5.802601337432861, + "learning_rate": 5.54068735596528e-06, + "loss": 0.6343412399291992, + "memory(GiB)": 36.53, + "step": 10585, + "token_acc": 0.8386798272671191, + "train_speed(iter/s)": 0.095969 + }, + { + "epoch": 0.4924736745732563, + "grad_norm": 7.778030872344971, + "learning_rate": 5.536864841021492e-06, + "loss": 0.7568382740020752, + "memory(GiB)": 36.53, + "step": 10590, + "token_acc": 0.8063575386410032, + "train_speed(iter/s)": 0.095994 + }, + { + "epoch": 0.49270619283320594, + "grad_norm": 6.506865978240967, + "learning_rate": 5.533042008612949e-06, + "loss": 0.6708489418029785, + "memory(GiB)": 36.53, + "step": 10595, + "token_acc": 0.8332750786438309, + "train_speed(iter/s)": 0.096019 + }, + { + "epoch": 0.4929387110931555, + "grad_norm": 7.4949493408203125, + "learning_rate": 5.529218861000208e-06, + "loss": 0.6150759220123291, + "memory(GiB)": 36.53, + "step": 10600, + "token_acc": 0.8408723747980614, + "train_speed(iter/s)": 0.096045 + }, + { + "epoch": 0.4929387110931555, + "eval_loss": 0.6005235910415649, + "eval_runtime": 295.1347, + "eval_samples_per_second": 11.774, + "eval_steps_per_second": 11.774, + "step": 10600 + }, + { + "epoch": 0.49317122935310514, + "grad_norm": 10.075146675109863, + "learning_rate": 5.5253954004440146e-06, + "loss": 0.6699877262115479, + "memory(GiB)": 36.53, + "step": 10605, + "token_acc": 0.824314389822924, + "train_speed(iter/s)": 0.095813 + }, + { + "epoch": 0.4934037476130547, + "grad_norm": 7.288110256195068, + "learning_rate": 5.521571629205301e-06, + "loss": 0.6553449153900146, + "memory(GiB)": 36.53, + "step": 10610, + "token_acc": 0.8335005015045135, + "train_speed(iter/s)": 0.095837 + }, + { + "epoch": 0.49363626587300435, + "grad_norm": 6.840569972991943, + "learning_rate": 5.517747549545179e-06, + "loss": 0.722406530380249, + "memory(GiB)": 36.53, + "step": 10615, + "token_acc": 0.8230827638572513, + "train_speed(iter/s)": 0.095863 + }, + { + "epoch": 0.49386878413295393, + "grad_norm": 6.8756022453308105, + "learning_rate": 5.513923163724946e-06, + "loss": 0.6679422855377197, + "memory(GiB)": 36.53, + "step": 10620, + "token_acc": 0.8346281908990011, + "train_speed(iter/s)": 0.095888 + }, + { + "epoch": 0.49410130239290356, + "grad_norm": 8.059365272521973, + "learning_rate": 5.510098474006079e-06, + "loss": 0.6776402473449707, + "memory(GiB)": 36.53, + "step": 10625, + "token_acc": 0.8284745762711865, + "train_speed(iter/s)": 0.095913 + }, + { + "epoch": 0.49433382065285314, + "grad_norm": 8.242023468017578, + "learning_rate": 5.506273482650237e-06, + "loss": 0.7858617782592774, + "memory(GiB)": 36.53, + "step": 10630, + "token_acc": 0.8071585098612125, + "train_speed(iter/s)": 0.095938 + }, + { + "epoch": 0.49456633891280277, + "grad_norm": 7.4410786628723145, + "learning_rate": 5.502448191919253e-06, + "loss": 0.5851879119873047, + "memory(GiB)": 36.53, + "step": 10635, + "token_acc": 0.859278518037049, + "train_speed(iter/s)": 0.095963 + }, + { + "epoch": 0.49479885717275235, + "grad_norm": 6.335245132446289, + "learning_rate": 5.498622604075139e-06, + "loss": 0.6306666374206543, + "memory(GiB)": 36.53, + "step": 10640, + "token_acc": 0.8379658875552748, + "train_speed(iter/s)": 0.095988 + }, + { + "epoch": 0.4950313754327019, + "grad_norm": 9.677199363708496, + "learning_rate": 5.4947967213800855e-06, + "loss": 0.5713845252990722, + "memory(GiB)": 36.53, + "step": 10645, + "token_acc": 0.8544303797468354, + "train_speed(iter/s)": 0.096014 + }, + { + "epoch": 0.49526389369265156, + "grad_norm": 7.010994911193848, + "learning_rate": 5.490970546096454e-06, + "loss": 0.6432509899139405, + "memory(GiB)": 36.53, + "step": 10650, + "token_acc": 0.8430942687128092, + "train_speed(iter/s)": 0.096038 + }, + { + "epoch": 0.49526389369265156, + "eval_loss": 0.601109504699707, + "eval_runtime": 292.8707, + "eval_samples_per_second": 11.865, + "eval_steps_per_second": 11.865, + "step": 10650 + }, + { + "epoch": 0.49549641195260113, + "grad_norm": 9.233794212341309, + "learning_rate": 5.487144080486781e-06, + "loss": 0.7560394287109375, + "memory(GiB)": 36.53, + "step": 10655, + "token_acc": 0.8235558192108512, + "train_speed(iter/s)": 0.09581 + }, + { + "epoch": 0.49572893021255077, + "grad_norm": 10.122901916503906, + "learning_rate": 5.483317326813771e-06, + "loss": 0.6702571868896484, + "memory(GiB)": 36.53, + "step": 10660, + "token_acc": 0.8401162790697675, + "train_speed(iter/s)": 0.095835 + }, + { + "epoch": 0.49596144847250034, + "grad_norm": 7.147398948669434, + "learning_rate": 5.479490287340305e-06, + "loss": 0.6192568302154541, + "memory(GiB)": 36.53, + "step": 10665, + "token_acc": 0.8434712084347121, + "train_speed(iter/s)": 0.09586 + }, + { + "epoch": 0.49619396673245, + "grad_norm": 7.782430648803711, + "learning_rate": 5.475662964329428e-06, + "loss": 0.7436542510986328, + "memory(GiB)": 36.53, + "step": 10670, + "token_acc": 0.814753556070129, + "train_speed(iter/s)": 0.095885 + }, + { + "epoch": 0.49642648499239955, + "grad_norm": 9.926347732543945, + "learning_rate": 5.471835360044354e-06, + "loss": 0.7577160358428955, + "memory(GiB)": 36.53, + "step": 10675, + "token_acc": 0.8254643962848297, + "train_speed(iter/s)": 0.095911 + }, + { + "epoch": 0.4966590032523492, + "grad_norm": 6.964309215545654, + "learning_rate": 5.468007476748463e-06, + "loss": 0.68472318649292, + "memory(GiB)": 36.53, + "step": 10680, + "token_acc": 0.8404074702886248, + "train_speed(iter/s)": 0.095935 + }, + { + "epoch": 0.49689152151229876, + "grad_norm": 8.409749031066895, + "learning_rate": 5.464179316705302e-06, + "loss": 0.8917485237121582, + "memory(GiB)": 36.53, + "step": 10685, + "token_acc": 0.7695954487989887, + "train_speed(iter/s)": 0.09596 + }, + { + "epoch": 0.49712403977224834, + "grad_norm": 8.865854263305664, + "learning_rate": 5.460350882178581e-06, + "loss": 0.704495906829834, + "memory(GiB)": 36.53, + "step": 10690, + "token_acc": 0.8437796771130105, + "train_speed(iter/s)": 0.095986 + }, + { + "epoch": 0.49735655803219797, + "grad_norm": 8.874429702758789, + "learning_rate": 5.45652217543217e-06, + "loss": 0.704408597946167, + "memory(GiB)": 36.53, + "step": 10695, + "token_acc": 0.8195391106783512, + "train_speed(iter/s)": 0.096011 + }, + { + "epoch": 0.49758907629214755, + "grad_norm": 7.480283260345459, + "learning_rate": 5.452693198730101e-06, + "loss": 0.7443026542663574, + "memory(GiB)": 36.53, + "step": 10700, + "token_acc": 0.8162226878180185, + "train_speed(iter/s)": 0.096036 + }, + { + "epoch": 0.49758907629214755, + "eval_loss": 0.6013534665107727, + "eval_runtime": 297.2319, + "eval_samples_per_second": 11.691, + "eval_steps_per_second": 11.691, + "step": 10700 + }, + { + "epoch": 0.4978215945520972, + "grad_norm": 4.385597229003906, + "learning_rate": 5.448863954336568e-06, + "loss": 0.7800351142883301, + "memory(GiB)": 36.53, + "step": 10705, + "token_acc": 0.8231241158987237, + "train_speed(iter/s)": 0.095803 + }, + { + "epoch": 0.49805411281204676, + "grad_norm": 7.6840291023254395, + "learning_rate": 5.445034444515919e-06, + "loss": 0.6394901752471924, + "memory(GiB)": 36.53, + "step": 10710, + "token_acc": 0.8491237677984665, + "train_speed(iter/s)": 0.095827 + }, + { + "epoch": 0.4982866310719964, + "grad_norm": 8.534746170043945, + "learning_rate": 5.441204671532664e-06, + "loss": 0.5974376201629639, + "memory(GiB)": 36.53, + "step": 10715, + "token_acc": 0.8384531984098301, + "train_speed(iter/s)": 0.095852 + }, + { + "epoch": 0.49851914933194597, + "grad_norm": 6.349865436553955, + "learning_rate": 5.437374637651463e-06, + "loss": 0.611474084854126, + "memory(GiB)": 36.53, + "step": 10720, + "token_acc": 0.8560975609756097, + "train_speed(iter/s)": 0.095877 + }, + { + "epoch": 0.4987516675918956, + "grad_norm": 8.979010581970215, + "learning_rate": 5.433544345137137e-06, + "loss": 0.7590946197509766, + "memory(GiB)": 36.53, + "step": 10725, + "token_acc": 0.8189058171745153, + "train_speed(iter/s)": 0.095902 + }, + { + "epoch": 0.4989841858518452, + "grad_norm": 6.776154518127441, + "learning_rate": 5.429713796254654e-06, + "loss": 0.6038641452789306, + "memory(GiB)": 40.03, + "step": 10730, + "token_acc": 0.8517397881996974, + "train_speed(iter/s)": 0.095925 + }, + { + "epoch": 0.4992167041117948, + "grad_norm": 5.198429584503174, + "learning_rate": 5.425882993269136e-06, + "loss": 0.6112579345703125, + "memory(GiB)": 40.03, + "step": 10735, + "token_acc": 0.853542234332425, + "train_speed(iter/s)": 0.09595 + }, + { + "epoch": 0.4994492223717444, + "grad_norm": 8.502971649169922, + "learning_rate": 5.4220519384458545e-06, + "loss": 0.6879833221435547, + "memory(GiB)": 40.03, + "step": 10740, + "token_acc": 0.8312799452429842, + "train_speed(iter/s)": 0.095975 + }, + { + "epoch": 0.49968174063169396, + "grad_norm": 8.349090576171875, + "learning_rate": 5.418220634050232e-06, + "loss": 0.7486342430114746, + "memory(GiB)": 40.03, + "step": 10745, + "token_acc": 0.8212837837837837, + "train_speed(iter/s)": 0.095999 + }, + { + "epoch": 0.4999142588916436, + "grad_norm": 5.16649866104126, + "learning_rate": 5.414389082347836e-06, + "loss": 0.6454334735870362, + "memory(GiB)": 40.03, + "step": 10750, + "token_acc": 0.8207745421795257, + "train_speed(iter/s)": 0.096024 + }, + { + "epoch": 0.4999142588916436, + "eval_loss": 0.6007801294326782, + "eval_runtime": 295.4874, + "eval_samples_per_second": 11.76, + "eval_steps_per_second": 11.76, + "step": 10750 + }, + { + "epoch": 0.5001467771515932, + "grad_norm": 6.761124134063721, + "learning_rate": 5.410557285604382e-06, + "loss": 0.6681477546691894, + "memory(GiB)": 40.03, + "step": 10755, + "token_acc": 0.8252858146515936, + "train_speed(iter/s)": 0.095796 + }, + { + "epoch": 0.5003792954115428, + "grad_norm": 6.287301063537598, + "learning_rate": 5.406725246085728e-06, + "loss": 0.8242059707641601, + "memory(GiB)": 40.03, + "step": 10760, + "token_acc": 0.801002358490566, + "train_speed(iter/s)": 0.095821 + }, + { + "epoch": 0.5006118136714924, + "grad_norm": 6.488293170928955, + "learning_rate": 5.40289296605788e-06, + "loss": 0.6869512557983398, + "memory(GiB)": 40.03, + "step": 10765, + "token_acc": 0.8194488438390878, + "train_speed(iter/s)": 0.095846 + }, + { + "epoch": 0.500844331931442, + "grad_norm": 5.360726833343506, + "learning_rate": 5.399060447786982e-06, + "loss": 0.7333622932434082, + "memory(GiB)": 40.03, + "step": 10770, + "token_acc": 0.8263157894736842, + "train_speed(iter/s)": 0.095871 + }, + { + "epoch": 0.5010768501913916, + "grad_norm": 9.921550750732422, + "learning_rate": 5.39522769353932e-06, + "loss": 0.6461817741394043, + "memory(GiB)": 40.03, + "step": 10775, + "token_acc": 0.837942955920484, + "train_speed(iter/s)": 0.095896 + }, + { + "epoch": 0.5013093684513412, + "grad_norm": 6.710071563720703, + "learning_rate": 5.39139470558132e-06, + "loss": 0.7817147254943848, + "memory(GiB)": 40.03, + "step": 10780, + "token_acc": 0.8170890188434048, + "train_speed(iter/s)": 0.095921 + }, + { + "epoch": 0.5015418867112907, + "grad_norm": 8.08139419555664, + "learning_rate": 5.3875614861795466e-06, + "loss": 0.6603563308715821, + "memory(GiB)": 40.03, + "step": 10785, + "token_acc": 0.8272024729520866, + "train_speed(iter/s)": 0.095945 + }, + { + "epoch": 0.5017744049712404, + "grad_norm": 6.168495178222656, + "learning_rate": 5.383728037600702e-06, + "loss": 0.670966386795044, + "memory(GiB)": 40.03, + "step": 10790, + "token_acc": 0.8350973028337316, + "train_speed(iter/s)": 0.095971 + }, + { + "epoch": 0.50200692323119, + "grad_norm": 5.787970542907715, + "learning_rate": 5.379894362111621e-06, + "loss": 0.5740962982177734, + "memory(GiB)": 40.03, + "step": 10795, + "token_acc": 0.8688090737240075, + "train_speed(iter/s)": 0.095997 + }, + { + "epoch": 0.5022394414911396, + "grad_norm": 7.659470558166504, + "learning_rate": 5.376060461979272e-06, + "loss": 0.7049031257629395, + "memory(GiB)": 40.03, + "step": 10800, + "token_acc": 0.8216096324461344, + "train_speed(iter/s)": 0.096022 + }, + { + "epoch": 0.5022394414911396, + "eval_loss": 0.5980068445205688, + "eval_runtime": 292.8256, + "eval_samples_per_second": 11.867, + "eval_steps_per_second": 11.867, + "step": 10800 + }, + { + "epoch": 0.5024719597510892, + "grad_norm": 9.11786937713623, + "learning_rate": 5.372226339470764e-06, + "loss": 0.6317077159881592, + "memory(GiB)": 40.03, + "step": 10805, + "token_acc": 0.8252225018290575, + "train_speed(iter/s)": 0.095797 + }, + { + "epoch": 0.5027044780110388, + "grad_norm": 9.503100395202637, + "learning_rate": 5.368391996853328e-06, + "loss": 0.6980850219726562, + "memory(GiB)": 40.03, + "step": 10810, + "token_acc": 0.8356107660455486, + "train_speed(iter/s)": 0.095822 + }, + { + "epoch": 0.5029369962709884, + "grad_norm": 10.004739761352539, + "learning_rate": 5.364557436394331e-06, + "loss": 0.7618881225585937, + "memory(GiB)": 40.03, + "step": 10815, + "token_acc": 0.8205022643062989, + "train_speed(iter/s)": 0.095847 + }, + { + "epoch": 0.5031695145309381, + "grad_norm": 6.893964767456055, + "learning_rate": 5.360722660361266e-06, + "loss": 0.8586786270141602, + "memory(GiB)": 40.03, + "step": 10820, + "token_acc": 0.8097094259390503, + "train_speed(iter/s)": 0.095872 + }, + { + "epoch": 0.5034020327908876, + "grad_norm": 6.224417209625244, + "learning_rate": 5.3568876710217545e-06, + "loss": 0.6337433815002441, + "memory(GiB)": 40.03, + "step": 10825, + "token_acc": 0.845340383344349, + "train_speed(iter/s)": 0.095897 + }, + { + "epoch": 0.5036345510508372, + "grad_norm": 6.450043678283691, + "learning_rate": 5.353052470643545e-06, + "loss": 0.6762244224548339, + "memory(GiB)": 40.03, + "step": 10830, + "token_acc": 0.8395061728395061, + "train_speed(iter/s)": 0.095922 + }, + { + "epoch": 0.5038670693107868, + "grad_norm": 7.789783000946045, + "learning_rate": 5.349217061494509e-06, + "loss": 0.7612596035003663, + "memory(GiB)": 40.03, + "step": 10835, + "token_acc": 0.8089563019140484, + "train_speed(iter/s)": 0.095947 + }, + { + "epoch": 0.5040995875707364, + "grad_norm": 8.258014678955078, + "learning_rate": 5.345381445842644e-06, + "loss": 0.5151895046234131, + "memory(GiB)": 40.03, + "step": 10840, + "token_acc": 0.8606260296540362, + "train_speed(iter/s)": 0.095971 + }, + { + "epoch": 0.504332105830686, + "grad_norm": 6.241860389709473, + "learning_rate": 5.341545625956064e-06, + "loss": 0.7108976364135742, + "memory(GiB)": 40.03, + "step": 10845, + "token_acc": 0.8180952380952381, + "train_speed(iter/s)": 0.095996 + }, + { + "epoch": 0.5045646240906356, + "grad_norm": 6.1465301513671875, + "learning_rate": 5.337709604103013e-06, + "loss": 0.6371690273284912, + "memory(GiB)": 40.03, + "step": 10850, + "token_acc": 0.8452544704264099, + "train_speed(iter/s)": 0.096021 + }, + { + "epoch": 0.5045646240906356, + "eval_loss": 0.5983362793922424, + "eval_runtime": 290.2492, + "eval_samples_per_second": 11.972, + "eval_steps_per_second": 11.972, + "step": 10850 + }, + { + "epoch": 0.5047971423505853, + "grad_norm": 7.173838138580322, + "learning_rate": 5.3338733825518454e-06, + "loss": 0.5999796867370606, + "memory(GiB)": 40.03, + "step": 10855, + "token_acc": 0.8251333823200536, + "train_speed(iter/s)": 0.0958 + }, + { + "epoch": 0.5050296606105348, + "grad_norm": 6.503340721130371, + "learning_rate": 5.330036963571039e-06, + "loss": 0.519847059249878, + "memory(GiB)": 40.03, + "step": 10860, + "token_acc": 0.8625856164383562, + "train_speed(iter/s)": 0.095825 + }, + { + "epoch": 0.5052621788704844, + "grad_norm": 7.504303455352783, + "learning_rate": 5.326200349429185e-06, + "loss": 0.5535295486450196, + "memory(GiB)": 40.03, + "step": 10865, + "token_acc": 0.8465544871794872, + "train_speed(iter/s)": 0.09585 + }, + { + "epoch": 0.505494697130434, + "grad_norm": 7.1386799812316895, + "learning_rate": 5.322363542394994e-06, + "loss": 0.7013700485229493, + "memory(GiB)": 40.03, + "step": 10870, + "token_acc": 0.8328748280605227, + "train_speed(iter/s)": 0.095875 + }, + { + "epoch": 0.5057272153903837, + "grad_norm": 6.778848171234131, + "learning_rate": 5.318526544737288e-06, + "loss": 0.6343185424804687, + "memory(GiB)": 40.03, + "step": 10875, + "token_acc": 0.8425501937301867, + "train_speed(iter/s)": 0.095899 + }, + { + "epoch": 0.5059597336503332, + "grad_norm": 6.905848026275635, + "learning_rate": 5.314689358725002e-06, + "loss": 0.7110846996307373, + "memory(GiB)": 40.03, + "step": 10880, + "token_acc": 0.8171044202434337, + "train_speed(iter/s)": 0.095924 + }, + { + "epoch": 0.5061922519102828, + "grad_norm": 10.56027889251709, + "learning_rate": 5.31085198662718e-06, + "loss": 0.6131344795227051, + "memory(GiB)": 40.03, + "step": 10885, + "token_acc": 0.8370015948963317, + "train_speed(iter/s)": 0.095948 + }, + { + "epoch": 0.5064247701702325, + "grad_norm": 8.732833862304688, + "learning_rate": 5.3070144307129834e-06, + "loss": 0.6883892059326172, + "memory(GiB)": 40.03, + "step": 10890, + "token_acc": 0.8293180890159249, + "train_speed(iter/s)": 0.095973 + }, + { + "epoch": 0.506657288430182, + "grad_norm": 7.830416679382324, + "learning_rate": 5.303176693251675e-06, + "loss": 0.6924023151397705, + "memory(GiB)": 40.03, + "step": 10895, + "token_acc": 0.8322422258592471, + "train_speed(iter/s)": 0.095998 + }, + { + "epoch": 0.5068898066901316, + "grad_norm": 7.292912483215332, + "learning_rate": 5.2993387765126255e-06, + "loss": 0.6057341575622559, + "memory(GiB)": 40.03, + "step": 10900, + "token_acc": 0.8391360412637009, + "train_speed(iter/s)": 0.096023 + }, + { + "epoch": 0.5068898066901316, + "eval_loss": 0.6003403663635254, + "eval_runtime": 290.5621, + "eval_samples_per_second": 11.96, + "eval_steps_per_second": 11.96, + "step": 10900 + }, + { + "epoch": 0.5071223249500812, + "grad_norm": 9.880396842956543, + "learning_rate": 5.295500682765318e-06, + "loss": 0.7102957725524902, + "memory(GiB)": 40.03, + "step": 10905, + "token_acc": 0.8242088112303457, + "train_speed(iter/s)": 0.095802 + }, + { + "epoch": 0.5073548432100309, + "grad_norm": 7.285651206970215, + "learning_rate": 5.291662414279332e-06, + "loss": 0.7098144054412842, + "memory(GiB)": 40.03, + "step": 10910, + "token_acc": 0.8158052884615384, + "train_speed(iter/s)": 0.095826 + }, + { + "epoch": 0.5075873614699804, + "grad_norm": 7.897537708282471, + "learning_rate": 5.287823973324355e-06, + "loss": 0.6831938743591308, + "memory(GiB)": 40.03, + "step": 10915, + "token_acc": 0.8302300109529025, + "train_speed(iter/s)": 0.095851 + }, + { + "epoch": 0.50781987972993, + "grad_norm": 7.627477645874023, + "learning_rate": 5.283985362170176e-06, + "loss": 0.592428731918335, + "memory(GiB)": 40.03, + "step": 10920, + "token_acc": 0.8552695483244294, + "train_speed(iter/s)": 0.095876 + }, + { + "epoch": 0.5080523979898797, + "grad_norm": 8.800142288208008, + "learning_rate": 5.280146583086686e-06, + "loss": 0.6185301303863525, + "memory(GiB)": 40.03, + "step": 10925, + "token_acc": 0.8554865424430642, + "train_speed(iter/s)": 0.095901 + }, + { + "epoch": 0.5082849162498293, + "grad_norm": 7.865505695343018, + "learning_rate": 5.276307638343871e-06, + "loss": 0.8139777183532715, + "memory(GiB)": 40.03, + "step": 10930, + "token_acc": 0.8035950303991541, + "train_speed(iter/s)": 0.095925 + }, + { + "epoch": 0.5085174345097788, + "grad_norm": 6.902219295501709, + "learning_rate": 5.272468530211821e-06, + "loss": 0.7540923118591308, + "memory(GiB)": 40.03, + "step": 10935, + "token_acc": 0.8090858416945373, + "train_speed(iter/s)": 0.09595 + }, + { + "epoch": 0.5087499527697285, + "grad_norm": 7.471006870269775, + "learning_rate": 5.268629260960714e-06, + "loss": 0.6599215030670166, + "memory(GiB)": 40.03, + "step": 10940, + "token_acc": 0.8369609856262834, + "train_speed(iter/s)": 0.095975 + }, + { + "epoch": 0.5089824710296781, + "grad_norm": 7.935706615447998, + "learning_rate": 5.2647898328608315e-06, + "loss": 0.6189352035522461, + "memory(GiB)": 40.03, + "step": 10945, + "token_acc": 0.8472222222222222, + "train_speed(iter/s)": 0.096 + }, + { + "epoch": 0.5092149892896276, + "grad_norm": 7.151297092437744, + "learning_rate": 5.260950248182546e-06, + "loss": 0.6623498439788819, + "memory(GiB)": 40.03, + "step": 10950, + "token_acc": 0.8259187620889749, + "train_speed(iter/s)": 0.096024 + }, + { + "epoch": 0.5092149892896276, + "eval_loss": 0.5966300368309021, + "eval_runtime": 293.6356, + "eval_samples_per_second": 11.834, + "eval_steps_per_second": 11.834, + "step": 10950 + }, + { + "epoch": 0.5094475075495772, + "grad_norm": 7.964531421661377, + "learning_rate": 5.257110509196322e-06, + "loss": 0.622746467590332, + "memory(GiB)": 40.03, + "step": 10955, + "token_acc": 0.825282466675729, + "train_speed(iter/s)": 0.095802 + }, + { + "epoch": 0.5096800258095269, + "grad_norm": 8.218613624572754, + "learning_rate": 5.253270618172717e-06, + "loss": 0.5788079261779785, + "memory(GiB)": 40.03, + "step": 10960, + "token_acc": 0.8508771929824561, + "train_speed(iter/s)": 0.095826 + }, + { + "epoch": 0.5099125440694765, + "grad_norm": 7.500878810882568, + "learning_rate": 5.249430577382373e-06, + "loss": 0.6466068744659423, + "memory(GiB)": 40.03, + "step": 10965, + "token_acc": 0.84496996996997, + "train_speed(iter/s)": 0.095851 + }, + { + "epoch": 0.510145062329426, + "grad_norm": 9.07535171508789, + "learning_rate": 5.245590389096031e-06, + "loss": 0.6920000076293945, + "memory(GiB)": 40.03, + "step": 10970, + "token_acc": 0.8427947598253275, + "train_speed(iter/s)": 0.095875 + }, + { + "epoch": 0.5103775805893757, + "grad_norm": 7.486384391784668, + "learning_rate": 5.241750055584507e-06, + "loss": 0.7849728107452393, + "memory(GiB)": 40.03, + "step": 10975, + "token_acc": 0.8100734522560336, + "train_speed(iter/s)": 0.0959 + }, + { + "epoch": 0.5106100988493253, + "grad_norm": 8.976439476013184, + "learning_rate": 5.237909579118713e-06, + "loss": 0.6393332004547119, + "memory(GiB)": 40.03, + "step": 10980, + "token_acc": 0.8386505317198386, + "train_speed(iter/s)": 0.095924 + }, + { + "epoch": 0.5108426171092748, + "grad_norm": 5.604680061340332, + "learning_rate": 5.2340689619696375e-06, + "loss": 0.7520250797271728, + "memory(GiB)": 40.03, + "step": 10985, + "token_acc": 0.8177858439201452, + "train_speed(iter/s)": 0.095949 + }, + { + "epoch": 0.5110751353692244, + "grad_norm": 6.231554985046387, + "learning_rate": 5.23022820640836e-06, + "loss": 0.743222713470459, + "memory(GiB)": 40.03, + "step": 10990, + "token_acc": 0.823906083244397, + "train_speed(iter/s)": 0.095973 + }, + { + "epoch": 0.5113076536291741, + "grad_norm": 6.983364105224609, + "learning_rate": 5.226387314706035e-06, + "loss": 0.6753977298736572, + "memory(GiB)": 40.03, + "step": 10995, + "token_acc": 0.8220467658669905, + "train_speed(iter/s)": 0.095998 + }, + { + "epoch": 0.5115401718891237, + "grad_norm": 7.330320358276367, + "learning_rate": 5.222546289133902e-06, + "loss": 0.686239767074585, + "memory(GiB)": 40.03, + "step": 11000, + "token_acc": 0.8274095421069484, + "train_speed(iter/s)": 0.096023 + }, + { + "epoch": 0.5115401718891237, + "eval_loss": 0.599044919013977, + "eval_runtime": 293.7218, + "eval_samples_per_second": 11.831, + "eval_steps_per_second": 11.831, + "step": 11000 + }, + { + "epoch": 0.5117726901490732, + "grad_norm": 6.857234001159668, + "learning_rate": 5.218705131963275e-06, + "loss": 0.589632225036621, + "memory(GiB)": 40.03, + "step": 11005, + "token_acc": 0.825489095574086, + "train_speed(iter/s)": 0.095801 + }, + { + "epoch": 0.5120052084090229, + "grad_norm": 7.7748613357543945, + "learning_rate": 5.214863845465553e-06, + "loss": 0.5865228652954102, + "memory(GiB)": 40.03, + "step": 11010, + "token_acc": 0.8419864559819413, + "train_speed(iter/s)": 0.095825 + }, + { + "epoch": 0.5122377266689725, + "grad_norm": 8.886736869812012, + "learning_rate": 5.211022431912205e-06, + "loss": 0.6716622829437255, + "memory(GiB)": 40.03, + "step": 11015, + "token_acc": 0.8304431599229287, + "train_speed(iter/s)": 0.09585 + }, + { + "epoch": 0.5124702449289221, + "grad_norm": 6.145265102386475, + "learning_rate": 5.207180893574778e-06, + "loss": 0.6420755386352539, + "memory(GiB)": 40.03, + "step": 11020, + "token_acc": 0.8376664552948636, + "train_speed(iter/s)": 0.095874 + }, + { + "epoch": 0.5127027631888716, + "grad_norm": 8.65272331237793, + "learning_rate": 5.203339232724892e-06, + "loss": 0.6653483867645263, + "memory(GiB)": 40.03, + "step": 11025, + "token_acc": 0.8366606170598911, + "train_speed(iter/s)": 0.095898 + }, + { + "epoch": 0.5129352814488213, + "grad_norm": 8.551532745361328, + "learning_rate": 5.19949745163424e-06, + "loss": 0.6726420402526856, + "memory(GiB)": 40.03, + "step": 11030, + "token_acc": 0.8368983957219251, + "train_speed(iter/s)": 0.095923 + }, + { + "epoch": 0.5131677997087709, + "grad_norm": 8.132074356079102, + "learning_rate": 5.195655552574585e-06, + "loss": 0.5192743301391601, + "memory(GiB)": 40.03, + "step": 11035, + "token_acc": 0.8695299837925445, + "train_speed(iter/s)": 0.095947 + }, + { + "epoch": 0.5134003179687204, + "grad_norm": 5.116249084472656, + "learning_rate": 5.1918135378177615e-06, + "loss": 0.6704733848571778, + "memory(GiB)": 40.03, + "step": 11040, + "token_acc": 0.8200234879624193, + "train_speed(iter/s)": 0.095971 + }, + { + "epoch": 0.5136328362286701, + "grad_norm": 7.496029853820801, + "learning_rate": 5.1879714096356695e-06, + "loss": 0.6737764358520508, + "memory(GiB)": 40.03, + "step": 11045, + "token_acc": 0.8183986371379898, + "train_speed(iter/s)": 0.095995 + }, + { + "epoch": 0.5138653544886197, + "grad_norm": 8.291666030883789, + "learning_rate": 5.184129170300281e-06, + "loss": 0.6830921649932862, + "memory(GiB)": 40.03, + "step": 11050, + "token_acc": 0.8283981448448091, + "train_speed(iter/s)": 0.096019 + }, + { + "epoch": 0.5138653544886197, + "eval_loss": 0.5962172746658325, + "eval_runtime": 296.7148, + "eval_samples_per_second": 11.712, + "eval_steps_per_second": 11.712, + "step": 11050 + }, + { + "epoch": 0.5140978727485693, + "grad_norm": 8.21020221710205, + "learning_rate": 5.180286822083629e-06, + "loss": 0.5733434200286865, + "memory(GiB)": 40.03, + "step": 11055, + "token_acc": 0.8254607972492104, + "train_speed(iter/s)": 0.095795 + }, + { + "epoch": 0.5143303910085189, + "grad_norm": 7.53735876083374, + "learning_rate": 5.176444367257812e-06, + "loss": 0.7004610538482666, + "memory(GiB)": 40.03, + "step": 11060, + "token_acc": 0.8246505717916137, + "train_speed(iter/s)": 0.09582 + }, + { + "epoch": 0.5145629092684685, + "grad_norm": 8.39875316619873, + "learning_rate": 5.172601808094994e-06, + "loss": 0.6619822025299072, + "memory(GiB)": 40.03, + "step": 11065, + "token_acc": 0.8304177079614423, + "train_speed(iter/s)": 0.095844 + }, + { + "epoch": 0.5147954275284181, + "grad_norm": 4.925565242767334, + "learning_rate": 5.168759146867397e-06, + "loss": 0.6136856555938721, + "memory(GiB)": 40.03, + "step": 11070, + "token_acc": 0.8474870017331022, + "train_speed(iter/s)": 0.095868 + }, + { + "epoch": 0.5150279457883677, + "grad_norm": 6.475769519805908, + "learning_rate": 5.164916385847307e-06, + "loss": 0.6914380550384521, + "memory(GiB)": 40.03, + "step": 11075, + "token_acc": 0.8183244430498902, + "train_speed(iter/s)": 0.095893 + }, + { + "epoch": 0.5152604640483173, + "grad_norm": 6.658247947692871, + "learning_rate": 5.161073527307065e-06, + "loss": 0.6014257907867432, + "memory(GiB)": 40.03, + "step": 11080, + "token_acc": 0.857245337159254, + "train_speed(iter/s)": 0.095916 + }, + { + "epoch": 0.5154929823082669, + "grad_norm": 7.923861503601074, + "learning_rate": 5.157230573519074e-06, + "loss": 0.695302152633667, + "memory(GiB)": 40.03, + "step": 11085, + "token_acc": 0.8259504708754796, + "train_speed(iter/s)": 0.095941 + }, + { + "epoch": 0.5157255005682165, + "grad_norm": 5.608233451843262, + "learning_rate": 5.153387526755791e-06, + "loss": 0.47870712280273436, + "memory(GiB)": 40.03, + "step": 11090, + "token_acc": 0.877984952567877, + "train_speed(iter/s)": 0.095965 + }, + { + "epoch": 0.515958018828166, + "grad_norm": 6.8638129234313965, + "learning_rate": 5.149544389289728e-06, + "loss": 0.6515926837921142, + "memory(GiB)": 40.03, + "step": 11095, + "token_acc": 0.8164341085271318, + "train_speed(iter/s)": 0.095989 + }, + { + "epoch": 0.5161905370881157, + "grad_norm": 9.983770370483398, + "learning_rate": 5.145701163393449e-06, + "loss": 0.7209710121154785, + "memory(GiB)": 40.03, + "step": 11100, + "token_acc": 0.8370177719982661, + "train_speed(iter/s)": 0.096014 + }, + { + "epoch": 0.5161905370881157, + "eval_loss": 0.5951548218727112, + "eval_runtime": 292.8437, + "eval_samples_per_second": 11.866, + "eval_steps_per_second": 11.866, + "step": 11100 + }, + { + "epoch": 0.5164230553480653, + "grad_norm": 7.779696464538574, + "learning_rate": 5.141857851339574e-06, + "loss": 0.5973493576049804, + "memory(GiB)": 40.03, + "step": 11105, + "token_acc": 0.8255119248373886, + "train_speed(iter/s)": 0.095795 + }, + { + "epoch": 0.516655573608015, + "grad_norm": 8.005353927612305, + "learning_rate": 5.138014455400773e-06, + "loss": 0.6842738151550293, + "memory(GiB)": 40.03, + "step": 11110, + "token_acc": 0.8142916493560449, + "train_speed(iter/s)": 0.095819 + }, + { + "epoch": 0.5168880918679645, + "grad_norm": 7.510136127471924, + "learning_rate": 5.134170977849763e-06, + "loss": 0.6187559604644776, + "memory(GiB)": 40.03, + "step": 11115, + "token_acc": 0.8454123527311674, + "train_speed(iter/s)": 0.095842 + }, + { + "epoch": 0.5171206101279141, + "grad_norm": 7.755091667175293, + "learning_rate": 5.130327420959311e-06, + "loss": 0.6718905448913575, + "memory(GiB)": 40.03, + "step": 11120, + "token_acc": 0.8301941466241669, + "train_speed(iter/s)": 0.095866 + }, + { + "epoch": 0.5173531283878637, + "grad_norm": 6.52587890625, + "learning_rate": 5.126483787002231e-06, + "loss": 0.6481473922729493, + "memory(GiB)": 40.03, + "step": 11125, + "token_acc": 0.8374822190611664, + "train_speed(iter/s)": 0.095891 + }, + { + "epoch": 0.5175856466478133, + "grad_norm": 9.18529224395752, + "learning_rate": 5.122640078251383e-06, + "loss": 0.6470609188079834, + "memory(GiB)": 40.03, + "step": 11130, + "token_acc": 0.8411287205257054, + "train_speed(iter/s)": 0.095915 + }, + { + "epoch": 0.5178181649077629, + "grad_norm": 6.860177516937256, + "learning_rate": 5.118796296979671e-06, + "loss": 0.6926799297332764, + "memory(GiB)": 40.03, + "step": 11135, + "token_acc": 0.8272789581905414, + "train_speed(iter/s)": 0.09594 + }, + { + "epoch": 0.5180506831677125, + "grad_norm": 6.139833927154541, + "learning_rate": 5.11495244546004e-06, + "loss": 0.5168371200561523, + "memory(GiB)": 40.03, + "step": 11140, + "token_acc": 0.874955595026643, + "train_speed(iter/s)": 0.095964 + }, + { + "epoch": 0.5182832014276622, + "grad_norm": 7.04421854019165, + "learning_rate": 5.111108525965478e-06, + "loss": 0.703952693939209, + "memory(GiB)": 40.03, + "step": 11145, + "token_acc": 0.8202391118701964, + "train_speed(iter/s)": 0.095988 + }, + { + "epoch": 0.5185157196876117, + "grad_norm": 8.804143905639648, + "learning_rate": 5.107264540769016e-06, + "loss": 0.6630299091339111, + "memory(GiB)": 40.03, + "step": 11150, + "token_acc": 0.8420208500400962, + "train_speed(iter/s)": 0.096012 + }, + { + "epoch": 0.5185157196876117, + "eval_loss": 0.5949863791465759, + "eval_runtime": 298.501, + "eval_samples_per_second": 11.642, + "eval_steps_per_second": 11.642, + "step": 11150 + }, + { + "epoch": 0.5187482379475613, + "grad_norm": 7.5537028312683105, + "learning_rate": 5.103420492143718e-06, + "loss": 0.6842432975769043, + "memory(GiB)": 40.03, + "step": 11155, + "token_acc": 0.8257057996934083, + "train_speed(iter/s)": 0.095789 + }, + { + "epoch": 0.5189807562075109, + "grad_norm": 7.630836486816406, + "learning_rate": 5.0995763823626905e-06, + "loss": 0.6884300708770752, + "memory(GiB)": 40.03, + "step": 11160, + "token_acc": 0.8314069350338781, + "train_speed(iter/s)": 0.095813 + }, + { + "epoch": 0.5192132744674606, + "grad_norm": 6.869633674621582, + "learning_rate": 5.0957322136990724e-06, + "loss": 0.5924717903137207, + "memory(GiB)": 40.03, + "step": 11165, + "token_acc": 0.8458379992534528, + "train_speed(iter/s)": 0.095837 + }, + { + "epoch": 0.5194457927274101, + "grad_norm": 7.150241374969482, + "learning_rate": 5.091887988426043e-06, + "loss": 0.7052815914154053, + "memory(GiB)": 40.03, + "step": 11170, + "token_acc": 0.8319569120287253, + "train_speed(iter/s)": 0.095861 + }, + { + "epoch": 0.5196783109873597, + "grad_norm": 7.968418598175049, + "learning_rate": 5.088043708816807e-06, + "loss": 0.651512622833252, + "memory(GiB)": 40.03, + "step": 11175, + "token_acc": 0.833808844507846, + "train_speed(iter/s)": 0.095885 + }, + { + "epoch": 0.5199108292473094, + "grad_norm": 8.40583324432373, + "learning_rate": 5.08419937714461e-06, + "loss": 0.6126296520233154, + "memory(GiB)": 40.03, + "step": 11180, + "token_acc": 0.8552202283849919, + "train_speed(iter/s)": 0.095909 + }, + { + "epoch": 0.5201433475072589, + "grad_norm": 7.4501519203186035, + "learning_rate": 5.0803549956827196e-06, + "loss": 0.6180335998535156, + "memory(GiB)": 40.03, + "step": 11185, + "token_acc": 0.8421973407977607, + "train_speed(iter/s)": 0.095933 + }, + { + "epoch": 0.5203758657672085, + "grad_norm": 6.713845729827881, + "learning_rate": 5.07651056670444e-06, + "loss": 0.6531442642211914, + "memory(GiB)": 40.03, + "step": 11190, + "token_acc": 0.8341672623302359, + "train_speed(iter/s)": 0.095957 + }, + { + "epoch": 0.5206083840271581, + "grad_norm": 8.381232261657715, + "learning_rate": 5.072666092483101e-06, + "loss": 0.7437289237976075, + "memory(GiB)": 40.03, + "step": 11195, + "token_acc": 0.8160783150400475, + "train_speed(iter/s)": 0.095981 + }, + { + "epoch": 0.5208409022871078, + "grad_norm": 7.857691287994385, + "learning_rate": 5.068821575292057e-06, + "loss": 0.7598735809326171, + "memory(GiB)": 40.03, + "step": 11200, + "token_acc": 0.8187071144817624, + "train_speed(iter/s)": 0.096004 + }, + { + "epoch": 0.5208409022871078, + "eval_loss": 0.5962448120117188, + "eval_runtime": 296.2073, + "eval_samples_per_second": 11.732, + "eval_steps_per_second": 11.732, + "step": 11200 + }, + { + "epoch": 0.5210734205470573, + "grad_norm": 7.9069132804870605, + "learning_rate": 5.06497701740469e-06, + "loss": 0.6560911178588867, + "memory(GiB)": 40.03, + "step": 11205, + "token_acc": 0.824959984710576, + "train_speed(iter/s)": 0.095784 + }, + { + "epoch": 0.5213059388070069, + "grad_norm": 6.1809258460998535, + "learning_rate": 5.061132421094408e-06, + "loss": 0.7430883884429932, + "memory(GiB)": 40.03, + "step": 11210, + "token_acc": 0.7977564102564103, + "train_speed(iter/s)": 0.095808 + }, + { + "epoch": 0.5215384570669566, + "grad_norm": 9.742097854614258, + "learning_rate": 5.057287788634636e-06, + "loss": 0.5030066490173339, + "memory(GiB)": 40.03, + "step": 11215, + "token_acc": 0.880013596193066, + "train_speed(iter/s)": 0.095832 + }, + { + "epoch": 0.5217709753269062, + "grad_norm": 7.676933288574219, + "learning_rate": 5.053443122298827e-06, + "loss": 0.5586160659790039, + "memory(GiB)": 40.03, + "step": 11220, + "token_acc": 0.8607882052736037, + "train_speed(iter/s)": 0.095856 + }, + { + "epoch": 0.5220034935868557, + "grad_norm": 8.416631698608398, + "learning_rate": 5.049598424360449e-06, + "loss": 0.7418983936309814, + "memory(GiB)": 40.03, + "step": 11225, + "token_acc": 0.8017057569296375, + "train_speed(iter/s)": 0.095881 + }, + { + "epoch": 0.5222360118468053, + "grad_norm": 5.841848373413086, + "learning_rate": 5.045753697092993e-06, + "loss": 0.7004424095153808, + "memory(GiB)": 40.03, + "step": 11230, + "token_acc": 0.8233865371269952, + "train_speed(iter/s)": 0.095905 + }, + { + "epoch": 0.522468530106755, + "grad_norm": 9.230901718139648, + "learning_rate": 5.041908942769963e-06, + "loss": 0.6646398544311524, + "memory(GiB)": 40.03, + "step": 11235, + "token_acc": 0.8307048599935629, + "train_speed(iter/s)": 0.095929 + }, + { + "epoch": 0.5227010483667045, + "grad_norm": 8.23694896697998, + "learning_rate": 5.038064163664881e-06, + "loss": 0.5914628028869628, + "memory(GiB)": 40.03, + "step": 11240, + "token_acc": 0.854236034036256, + "train_speed(iter/s)": 0.095953 + }, + { + "epoch": 0.5229335666266541, + "grad_norm": 6.6772308349609375, + "learning_rate": 5.0342193620512825e-06, + "loss": 0.7465476989746094, + "memory(GiB)": 40.03, + "step": 11245, + "token_acc": 0.8043956043956044, + "train_speed(iter/s)": 0.095977 + }, + { + "epoch": 0.5231660848866038, + "grad_norm": 7.63891077041626, + "learning_rate": 5.03037454020272e-06, + "loss": 0.6081331253051758, + "memory(GiB)": 40.03, + "step": 11250, + "token_acc": 0.8427753023551878, + "train_speed(iter/s)": 0.096001 + }, + { + "epoch": 0.5231660848866038, + "eval_loss": 0.5921972393989563, + "eval_runtime": 290.7907, + "eval_samples_per_second": 11.95, + "eval_steps_per_second": 11.95, + "step": 11250 + }, + { + "epoch": 0.5233986031465534, + "grad_norm": 7.560066223144531, + "learning_rate": 5.026529700392754e-06, + "loss": 0.7295114040374756, + "memory(GiB)": 40.03, + "step": 11255, + "token_acc": 0.8256236840426211, + "train_speed(iter/s)": 0.095787 + }, + { + "epoch": 0.5236311214065029, + "grad_norm": 6.873581409454346, + "learning_rate": 5.022684844894957e-06, + "loss": 0.7038755893707276, + "memory(GiB)": 40.03, + "step": 11260, + "token_acc": 0.8330605564648118, + "train_speed(iter/s)": 0.095811 + }, + { + "epoch": 0.5238636396664526, + "grad_norm": 6.969046115875244, + "learning_rate": 5.0188399759829106e-06, + "loss": 0.6631447792053222, + "memory(GiB)": 40.03, + "step": 11265, + "token_acc": 0.8317152103559871, + "train_speed(iter/s)": 0.095836 + }, + { + "epoch": 0.5240961579264022, + "grad_norm": 6.5216779708862305, + "learning_rate": 5.014995095930205e-06, + "loss": 0.6236719608306884, + "memory(GiB)": 40.03, + "step": 11270, + "token_acc": 0.8280766852195424, + "train_speed(iter/s)": 0.095859 + }, + { + "epoch": 0.5243286761863517, + "grad_norm": 9.973017692565918, + "learning_rate": 5.011150207010437e-06, + "loss": 0.6285340785980225, + "memory(GiB)": 40.03, + "step": 11275, + "token_acc": 0.8375690607734807, + "train_speed(iter/s)": 0.095883 + }, + { + "epoch": 0.5245611944463013, + "grad_norm": 9.72734260559082, + "learning_rate": 5.007305311497206e-06, + "loss": 0.6574903964996338, + "memory(GiB)": 40.03, + "step": 11280, + "token_acc": 0.8370991253644315, + "train_speed(iter/s)": 0.095907 + }, + { + "epoch": 0.524793712706251, + "grad_norm": 8.460036277770996, + "learning_rate": 5.003460411664118e-06, + "loss": 0.6372312068939209, + "memory(GiB)": 40.03, + "step": 11285, + "token_acc": 0.8438914027149321, + "train_speed(iter/s)": 0.095931 + }, + { + "epoch": 0.5250262309662006, + "grad_norm": 7.125070095062256, + "learning_rate": 4.9996155097847834e-06, + "loss": 0.6930059432983399, + "memory(GiB)": 40.03, + "step": 11290, + "token_acc": 0.8248175182481752, + "train_speed(iter/s)": 0.095955 + }, + { + "epoch": 0.5252587492261501, + "grad_norm": 4.946784019470215, + "learning_rate": 4.995770608132809e-06, + "loss": 0.7608030796051025, + "memory(GiB)": 40.03, + "step": 11295, + "token_acc": 0.8132656109949208, + "train_speed(iter/s)": 0.095979 + }, + { + "epoch": 0.5254912674860998, + "grad_norm": 6.038403511047363, + "learning_rate": 4.991925708981806e-06, + "loss": 0.7435698509216309, + "memory(GiB)": 40.03, + "step": 11300, + "token_acc": 0.8154296875, + "train_speed(iter/s)": 0.096003 + }, + { + "epoch": 0.5254912674860998, + "eval_loss": 0.5952001810073853, + "eval_runtime": 291.1034, + "eval_samples_per_second": 11.937, + "eval_steps_per_second": 11.937, + "step": 11300 + }, + { + "epoch": 0.5257237857460494, + "grad_norm": 8.406840324401855, + "learning_rate": 4.9880808146053785e-06, + "loss": 0.5746410369873047, + "memory(GiB)": 40.03, + "step": 11305, + "token_acc": 0.826413462924119, + "train_speed(iter/s)": 0.09579 + }, + { + "epoch": 0.525956304005999, + "grad_norm": 7.57234525680542, + "learning_rate": 4.984235927277135e-06, + "loss": 0.6861493587493896, + "memory(GiB)": 40.03, + "step": 11310, + "token_acc": 0.8226790876967556, + "train_speed(iter/s)": 0.095813 + }, + { + "epoch": 0.5261888222659485, + "grad_norm": 7.232485294342041, + "learning_rate": 4.980391049270673e-06, + "loss": 0.6427253246307373, + "memory(GiB)": 40.03, + "step": 11315, + "token_acc": 0.8424116424116425, + "train_speed(iter/s)": 0.095837 + }, + { + "epoch": 0.5264213405258982, + "grad_norm": 6.939469337463379, + "learning_rate": 4.976546182859591e-06, + "loss": 0.7365629196166992, + "memory(GiB)": 40.03, + "step": 11320, + "token_acc": 0.8238817891373802, + "train_speed(iter/s)": 0.095861 + }, + { + "epoch": 0.5266538587858478, + "grad_norm": 9.328145027160645, + "learning_rate": 4.972701330317472e-06, + "loss": 0.7492372989654541, + "memory(GiB)": 40.03, + "step": 11325, + "token_acc": 0.818961818961819, + "train_speed(iter/s)": 0.095885 + }, + { + "epoch": 0.5268863770457973, + "grad_norm": 5.637638568878174, + "learning_rate": 4.968856493917902e-06, + "loss": 0.6783174991607666, + "memory(GiB)": 40.03, + "step": 11330, + "token_acc": 0.8413669064748202, + "train_speed(iter/s)": 0.095909 + }, + { + "epoch": 0.527118895305747, + "grad_norm": 6.447477340698242, + "learning_rate": 4.965011675934447e-06, + "loss": 0.6275362014770508, + "memory(GiB)": 40.03, + "step": 11335, + "token_acc": 0.8448905109489051, + "train_speed(iter/s)": 0.095933 + }, + { + "epoch": 0.5273514135656966, + "grad_norm": 6.677289009094238, + "learning_rate": 4.961166878640671e-06, + "loss": 0.6838769912719727, + "memory(GiB)": 40.03, + "step": 11340, + "token_acc": 0.80778739184178, + "train_speed(iter/s)": 0.095956 + }, + { + "epoch": 0.5275839318256462, + "grad_norm": 8.42552661895752, + "learning_rate": 4.957322104310115e-06, + "loss": 0.6776114940643311, + "memory(GiB)": 40.03, + "step": 11345, + "token_acc": 0.8311827956989247, + "train_speed(iter/s)": 0.09598 + }, + { + "epoch": 0.5278164500855957, + "grad_norm": 6.210771560668945, + "learning_rate": 4.953477355216318e-06, + "loss": 0.6211733818054199, + "memory(GiB)": 40.03, + "step": 11350, + "token_acc": 0.8471243042671615, + "train_speed(iter/s)": 0.096004 + }, + { + "epoch": 0.5278164500855957, + "eval_loss": 0.5934306383132935, + "eval_runtime": 291.2941, + "eval_samples_per_second": 11.93, + "eval_steps_per_second": 11.93, + "step": 11350 + }, + { + "epoch": 0.5280489683455454, + "grad_norm": 9.925871849060059, + "learning_rate": 4.949632633632797e-06, + "loss": 0.672046422958374, + "memory(GiB)": 40.03, + "step": 11355, + "token_acc": 0.8262175103694569, + "train_speed(iter/s)": 0.095792 + }, + { + "epoch": 0.528281486605495, + "grad_norm": 9.350289344787598, + "learning_rate": 4.945787941833056e-06, + "loss": 0.6249475479125977, + "memory(GiB)": 40.03, + "step": 11360, + "token_acc": 0.8458480565371025, + "train_speed(iter/s)": 0.095815 + }, + { + "epoch": 0.5285140048654446, + "grad_norm": 9.15757942199707, + "learning_rate": 4.941943282090578e-06, + "loss": 0.7165769577026367, + "memory(GiB)": 40.03, + "step": 11365, + "token_acc": 0.8218045112781955, + "train_speed(iter/s)": 0.095839 + }, + { + "epoch": 0.5287465231253942, + "grad_norm": 8.200997352600098, + "learning_rate": 4.9380986566788296e-06, + "loss": 0.7071670055389404, + "memory(GiB)": 40.03, + "step": 11370, + "token_acc": 0.8172683289914403, + "train_speed(iter/s)": 0.095863 + }, + { + "epoch": 0.5289790413853438, + "grad_norm": 6.972626209259033, + "learning_rate": 4.934254067871255e-06, + "loss": 0.5738145351409912, + "memory(GiB)": 40.03, + "step": 11375, + "token_acc": 0.8671679197994987, + "train_speed(iter/s)": 0.095887 + }, + { + "epoch": 0.5292115596452934, + "grad_norm": 6.229203224182129, + "learning_rate": 4.930409517941284e-06, + "loss": 0.6540433406829834, + "memory(GiB)": 40.03, + "step": 11380, + "token_acc": 0.8375670840787119, + "train_speed(iter/s)": 0.095909 + }, + { + "epoch": 0.529444077905243, + "grad_norm": 7.376658916473389, + "learning_rate": 4.926565009162309e-06, + "loss": 0.620716142654419, + "memory(GiB)": 40.03, + "step": 11385, + "token_acc": 0.8458440131819847, + "train_speed(iter/s)": 0.095933 + }, + { + "epoch": 0.5296765961651926, + "grad_norm": 9.04910659790039, + "learning_rate": 4.9227205438077145e-06, + "loss": 0.5833307266235351, + "memory(GiB)": 40.03, + "step": 11390, + "token_acc": 0.8477661334804192, + "train_speed(iter/s)": 0.095957 + }, + { + "epoch": 0.5299091144251422, + "grad_norm": 8.151079177856445, + "learning_rate": 4.918876124150846e-06, + "loss": 0.6190403938293457, + "memory(GiB)": 40.03, + "step": 11395, + "token_acc": 0.8311345646437994, + "train_speed(iter/s)": 0.09598 + }, + { + "epoch": 0.5301416326850918, + "grad_norm": 6.338625907897949, + "learning_rate": 4.915031752465033e-06, + "loss": 0.6119202613830567, + "memory(GiB)": 40.03, + "step": 11400, + "token_acc": 0.8471164309031556, + "train_speed(iter/s)": 0.096004 + }, + { + "epoch": 0.5301416326850918, + "eval_loss": 0.5921990871429443, + "eval_runtime": 291.4397, + "eval_samples_per_second": 11.924, + "eval_steps_per_second": 11.924, + "step": 11400 + }, + { + "epoch": 0.5303741509450414, + "grad_norm": 7.004538059234619, + "learning_rate": 4.911187431023565e-06, + "loss": 0.5625624179840087, + "memory(GiB)": 40.03, + "step": 11405, + "token_acc": 0.8265071992197182, + "train_speed(iter/s)": 0.095792 + }, + { + "epoch": 0.530606669204991, + "grad_norm": 7.3375701904296875, + "learning_rate": 4.907343162099712e-06, + "loss": 0.6645450115203857, + "memory(GiB)": 40.03, + "step": 11410, + "token_acc": 0.8184036249564308, + "train_speed(iter/s)": 0.095815 + }, + { + "epoch": 0.5308391874649406, + "grad_norm": 7.548114776611328, + "learning_rate": 4.90349894796671e-06, + "loss": 0.6787260055541993, + "memory(GiB)": 40.03, + "step": 11415, + "token_acc": 0.8279603223806572, + "train_speed(iter/s)": 0.095839 + }, + { + "epoch": 0.5310717057248902, + "grad_norm": 6.177823066711426, + "learning_rate": 4.899654790897757e-06, + "loss": 0.7073424339294434, + "memory(GiB)": 40.03, + "step": 11420, + "token_acc": 0.8293135435992579, + "train_speed(iter/s)": 0.095863 + }, + { + "epoch": 0.5313042239848398, + "grad_norm": 6.371845245361328, + "learning_rate": 4.895810693166026e-06, + "loss": 0.6852655410766602, + "memory(GiB)": 40.03, + "step": 11425, + "token_acc": 0.83436087135224, + "train_speed(iter/s)": 0.095887 + }, + { + "epoch": 0.5315367422447894, + "grad_norm": 4.985559463500977, + "learning_rate": 4.891966657044647e-06, + "loss": 0.656800365447998, + "memory(GiB)": 40.03, + "step": 11430, + "token_acc": 0.8411088573360379, + "train_speed(iter/s)": 0.09591 + }, + { + "epoch": 0.531769260504739, + "grad_norm": 8.7366943359375, + "learning_rate": 4.888122684806721e-06, + "loss": 0.5947208404541016, + "memory(GiB)": 40.03, + "step": 11435, + "token_acc": 0.858000858000858, + "train_speed(iter/s)": 0.095934 + }, + { + "epoch": 0.5320017787646886, + "grad_norm": 8.291216850280762, + "learning_rate": 4.884278778725304e-06, + "loss": 0.6520028114318848, + "memory(GiB)": 40.03, + "step": 11440, + "token_acc": 0.8529519618628529, + "train_speed(iter/s)": 0.095958 + }, + { + "epoch": 0.5322342970246382, + "grad_norm": 7.761775970458984, + "learning_rate": 4.8804349410734195e-06, + "loss": 0.5053174018859863, + "memory(GiB)": 40.03, + "step": 11445, + "token_acc": 0.8743396226415094, + "train_speed(iter/s)": 0.095982 + }, + { + "epoch": 0.5324668152845878, + "grad_norm": 7.208600044250488, + "learning_rate": 4.876591174124045e-06, + "loss": 0.5161089420318603, + "memory(GiB)": 40.03, + "step": 11450, + "token_acc": 0.8536853685368537, + "train_speed(iter/s)": 0.096005 + }, + { + "epoch": 0.5324668152845878, + "eval_loss": 0.5929927229881287, + "eval_runtime": 290.0243, + "eval_samples_per_second": 11.982, + "eval_steps_per_second": 11.982, + "step": 11450 + }, + { + "epoch": 0.5326993335445375, + "grad_norm": 8.39334774017334, + "learning_rate": 4.872747480150121e-06, + "loss": 0.6529646396636963, + "memory(GiB)": 40.03, + "step": 11455, + "token_acc": 0.8263123836610131, + "train_speed(iter/s)": 0.095796 + }, + { + "epoch": 0.532931851804487, + "grad_norm": 7.040056228637695, + "learning_rate": 4.8689038614245384e-06, + "loss": 0.7571589946746826, + "memory(GiB)": 40.03, + "step": 11460, + "token_acc": 0.8244972577696527, + "train_speed(iter/s)": 0.095819 + }, + { + "epoch": 0.5331643700644366, + "grad_norm": 6.09945821762085, + "learning_rate": 4.865060320220151e-06, + "loss": 0.5826333999633789, + "memory(GiB)": 40.03, + "step": 11465, + "token_acc": 0.8391376451077943, + "train_speed(iter/s)": 0.095843 + }, + { + "epoch": 0.5333968883243863, + "grad_norm": 8.370718002319336, + "learning_rate": 4.861216858809762e-06, + "loss": 0.6425962448120117, + "memory(GiB)": 40.03, + "step": 11470, + "token_acc": 0.8435257943286641, + "train_speed(iter/s)": 0.095866 + }, + { + "epoch": 0.5336294065843358, + "grad_norm": 8.308968544006348, + "learning_rate": 4.857373479466132e-06, + "loss": 0.6420434474945068, + "memory(GiB)": 40.03, + "step": 11475, + "token_acc": 0.8433778419343197, + "train_speed(iter/s)": 0.095888 + }, + { + "epoch": 0.5338619248442854, + "grad_norm": 6.884545803070068, + "learning_rate": 4.853530184461964e-06, + "loss": 0.6627838134765625, + "memory(GiB)": 40.03, + "step": 11480, + "token_acc": 0.8381901840490797, + "train_speed(iter/s)": 0.095912 + }, + { + "epoch": 0.534094443104235, + "grad_norm": 8.793212890625, + "learning_rate": 4.8496869760699206e-06, + "loss": 0.6520689487457275, + "memory(GiB)": 40.03, + "step": 11485, + "token_acc": 0.8455631399317406, + "train_speed(iter/s)": 0.095936 + }, + { + "epoch": 0.5343269613641847, + "grad_norm": 9.045661926269531, + "learning_rate": 4.845843856562609e-06, + "loss": 0.5819193363189697, + "memory(GiB)": 40.03, + "step": 11490, + "token_acc": 0.851056338028169, + "train_speed(iter/s)": 0.095959 + }, + { + "epoch": 0.5345594796241342, + "grad_norm": 7.690134525299072, + "learning_rate": 4.842000828212586e-06, + "loss": 0.7085441589355469, + "memory(GiB)": 40.03, + "step": 11495, + "token_acc": 0.8147433423388653, + "train_speed(iter/s)": 0.095983 + }, + { + "epoch": 0.5347919978840838, + "grad_norm": 7.208715438842773, + "learning_rate": 4.83815789329235e-06, + "loss": 0.6616110801696777, + "memory(GiB)": 40.03, + "step": 11500, + "token_acc": 0.8222143364088006, + "train_speed(iter/s)": 0.096006 + }, + { + "epoch": 0.5347919978840838, + "eval_loss": 0.5930191874504089, + "eval_runtime": 290.603, + "eval_samples_per_second": 11.958, + "eval_steps_per_second": 11.958, + "step": 11500 + }, + { + "epoch": 0.5350245161440335, + "grad_norm": 6.396140098571777, + "learning_rate": 4.8343150540743485e-06, + "loss": 0.6394748687744141, + "memory(GiB)": 40.03, + "step": 11505, + "token_acc": 0.8267575437585998, + "train_speed(iter/s)": 0.095797 + }, + { + "epoch": 0.5352570344039831, + "grad_norm": 7.510823726654053, + "learning_rate": 4.830472312830971e-06, + "loss": 0.6311664581298828, + "memory(GiB)": 40.03, + "step": 11510, + "token_acc": 0.8289065194894791, + "train_speed(iter/s)": 0.095821 + }, + { + "epoch": 0.5354895526639326, + "grad_norm": 7.6930084228515625, + "learning_rate": 4.8266296718345505e-06, + "loss": 0.8164526939392089, + "memory(GiB)": 40.03, + "step": 11515, + "token_acc": 0.8156723063223509, + "train_speed(iter/s)": 0.095844 + }, + { + "epoch": 0.5357220709238822, + "grad_norm": 9.583854675292969, + "learning_rate": 4.822787133357356e-06, + "loss": 0.7618258953094482, + "memory(GiB)": 40.03, + "step": 11520, + "token_acc": 0.8230184581976113, + "train_speed(iter/s)": 0.095868 + }, + { + "epoch": 0.5359545891838319, + "grad_norm": 7.062460899353027, + "learning_rate": 4.818944699671602e-06, + "loss": 0.6841589927673339, + "memory(GiB)": 40.03, + "step": 11525, + "token_acc": 0.8321906627489389, + "train_speed(iter/s)": 0.095891 + }, + { + "epoch": 0.5361871074437814, + "grad_norm": 9.660597801208496, + "learning_rate": 4.815102373049435e-06, + "loss": 0.7356135368347168, + "memory(GiB)": 40.03, + "step": 11530, + "token_acc": 0.8156158357771262, + "train_speed(iter/s)": 0.095914 + }, + { + "epoch": 0.536419625703731, + "grad_norm": 5.609492778778076, + "learning_rate": 4.811260155762947e-06, + "loss": 0.6264122009277344, + "memory(GiB)": 40.03, + "step": 11535, + "token_acc": 0.8382756727073036, + "train_speed(iter/s)": 0.095938 + }, + { + "epoch": 0.5366521439636807, + "grad_norm": 8.03640079498291, + "learning_rate": 4.8074180500841535e-06, + "loss": 0.5942182064056396, + "memory(GiB)": 40.03, + "step": 11540, + "token_acc": 0.8525579917381634, + "train_speed(iter/s)": 0.095961 + }, + { + "epoch": 0.5368846622236303, + "grad_norm": 6.38782262802124, + "learning_rate": 4.8035760582850124e-06, + "loss": 0.7426403522491455, + "memory(GiB)": 40.03, + "step": 11545, + "token_acc": 0.8122102009273571, + "train_speed(iter/s)": 0.095984 + }, + { + "epoch": 0.5371171804835798, + "grad_norm": 7.080733299255371, + "learning_rate": 4.799734182637413e-06, + "loss": 0.6498960494995117, + "memory(GiB)": 40.03, + "step": 11550, + "token_acc": 0.824015748031496, + "train_speed(iter/s)": 0.096007 + }, + { + "epoch": 0.5371171804835798, + "eval_loss": 0.5918628573417664, + "eval_runtime": 288.3253, + "eval_samples_per_second": 12.052, + "eval_steps_per_second": 12.052, + "step": 11550 + }, + { + "epoch": 0.5373496987435294, + "grad_norm": 9.150640487670898, + "learning_rate": 4.795892425413175e-06, + "loss": 0.5990890979766845, + "memory(GiB)": 40.03, + "step": 11555, + "token_acc": 0.8264292393641254, + "train_speed(iter/s)": 0.095801 + }, + { + "epoch": 0.5375822170034791, + "grad_norm": 8.70228099822998, + "learning_rate": 4.792050788884049e-06, + "loss": 0.6391530513763428, + "memory(GiB)": 40.03, + "step": 11560, + "token_acc": 0.8391319324836376, + "train_speed(iter/s)": 0.095825 + }, + { + "epoch": 0.5378147352634287, + "grad_norm": 8.723603248596191, + "learning_rate": 4.78820927532171e-06, + "loss": 0.7898457527160645, + "memory(GiB)": 40.03, + "step": 11565, + "token_acc": 0.8129608071400853, + "train_speed(iter/s)": 0.095849 + }, + { + "epoch": 0.5380472535233782, + "grad_norm": 7.317963600158691, + "learning_rate": 4.784367886997766e-06, + "loss": 0.6194943428039551, + "memory(GiB)": 40.03, + "step": 11570, + "token_acc": 0.8442244224422443, + "train_speed(iter/s)": 0.095872 + }, + { + "epoch": 0.5382797717833279, + "grad_norm": 10.04311466217041, + "learning_rate": 4.780526626183746e-06, + "loss": 0.6792384147644043, + "memory(GiB)": 40.03, + "step": 11575, + "token_acc": 0.8399412628487518, + "train_speed(iter/s)": 0.095895 + }, + { + "epoch": 0.5385122900432775, + "grad_norm": 6.099349498748779, + "learning_rate": 4.7766854951511115e-06, + "loss": 0.5884709835052491, + "memory(GiB)": 40.03, + "step": 11580, + "token_acc": 0.8562348668280871, + "train_speed(iter/s)": 0.095918 + }, + { + "epoch": 0.538744808303227, + "grad_norm": 9.135714530944824, + "learning_rate": 4.772844496171236e-06, + "loss": 0.5978000640869141, + "memory(GiB)": 40.03, + "step": 11585, + "token_acc": 0.8556073092081691, + "train_speed(iter/s)": 0.095942 + }, + { + "epoch": 0.5389773265631767, + "grad_norm": 6.890235900878906, + "learning_rate": 4.769003631515424e-06, + "loss": 0.6094280242919922, + "memory(GiB)": 40.03, + "step": 11590, + "token_acc": 0.8448572411420708, + "train_speed(iter/s)": 0.095966 + }, + { + "epoch": 0.5392098448231263, + "grad_norm": 7.566539287567139, + "learning_rate": 4.765162903454896e-06, + "loss": 0.5553547859191894, + "memory(GiB)": 40.03, + "step": 11595, + "token_acc": 0.8636524196397033, + "train_speed(iter/s)": 0.095988 + }, + { + "epoch": 0.5394423630830759, + "grad_norm": 7.855575084686279, + "learning_rate": 4.761322314260795e-06, + "loss": 0.6286415100097656, + "memory(GiB)": 40.03, + "step": 11600, + "token_acc": 0.8369634849455477, + "train_speed(iter/s)": 0.096012 + }, + { + "epoch": 0.5394423630830759, + "eval_loss": 0.5954886078834534, + "eval_runtime": 290.9344, + "eval_samples_per_second": 11.944, + "eval_steps_per_second": 11.944, + "step": 11600 + }, + { + "epoch": 0.5396748813430254, + "grad_norm": 7.707352161407471, + "learning_rate": 4.757481866204178e-06, + "loss": 0.6133537292480469, + "memory(GiB)": 40.03, + "step": 11605, + "token_acc": 0.8269178291089458, + "train_speed(iter/s)": 0.095804 + }, + { + "epoch": 0.5399073996029751, + "grad_norm": 8.7269926071167, + "learning_rate": 4.75364156155602e-06, + "loss": 0.6341817378997803, + "memory(GiB)": 40.03, + "step": 11610, + "token_acc": 0.8378782218901756, + "train_speed(iter/s)": 0.095827 + }, + { + "epoch": 0.5401399178629247, + "grad_norm": 6.603315830230713, + "learning_rate": 4.749801402587214e-06, + "loss": 0.6399006366729736, + "memory(GiB)": 40.03, + "step": 11615, + "token_acc": 0.8493666552550496, + "train_speed(iter/s)": 0.09585 + }, + { + "epoch": 0.5403724361228742, + "grad_norm": 6.983320713043213, + "learning_rate": 4.745961391568564e-06, + "loss": 0.7938889980316162, + "memory(GiB)": 40.03, + "step": 11620, + "token_acc": 0.8095981271946937, + "train_speed(iter/s)": 0.095874 + }, + { + "epoch": 0.5406049543828239, + "grad_norm": 7.961282730102539, + "learning_rate": 4.7421215307707846e-06, + "loss": 0.546476936340332, + "memory(GiB)": 40.03, + "step": 11625, + "token_acc": 0.86741494212557, + "train_speed(iter/s)": 0.095897 + }, + { + "epoch": 0.5408374726427735, + "grad_norm": 8.474226951599121, + "learning_rate": 4.738281822464508e-06, + "loss": 0.7628999710083008, + "memory(GiB)": 40.03, + "step": 11630, + "token_acc": 0.8200692041522492, + "train_speed(iter/s)": 0.09592 + }, + { + "epoch": 0.5410699909027231, + "grad_norm": 6.8097639083862305, + "learning_rate": 4.734442268920268e-06, + "loss": 0.5580487251281738, + "memory(GiB)": 40.03, + "step": 11635, + "token_acc": 0.8571891191709845, + "train_speed(iter/s)": 0.095943 + }, + { + "epoch": 0.5413025091626726, + "grad_norm": 6.000946521759033, + "learning_rate": 4.730602872408516e-06, + "loss": 0.720289659500122, + "memory(GiB)": 40.03, + "step": 11640, + "token_acc": 0.8221534227726178, + "train_speed(iter/s)": 0.095966 + }, + { + "epoch": 0.5415350274226223, + "grad_norm": 6.937013149261475, + "learning_rate": 4.7267636351996e-06, + "loss": 0.656560754776001, + "memory(GiB)": 40.03, + "step": 11645, + "token_acc": 0.8447912273302404, + "train_speed(iter/s)": 0.095989 + }, + { + "epoch": 0.5417675456825719, + "grad_norm": 8.265849113464355, + "learning_rate": 4.722924559563784e-06, + "loss": 0.7568728923797607, + "memory(GiB)": 40.03, + "step": 11650, + "token_acc": 0.805571510626077, + "train_speed(iter/s)": 0.096013 + }, + { + "epoch": 0.5417675456825719, + "eval_loss": 0.5918522477149963, + "eval_runtime": 296.0765, + "eval_samples_per_second": 11.737, + "eval_steps_per_second": 11.737, + "step": 11650 + }, + { + "epoch": 0.5420000639425215, + "grad_norm": 11.34377384185791, + "learning_rate": 4.71908564777123e-06, + "loss": 0.6885035514831543, + "memory(GiB)": 40.03, + "step": 11655, + "token_acc": 0.8265431850211395, + "train_speed(iter/s)": 0.095801 + }, + { + "epoch": 0.5422325822024711, + "grad_norm": 8.528449058532715, + "learning_rate": 4.7152469020920075e-06, + "loss": 0.7828842163085937, + "memory(GiB)": 40.03, + "step": 11660, + "token_acc": 0.8166794773251345, + "train_speed(iter/s)": 0.095824 + }, + { + "epoch": 0.5424651004624207, + "grad_norm": 8.462496757507324, + "learning_rate": 4.711408324796081e-06, + "loss": 0.7147085666656494, + "memory(GiB)": 40.03, + "step": 11665, + "token_acc": 0.8189625558227414, + "train_speed(iter/s)": 0.095847 + }, + { + "epoch": 0.5426976187223703, + "grad_norm": 7.3300371170043945, + "learning_rate": 4.707569918153323e-06, + "loss": 0.6366332054138184, + "memory(GiB)": 40.03, + "step": 11670, + "token_acc": 0.8369641602248771, + "train_speed(iter/s)": 0.09587 + }, + { + "epoch": 0.5429301369823198, + "grad_norm": 7.166123390197754, + "learning_rate": 4.7037316844335e-06, + "loss": 0.7396623611450195, + "memory(GiB)": 40.03, + "step": 11675, + "token_acc": 0.8181201221581269, + "train_speed(iter/s)": 0.095892 + }, + { + "epoch": 0.5431626552422695, + "grad_norm": 8.054522514343262, + "learning_rate": 4.699893625906279e-06, + "loss": 0.653428602218628, + "memory(GiB)": 40.03, + "step": 11680, + "token_acc": 0.8363309352517986, + "train_speed(iter/s)": 0.095915 + }, + { + "epoch": 0.5433951735022191, + "grad_norm": 6.426419734954834, + "learning_rate": 4.696055744841225e-06, + "loss": 0.7306111335754395, + "memory(GiB)": 40.03, + "step": 11685, + "token_acc": 0.8120168657765284, + "train_speed(iter/s)": 0.095938 + }, + { + "epoch": 0.5436276917621687, + "grad_norm": 8.696972846984863, + "learning_rate": 4.692218043507791e-06, + "loss": 0.5761496067047119, + "memory(GiB)": 40.03, + "step": 11690, + "token_acc": 0.8655980271270037, + "train_speed(iter/s)": 0.095962 + }, + { + "epoch": 0.5438602100221183, + "grad_norm": 6.541189193725586, + "learning_rate": 4.688380524175332e-06, + "loss": 0.6662922859191894, + "memory(GiB)": 40.03, + "step": 11695, + "token_acc": 0.8318356867779204, + "train_speed(iter/s)": 0.095985 + }, + { + "epoch": 0.5440927282820679, + "grad_norm": 8.999557495117188, + "learning_rate": 4.684543189113089e-06, + "loss": 0.5919151782989502, + "memory(GiB)": 40.03, + "step": 11700, + "token_acc": 0.8485790408525755, + "train_speed(iter/s)": 0.096007 + }, + { + "epoch": 0.5440927282820679, + "eval_loss": 0.5910959839820862, + "eval_runtime": 295.316, + "eval_samples_per_second": 11.767, + "eval_steps_per_second": 11.767, + "step": 11700 + }, + { + "epoch": 0.5443252465420175, + "grad_norm": 11.527779579162598, + "learning_rate": 4.6807060405902e-06, + "loss": 0.6689294815063477, + "memory(GiB)": 40.03, + "step": 11705, + "token_acc": 0.825695442970865, + "train_speed(iter/s)": 0.095797 + }, + { + "epoch": 0.5445577648019672, + "grad_norm": 7.013660430908203, + "learning_rate": 4.6768690808756835e-06, + "loss": 0.7742821216583252, + "memory(GiB)": 40.03, + "step": 11710, + "token_acc": 0.8161696895173686, + "train_speed(iter/s)": 0.095821 + }, + { + "epoch": 0.5447902830619167, + "grad_norm": 6.9611735343933105, + "learning_rate": 4.673032312238459e-06, + "loss": 0.7051783084869385, + "memory(GiB)": 40.03, + "step": 11715, + "token_acc": 0.8249914879128363, + "train_speed(iter/s)": 0.095843 + }, + { + "epoch": 0.5450228013218663, + "grad_norm": 8.261186599731445, + "learning_rate": 4.669195736947321e-06, + "loss": 0.6473368167877197, + "memory(GiB)": 40.03, + "step": 11720, + "token_acc": 0.8500555349870418, + "train_speed(iter/s)": 0.095867 + }, + { + "epoch": 0.545255319581816, + "grad_norm": 7.994866371154785, + "learning_rate": 4.665359357270959e-06, + "loss": 0.7445518493652343, + "memory(GiB)": 40.03, + "step": 11725, + "token_acc": 0.8127433628318584, + "train_speed(iter/s)": 0.095889 + }, + { + "epoch": 0.5454878378417655, + "grad_norm": 6.611346244812012, + "learning_rate": 4.661523175477939e-06, + "loss": 0.8385189056396485, + "memory(GiB)": 40.03, + "step": 11730, + "token_acc": 0.8054982817869416, + "train_speed(iter/s)": 0.095912 + }, + { + "epoch": 0.5457203561017151, + "grad_norm": 6.896411418914795, + "learning_rate": 4.657687193836718e-06, + "loss": 0.5892057418823242, + "memory(GiB)": 40.03, + "step": 11735, + "token_acc": 0.8499684144030322, + "train_speed(iter/s)": 0.095935 + }, + { + "epoch": 0.5459528743616647, + "grad_norm": 6.74559211730957, + "learning_rate": 4.653851414615626e-06, + "loss": 0.5817788600921631, + "memory(GiB)": 40.03, + "step": 11740, + "token_acc": 0.8544546850998463, + "train_speed(iter/s)": 0.095959 + }, + { + "epoch": 0.5461853926216144, + "grad_norm": 5.590111255645752, + "learning_rate": 4.650015840082881e-06, + "loss": 0.7610962390899658, + "memory(GiB)": 40.03, + "step": 11745, + "token_acc": 0.8103011539544047, + "train_speed(iter/s)": 0.095981 + }, + { + "epoch": 0.5464179108815639, + "grad_norm": 8.093123435974121, + "learning_rate": 4.646180472506573e-06, + "loss": 0.6921517848968506, + "memory(GiB)": 40.03, + "step": 11750, + "token_acc": 0.8304626815265113, + "train_speed(iter/s)": 0.096004 + }, + { + "epoch": 0.5464179108815639, + "eval_loss": 0.5902931094169617, + "eval_runtime": 294.1316, + "eval_samples_per_second": 11.814, + "eval_steps_per_second": 11.814, + "step": 11750 + }, + { + "epoch": 0.5466504291415135, + "grad_norm": 6.688976287841797, + "learning_rate": 4.6423453141546795e-06, + "loss": 0.65995774269104, + "memory(GiB)": 40.03, + "step": 11755, + "token_acc": 0.8267956965306658, + "train_speed(iter/s)": 0.095795 + }, + { + "epoch": 0.5468829474014631, + "grad_norm": 7.701316833496094, + "learning_rate": 4.638510367295041e-06, + "loss": 0.6725353240966797, + "memory(GiB)": 40.03, + "step": 11760, + "token_acc": 0.8244811818501583, + "train_speed(iter/s)": 0.095819 + }, + { + "epoch": 0.5471154656614127, + "grad_norm": 7.668078422546387, + "learning_rate": 4.6346756341953844e-06, + "loss": 0.6790274620056153, + "memory(GiB)": 40.03, + "step": 11765, + "token_acc": 0.8225134008338296, + "train_speed(iter/s)": 0.095842 + }, + { + "epoch": 0.5473479839213623, + "grad_norm": 8.048089981079102, + "learning_rate": 4.630841117123303e-06, + "loss": 0.6271292686462402, + "memory(GiB)": 40.03, + "step": 11770, + "token_acc": 0.8438142211261817, + "train_speed(iter/s)": 0.095865 + }, + { + "epoch": 0.5475805021813119, + "grad_norm": 9.47883415222168, + "learning_rate": 4.6270068183462695e-06, + "loss": 0.7323870182037353, + "memory(GiB)": 40.03, + "step": 11775, + "token_acc": 0.8140096618357487, + "train_speed(iter/s)": 0.095888 + }, + { + "epoch": 0.5478130204412616, + "grad_norm": 8.775604248046875, + "learning_rate": 4.623172740131617e-06, + "loss": 0.5636983394622803, + "memory(GiB)": 40.03, + "step": 11780, + "token_acc": 0.8589160115846091, + "train_speed(iter/s)": 0.09591 + }, + { + "epoch": 0.5480455387012111, + "grad_norm": 7.393858432769775, + "learning_rate": 4.61933888474656e-06, + "loss": 0.7762670993804932, + "memory(GiB)": 40.03, + "step": 11785, + "token_acc": 0.8098495212038304, + "train_speed(iter/s)": 0.095933 + }, + { + "epoch": 0.5482780569611607, + "grad_norm": 8.181929588317871, + "learning_rate": 4.615505254458171e-06, + "loss": 0.6364175796508789, + "memory(GiB)": 40.03, + "step": 11790, + "token_acc": 0.834214002642008, + "train_speed(iter/s)": 0.095956 + }, + { + "epoch": 0.5485105752211104, + "grad_norm": 6.371438026428223, + "learning_rate": 4.6116718515333986e-06, + "loss": 0.7543970108032226, + "memory(GiB)": 40.03, + "step": 11795, + "token_acc": 0.8123417721518987, + "train_speed(iter/s)": 0.095978 + }, + { + "epoch": 0.54874309348106, + "grad_norm": 7.374037742614746, + "learning_rate": 4.607838678239048e-06, + "loss": 0.6098850727081299, + "memory(GiB)": 40.03, + "step": 11800, + "token_acc": 0.8399006034788783, + "train_speed(iter/s)": 0.096001 + }, + { + "epoch": 0.54874309348106, + "eval_loss": 0.5927510857582092, + "eval_runtime": 294.3173, + "eval_samples_per_second": 11.807, + "eval_steps_per_second": 11.807, + "step": 11800 + }, + { + "epoch": 0.5489756117410095, + "grad_norm": 11.768540382385254, + "learning_rate": 4.604005736841794e-06, + "loss": 0.762086009979248, + "memory(GiB)": 40.03, + "step": 11805, + "token_acc": 0.8265537176810663, + "train_speed(iter/s)": 0.095794 + }, + { + "epoch": 0.5492081300009591, + "grad_norm": 6.83040189743042, + "learning_rate": 4.6001730296081755e-06, + "loss": 0.6055526256561279, + "memory(GiB)": 40.03, + "step": 11810, + "token_acc": 0.8458769633507853, + "train_speed(iter/s)": 0.095817 + }, + { + "epoch": 0.5494406482609088, + "grad_norm": 7.891283988952637, + "learning_rate": 4.596340558804588e-06, + "loss": 0.7261328220367431, + "memory(GiB)": 40.03, + "step": 11815, + "token_acc": 0.8048154093097913, + "train_speed(iter/s)": 0.09584 + }, + { + "epoch": 0.5496731665208583, + "grad_norm": 6.938855171203613, + "learning_rate": 4.592508326697292e-06, + "loss": 0.8132460594177247, + "memory(GiB)": 40.03, + "step": 11820, + "token_acc": 0.8011363636363636, + "train_speed(iter/s)": 0.095863 + }, + { + "epoch": 0.5499056847808079, + "grad_norm": 6.9001336097717285, + "learning_rate": 4.588676335552403e-06, + "loss": 0.5898480892181397, + "memory(GiB)": 40.03, + "step": 11825, + "token_acc": 0.8442064264849075, + "train_speed(iter/s)": 0.095886 + }, + { + "epoch": 0.5501382030407576, + "grad_norm": 6.822132587432861, + "learning_rate": 4.584844587635896e-06, + "loss": 0.6609106540679932, + "memory(GiB)": 40.03, + "step": 11830, + "token_acc": 0.8293972506168488, + "train_speed(iter/s)": 0.095909 + }, + { + "epoch": 0.5503707213007072, + "grad_norm": 7.090336799621582, + "learning_rate": 4.581013085213601e-06, + "loss": 0.6086909770965576, + "memory(GiB)": 40.03, + "step": 11835, + "token_acc": 0.8393269548003959, + "train_speed(iter/s)": 0.095932 + }, + { + "epoch": 0.5506032395606567, + "grad_norm": 7.868709564208984, + "learning_rate": 4.577181830551208e-06, + "loss": 0.7239909648895264, + "memory(GiB)": 40.03, + "step": 11840, + "token_acc": 0.8222000664672648, + "train_speed(iter/s)": 0.095955 + }, + { + "epoch": 0.5508357578206063, + "grad_norm": 7.852298259735107, + "learning_rate": 4.573350825914249e-06, + "loss": 0.723827314376831, + "memory(GiB)": 40.03, + "step": 11845, + "token_acc": 0.8253404381290704, + "train_speed(iter/s)": 0.095977 + }, + { + "epoch": 0.551068276080556, + "grad_norm": 8.472371101379395, + "learning_rate": 4.569520073568121e-06, + "loss": 0.62130126953125, + "memory(GiB)": 40.03, + "step": 11850, + "token_acc": 0.8579749103942652, + "train_speed(iter/s)": 0.096001 + }, + { + "epoch": 0.551068276080556, + "eval_loss": 0.5887280702590942, + "eval_runtime": 293.7275, + "eval_samples_per_second": 11.831, + "eval_steps_per_second": 11.831, + "step": 11850 + }, + { + "epoch": 0.5513007943405056, + "grad_norm": 8.165416717529297, + "learning_rate": 4.565689575778064e-06, + "loss": 0.5155246734619141, + "memory(GiB)": 40.03, + "step": 11855, + "token_acc": 0.8275770606438131, + "train_speed(iter/s)": 0.095795 + }, + { + "epoch": 0.5515333126004551, + "grad_norm": 7.239565372467041, + "learning_rate": 4.561859334809172e-06, + "loss": 0.5977548599243164, + "memory(GiB)": 40.03, + "step": 11860, + "token_acc": 0.8501628664495114, + "train_speed(iter/s)": 0.095817 + }, + { + "epoch": 0.5517658308604048, + "grad_norm": 8.126626968383789, + "learning_rate": 4.558029352926379e-06, + "loss": 0.8376049041748047, + "memory(GiB)": 40.03, + "step": 11865, + "token_acc": 0.8080301129234629, + "train_speed(iter/s)": 0.095839 + }, + { + "epoch": 0.5519983491203544, + "grad_norm": 6.358272552490234, + "learning_rate": 4.5541996323944775e-06, + "loss": 0.6366981506347656, + "memory(GiB)": 40.03, + "step": 11870, + "token_acc": 0.834573043736983, + "train_speed(iter/s)": 0.095862 + }, + { + "epoch": 0.5522308673803039, + "grad_norm": 7.45095682144165, + "learning_rate": 4.550370175478096e-06, + "loss": 0.7741560459136962, + "memory(GiB)": 40.03, + "step": 11875, + "token_acc": 0.8243014394580863, + "train_speed(iter/s)": 0.095884 + }, + { + "epoch": 0.5524633856402535, + "grad_norm": 7.986889362335205, + "learning_rate": 4.546540984441713e-06, + "loss": 0.7031302452087402, + "memory(GiB)": 40.03, + "step": 11880, + "token_acc": 0.8319423368740516, + "train_speed(iter/s)": 0.095907 + }, + { + "epoch": 0.5526959039002032, + "grad_norm": 7.913536548614502, + "learning_rate": 4.542712061549646e-06, + "loss": 0.679871940612793, + "memory(GiB)": 40.03, + "step": 11885, + "token_acc": 0.8283246977547496, + "train_speed(iter/s)": 0.095929 + }, + { + "epoch": 0.5529284221601528, + "grad_norm": 7.337793350219727, + "learning_rate": 4.538883409066055e-06, + "loss": 0.6509832859039306, + "memory(GiB)": 40.03, + "step": 11890, + "token_acc": 0.831765935214211, + "train_speed(iter/s)": 0.095952 + }, + { + "epoch": 0.5531609404201023, + "grad_norm": 8.420065879821777, + "learning_rate": 4.53505502925494e-06, + "loss": 0.6353929996490478, + "memory(GiB)": 40.03, + "step": 11895, + "token_acc": 0.8337801608579088, + "train_speed(iter/s)": 0.095975 + }, + { + "epoch": 0.553393458680052, + "grad_norm": 6.823483943939209, + "learning_rate": 4.531226924380144e-06, + "loss": 0.7875633239746094, + "memory(GiB)": 40.03, + "step": 11900, + "token_acc": 0.8031420327027894, + "train_speed(iter/s)": 0.095998 + }, + { + "epoch": 0.553393458680052, + "eval_loss": 0.5897703170776367, + "eval_runtime": 295.526, + "eval_samples_per_second": 11.759, + "eval_steps_per_second": 11.759, + "step": 11900 + }, + { + "epoch": 0.5536259769400016, + "grad_norm": 8.597670555114746, + "learning_rate": 4.527399096705338e-06, + "loss": 0.6930715560913085, + "memory(GiB)": 40.03, + "step": 11905, + "token_acc": 0.8261207670600308, + "train_speed(iter/s)": 0.095792 + }, + { + "epoch": 0.5538584951999511, + "grad_norm": 8.751448631286621, + "learning_rate": 4.523571548494039e-06, + "loss": 0.6656043052673339, + "memory(GiB)": 40.03, + "step": 11910, + "token_acc": 0.8314430973797419, + "train_speed(iter/s)": 0.095815 + }, + { + "epoch": 0.5540910134599007, + "grad_norm": 6.747947692871094, + "learning_rate": 4.5197442820095906e-06, + "loss": 0.911475658416748, + "memory(GiB)": 40.03, + "step": 11915, + "token_acc": 0.7812105926860026, + "train_speed(iter/s)": 0.095838 + }, + { + "epoch": 0.5543235317198504, + "grad_norm": 7.468943119049072, + "learning_rate": 4.5159172995151786e-06, + "loss": 0.767233419418335, + "memory(GiB)": 40.03, + "step": 11920, + "token_acc": 0.814017094017094, + "train_speed(iter/s)": 0.09586 + }, + { + "epoch": 0.5545560499798, + "grad_norm": 7.417344093322754, + "learning_rate": 4.512090603273811e-06, + "loss": 0.7732417106628418, + "memory(GiB)": 40.03, + "step": 11925, + "token_acc": 0.8210053126277074, + "train_speed(iter/s)": 0.095883 + }, + { + "epoch": 0.5547885682397495, + "grad_norm": 7.629657745361328, + "learning_rate": 4.508264195548336e-06, + "loss": 0.7324337959289551, + "memory(GiB)": 40.03, + "step": 11930, + "token_acc": 0.8334142787761049, + "train_speed(iter/s)": 0.095905 + }, + { + "epoch": 0.5550210864996992, + "grad_norm": 7.580898284912109, + "learning_rate": 4.504438078601421e-06, + "loss": 0.7392070770263672, + "memory(GiB)": 40.03, + "step": 11935, + "token_acc": 0.822000711997152, + "train_speed(iter/s)": 0.095928 + }, + { + "epoch": 0.5552536047596488, + "grad_norm": 8.211389541625977, + "learning_rate": 4.500612254695571e-06, + "loss": 0.8082739830017089, + "memory(GiB)": 40.03, + "step": 11940, + "token_acc": 0.794435857805255, + "train_speed(iter/s)": 0.095951 + }, + { + "epoch": 0.5554861230195984, + "grad_norm": 6.073451519012451, + "learning_rate": 4.496786726093116e-06, + "loss": 0.686737060546875, + "memory(GiB)": 40.03, + "step": 11945, + "token_acc": 0.8204729309271935, + "train_speed(iter/s)": 0.095974 + }, + { + "epoch": 0.555718641279548, + "grad_norm": 6.639056205749512, + "learning_rate": 4.492961495056204e-06, + "loss": 0.7988576412200927, + "memory(GiB)": 40.03, + "step": 11950, + "token_acc": 0.8012508686587908, + "train_speed(iter/s)": 0.095996 + }, + { + "epoch": 0.555718641279548, + "eval_loss": 0.588585615158081, + "eval_runtime": 295.4879, + "eval_samples_per_second": 11.76, + "eval_steps_per_second": 11.76, + "step": 11950 + }, + { + "epoch": 0.5559511595394976, + "grad_norm": 6.849501609802246, + "learning_rate": 4.489136563846814e-06, + "loss": 0.7492703914642334, + "memory(GiB)": 40.03, + "step": 11955, + "token_acc": 0.8257185645526841, + "train_speed(iter/s)": 0.095791 + }, + { + "epoch": 0.5561836777994472, + "grad_norm": 8.924171447753906, + "learning_rate": 4.485311934726747e-06, + "loss": 0.7083686351776123, + "memory(GiB)": 40.03, + "step": 11960, + "token_acc": 0.8093106535362579, + "train_speed(iter/s)": 0.095814 + }, + { + "epoch": 0.5564161960593967, + "grad_norm": 5.738160133361816, + "learning_rate": 4.4814876099576254e-06, + "loss": 0.6795273780822754, + "memory(GiB)": 40.03, + "step": 11965, + "token_acc": 0.822052067381317, + "train_speed(iter/s)": 0.095836 + }, + { + "epoch": 0.5566487143193464, + "grad_norm": 9.783073425292969, + "learning_rate": 4.477663591800887e-06, + "loss": 0.7246876716613769, + "memory(GiB)": 40.03, + "step": 11970, + "token_acc": 0.8208363374188897, + "train_speed(iter/s)": 0.095858 + }, + { + "epoch": 0.556881232579296, + "grad_norm": 7.146920204162598, + "learning_rate": 4.473839882517794e-06, + "loss": 0.6436521530151367, + "memory(GiB)": 40.03, + "step": 11975, + "token_acc": 0.8347107438016529, + "train_speed(iter/s)": 0.095881 + }, + { + "epoch": 0.5571137508392456, + "grad_norm": 7.048880577087402, + "learning_rate": 4.470016484369423e-06, + "loss": 0.695180606842041, + "memory(GiB)": 40.03, + "step": 11980, + "token_acc": 0.8230152949745084, + "train_speed(iter/s)": 0.095902 + }, + { + "epoch": 0.5573462690991952, + "grad_norm": 5.961655139923096, + "learning_rate": 4.466193399616669e-06, + "loss": 0.6891386032104492, + "memory(GiB)": 40.03, + "step": 11985, + "token_acc": 0.814453125, + "train_speed(iter/s)": 0.095925 + }, + { + "epoch": 0.5575787873591448, + "grad_norm": 6.8944878578186035, + "learning_rate": 4.462370630520237e-06, + "loss": 0.6040480613708497, + "memory(GiB)": 40.03, + "step": 11990, + "token_acc": 0.848985208118335, + "train_speed(iter/s)": 0.095947 + }, + { + "epoch": 0.5578113056190944, + "grad_norm": 6.749599456787109, + "learning_rate": 4.458548179340651e-06, + "loss": 0.7882990360260009, + "memory(GiB)": 40.03, + "step": 11995, + "token_acc": 0.8040925863804093, + "train_speed(iter/s)": 0.09597 + }, + { + "epoch": 0.558043823879044, + "grad_norm": 8.951676368713379, + "learning_rate": 4.4547260483382435e-06, + "loss": 0.7259221076965332, + "memory(GiB)": 40.03, + "step": 12000, + "token_acc": 0.825390625, + "train_speed(iter/s)": 0.095992 + }, + { + "epoch": 0.558043823879044, + "eval_loss": 0.5902968049049377, + "eval_runtime": 294.0747, + "eval_samples_per_second": 11.817, + "eval_steps_per_second": 11.817, + "step": 12000 + }, + { + "epoch": 0.5582763421389936, + "grad_norm": 6.93931245803833, + "learning_rate": 4.45090423977316e-06, + "loss": 0.5834178447723388, + "memory(GiB)": 40.03, + "step": 12005, + "token_acc": 0.827311676684998, + "train_speed(iter/s)": 0.095789 + }, + { + "epoch": 0.5585088603989432, + "grad_norm": 9.755097389221191, + "learning_rate": 4.447082755905351e-06, + "loss": 0.7193635940551758, + "memory(GiB)": 40.03, + "step": 12010, + "token_acc": 0.8245924875974486, + "train_speed(iter/s)": 0.095811 + }, + { + "epoch": 0.5587413786588928, + "grad_norm": 6.74008321762085, + "learning_rate": 4.4432615989945794e-06, + "loss": 0.5961836814880371, + "memory(GiB)": 40.03, + "step": 12015, + "token_acc": 0.846640872317974, + "train_speed(iter/s)": 0.095834 + }, + { + "epoch": 0.5589738969188424, + "grad_norm": 10.712708473205566, + "learning_rate": 4.439440771300412e-06, + "loss": 0.5513602256774902, + "memory(GiB)": 40.03, + "step": 12020, + "token_acc": 0.8571959836370397, + "train_speed(iter/s)": 0.095856 + }, + { + "epoch": 0.559206415178792, + "grad_norm": 7.622487545013428, + "learning_rate": 4.435620275082227e-06, + "loss": 0.5959040641784668, + "memory(GiB)": 40.03, + "step": 12025, + "token_acc": 0.8629697525206232, + "train_speed(iter/s)": 0.095878 + }, + { + "epoch": 0.5594389334387416, + "grad_norm": 7.523995399475098, + "learning_rate": 4.431800112599195e-06, + "loss": 0.7422564506530762, + "memory(GiB)": 40.03, + "step": 12030, + "token_acc": 0.8241792929292929, + "train_speed(iter/s)": 0.0959 + }, + { + "epoch": 0.5596714516986913, + "grad_norm": 7.105021953582764, + "learning_rate": 4.427980286110301e-06, + "loss": 0.6166975498199463, + "memory(GiB)": 40.03, + "step": 12035, + "token_acc": 0.8445110528287748, + "train_speed(iter/s)": 0.095923 + }, + { + "epoch": 0.5599039699586408, + "grad_norm": 7.011575222015381, + "learning_rate": 4.424160797874323e-06, + "loss": 0.5584123134613037, + "memory(GiB)": 40.03, + "step": 12040, + "token_acc": 0.85, + "train_speed(iter/s)": 0.095944 + }, + { + "epoch": 0.5601364882185904, + "grad_norm": 8.523414611816406, + "learning_rate": 4.420341650149847e-06, + "loss": 0.7120149612426758, + "memory(GiB)": 40.03, + "step": 12045, + "token_acc": 0.8180006642311525, + "train_speed(iter/s)": 0.095966 + }, + { + "epoch": 0.56036900647854, + "grad_norm": 10.580933570861816, + "learning_rate": 4.416522845195247e-06, + "loss": 0.6449977874755859, + "memory(GiB)": 40.03, + "step": 12050, + "token_acc": 0.8413223140495868, + "train_speed(iter/s)": 0.095989 + }, + { + "epoch": 0.56036900647854, + "eval_loss": 0.5899468660354614, + "eval_runtime": 293.9174, + "eval_samples_per_second": 11.823, + "eval_steps_per_second": 11.823, + "step": 12050 + }, + { + "epoch": 0.5606015247384896, + "grad_norm": 6.270550727844238, + "learning_rate": 4.4127043852687045e-06, + "loss": 0.7345793724060059, + "memory(GiB)": 40.03, + "step": 12055, + "token_acc": 0.8270697756325593, + "train_speed(iter/s)": 0.095786 + }, + { + "epoch": 0.5608340429984392, + "grad_norm": 6.681027889251709, + "learning_rate": 4.40888627262819e-06, + "loss": 0.6348593711853028, + "memory(GiB)": 40.03, + "step": 12060, + "token_acc": 0.8431644691186676, + "train_speed(iter/s)": 0.095809 + }, + { + "epoch": 0.5610665612583888, + "grad_norm": 8.465786933898926, + "learning_rate": 4.4050685095314755e-06, + "loss": 0.650502061843872, + "memory(GiB)": 40.03, + "step": 12065, + "token_acc": 0.8298568507157464, + "train_speed(iter/s)": 0.095831 + }, + { + "epoch": 0.5612990795183385, + "grad_norm": 7.157223701477051, + "learning_rate": 4.401251098236116e-06, + "loss": 0.8636740684509278, + "memory(GiB)": 40.03, + "step": 12070, + "token_acc": 0.8011363636363636, + "train_speed(iter/s)": 0.095854 + }, + { + "epoch": 0.561531597778288, + "grad_norm": 6.018905162811279, + "learning_rate": 4.397434040999469e-06, + "loss": 0.7139524936676025, + "memory(GiB)": 40.03, + "step": 12075, + "token_acc": 0.8330578512396695, + "train_speed(iter/s)": 0.095876 + }, + { + "epoch": 0.5617641160382376, + "grad_norm": 6.583132743835449, + "learning_rate": 4.39361734007868e-06, + "loss": 0.8445042610168457, + "memory(GiB)": 40.03, + "step": 12080, + "token_acc": 0.7940761636107193, + "train_speed(iter/s)": 0.095899 + }, + { + "epoch": 0.5619966342981872, + "grad_norm": 8.308008193969727, + "learning_rate": 4.389800997730677e-06, + "loss": 0.6492765426635743, + "memory(GiB)": 40.03, + "step": 12085, + "token_acc": 0.8364485981308412, + "train_speed(iter/s)": 0.09592 + }, + { + "epoch": 0.5622291525581369, + "grad_norm": 7.715389728546143, + "learning_rate": 4.385985016212184e-06, + "loss": 0.5420804023742676, + "memory(GiB)": 40.03, + "step": 12090, + "token_acc": 0.8639160332971408, + "train_speed(iter/s)": 0.095943 + }, + { + "epoch": 0.5624616708180864, + "grad_norm": 7.167847633361816, + "learning_rate": 4.382169397779708e-06, + "loss": 0.7379115104675293, + "memory(GiB)": 40.03, + "step": 12095, + "token_acc": 0.8219735503560529, + "train_speed(iter/s)": 0.095965 + }, + { + "epoch": 0.562694189078036, + "grad_norm": 7.906615734100342, + "learning_rate": 4.378354144689544e-06, + "loss": 0.6950104236602783, + "memory(GiB)": 40.03, + "step": 12100, + "token_acc": 0.8354203935599285, + "train_speed(iter/s)": 0.095987 + }, + { + "epoch": 0.562694189078036, + "eval_loss": 0.588950514793396, + "eval_runtime": 292.1716, + "eval_samples_per_second": 11.894, + "eval_steps_per_second": 11.894, + "step": 12100 + }, + { + "epoch": 0.5629267073379857, + "grad_norm": 6.199409484863281, + "learning_rate": 4.374539259197766e-06, + "loss": 0.6853800296783448, + "memory(GiB)": 40.03, + "step": 12105, + "token_acc": 0.8269081789938584, + "train_speed(iter/s)": 0.095787 + }, + { + "epoch": 0.5631592255979352, + "grad_norm": 5.901808738708496, + "learning_rate": 4.370724743560235e-06, + "loss": 0.608131742477417, + "memory(GiB)": 40.03, + "step": 12110, + "token_acc": 0.8353520060560181, + "train_speed(iter/s)": 0.095809 + }, + { + "epoch": 0.5633917438578848, + "grad_norm": 7.537030220031738, + "learning_rate": 4.36691060003259e-06, + "loss": 0.699574613571167, + "memory(GiB)": 40.03, + "step": 12115, + "token_acc": 0.8168229777256741, + "train_speed(iter/s)": 0.095831 + }, + { + "epoch": 0.5636242621178345, + "grad_norm": 7.507444858551025, + "learning_rate": 4.363096830870257e-06, + "loss": 0.5918978691101074, + "memory(GiB)": 40.03, + "step": 12120, + "token_acc": 0.8476223533495314, + "train_speed(iter/s)": 0.095853 + }, + { + "epoch": 0.5638567803777841, + "grad_norm": 7.615386962890625, + "learning_rate": 4.35928343832843e-06, + "loss": 0.7293277740478515, + "memory(GiB)": 40.03, + "step": 12125, + "token_acc": 0.8251136761105281, + "train_speed(iter/s)": 0.095875 + }, + { + "epoch": 0.5640892986377336, + "grad_norm": 7.207439422607422, + "learning_rate": 4.355470424662087e-06, + "loss": 0.6662022590637207, + "memory(GiB)": 40.03, + "step": 12130, + "token_acc": 0.8318496538081108, + "train_speed(iter/s)": 0.095898 + }, + { + "epoch": 0.5643218168976832, + "grad_norm": 11.291145324707031, + "learning_rate": 4.351657792125981e-06, + "loss": 0.6410336971282959, + "memory(GiB)": 40.03, + "step": 12135, + "token_acc": 0.8378037235721048, + "train_speed(iter/s)": 0.09592 + }, + { + "epoch": 0.5645543351576329, + "grad_norm": 8.108573913574219, + "learning_rate": 4.347845542974642e-06, + "loss": 0.7120551109313965, + "memory(GiB)": 40.03, + "step": 12140, + "token_acc": 0.8150989099717401, + "train_speed(iter/s)": 0.095942 + }, + { + "epoch": 0.5647868534175825, + "grad_norm": 8.063129425048828, + "learning_rate": 4.344033679462367e-06, + "loss": 0.6626528263092041, + "memory(GiB)": 40.03, + "step": 12145, + "token_acc": 0.8348591549295775, + "train_speed(iter/s)": 0.095964 + }, + { + "epoch": 0.565019371677532, + "grad_norm": 7.968915939331055, + "learning_rate": 4.3402222038432295e-06, + "loss": 0.6640778541564941, + "memory(GiB)": 40.03, + "step": 12150, + "token_acc": 0.8185752330226365, + "train_speed(iter/s)": 0.095986 + }, + { + "epoch": 0.565019371677532, + "eval_loss": 0.5861648917198181, + "eval_runtime": 290.5533, + "eval_samples_per_second": 11.96, + "eval_steps_per_second": 11.96, + "step": 12150 + }, + { + "epoch": 0.5652518899374817, + "grad_norm": 8.727378845214844, + "learning_rate": 4.336411118371073e-06, + "loss": 0.6298631191253662, + "memory(GiB)": 40.03, + "step": 12155, + "token_acc": 0.8282516248094359, + "train_speed(iter/s)": 0.095788 + }, + { + "epoch": 0.5654844081974313, + "grad_norm": 8.191335678100586, + "learning_rate": 4.332600425299512e-06, + "loss": 0.6278055667877197, + "memory(GiB)": 40.03, + "step": 12160, + "token_acc": 0.8424785367674505, + "train_speed(iter/s)": 0.095811 + }, + { + "epoch": 0.5657169264573808, + "grad_norm": 9.833742141723633, + "learning_rate": 4.328790126881923e-06, + "loss": 0.6580445289611816, + "memory(GiB)": 40.03, + "step": 12165, + "token_acc": 0.8379080118694362, + "train_speed(iter/s)": 0.095833 + }, + { + "epoch": 0.5659494447173304, + "grad_norm": 6.148544788360596, + "learning_rate": 4.324980225371456e-06, + "loss": 0.5997506618499756, + "memory(GiB)": 40.03, + "step": 12170, + "token_acc": 0.8537455410225921, + "train_speed(iter/s)": 0.095855 + }, + { + "epoch": 0.5661819629772801, + "grad_norm": 6.668766975402832, + "learning_rate": 4.321170723021022e-06, + "loss": 0.5688246250152588, + "memory(GiB)": 40.03, + "step": 12175, + "token_acc": 0.8473724884080371, + "train_speed(iter/s)": 0.095877 + }, + { + "epoch": 0.5664144812372297, + "grad_norm": 8.070734977722168, + "learning_rate": 4.3173616220833e-06, + "loss": 0.7162200927734375, + "memory(GiB)": 40.03, + "step": 12180, + "token_acc": 0.8325041459369817, + "train_speed(iter/s)": 0.0959 + }, + { + "epoch": 0.5666469994971792, + "grad_norm": 8.31356143951416, + "learning_rate": 4.3135529248107245e-06, + "loss": 0.7518483161926269, + "memory(GiB)": 40.03, + "step": 12185, + "token_acc": 0.8049171566007483, + "train_speed(iter/s)": 0.095922 + }, + { + "epoch": 0.5668795177571289, + "grad_norm": 5.885660648345947, + "learning_rate": 4.3097446334555e-06, + "loss": 0.8494339942932129, + "memory(GiB)": 40.03, + "step": 12190, + "token_acc": 0.7792833483890395, + "train_speed(iter/s)": 0.095944 + }, + { + "epoch": 0.5671120360170785, + "grad_norm": 9.467564582824707, + "learning_rate": 4.305936750269583e-06, + "loss": 0.6303043842315674, + "memory(GiB)": 40.03, + "step": 12195, + "token_acc": 0.8442650521358896, + "train_speed(iter/s)": 0.095966 + }, + { + "epoch": 0.567344554277028, + "grad_norm": 5.601880073547363, + "learning_rate": 4.302129277504696e-06, + "loss": 0.6847558975219726, + "memory(GiB)": 40.03, + "step": 12200, + "token_acc": 0.8275607958732498, + "train_speed(iter/s)": 0.095988 + }, + { + "epoch": 0.567344554277028, + "eval_loss": 0.5879337787628174, + "eval_runtime": 294.3451, + "eval_samples_per_second": 11.806, + "eval_steps_per_second": 11.806, + "step": 12200 + }, + { + "epoch": 0.5675770725369776, + "grad_norm": 9.660368919372559, + "learning_rate": 4.298322217412312e-06, + "loss": 0.6761251926422119, + "memory(GiB)": 40.03, + "step": 12205, + "token_acc": 0.8278712867313605, + "train_speed(iter/s)": 0.095788 + }, + { + "epoch": 0.5678095907969273, + "grad_norm": 8.12336254119873, + "learning_rate": 4.294515572243665e-06, + "loss": 0.6749348163604736, + "memory(GiB)": 40.03, + "step": 12210, + "token_acc": 0.8432692307692308, + "train_speed(iter/s)": 0.09581 + }, + { + "epoch": 0.5680421090568769, + "grad_norm": 8.090500831604004, + "learning_rate": 4.290709344249743e-06, + "loss": 0.6176517009735107, + "memory(GiB)": 40.03, + "step": 12215, + "token_acc": 0.837516960651289, + "train_speed(iter/s)": 0.095833 + }, + { + "epoch": 0.5682746273168264, + "grad_norm": 7.9386982917785645, + "learning_rate": 4.286903535681282e-06, + "loss": 0.6293925285339356, + "memory(GiB)": 40.03, + "step": 12220, + "token_acc": 0.8361252731245448, + "train_speed(iter/s)": 0.095855 + }, + { + "epoch": 0.5685071455767761, + "grad_norm": 8.970488548278809, + "learning_rate": 4.283098148788781e-06, + "loss": 0.5741978168487549, + "memory(GiB)": 40.03, + "step": 12225, + "token_acc": 0.8548273431994362, + "train_speed(iter/s)": 0.095877 + }, + { + "epoch": 0.5687396638367257, + "grad_norm": 6.578322887420654, + "learning_rate": 4.279293185822476e-06, + "loss": 0.6272590160369873, + "memory(GiB)": 40.03, + "step": 12230, + "token_acc": 0.837742980561555, + "train_speed(iter/s)": 0.095899 + }, + { + "epoch": 0.5689721820966753, + "grad_norm": 9.832868576049805, + "learning_rate": 4.275488649032362e-06, + "loss": 0.6227746963500976, + "memory(GiB)": 40.03, + "step": 12235, + "token_acc": 0.8389763779527559, + "train_speed(iter/s)": 0.09592 + }, + { + "epoch": 0.5692047003566248, + "grad_norm": 7.5642900466918945, + "learning_rate": 4.271684540668178e-06, + "loss": 0.6691617488861084, + "memory(GiB)": 40.03, + "step": 12240, + "token_acc": 0.8332721263312522, + "train_speed(iter/s)": 0.095942 + }, + { + "epoch": 0.5694372186165745, + "grad_norm": 7.92784309387207, + "learning_rate": 4.267880862979414e-06, + "loss": 0.6101456165313721, + "memory(GiB)": 40.03, + "step": 12245, + "token_acc": 0.8562476962771839, + "train_speed(iter/s)": 0.095964 + }, + { + "epoch": 0.5696697368765241, + "grad_norm": 7.504398345947266, + "learning_rate": 4.264077618215296e-06, + "loss": 0.6823426246643066, + "memory(GiB)": 40.03, + "step": 12250, + "token_acc": 0.8348559381588194, + "train_speed(iter/s)": 0.095986 + }, + { + "epoch": 0.5696697368765241, + "eval_loss": 0.5861591100692749, + "eval_runtime": 295.5018, + "eval_samples_per_second": 11.76, + "eval_steps_per_second": 11.76, + "step": 12250 + }, + { + "epoch": 0.5699022551364736, + "grad_norm": 5.381181716918945, + "learning_rate": 4.260274808624805e-06, + "loss": 0.6528043746948242, + "memory(GiB)": 40.03, + "step": 12255, + "token_acc": 0.8278091189490675, + "train_speed(iter/s)": 0.095786 + }, + { + "epoch": 0.5701347733964233, + "grad_norm": 6.923305034637451, + "learning_rate": 4.256472436456658e-06, + "loss": 0.7982216835021972, + "memory(GiB)": 40.03, + "step": 12260, + "token_acc": 0.7843494085532302, + "train_speed(iter/s)": 0.095808 + }, + { + "epoch": 0.5703672916563729, + "grad_norm": 8.104715347290039, + "learning_rate": 4.252670503959317e-06, + "loss": 0.6475061893463134, + "memory(GiB)": 40.03, + "step": 12265, + "token_acc": 0.8468261269549218, + "train_speed(iter/s)": 0.095829 + }, + { + "epoch": 0.5705998099163225, + "grad_norm": 7.670345306396484, + "learning_rate": 4.248869013380977e-06, + "loss": 0.6680463790893555, + "memory(GiB)": 40.03, + "step": 12270, + "token_acc": 0.8414048059149722, + "train_speed(iter/s)": 0.095852 + }, + { + "epoch": 0.570832328176272, + "grad_norm": 10.764370918273926, + "learning_rate": 4.245067966969583e-06, + "loss": 0.7138540267944335, + "memory(GiB)": 40.03, + "step": 12275, + "token_acc": 0.8249346771183277, + "train_speed(iter/s)": 0.095874 + }, + { + "epoch": 0.5710648464362217, + "grad_norm": 9.680176734924316, + "learning_rate": 4.241267366972806e-06, + "loss": 0.5867207050323486, + "memory(GiB)": 40.03, + "step": 12280, + "token_acc": 0.8530480419862737, + "train_speed(iter/s)": 0.095896 + }, + { + "epoch": 0.5712973646961713, + "grad_norm": 6.827235698699951, + "learning_rate": 4.237467215638064e-06, + "loss": 0.6321615219116211, + "memory(GiB)": 40.03, + "step": 12285, + "token_acc": 0.8348656294200849, + "train_speed(iter/s)": 0.095918 + }, + { + "epoch": 0.571529882956121, + "grad_norm": 7.801530838012695, + "learning_rate": 4.233667515212496e-06, + "loss": 0.7946747779846192, + "memory(GiB)": 40.03, + "step": 12290, + "token_acc": 0.8071505958829902, + "train_speed(iter/s)": 0.09594 + }, + { + "epoch": 0.5717624012160705, + "grad_norm": 7.390880107879639, + "learning_rate": 4.229868267942988e-06, + "loss": 0.6462111473083496, + "memory(GiB)": 40.03, + "step": 12295, + "token_acc": 0.8338423946243128, + "train_speed(iter/s)": 0.095962 + }, + { + "epoch": 0.5719949194760201, + "grad_norm": 9.422018051147461, + "learning_rate": 4.226069476076151e-06, + "loss": 0.6602601051330567, + "memory(GiB)": 40.03, + "step": 12300, + "token_acc": 0.8338995847489619, + "train_speed(iter/s)": 0.095984 + }, + { + "epoch": 0.5719949194760201, + "eval_loss": 0.5856576561927795, + "eval_runtime": 291.848, + "eval_samples_per_second": 11.907, + "eval_steps_per_second": 11.907, + "step": 12300 + }, + { + "epoch": 0.5722274377359697, + "grad_norm": 7.992288112640381, + "learning_rate": 4.222271141858328e-06, + "loss": 0.5441146850585937, + "memory(GiB)": 40.03, + "step": 12305, + "token_acc": 0.8283153004022423, + "train_speed(iter/s)": 0.095787 + }, + { + "epoch": 0.5724599559959193, + "grad_norm": 7.502604961395264, + "learning_rate": 4.218473267535589e-06, + "loss": 0.6329378128051758, + "memory(GiB)": 40.03, + "step": 12310, + "token_acc": 0.8429888084265964, + "train_speed(iter/s)": 0.095809 + }, + { + "epoch": 0.5726924742558689, + "grad_norm": 6.395047187805176, + "learning_rate": 4.214675855353737e-06, + "loss": 0.781887435913086, + "memory(GiB)": 40.03, + "step": 12315, + "token_acc": 0.8022071307300509, + "train_speed(iter/s)": 0.09583 + }, + { + "epoch": 0.5729249925158185, + "grad_norm": 12.194600105285645, + "learning_rate": 4.210878907558298e-06, + "loss": 0.7144025325775146, + "memory(GiB)": 40.03, + "step": 12320, + "token_acc": 0.8248885285772193, + "train_speed(iter/s)": 0.095852 + }, + { + "epoch": 0.5731575107757682, + "grad_norm": 7.285346508026123, + "learning_rate": 4.207082426394525e-06, + "loss": 0.6607818126678466, + "memory(GiB)": 40.03, + "step": 12325, + "token_acc": 0.8305728088336783, + "train_speed(iter/s)": 0.095874 + }, + { + "epoch": 0.5733900290357177, + "grad_norm": 6.923439025878906, + "learning_rate": 4.203286414107394e-06, + "loss": 0.7327235698699951, + "memory(GiB)": 40.03, + "step": 12330, + "token_acc": 0.8300500834724541, + "train_speed(iter/s)": 0.095896 + }, + { + "epoch": 0.5736225472956673, + "grad_norm": 9.087661743164062, + "learning_rate": 4.199490872941603e-06, + "loss": 0.6531758308410645, + "memory(GiB)": 40.03, + "step": 12335, + "token_acc": 0.8412962193601994, + "train_speed(iter/s)": 0.095919 + }, + { + "epoch": 0.5738550655556169, + "grad_norm": 8.431983947753906, + "learning_rate": 4.195695805141575e-06, + "loss": 0.6540855407714844, + "memory(GiB)": 40.03, + "step": 12340, + "token_acc": 0.8342618384401114, + "train_speed(iter/s)": 0.095942 + }, + { + "epoch": 0.5740875838155666, + "grad_norm": 7.269530773162842, + "learning_rate": 4.1919012129514494e-06, + "loss": 0.6406298637390136, + "memory(GiB)": 40.03, + "step": 12345, + "token_acc": 0.8300678221552373, + "train_speed(iter/s)": 0.095964 + }, + { + "epoch": 0.5743201020755161, + "grad_norm": 8.626265525817871, + "learning_rate": 4.188107098615088e-06, + "loss": 0.7785932064056397, + "memory(GiB)": 40.03, + "step": 12350, + "token_acc": 0.7945383615084526, + "train_speed(iter/s)": 0.095985 + }, + { + "epoch": 0.5743201020755161, + "eval_loss": 0.5844415426254272, + "eval_runtime": 291.785, + "eval_samples_per_second": 11.909, + "eval_steps_per_second": 11.909, + "step": 12350 + }, + { + "epoch": 0.5745526203354657, + "grad_norm": 9.09610366821289, + "learning_rate": 4.1843134643760645e-06, + "loss": 0.6106031894683838, + "memory(GiB)": 40.03, + "step": 12355, + "token_acc": 0.8282516834546166, + "train_speed(iter/s)": 0.09579 + }, + { + "epoch": 0.5747851385954154, + "grad_norm": 6.92556095123291, + "learning_rate": 4.180520312477674e-06, + "loss": 0.7823381423950195, + "memory(GiB)": 40.03, + "step": 12360, + "token_acc": 0.8039964736996768, + "train_speed(iter/s)": 0.095812 + }, + { + "epoch": 0.5750176568553649, + "grad_norm": 7.292800426483154, + "learning_rate": 4.176727645162922e-06, + "loss": 0.6464691638946534, + "memory(GiB)": 40.03, + "step": 12365, + "token_acc": 0.8313665778454511, + "train_speed(iter/s)": 0.095834 + }, + { + "epoch": 0.5752501751153145, + "grad_norm": 8.345467567443848, + "learning_rate": 4.172935464674535e-06, + "loss": 0.6117934226989746, + "memory(GiB)": 40.03, + "step": 12370, + "token_acc": 0.8359663865546219, + "train_speed(iter/s)": 0.095856 + }, + { + "epoch": 0.5754826933752641, + "grad_norm": 5.228003978729248, + "learning_rate": 4.16914377325494e-06, + "loss": 0.5495120048522949, + "memory(GiB)": 40.03, + "step": 12375, + "token_acc": 0.8590203106332138, + "train_speed(iter/s)": 0.095878 + }, + { + "epoch": 0.5757152116352138, + "grad_norm": 10.157156944274902, + "learning_rate": 4.165352573146285e-06, + "loss": 0.7139785766601563, + "memory(GiB)": 40.03, + "step": 12380, + "token_acc": 0.8175206611570248, + "train_speed(iter/s)": 0.0959 + }, + { + "epoch": 0.5759477298951633, + "grad_norm": 8.81885051727295, + "learning_rate": 4.161561866590421e-06, + "loss": 0.5811410427093506, + "memory(GiB)": 40.03, + "step": 12385, + "token_acc": 0.8459538511537211, + "train_speed(iter/s)": 0.095922 + }, + { + "epoch": 0.5761802481551129, + "grad_norm": 7.776773929595947, + "learning_rate": 4.157771655828915e-06, + "loss": 0.6779936790466309, + "memory(GiB)": 40.03, + "step": 12390, + "token_acc": 0.8202472435683261, + "train_speed(iter/s)": 0.095944 + }, + { + "epoch": 0.5764127664150626, + "grad_norm": 6.842904567718506, + "learning_rate": 4.1539819431030295e-06, + "loss": 0.5791988849639893, + "memory(GiB)": 40.03, + "step": 12395, + "token_acc": 0.854490337248958, + "train_speed(iter/s)": 0.095965 + }, + { + "epoch": 0.5766452846750121, + "grad_norm": 6.902646064758301, + "learning_rate": 4.150192730653742e-06, + "loss": 0.7370592594146729, + "memory(GiB)": 40.03, + "step": 12400, + "token_acc": 0.8119092627599244, + "train_speed(iter/s)": 0.095988 + }, + { + "epoch": 0.5766452846750121, + "eval_loss": 0.5841361880302429, + "eval_runtime": 291.209, + "eval_samples_per_second": 11.933, + "eval_steps_per_second": 11.933, + "step": 12400 + }, + { + "epoch": 0.5768778029349617, + "grad_norm": 8.645942687988281, + "learning_rate": 4.14640402072173e-06, + "loss": 0.5885090827941895, + "memory(GiB)": 40.03, + "step": 12405, + "token_acc": 0.8285636281878332, + "train_speed(iter/s)": 0.095792 + }, + { + "epoch": 0.5771103211949113, + "grad_norm": 9.0007963180542, + "learning_rate": 4.142615815547376e-06, + "loss": 0.5649767398834229, + "memory(GiB)": 40.03, + "step": 12410, + "token_acc": 0.8557068741893644, + "train_speed(iter/s)": 0.095814 + }, + { + "epoch": 0.577342839454861, + "grad_norm": 8.417078971862793, + "learning_rate": 4.138828117370759e-06, + "loss": 0.5798979282379151, + "memory(GiB)": 40.03, + "step": 12415, + "token_acc": 0.8643841707425522, + "train_speed(iter/s)": 0.095835 + }, + { + "epoch": 0.5775753577148105, + "grad_norm": 7.812051296234131, + "learning_rate": 4.135040928431667e-06, + "loss": 0.7484992980957031, + "memory(GiB)": 40.03, + "step": 12420, + "token_acc": 0.8171667829727843, + "train_speed(iter/s)": 0.095857 + }, + { + "epoch": 0.5778078759747601, + "grad_norm": 6.615225791931152, + "learning_rate": 4.131254250969578e-06, + "loss": 0.6321574211120605, + "memory(GiB)": 40.03, + "step": 12425, + "token_acc": 0.8439282803585982, + "train_speed(iter/s)": 0.095879 + }, + { + "epoch": 0.5780403942347098, + "grad_norm": 9.891707420349121, + "learning_rate": 4.1274680872236724e-06, + "loss": 0.6489062309265137, + "memory(GiB)": 40.03, + "step": 12430, + "token_acc": 0.8401946107784432, + "train_speed(iter/s)": 0.095901 + }, + { + "epoch": 0.5782729124946594, + "grad_norm": 6.718799591064453, + "learning_rate": 4.123682439432826e-06, + "loss": 0.7149152755737305, + "memory(GiB)": 40.03, + "step": 12435, + "token_acc": 0.819376026272578, + "train_speed(iter/s)": 0.095923 + }, + { + "epoch": 0.5785054307546089, + "grad_norm": 7.437344074249268, + "learning_rate": 4.1198973098356095e-06, + "loss": 0.5938013553619385, + "memory(GiB)": 40.03, + "step": 12440, + "token_acc": 0.8444444444444444, + "train_speed(iter/s)": 0.095946 + }, + { + "epoch": 0.5787379490145586, + "grad_norm": 6.788064002990723, + "learning_rate": 4.116112700670285e-06, + "loss": 0.612579345703125, + "memory(GiB)": 40.03, + "step": 12445, + "token_acc": 0.8387524883875249, + "train_speed(iter/s)": 0.095967 + }, + { + "epoch": 0.5789704672745082, + "grad_norm": 9.03939437866211, + "learning_rate": 4.112328614174811e-06, + "loss": 0.6132975578308105, + "memory(GiB)": 40.03, + "step": 12450, + "token_acc": 0.8432807085098191, + "train_speed(iter/s)": 0.095989 + }, + { + "epoch": 0.5789704672745082, + "eval_loss": 0.5835011601448059, + "eval_runtime": 291.6729, + "eval_samples_per_second": 11.914, + "eval_steps_per_second": 11.914, + "step": 12450 + }, + { + "epoch": 0.5792029855344577, + "grad_norm": 6.867413520812988, + "learning_rate": 4.108545052586833e-06, + "loss": 0.7212032794952392, + "memory(GiB)": 40.03, + "step": 12455, + "token_acc": 0.8281243760033865, + "train_speed(iter/s)": 0.095795 + }, + { + "epoch": 0.5794355037944073, + "grad_norm": 8.361393928527832, + "learning_rate": 4.10476201814369e-06, + "loss": 0.7784335613250732, + "memory(GiB)": 40.03, + "step": 12460, + "token_acc": 0.8015446608462055, + "train_speed(iter/s)": 0.095817 + }, + { + "epoch": 0.579668022054357, + "grad_norm": 6.556368350982666, + "learning_rate": 4.100979513082404e-06, + "loss": 0.6438935279846192, + "memory(GiB)": 40.03, + "step": 12465, + "token_acc": 0.8374913374913375, + "train_speed(iter/s)": 0.095838 + }, + { + "epoch": 0.5799005403143066, + "grad_norm": 6.511188507080078, + "learning_rate": 4.0971975396396894e-06, + "loss": 0.6152307987213135, + "memory(GiB)": 40.03, + "step": 12470, + "token_acc": 0.8479876160990713, + "train_speed(iter/s)": 0.09586 + }, + { + "epoch": 0.5801330585742561, + "grad_norm": 9.094870567321777, + "learning_rate": 4.093416100051943e-06, + "loss": 0.717622709274292, + "memory(GiB)": 40.03, + "step": 12475, + "token_acc": 0.8252267106347898, + "train_speed(iter/s)": 0.095881 + }, + { + "epoch": 0.5803655768342058, + "grad_norm": 8.88550853729248, + "learning_rate": 4.089635196555246e-06, + "loss": 0.5561723232269287, + "memory(GiB)": 40.03, + "step": 12480, + "token_acc": 0.8629441624365483, + "train_speed(iter/s)": 0.095903 + }, + { + "epoch": 0.5805980950941554, + "grad_norm": 8.385820388793945, + "learning_rate": 4.085854831385367e-06, + "loss": 0.7090948581695556, + "memory(GiB)": 40.03, + "step": 12485, + "token_acc": 0.8193054738081225, + "train_speed(iter/s)": 0.095924 + }, + { + "epoch": 0.580830613354105, + "grad_norm": 8.583136558532715, + "learning_rate": 4.082075006777747e-06, + "loss": 0.7321601390838623, + "memory(GiB)": 40.03, + "step": 12490, + "token_acc": 0.8369313801079414, + "train_speed(iter/s)": 0.095945 + }, + { + "epoch": 0.5810631316140545, + "grad_norm": 7.687808036804199, + "learning_rate": 4.078295724967517e-06, + "loss": 0.642839765548706, + "memory(GiB)": 40.03, + "step": 12495, + "token_acc": 0.8467590857999251, + "train_speed(iter/s)": 0.095967 + }, + { + "epoch": 0.5812956498740042, + "grad_norm": 8.041131019592285, + "learning_rate": 4.074516988189482e-06, + "loss": 0.6586986064910889, + "memory(GiB)": 40.03, + "step": 12500, + "token_acc": 0.8389610389610389, + "train_speed(iter/s)": 0.095989 + }, + { + "epoch": 0.5812956498740042, + "eval_loss": 0.5843559503555298, + "eval_runtime": 290.4963, + "eval_samples_per_second": 11.962, + "eval_steps_per_second": 11.962, + "step": 12500 + }, + { + "epoch": 0.5815281681339538, + "grad_norm": 8.2643404006958, + "learning_rate": 4.070738798678126e-06, + "loss": 0.6503505229949951, + "memory(GiB)": 40.03, + "step": 12505, + "token_acc": 0.8280452326951281, + "train_speed(iter/s)": 0.095797 + }, + { + "epoch": 0.5817606863939033, + "grad_norm": 8.272042274475098, + "learning_rate": 4.066961158667609e-06, + "loss": 0.6874100685119628, + "memory(GiB)": 40.03, + "step": 12510, + "token_acc": 0.8293502613890963, + "train_speed(iter/s)": 0.095819 + }, + { + "epoch": 0.581993204653853, + "grad_norm": 7.307247638702393, + "learning_rate": 4.063184070391765e-06, + "loss": 0.6720140457153321, + "memory(GiB)": 40.03, + "step": 12515, + "token_acc": 0.839740995548361, + "train_speed(iter/s)": 0.09584 + }, + { + "epoch": 0.5822257229138026, + "grad_norm": 5.765705108642578, + "learning_rate": 4.0594075360841035e-06, + "loss": 0.6570749282836914, + "memory(GiB)": 40.03, + "step": 12520, + "token_acc": 0.82145236508994, + "train_speed(iter/s)": 0.095862 + }, + { + "epoch": 0.5824582411737522, + "grad_norm": 10.116802215576172, + "learning_rate": 4.055631557977808e-06, + "loss": 0.6857599258422852, + "memory(GiB)": 40.03, + "step": 12525, + "token_acc": 0.8437649307214524, + "train_speed(iter/s)": 0.095884 + }, + { + "epoch": 0.5826907594337017, + "grad_norm": 9.83465576171875, + "learning_rate": 4.051856138305727e-06, + "loss": 0.6695918560028076, + "memory(GiB)": 40.03, + "step": 12530, + "token_acc": 0.8444125044915559, + "train_speed(iter/s)": 0.095905 + }, + { + "epoch": 0.5829232776936514, + "grad_norm": 9.249621391296387, + "learning_rate": 4.048081279300386e-06, + "loss": 0.6529331207275391, + "memory(GiB)": 40.03, + "step": 12535, + "token_acc": 0.8379413015737984, + "train_speed(iter/s)": 0.095927 + }, + { + "epoch": 0.583155795953601, + "grad_norm": 8.802436828613281, + "learning_rate": 4.044306983193973e-06, + "loss": 0.5567544460296631, + "memory(GiB)": 40.03, + "step": 12540, + "token_acc": 0.8690082644628099, + "train_speed(iter/s)": 0.095948 + }, + { + "epoch": 0.5833883142135505, + "grad_norm": 8.022923469543457, + "learning_rate": 4.04053325221835e-06, + "loss": 0.615961742401123, + "memory(GiB)": 40.03, + "step": 12545, + "token_acc": 0.8515337423312883, + "train_speed(iter/s)": 0.09597 + }, + { + "epoch": 0.5836208324735002, + "grad_norm": 8.537787437438965, + "learning_rate": 4.036760088605035e-06, + "loss": 0.7440320968627929, + "memory(GiB)": 40.03, + "step": 12550, + "token_acc": 0.8174474959612278, + "train_speed(iter/s)": 0.095991 + }, + { + "epoch": 0.5836208324735002, + "eval_loss": 0.5858432054519653, + "eval_runtime": 291.3931, + "eval_samples_per_second": 11.925, + "eval_steps_per_second": 11.925, + "step": 12550 + }, + { + "epoch": 0.5838533507334498, + "grad_norm": 9.390138626098633, + "learning_rate": 4.032987494585221e-06, + "loss": 0.6783319473266601, + "memory(GiB)": 40.03, + "step": 12555, + "token_acc": 0.828230028373896, + "train_speed(iter/s)": 0.095798 + }, + { + "epoch": 0.5840858689933994, + "grad_norm": 9.66921615600586, + "learning_rate": 4.029215472389756e-06, + "loss": 0.7414599418640136, + "memory(GiB)": 40.03, + "step": 12560, + "token_acc": 0.8162729658792651, + "train_speed(iter/s)": 0.095819 + }, + { + "epoch": 0.584318387253349, + "grad_norm": 6.690301895141602, + "learning_rate": 4.0254440242491565e-06, + "loss": 0.6200287342071533, + "memory(GiB)": 40.03, + "step": 12565, + "token_acc": 0.8414403032217309, + "train_speed(iter/s)": 0.09584 + }, + { + "epoch": 0.5845509055132986, + "grad_norm": 7.844669342041016, + "learning_rate": 4.0216731523935925e-06, + "loss": 0.5903035163879394, + "memory(GiB)": 40.03, + "step": 12570, + "token_acc": 0.8531626506024096, + "train_speed(iter/s)": 0.095862 + }, + { + "epoch": 0.5847834237732482, + "grad_norm": 8.23584270477295, + "learning_rate": 4.0179028590529e-06, + "loss": 0.5819211959838867, + "memory(GiB)": 40.03, + "step": 12575, + "token_acc": 0.8545808966861599, + "train_speed(iter/s)": 0.095884 + }, + { + "epoch": 0.5850159420331978, + "grad_norm": 8.184552192687988, + "learning_rate": 4.014133146456568e-06, + "loss": 0.7459378719329834, + "memory(GiB)": 40.03, + "step": 12580, + "token_acc": 0.8132008971483499, + "train_speed(iter/s)": 0.095905 + }, + { + "epoch": 0.5852484602931474, + "grad_norm": 6.241691589355469, + "learning_rate": 4.010364016833745e-06, + "loss": 0.6224756240844727, + "memory(GiB)": 40.03, + "step": 12585, + "token_acc": 0.8384336952945047, + "train_speed(iter/s)": 0.095926 + }, + { + "epoch": 0.585480978553097, + "grad_norm": 8.508033752441406, + "learning_rate": 4.006595472413233e-06, + "loss": 0.5763284206390381, + "memory(GiB)": 40.03, + "step": 12590, + "token_acc": 0.8516377649325626, + "train_speed(iter/s)": 0.095948 + }, + { + "epoch": 0.5857134968130466, + "grad_norm": 8.421253204345703, + "learning_rate": 4.0028275154234885e-06, + "loss": 0.525826358795166, + "memory(GiB)": 40.03, + "step": 12595, + "token_acc": 0.8619173262972736, + "train_speed(iter/s)": 0.095969 + }, + { + "epoch": 0.5859460150729962, + "grad_norm": 7.334120273590088, + "learning_rate": 3.999060148092621e-06, + "loss": 0.6416743278503418, + "memory(GiB)": 40.03, + "step": 12600, + "token_acc": 0.835724043715847, + "train_speed(iter/s)": 0.09599 + }, + { + "epoch": 0.5859460150729962, + "eval_loss": 0.5827152132987976, + "eval_runtime": 292.4291, + "eval_samples_per_second": 11.883, + "eval_steps_per_second": 11.883, + "step": 12600 + }, + { + "epoch": 0.5861785333329458, + "grad_norm": 8.851205825805664, + "learning_rate": 3.995293372648391e-06, + "loss": 0.5905053615570068, + "memory(GiB)": 40.03, + "step": 12605, + "token_acc": 0.8291728479210938, + "train_speed(iter/s)": 0.095798 + }, + { + "epoch": 0.5864110515928954, + "grad_norm": 6.9212493896484375, + "learning_rate": 3.9915271913182115e-06, + "loss": 0.7106484413146973, + "memory(GiB)": 40.03, + "step": 12610, + "token_acc": 0.8066332916145181, + "train_speed(iter/s)": 0.095819 + }, + { + "epoch": 0.586643569852845, + "grad_norm": 7.967902660369873, + "learning_rate": 3.987761606329138e-06, + "loss": 0.631581974029541, + "memory(GiB)": 40.03, + "step": 12615, + "token_acc": 0.8341113105924596, + "train_speed(iter/s)": 0.095841 + }, + { + "epoch": 0.5868760881127946, + "grad_norm": 7.79425573348999, + "learning_rate": 3.98399661990788e-06, + "loss": 0.5869490623474121, + "memory(GiB)": 40.03, + "step": 12620, + "token_acc": 0.8605851979345955, + "train_speed(iter/s)": 0.095862 + }, + { + "epoch": 0.5871086063727442, + "grad_norm": 10.994945526123047, + "learning_rate": 3.980232234280788e-06, + "loss": 0.6822981834411621, + "memory(GiB)": 40.03, + "step": 12625, + "token_acc": 0.8275714895433205, + "train_speed(iter/s)": 0.095883 + }, + { + "epoch": 0.5873411246326938, + "grad_norm": 7.380897521972656, + "learning_rate": 3.976468451673864e-06, + "loss": 0.6111745834350586, + "memory(GiB)": 40.03, + "step": 12630, + "token_acc": 0.8547326279668287, + "train_speed(iter/s)": 0.095904 + }, + { + "epoch": 0.5875736428926435, + "grad_norm": 6.481570243835449, + "learning_rate": 3.972705274312741e-06, + "loss": 0.6143715381622314, + "memory(GiB)": 40.03, + "step": 12635, + "token_acc": 0.8486401261332283, + "train_speed(iter/s)": 0.095925 + }, + { + "epoch": 0.587806161152593, + "grad_norm": 8.564242362976074, + "learning_rate": 3.968942704422709e-06, + "loss": 0.6853072166442871, + "memory(GiB)": 40.03, + "step": 12640, + "token_acc": 0.8207063084817386, + "train_speed(iter/s)": 0.095947 + }, + { + "epoch": 0.5880386794125426, + "grad_norm": 6.765157222747803, + "learning_rate": 3.965180744228688e-06, + "loss": 0.5456812381744385, + "memory(GiB)": 40.03, + "step": 12645, + "token_acc": 0.8647798742138365, + "train_speed(iter/s)": 0.095968 + }, + { + "epoch": 0.5882711976724923, + "grad_norm": 8.51491641998291, + "learning_rate": 3.961419395955244e-06, + "loss": 0.6321295261383056, + "memory(GiB)": 40.03, + "step": 12650, + "token_acc": 0.8409266409266409, + "train_speed(iter/s)": 0.095989 + }, + { + "epoch": 0.5882711976724923, + "eval_loss": 0.5844881534576416, + "eval_runtime": 291.3342, + "eval_samples_per_second": 11.928, + "eval_steps_per_second": 11.928, + "step": 12650 + }, + { + "epoch": 0.5885037159324418, + "grad_norm": 8.120543479919434, + "learning_rate": 3.957658661826575e-06, + "loss": 0.5859105587005615, + "memory(GiB)": 40.03, + "step": 12655, + "token_acc": 0.8287495797781228, + "train_speed(iter/s)": 0.095798 + }, + { + "epoch": 0.5887362341923914, + "grad_norm": 9.27128791809082, + "learning_rate": 3.953898544066522e-06, + "loss": 0.5901806354522705, + "memory(GiB)": 40.03, + "step": 12660, + "token_acc": 0.84593837535014, + "train_speed(iter/s)": 0.095819 + }, + { + "epoch": 0.588968752452341, + "grad_norm": 6.452368259429932, + "learning_rate": 3.9501390448985565e-06, + "loss": 0.6747704982757569, + "memory(GiB)": 40.03, + "step": 12665, + "token_acc": 0.8348032564450475, + "train_speed(iter/s)": 0.09584 + }, + { + "epoch": 0.5892012707122907, + "grad_norm": 5.982104778289795, + "learning_rate": 3.946380166545789e-06, + "loss": 0.6910391330718995, + "memory(GiB)": 40.03, + "step": 12670, + "token_acc": 0.8227188081936685, + "train_speed(iter/s)": 0.095861 + }, + { + "epoch": 0.5894337889722402, + "grad_norm": 9.366355895996094, + "learning_rate": 3.9426219112309585e-06, + "loss": 0.5870296001434326, + "memory(GiB)": 40.03, + "step": 12675, + "token_acc": 0.85423197492163, + "train_speed(iter/s)": 0.095881 + }, + { + "epoch": 0.5896663072321898, + "grad_norm": 8.060824394226074, + "learning_rate": 3.938864281176438e-06, + "loss": 0.6449044704437256, + "memory(GiB)": 40.03, + "step": 12680, + "token_acc": 0.8468025298664793, + "train_speed(iter/s)": 0.095902 + }, + { + "epoch": 0.5898988254921395, + "grad_norm": 8.35085678100586, + "learning_rate": 3.935107278604229e-06, + "loss": 0.6621711730957032, + "memory(GiB)": 40.03, + "step": 12685, + "token_acc": 0.8270584634220998, + "train_speed(iter/s)": 0.095923 + }, + { + "epoch": 0.590131343752089, + "grad_norm": 7.202229976654053, + "learning_rate": 3.931350905735965e-06, + "loss": 0.6891547203063965, + "memory(GiB)": 40.03, + "step": 12690, + "token_acc": 0.8342198581560284, + "train_speed(iter/s)": 0.095945 + }, + { + "epoch": 0.5903638620120386, + "grad_norm": 7.003572463989258, + "learning_rate": 3.9275951647929034e-06, + "loss": 0.6476888656616211, + "memory(GiB)": 40.03, + "step": 12695, + "token_acc": 0.8447937131630648, + "train_speed(iter/s)": 0.095967 + }, + { + "epoch": 0.5905963802719882, + "grad_norm": 9.953988075256348, + "learning_rate": 3.9238400579959316e-06, + "loss": 0.7107308387756348, + "memory(GiB)": 40.03, + "step": 12700, + "token_acc": 0.8269881556683587, + "train_speed(iter/s)": 0.095987 + }, + { + "epoch": 0.5905963802719882, + "eval_loss": 0.5821236968040466, + "eval_runtime": 291.7847, + "eval_samples_per_second": 11.909, + "eval_steps_per_second": 11.909, + "step": 12700 + }, + { + "epoch": 0.5908288985319379, + "grad_norm": 5.7399187088012695, + "learning_rate": 3.920085587565558e-06, + "loss": 0.5839637756347656, + "memory(GiB)": 40.03, + "step": 12705, + "token_acc": 0.8290176177249842, + "train_speed(iter/s)": 0.095797 + }, + { + "epoch": 0.5910614167918874, + "grad_norm": 8.689284324645996, + "learning_rate": 3.916331755721921e-06, + "loss": 0.5548437595367431, + "memory(GiB)": 40.03, + "step": 12710, + "token_acc": 0.8688783570300158, + "train_speed(iter/s)": 0.095818 + }, + { + "epoch": 0.591293935051837, + "grad_norm": 8.828099250793457, + "learning_rate": 3.912578564684772e-06, + "loss": 0.660148286819458, + "memory(GiB)": 40.03, + "step": 12715, + "token_acc": 0.8491717523975588, + "train_speed(iter/s)": 0.095839 + }, + { + "epoch": 0.5915264533117867, + "grad_norm": 8.250561714172363, + "learning_rate": 3.908826016673493e-06, + "loss": 0.6170506954193116, + "memory(GiB)": 40.03, + "step": 12720, + "token_acc": 0.8587921847246892, + "train_speed(iter/s)": 0.09586 + }, + { + "epoch": 0.5917589715717363, + "grad_norm": 6.9805073738098145, + "learning_rate": 3.905074113907077e-06, + "loss": 0.7492640495300293, + "memory(GiB)": 40.03, + "step": 12725, + "token_acc": 0.8194444444444444, + "train_speed(iter/s)": 0.095882 + }, + { + "epoch": 0.5919914898316858, + "grad_norm": 8.539275169372559, + "learning_rate": 3.901322858604144e-06, + "loss": 0.6248363018035888, + "memory(GiB)": 40.03, + "step": 12730, + "token_acc": 0.8336466165413534, + "train_speed(iter/s)": 0.095902 + }, + { + "epoch": 0.5922240080916354, + "grad_norm": 7.449262619018555, + "learning_rate": 3.897572252982927e-06, + "loss": 0.6663394927978515, + "memory(GiB)": 40.03, + "step": 12735, + "token_acc": 0.8276753960556095, + "train_speed(iter/s)": 0.095923 + }, + { + "epoch": 0.5924565263515851, + "grad_norm": 8.026508331298828, + "learning_rate": 3.893822299261271e-06, + "loss": 0.63345365524292, + "memory(GiB)": 40.03, + "step": 12740, + "token_acc": 0.8415265200517464, + "train_speed(iter/s)": 0.095945 + }, + { + "epoch": 0.5926890446115346, + "grad_norm": 8.786938667297363, + "learning_rate": 3.890072999656645e-06, + "loss": 0.6941215515136718, + "memory(GiB)": 40.03, + "step": 12745, + "token_acc": 0.8345984818218138, + "train_speed(iter/s)": 0.095966 + }, + { + "epoch": 0.5929215628714842, + "grad_norm": 6.686178684234619, + "learning_rate": 3.886324356386121e-06, + "loss": 0.6077319145202636, + "memory(GiB)": 40.03, + "step": 12750, + "token_acc": 0.8468640560331104, + "train_speed(iter/s)": 0.095987 + }, + { + "epoch": 0.5929215628714842, + "eval_loss": 0.582139790058136, + "eval_runtime": 291.4567, + "eval_samples_per_second": 11.923, + "eval_steps_per_second": 11.923, + "step": 12750 + }, + { + "epoch": 0.5931540811314339, + "grad_norm": 7.007806301116943, + "learning_rate": 3.8825763716663895e-06, + "loss": 0.5417373180389404, + "memory(GiB)": 40.03, + "step": 12755, + "token_acc": 0.8294199720948774, + "train_speed(iter/s)": 0.095798 + }, + { + "epoch": 0.5933865993913835, + "grad_norm": 6.337910175323486, + "learning_rate": 3.878829047713748e-06, + "loss": 0.6792127609252929, + "memory(GiB)": 40.03, + "step": 12760, + "token_acc": 0.8312189740761169, + "train_speed(iter/s)": 0.095819 + }, + { + "epoch": 0.593619117651333, + "grad_norm": 8.660492897033691, + "learning_rate": 3.875082386744109e-06, + "loss": 0.7165204048156738, + "memory(GiB)": 40.03, + "step": 12765, + "token_acc": 0.8216374269005848, + "train_speed(iter/s)": 0.09584 + }, + { + "epoch": 0.5938516359112826, + "grad_norm": 7.1973419189453125, + "learning_rate": 3.871336390972983e-06, + "loss": 0.6989931106567383, + "memory(GiB)": 40.03, + "step": 12770, + "token_acc": 0.8199863107460643, + "train_speed(iter/s)": 0.095861 + }, + { + "epoch": 0.5940841541712323, + "grad_norm": 6.249396800994873, + "learning_rate": 3.867591062615497e-06, + "loss": 0.6296061992645263, + "memory(GiB)": 40.03, + "step": 12775, + "token_acc": 0.8337129840546698, + "train_speed(iter/s)": 0.095882 + }, + { + "epoch": 0.5943166724311819, + "grad_norm": 6.071090221405029, + "learning_rate": 3.8638464038863785e-06, + "loss": 0.6844027042388916, + "memory(GiB)": 40.03, + "step": 12780, + "token_acc": 0.836848635235732, + "train_speed(iter/s)": 0.095903 + }, + { + "epoch": 0.5945491906911314, + "grad_norm": 7.891200065612793, + "learning_rate": 3.8601024169999605e-06, + "loss": 0.8278802871704102, + "memory(GiB)": 40.03, + "step": 12785, + "token_acc": 0.7972350230414746, + "train_speed(iter/s)": 0.095924 + }, + { + "epoch": 0.5947817089510811, + "grad_norm": 9.561638832092285, + "learning_rate": 3.856359104170174e-06, + "loss": 0.6455109119415283, + "memory(GiB)": 40.03, + "step": 12790, + "token_acc": 0.8415918845103394, + "train_speed(iter/s)": 0.095945 + }, + { + "epoch": 0.5950142272110307, + "grad_norm": 7.377737045288086, + "learning_rate": 3.852616467610561e-06, + "loss": 0.7129979610443116, + "memory(GiB)": 40.03, + "step": 12795, + "token_acc": 0.7946912242686891, + "train_speed(iter/s)": 0.095965 + }, + { + "epoch": 0.5952467454709802, + "grad_norm": 7.447319030761719, + "learning_rate": 3.848874509534254e-06, + "loss": 0.7946955680847168, + "memory(GiB)": 40.03, + "step": 12800, + "token_acc": 0.8070114543561263, + "train_speed(iter/s)": 0.095987 + }, + { + "epoch": 0.5952467454709802, + "eval_loss": 0.5835894346237183, + "eval_runtime": 290.9893, + "eval_samples_per_second": 11.942, + "eval_steps_per_second": 11.942, + "step": 12800 + }, + { + "epoch": 0.5954792637309299, + "grad_norm": 8.0943603515625, + "learning_rate": 3.8451332321539915e-06, + "loss": 0.5825368404388428, + "memory(GiB)": 40.03, + "step": 12805, + "token_acc": 0.8292804408775778, + "train_speed(iter/s)": 0.095798 + }, + { + "epoch": 0.5957117819908795, + "grad_norm": 7.130446434020996, + "learning_rate": 3.841392637682103e-06, + "loss": 0.6323969841003418, + "memory(GiB)": 40.03, + "step": 12810, + "token_acc": 0.8424920127795528, + "train_speed(iter/s)": 0.095819 + }, + { + "epoch": 0.5959443002508291, + "grad_norm": 8.11547565460205, + "learning_rate": 3.83765272833052e-06, + "loss": 0.6336262702941895, + "memory(GiB)": 40.03, + "step": 12815, + "token_acc": 0.8367215230719587, + "train_speed(iter/s)": 0.09584 + }, + { + "epoch": 0.5961768185107786, + "grad_norm": 9.913498878479004, + "learning_rate": 3.833913506310765e-06, + "loss": 0.6936050415039062, + "memory(GiB)": 40.03, + "step": 12820, + "token_acc": 0.8213120695904313, + "train_speed(iter/s)": 0.095861 + }, + { + "epoch": 0.5964093367707283, + "grad_norm": 8.244888305664062, + "learning_rate": 3.830174973833956e-06, + "loss": 0.5694565773010254, + "memory(GiB)": 40.03, + "step": 12825, + "token_acc": 0.8766328011611031, + "train_speed(iter/s)": 0.095881 + }, + { + "epoch": 0.5966418550306779, + "grad_norm": 6.958888530731201, + "learning_rate": 3.826437133110803e-06, + "loss": 0.7474677562713623, + "memory(GiB)": 40.03, + "step": 12830, + "token_acc": 0.8297191610380377, + "train_speed(iter/s)": 0.095902 + }, + { + "epoch": 0.5968743732906274, + "grad_norm": 7.2198052406311035, + "learning_rate": 3.822699986351607e-06, + "loss": 0.6751950263977051, + "memory(GiB)": 40.03, + "step": 12835, + "token_acc": 0.8365192582025678, + "train_speed(iter/s)": 0.095923 + }, + { + "epoch": 0.5971068915505771, + "grad_norm": 7.975522518157959, + "learning_rate": 3.818963535766255e-06, + "loss": 0.6447204113006592, + "memory(GiB)": 40.03, + "step": 12840, + "token_acc": 0.8336427775714816, + "train_speed(iter/s)": 0.095943 + }, + { + "epoch": 0.5973394098105267, + "grad_norm": 8.00240421295166, + "learning_rate": 3.8152277835642315e-06, + "loss": 0.6401217460632325, + "memory(GiB)": 40.03, + "step": 12845, + "token_acc": 0.8358505564387917, + "train_speed(iter/s)": 0.095963 + }, + { + "epoch": 0.5975719280704763, + "grad_norm": 8.330437660217285, + "learning_rate": 3.8114927319545962e-06, + "loss": 0.5822761535644532, + "memory(GiB)": 40.03, + "step": 12850, + "token_acc": 0.8531309297912714, + "train_speed(iter/s)": 0.095984 + }, + { + "epoch": 0.5975719280704763, + "eval_loss": 0.581326961517334, + "eval_runtime": 290.2078, + "eval_samples_per_second": 11.974, + "eval_steps_per_second": 11.974, + "step": 12850 + }, + { + "epoch": 0.5978044463304258, + "grad_norm": 6.689164638519287, + "learning_rate": 3.807758383146004e-06, + "loss": 0.6984948635101318, + "memory(GiB)": 40.03, + "step": 12855, + "token_acc": 0.8286753629691356, + "train_speed(iter/s)": 0.095798 + }, + { + "epoch": 0.5980369645903755, + "grad_norm": 7.692912578582764, + "learning_rate": 3.804024739346689e-06, + "loss": 0.718368673324585, + "memory(GiB)": 40.03, + "step": 12860, + "token_acc": 0.8182618907809748, + "train_speed(iter/s)": 0.095819 + }, + { + "epoch": 0.5982694828503251, + "grad_norm": 7.318233966827393, + "learning_rate": 3.8002918027644697e-06, + "loss": 0.7070892333984375, + "memory(GiB)": 40.03, + "step": 12865, + "token_acc": 0.8297933409873708, + "train_speed(iter/s)": 0.09584 + }, + { + "epoch": 0.5985020011102747, + "grad_norm": 8.469462394714355, + "learning_rate": 3.7965595756067507e-06, + "loss": 0.7083279609680175, + "memory(GiB)": 40.03, + "step": 12870, + "token_acc": 0.831043445971036, + "train_speed(iter/s)": 0.095861 + }, + { + "epoch": 0.5987345193702243, + "grad_norm": 9.082457542419434, + "learning_rate": 3.792828060080508e-06, + "loss": 0.5696929931640625, + "memory(GiB)": 40.03, + "step": 12875, + "token_acc": 0.8571428571428571, + "train_speed(iter/s)": 0.095882 + }, + { + "epoch": 0.5989670376301739, + "grad_norm": 9.04974365234375, + "learning_rate": 3.789097258392305e-06, + "loss": 0.751149845123291, + "memory(GiB)": 40.03, + "step": 12880, + "token_acc": 0.8173846740373618, + "train_speed(iter/s)": 0.095903 + }, + { + "epoch": 0.5991995558901235, + "grad_norm": 9.381528854370117, + "learning_rate": 3.7853671727482775e-06, + "loss": 0.5614064693450928, + "memory(GiB)": 40.03, + "step": 12885, + "token_acc": 0.8481468154012235, + "train_speed(iter/s)": 0.095924 + }, + { + "epoch": 0.599432074150073, + "grad_norm": 8.049118041992188, + "learning_rate": 3.7816378053541446e-06, + "loss": 0.5766382217407227, + "memory(GiB)": 40.03, + "step": 12890, + "token_acc": 0.8552249637155298, + "train_speed(iter/s)": 0.095945 + }, + { + "epoch": 0.5996645924100227, + "grad_norm": 6.832623481750488, + "learning_rate": 3.7779091584151912e-06, + "loss": 0.6892680644989013, + "memory(GiB)": 40.03, + "step": 12895, + "token_acc": 0.8299130434782609, + "train_speed(iter/s)": 0.095966 + }, + { + "epoch": 0.5998971106699723, + "grad_norm": 8.119552612304688, + "learning_rate": 3.7741812341362848e-06, + "loss": 0.6701316833496094, + "memory(GiB)": 40.03, + "step": 12900, + "token_acc": 0.8302945301542777, + "train_speed(iter/s)": 0.095987 + }, + { + "epoch": 0.5998971106699723, + "eval_loss": 0.5804882645606995, + "eval_runtime": 290.8058, + "eval_samples_per_second": 11.95, + "eval_steps_per_second": 11.95, + "step": 12900 + }, + { + "epoch": 0.6001296289299219, + "grad_norm": 9.428837776184082, + "learning_rate": 3.7704540347218598e-06, + "loss": 0.650011682510376, + "memory(GiB)": 40.03, + "step": 12905, + "token_acc": 0.8292998026030717, + "train_speed(iter/s)": 0.095801 + }, + { + "epoch": 0.6003621471898715, + "grad_norm": 8.723655700683594, + "learning_rate": 3.766727562375928e-06, + "loss": 0.5859549045562744, + "memory(GiB)": 40.03, + "step": 12910, + "token_acc": 0.8322303110522833, + "train_speed(iter/s)": 0.095821 + }, + { + "epoch": 0.6005946654498211, + "grad_norm": 7.602426528930664, + "learning_rate": 3.7630018193020635e-06, + "loss": 0.7068216323852539, + "memory(GiB)": 40.03, + "step": 12915, + "token_acc": 0.828998505231689, + "train_speed(iter/s)": 0.095843 + }, + { + "epoch": 0.6008271837097707, + "grad_norm": 7.832668304443359, + "learning_rate": 3.759276807703415e-06, + "loss": 0.7375150680541992, + "memory(GiB)": 40.03, + "step": 12920, + "token_acc": 0.8146705615060046, + "train_speed(iter/s)": 0.095864 + }, + { + "epoch": 0.6010597019697204, + "grad_norm": 8.143230438232422, + "learning_rate": 3.7555525297826963e-06, + "loss": 0.8059114456176758, + "memory(GiB)": 40.03, + "step": 12925, + "token_acc": 0.7878280290340591, + "train_speed(iter/s)": 0.095884 + }, + { + "epoch": 0.6012922202296699, + "grad_norm": 6.759118556976318, + "learning_rate": 3.7518289877421898e-06, + "loss": 0.5784881114959717, + "memory(GiB)": 40.03, + "step": 12930, + "token_acc": 0.8483215913800248, + "train_speed(iter/s)": 0.095905 + }, + { + "epoch": 0.6015247384896195, + "grad_norm": 7.4658122062683105, + "learning_rate": 3.748106183783738e-06, + "loss": 0.6297882556915283, + "memory(GiB)": 40.03, + "step": 12935, + "token_acc": 0.8391791044776119, + "train_speed(iter/s)": 0.095925 + }, + { + "epoch": 0.6017572567495691, + "grad_norm": 7.309219837188721, + "learning_rate": 3.7443841201087515e-06, + "loss": 0.745161485671997, + "memory(GiB)": 40.03, + "step": 12940, + "token_acc": 0.8164102564102564, + "train_speed(iter/s)": 0.095947 + }, + { + "epoch": 0.6019897750095187, + "grad_norm": 8.289355278015137, + "learning_rate": 3.740662798918201e-06, + "loss": 0.7899196147918701, + "memory(GiB)": 40.03, + "step": 12945, + "token_acc": 0.8073065902578797, + "train_speed(iter/s)": 0.095968 + }, + { + "epoch": 0.6022222932694683, + "grad_norm": 8.108407974243164, + "learning_rate": 3.7369422224126205e-06, + "loss": 0.654511833190918, + "memory(GiB)": 40.03, + "step": 12950, + "token_acc": 0.8181179775280899, + "train_speed(iter/s)": 0.095988 + }, + { + "epoch": 0.6022222932694683, + "eval_loss": 0.5823829770088196, + "eval_runtime": 292.6367, + "eval_samples_per_second": 11.875, + "eval_steps_per_second": 11.875, + "step": 12950 + }, + { + "epoch": 0.6024548115294179, + "grad_norm": 8.292499542236328, + "learning_rate": 3.733222392792098e-06, + "loss": 0.6521989822387695, + "memory(GiB)": 40.03, + "step": 12955, + "token_acc": 0.8286181547070283, + "train_speed(iter/s)": 0.095801 + }, + { + "epoch": 0.6026873297893676, + "grad_norm": 10.976821899414062, + "learning_rate": 3.729503312256287e-06, + "loss": 0.6724837303161622, + "memory(GiB)": 40.03, + "step": 12960, + "token_acc": 0.843737882900349, + "train_speed(iter/s)": 0.095822 + }, + { + "epoch": 0.6029198480493171, + "grad_norm": 7.344293117523193, + "learning_rate": 3.7257849830043913e-06, + "loss": 0.7633102893829345, + "memory(GiB)": 40.03, + "step": 12965, + "token_acc": 0.8059814023624026, + "train_speed(iter/s)": 0.095842 + }, + { + "epoch": 0.6031523663092667, + "grad_norm": 8.665361404418945, + "learning_rate": 3.722067407235179e-06, + "loss": 0.7092705726623535, + "memory(GiB)": 40.03, + "step": 12970, + "token_acc": 0.8277597986335851, + "train_speed(iter/s)": 0.095862 + }, + { + "epoch": 0.6033848845692164, + "grad_norm": 8.475383758544922, + "learning_rate": 3.7183505871469618e-06, + "loss": 0.6085611820220947, + "memory(GiB)": 40.03, + "step": 12975, + "token_acc": 0.8396268325188805, + "train_speed(iter/s)": 0.095882 + }, + { + "epoch": 0.603617402829166, + "grad_norm": 5.808564186096191, + "learning_rate": 3.7146345249376132e-06, + "loss": 0.7104721546173096, + "memory(GiB)": 40.03, + "step": 12980, + "token_acc": 0.8194636439037877, + "train_speed(iter/s)": 0.095903 + }, + { + "epoch": 0.6038499210891155, + "grad_norm": 6.993358135223389, + "learning_rate": 3.7109192228045542e-06, + "loss": 0.6546235084533691, + "memory(GiB)": 40.03, + "step": 12985, + "token_acc": 0.828125, + "train_speed(iter/s)": 0.095924 + }, + { + "epoch": 0.6040824393490651, + "grad_norm": 8.322504043579102, + "learning_rate": 3.7072046829447607e-06, + "loss": 0.6930451393127441, + "memory(GiB)": 40.03, + "step": 12990, + "token_acc": 0.8300180831826401, + "train_speed(iter/s)": 0.095944 + }, + { + "epoch": 0.6043149576090148, + "grad_norm": 9.36544132232666, + "learning_rate": 3.7034909075547498e-06, + "loss": 0.592886209487915, + "memory(GiB)": 40.03, + "step": 12995, + "token_acc": 0.859493670886076, + "train_speed(iter/s)": 0.095964 + }, + { + "epoch": 0.6045474758689643, + "grad_norm": 7.081161975860596, + "learning_rate": 3.6997778988305944e-06, + "loss": 0.6275952816009521, + "memory(GiB)": 40.03, + "step": 13000, + "token_acc": 0.8473684210526315, + "train_speed(iter/s)": 0.095984 + }, + { + "epoch": 0.6045474758689643, + "eval_loss": 0.5824636220932007, + "eval_runtime": 290.8141, + "eval_samples_per_second": 11.949, + "eval_steps_per_second": 11.949, + "step": 13000 + }, + { + "epoch": 0.6047799941289139, + "grad_norm": 6.186984062194824, + "learning_rate": 3.6960656589679124e-06, + "loss": 0.6313210010528565, + "memory(GiB)": 40.03, + "step": 13005, + "token_acc": 0.8288772616527539, + "train_speed(iter/s)": 0.095799 + }, + { + "epoch": 0.6050125123888636, + "grad_norm": 9.29638671875, + "learning_rate": 3.692354190161863e-06, + "loss": 0.8241156578063965, + "memory(GiB)": 40.03, + "step": 13010, + "token_acc": 0.8011299435028248, + "train_speed(iter/s)": 0.09582 + }, + { + "epoch": 0.6052450306488132, + "grad_norm": 7.913212299346924, + "learning_rate": 3.688643494607156e-06, + "loss": 0.6520851612091064, + "memory(GiB)": 40.03, + "step": 13015, + "token_acc": 0.8400690846286701, + "train_speed(iter/s)": 0.09584 + }, + { + "epoch": 0.6054775489087627, + "grad_norm": 7.703744411468506, + "learning_rate": 3.6849335744980364e-06, + "loss": 0.6411514282226562, + "memory(GiB)": 40.03, + "step": 13020, + "token_acc": 0.831096196868009, + "train_speed(iter/s)": 0.095861 + }, + { + "epoch": 0.6057100671687123, + "grad_norm": 9.644156455993652, + "learning_rate": 3.6812244320282965e-06, + "loss": 0.6027958393096924, + "memory(GiB)": 40.03, + "step": 13025, + "token_acc": 0.848517327617006, + "train_speed(iter/s)": 0.095882 + }, + { + "epoch": 0.605942585428662, + "grad_norm": 10.0267333984375, + "learning_rate": 3.677516069391266e-06, + "loss": 0.6812788963317871, + "memory(GiB)": 40.03, + "step": 13030, + "token_acc": 0.8278537125969708, + "train_speed(iter/s)": 0.095902 + }, + { + "epoch": 0.6061751036886115, + "grad_norm": 8.00976276397705, + "learning_rate": 3.673808488779816e-06, + "loss": 0.6315018653869628, + "memory(GiB)": 40.03, + "step": 13035, + "token_acc": 0.840867992766727, + "train_speed(iter/s)": 0.095922 + }, + { + "epoch": 0.6064076219485611, + "grad_norm": 9.463610649108887, + "learning_rate": 3.6701016923863495e-06, + "loss": 0.8386247634887696, + "memory(GiB)": 40.03, + "step": 13040, + "token_acc": 0.8117283950617284, + "train_speed(iter/s)": 0.095943 + }, + { + "epoch": 0.6066401402085108, + "grad_norm": 7.517702102661133, + "learning_rate": 3.6663956824028145e-06, + "loss": 0.6400721549987793, + "memory(GiB)": 40.03, + "step": 13045, + "token_acc": 0.8454281567489115, + "train_speed(iter/s)": 0.095964 + }, + { + "epoch": 0.6068726584684604, + "grad_norm": 8.288093566894531, + "learning_rate": 3.6626904610206847e-06, + "loss": 0.6020816326141357, + "memory(GiB)": 40.03, + "step": 13050, + "token_acc": 0.844327990135635, + "train_speed(iter/s)": 0.095984 + }, + { + "epoch": 0.6068726584684604, + "eval_loss": 0.5813098549842834, + "eval_runtime": 292.0744, + "eval_samples_per_second": 11.898, + "eval_steps_per_second": 11.898, + "step": 13050 + }, + { + "epoch": 0.6071051767284099, + "grad_norm": 6.002388000488281, + "learning_rate": 3.6589860304309767e-06, + "loss": 0.6771800994873047, + "memory(GiB)": 40.03, + "step": 13055, + "token_acc": 0.8286898971373894, + "train_speed(iter/s)": 0.095799 + }, + { + "epoch": 0.6073376949883595, + "grad_norm": 8.348504066467285, + "learning_rate": 3.655282392824229e-06, + "loss": 0.7498507976531983, + "memory(GiB)": 40.03, + "step": 13060, + "token_acc": 0.815230961298377, + "train_speed(iter/s)": 0.095819 + }, + { + "epoch": 0.6075702132483092, + "grad_norm": 6.031515121459961, + "learning_rate": 3.6515795503905216e-06, + "loss": 0.6623213291168213, + "memory(GiB)": 40.03, + "step": 13065, + "token_acc": 0.8269918699186992, + "train_speed(iter/s)": 0.09584 + }, + { + "epoch": 0.6078027315082588, + "grad_norm": 8.771651268005371, + "learning_rate": 3.647877505319456e-06, + "loss": 0.5782103538513184, + "memory(GiB)": 40.03, + "step": 13070, + "token_acc": 0.8602808786460209, + "train_speed(iter/s)": 0.095861 + }, + { + "epoch": 0.6080352497682083, + "grad_norm": 9.957589149475098, + "learning_rate": 3.6441762598001706e-06, + "loss": 0.5717226505279541, + "memory(GiB)": 40.03, + "step": 13075, + "token_acc": 0.8529185867895546, + "train_speed(iter/s)": 0.095881 + }, + { + "epoch": 0.608267768028158, + "grad_norm": 9.755043983459473, + "learning_rate": 3.640475816021319e-06, + "loss": 0.6282239437103272, + "memory(GiB)": 40.03, + "step": 13080, + "token_acc": 0.8373353989155693, + "train_speed(iter/s)": 0.095902 + }, + { + "epoch": 0.6085002862881076, + "grad_norm": 9.720813751220703, + "learning_rate": 3.636776176171095e-06, + "loss": 0.724711799621582, + "memory(GiB)": 40.03, + "step": 13085, + "token_acc": 0.8197911938266, + "train_speed(iter/s)": 0.095923 + }, + { + "epoch": 0.6087328045480571, + "grad_norm": 7.896881580352783, + "learning_rate": 3.6330773424372055e-06, + "loss": 0.6664868354797363, + "memory(GiB)": 40.03, + "step": 13090, + "token_acc": 0.8358640636297903, + "train_speed(iter/s)": 0.095943 + }, + { + "epoch": 0.6089653228080067, + "grad_norm": 7.925256729125977, + "learning_rate": 3.6293793170068883e-06, + "loss": 0.8178078651428222, + "memory(GiB)": 40.03, + "step": 13095, + "token_acc": 0.7784163473818646, + "train_speed(iter/s)": 0.095963 + }, + { + "epoch": 0.6091978410679564, + "grad_norm": 8.73812484741211, + "learning_rate": 3.6256821020668944e-06, + "loss": 0.5127421855926514, + "memory(GiB)": 40.03, + "step": 13100, + "token_acc": 0.8643181025462155, + "train_speed(iter/s)": 0.095984 + }, + { + "epoch": 0.6091978410679564, + "eval_loss": 0.5807544589042664, + "eval_runtime": 294.0126, + "eval_samples_per_second": 11.819, + "eval_steps_per_second": 11.819, + "step": 13100 + }, + { + "epoch": 0.609430359327906, + "grad_norm": 7.597916126251221, + "learning_rate": 3.621985699803508e-06, + "loss": 0.6496970176696777, + "memory(GiB)": 40.03, + "step": 13105, + "token_acc": 0.829230387169648, + "train_speed(iter/s)": 0.095798 + }, + { + "epoch": 0.6096628775878555, + "grad_norm": 7.8660101890563965, + "learning_rate": 3.6182901124025205e-06, + "loss": 0.5560397148132324, + "memory(GiB)": 40.03, + "step": 13110, + "token_acc": 0.8621080468454855, + "train_speed(iter/s)": 0.095819 + }, + { + "epoch": 0.6098953958478052, + "grad_norm": 10.034387588500977, + "learning_rate": 3.6145953420492506e-06, + "loss": 0.5755732536315918, + "memory(GiB)": 40.03, + "step": 13115, + "token_acc": 0.858712236801953, + "train_speed(iter/s)": 0.095839 + }, + { + "epoch": 0.6101279141077548, + "grad_norm": 6.889771938323975, + "learning_rate": 3.6109013909285275e-06, + "loss": 0.5321535587310791, + "memory(GiB)": 40.03, + "step": 13120, + "token_acc": 0.8643453028654334, + "train_speed(iter/s)": 0.095859 + }, + { + "epoch": 0.6103604323677044, + "grad_norm": 9.340177536010742, + "learning_rate": 3.6072082612247006e-06, + "loss": 0.7592248916625977, + "memory(GiB)": 40.03, + "step": 13125, + "token_acc": 0.804501175680215, + "train_speed(iter/s)": 0.09588 + }, + { + "epoch": 0.610592950627654, + "grad_norm": 9.322178840637207, + "learning_rate": 3.603515955121629e-06, + "loss": 0.5940544128417968, + "memory(GiB)": 40.03, + "step": 13130, + "token_acc": 0.8639663737103553, + "train_speed(iter/s)": 0.095901 + }, + { + "epoch": 0.6108254688876036, + "grad_norm": 9.264908790588379, + "learning_rate": 3.599824474802689e-06, + "loss": 0.6264768123626709, + "memory(GiB)": 40.03, + "step": 13135, + "token_acc": 0.8332737030411449, + "train_speed(iter/s)": 0.095921 + }, + { + "epoch": 0.6110579871475532, + "grad_norm": 8.198921203613281, + "learning_rate": 3.596133822450768e-06, + "loss": 0.7603225231170654, + "memory(GiB)": 40.03, + "step": 13140, + "token_acc": 0.8068370394955194, + "train_speed(iter/s)": 0.095941 + }, + { + "epoch": 0.6112905054075027, + "grad_norm": 10.895846366882324, + "learning_rate": 3.5924440002482595e-06, + "loss": 0.6804223537445069, + "memory(GiB)": 40.03, + "step": 13145, + "token_acc": 0.8257232485186476, + "train_speed(iter/s)": 0.095962 + }, + { + "epoch": 0.6115230236674524, + "grad_norm": 11.369791030883789, + "learning_rate": 3.588755010377074e-06, + "loss": 0.7144616603851318, + "memory(GiB)": 40.03, + "step": 13150, + "token_acc": 0.8176609369733738, + "train_speed(iter/s)": 0.095982 + }, + { + "epoch": 0.6115230236674524, + "eval_loss": 0.5825229287147522, + "eval_runtime": 293.787, + "eval_samples_per_second": 11.828, + "eval_steps_per_second": 11.828, + "step": 13150 + }, + { + "epoch": 0.611755541927402, + "grad_norm": 8.719143867492676, + "learning_rate": 3.58506685501862e-06, + "loss": 0.7275121688842774, + "memory(GiB)": 40.03, + "step": 13155, + "token_acc": 0.8287984493019456, + "train_speed(iter/s)": 0.095797 + }, + { + "epoch": 0.6119880601873516, + "grad_norm": 7.496494293212891, + "learning_rate": 3.58137953635382e-06, + "loss": 0.6511481285095215, + "memory(GiB)": 40.03, + "step": 13160, + "token_acc": 0.8354237932345115, + "train_speed(iter/s)": 0.095818 + }, + { + "epoch": 0.6122205784473012, + "grad_norm": 7.9762773513793945, + "learning_rate": 3.5776930565630985e-06, + "loss": 0.571917200088501, + "memory(GiB)": 40.03, + "step": 13165, + "token_acc": 0.8398208749569411, + "train_speed(iter/s)": 0.095838 + }, + { + "epoch": 0.6124530967072508, + "grad_norm": 8.024611473083496, + "learning_rate": 3.5740074178263883e-06, + "loss": 0.6731865406036377, + "memory(GiB)": 40.03, + "step": 13170, + "token_acc": 0.838391502276176, + "train_speed(iter/s)": 0.095859 + }, + { + "epoch": 0.6126856149672004, + "grad_norm": 7.202920913696289, + "learning_rate": 3.5703226223231164e-06, + "loss": 0.622395133972168, + "memory(GiB)": 40.03, + "step": 13175, + "token_acc": 0.8469915600326708, + "train_speed(iter/s)": 0.095879 + }, + { + "epoch": 0.6129181332271499, + "grad_norm": 10.924602508544922, + "learning_rate": 3.5666386722322187e-06, + "loss": 0.5626607418060303, + "memory(GiB)": 40.03, + "step": 13180, + "token_acc": 0.8612159329140461, + "train_speed(iter/s)": 0.0959 + }, + { + "epoch": 0.6131506514870996, + "grad_norm": 7.208611965179443, + "learning_rate": 3.5629555697321284e-06, + "loss": 0.7295779228210449, + "memory(GiB)": 40.03, + "step": 13185, + "token_acc": 0.8218286953379111, + "train_speed(iter/s)": 0.09592 + }, + { + "epoch": 0.6133831697470492, + "grad_norm": 9.755807876586914, + "learning_rate": 3.559273317000779e-06, + "loss": 0.5655300617218018, + "memory(GiB)": 40.03, + "step": 13190, + "token_acc": 0.8583294877711122, + "train_speed(iter/s)": 0.095941 + }, + { + "epoch": 0.6136156880069988, + "grad_norm": 8.565410614013672, + "learning_rate": 3.5555919162155968e-06, + "loss": 0.5593877792358398, + "memory(GiB)": 40.03, + "step": 13195, + "token_acc": 0.86209216279852, + "train_speed(iter/s)": 0.095961 + }, + { + "epoch": 0.6138482062669484, + "grad_norm": 9.327910423278809, + "learning_rate": 3.5519113695535105e-06, + "loss": 0.7032355785369873, + "memory(GiB)": 40.03, + "step": 13200, + "token_acc": 0.8238464129336477, + "train_speed(iter/s)": 0.095982 + }, + { + "epoch": 0.6138482062669484, + "eval_loss": 0.581161379814148, + "eval_runtime": 292.8198, + "eval_samples_per_second": 11.867, + "eval_steps_per_second": 11.867, + "step": 13200 + }, + { + "epoch": 0.614080724526898, + "grad_norm": 5.984133243560791, + "learning_rate": 3.54823167919094e-06, + "loss": 0.6683717727661133, + "memory(GiB)": 40.03, + "step": 13205, + "token_acc": 0.8294327679302749, + "train_speed(iter/s)": 0.095797 + }, + { + "epoch": 0.6143132427868476, + "grad_norm": 7.557715892791748, + "learning_rate": 3.5445528473038016e-06, + "loss": 0.7758615970611572, + "memory(GiB)": 40.03, + "step": 13210, + "token_acc": 0.8118092832333439, + "train_speed(iter/s)": 0.095818 + }, + { + "epoch": 0.6145457610467973, + "grad_norm": 7.7075514793396, + "learning_rate": 3.540874876067499e-06, + "loss": 0.5775607585906982, + "memory(GiB)": 40.03, + "step": 13215, + "token_acc": 0.8622920517560074, + "train_speed(iter/s)": 0.095838 + }, + { + "epoch": 0.6147782793067468, + "grad_norm": 6.942788124084473, + "learning_rate": 3.5371977676569323e-06, + "loss": 0.5903857707977295, + "memory(GiB)": 40.03, + "step": 13220, + "token_acc": 0.8548087634608244, + "train_speed(iter/s)": 0.095858 + }, + { + "epoch": 0.6150107975666964, + "grad_norm": 7.968829154968262, + "learning_rate": 3.533521524246488e-06, + "loss": 0.7749314785003663, + "memory(GiB)": 40.03, + "step": 13225, + "token_acc": 0.8158175988599928, + "train_speed(iter/s)": 0.095879 + }, + { + "epoch": 0.615243315826646, + "grad_norm": 7.358787536621094, + "learning_rate": 3.5298461480100456e-06, + "loss": 0.7289624214172363, + "memory(GiB)": 40.03, + "step": 13230, + "token_acc": 0.8090257023311417, + "train_speed(iter/s)": 0.095899 + }, + { + "epoch": 0.6154758340865956, + "grad_norm": 6.563849449157715, + "learning_rate": 3.5261716411209632e-06, + "loss": 0.6173213481903076, + "memory(GiB)": 40.03, + "step": 13235, + "token_acc": 0.8423772609819121, + "train_speed(iter/s)": 0.095919 + }, + { + "epoch": 0.6157083523465452, + "grad_norm": 9.109136581420898, + "learning_rate": 3.522498005752094e-06, + "loss": 0.6493964195251465, + "memory(GiB)": 40.03, + "step": 13240, + "token_acc": 0.8377281947261663, + "train_speed(iter/s)": 0.09594 + }, + { + "epoch": 0.6159408706064948, + "grad_norm": 9.26567554473877, + "learning_rate": 3.5188252440757707e-06, + "loss": 0.6601822376251221, + "memory(GiB)": 40.03, + "step": 13245, + "token_acc": 0.8382547377699427, + "train_speed(iter/s)": 0.09596 + }, + { + "epoch": 0.6161733888664445, + "grad_norm": 8.409039497375488, + "learning_rate": 3.515153358263813e-06, + "loss": 0.6446426391601563, + "memory(GiB)": 40.03, + "step": 13250, + "token_acc": 0.8314216197427876, + "train_speed(iter/s)": 0.095981 + }, + { + "epoch": 0.6161733888664445, + "eval_loss": 0.5819464325904846, + "eval_runtime": 294.1628, + "eval_samples_per_second": 11.813, + "eval_steps_per_second": 11.813, + "step": 13250 + }, + { + "epoch": 0.616405907126394, + "grad_norm": 6.008672714233398, + "learning_rate": 3.511482350487516e-06, + "loss": 0.7331273078918457, + "memory(GiB)": 40.03, + "step": 13255, + "token_acc": 0.8281281177418969, + "train_speed(iter/s)": 0.095796 + }, + { + "epoch": 0.6166384253863436, + "grad_norm": 6.8742828369140625, + "learning_rate": 3.507812222917662e-06, + "loss": 0.5720431327819824, + "memory(GiB)": 40.03, + "step": 13260, + "token_acc": 0.8560767590618337, + "train_speed(iter/s)": 0.095815 + }, + { + "epoch": 0.6168709436462932, + "grad_norm": 9.703290939331055, + "learning_rate": 3.504142977724512e-06, + "loss": 0.6921139717102051, + "memory(GiB)": 40.03, + "step": 13265, + "token_acc": 0.8224266006367174, + "train_speed(iter/s)": 0.095836 + }, + { + "epoch": 0.6171034619062429, + "grad_norm": 7.2310709953308105, + "learning_rate": 3.5004746170778024e-06, + "loss": 0.6525343418121338, + "memory(GiB)": 40.03, + "step": 13270, + "token_acc": 0.8408477842003853, + "train_speed(iter/s)": 0.095857 + }, + { + "epoch": 0.6173359801661924, + "grad_norm": 7.915954113006592, + "learning_rate": 3.496807143146751e-06, + "loss": 0.7520557880401612, + "memory(GiB)": 40.03, + "step": 13275, + "token_acc": 0.8154848046309696, + "train_speed(iter/s)": 0.095877 + }, + { + "epoch": 0.617568498426142, + "grad_norm": 7.646523475646973, + "learning_rate": 3.493140558100043e-06, + "loss": 0.5795305252075196, + "memory(GiB)": 40.03, + "step": 13280, + "token_acc": 0.8563643441027637, + "train_speed(iter/s)": 0.095897 + }, + { + "epoch": 0.6178010166860917, + "grad_norm": 8.842355728149414, + "learning_rate": 3.4894748641058483e-06, + "loss": 0.7470812320709228, + "memory(GiB)": 40.03, + "step": 13285, + "token_acc": 0.8251192368839427, + "train_speed(iter/s)": 0.095917 + }, + { + "epoch": 0.6180335349460412, + "grad_norm": 6.617765426635742, + "learning_rate": 3.4858100633318014e-06, + "loss": 0.7513682842254639, + "memory(GiB)": 40.03, + "step": 13290, + "token_acc": 0.8246217331499313, + "train_speed(iter/s)": 0.095937 + }, + { + "epoch": 0.6182660532059908, + "grad_norm": 11.116613388061523, + "learning_rate": 3.4821461579450166e-06, + "loss": 0.6056669235229493, + "memory(GiB)": 40.03, + "step": 13295, + "token_acc": 0.8522577816747041, + "train_speed(iter/s)": 0.095956 + }, + { + "epoch": 0.6184985714659404, + "grad_norm": 8.063444137573242, + "learning_rate": 3.4784831501120687e-06, + "loss": 0.6425219058990479, + "memory(GiB)": 40.03, + "step": 13300, + "token_acc": 0.8359303391384051, + "train_speed(iter/s)": 0.095976 + }, + { + "epoch": 0.6184985714659404, + "eval_loss": 0.5811650156974792, + "eval_runtime": 293.3389, + "eval_samples_per_second": 11.846, + "eval_steps_per_second": 11.846, + "step": 13300 + }, + { + "epoch": 0.6187310897258901, + "grad_norm": 8.379485130310059, + "learning_rate": 3.4748210419990116e-06, + "loss": 0.7794717311859131, + "memory(GiB)": 40.03, + "step": 13305, + "token_acc": 0.8279750441660964, + "train_speed(iter/s)": 0.095793 + }, + { + "epoch": 0.6189636079858396, + "grad_norm": 8.657197952270508, + "learning_rate": 3.4711598357713607e-06, + "loss": 0.775357723236084, + "memory(GiB)": 40.03, + "step": 13310, + "token_acc": 0.7870988242770893, + "train_speed(iter/s)": 0.095813 + }, + { + "epoch": 0.6191961262457892, + "grad_norm": 7.7351765632629395, + "learning_rate": 3.467499533594102e-06, + "loss": 0.6825555324554443, + "memory(GiB)": 40.03, + "step": 13315, + "token_acc": 0.8237533307955843, + "train_speed(iter/s)": 0.095834 + }, + { + "epoch": 0.6194286445057389, + "grad_norm": 11.018149375915527, + "learning_rate": 3.463840137631682e-06, + "loss": 0.6458070278167725, + "memory(GiB)": 40.03, + "step": 13320, + "token_acc": 0.8359375, + "train_speed(iter/s)": 0.095854 + }, + { + "epoch": 0.6196611627656884, + "grad_norm": 5.231941223144531, + "learning_rate": 3.4601816500480188e-06, + "loss": 0.643475866317749, + "memory(GiB)": 40.03, + "step": 13325, + "token_acc": 0.8257418909592823, + "train_speed(iter/s)": 0.095875 + }, + { + "epoch": 0.619893681025638, + "grad_norm": 9.33646297454834, + "learning_rate": 3.456524073006485e-06, + "loss": 0.6933014869689942, + "memory(GiB)": 40.03, + "step": 13330, + "token_acc": 0.829021372328459, + "train_speed(iter/s)": 0.095896 + }, + { + "epoch": 0.6201261992855877, + "grad_norm": 7.163576126098633, + "learning_rate": 3.4528674086699234e-06, + "loss": 0.6457261085510254, + "memory(GiB)": 40.03, + "step": 13335, + "token_acc": 0.8344663494221618, + "train_speed(iter/s)": 0.095917 + }, + { + "epoch": 0.6203587175455373, + "grad_norm": 5.749244689941406, + "learning_rate": 3.4492116592006274e-06, + "loss": 0.6414624214172363, + "memory(GiB)": 40.03, + "step": 13340, + "token_acc": 0.8356120826709063, + "train_speed(iter/s)": 0.095937 + }, + { + "epoch": 0.6205912358054868, + "grad_norm": 8.368809700012207, + "learning_rate": 3.4455568267603577e-06, + "loss": 0.5333932399749756, + "memory(GiB)": 40.03, + "step": 13345, + "token_acc": 0.8651933701657458, + "train_speed(iter/s)": 0.095957 + }, + { + "epoch": 0.6208237540654364, + "grad_norm": 6.098763942718506, + "learning_rate": 3.4419029135103288e-06, + "loss": 0.7009202003479004, + "memory(GiB)": 40.03, + "step": 13350, + "token_acc": 0.8158658497258948, + "train_speed(iter/s)": 0.095977 + }, + { + "epoch": 0.6208237540654364, + "eval_loss": 0.5797573328018188, + "eval_runtime": 293.8327, + "eval_samples_per_second": 11.826, + "eval_steps_per_second": 11.826, + "step": 13350 + }, + { + "epoch": 0.6210562723253861, + "grad_norm": 5.722322463989258, + "learning_rate": 3.438249921611214e-06, + "loss": 0.6894258499145508, + "memory(GiB)": 40.03, + "step": 13355, + "token_acc": 0.8296364362764682, + "train_speed(iter/s)": 0.095795 + }, + { + "epoch": 0.6212887905853357, + "grad_norm": 8.34636402130127, + "learning_rate": 3.4345978532231367e-06, + "loss": 0.6312924861907959, + "memory(GiB)": 40.03, + "step": 13360, + "token_acc": 0.8543487307532251, + "train_speed(iter/s)": 0.095816 + }, + { + "epoch": 0.6215213088452852, + "grad_norm": 6.663353443145752, + "learning_rate": 3.4309467105056802e-06, + "loss": 0.616228437423706, + "memory(GiB)": 40.03, + "step": 13365, + "token_acc": 0.8325808878856283, + "train_speed(iter/s)": 0.095835 + }, + { + "epoch": 0.6217538271052349, + "grad_norm": 9.392459869384766, + "learning_rate": 3.4272964956178774e-06, + "loss": 0.7160651683807373, + "memory(GiB)": 40.03, + "step": 13370, + "token_acc": 0.825006825006825, + "train_speed(iter/s)": 0.095855 + }, + { + "epoch": 0.6219863453651845, + "grad_norm": 6.7092366218566895, + "learning_rate": 3.423647210718214e-06, + "loss": 0.6473873138427735, + "memory(GiB)": 40.03, + "step": 13375, + "token_acc": 0.8419076229815997, + "train_speed(iter/s)": 0.095875 + }, + { + "epoch": 0.622218863625134, + "grad_norm": 8.367863655090332, + "learning_rate": 3.4199988579646226e-06, + "loss": 0.7488775253295898, + "memory(GiB)": 40.03, + "step": 13380, + "token_acc": 0.8174308137133416, + "train_speed(iter/s)": 0.095896 + }, + { + "epoch": 0.6224513818850836, + "grad_norm": 7.5913777351379395, + "learning_rate": 3.4163514395144892e-06, + "loss": 0.5292999744415283, + "memory(GiB)": 40.03, + "step": 13385, + "token_acc": 0.8726533166458073, + "train_speed(iter/s)": 0.095916 + }, + { + "epoch": 0.6226839001450333, + "grad_norm": 7.731056213378906, + "learning_rate": 3.4127049575246417e-06, + "loss": 0.7689545154571533, + "memory(GiB)": 40.03, + "step": 13390, + "token_acc": 0.821762349799733, + "train_speed(iter/s)": 0.095937 + }, + { + "epoch": 0.6229164184049829, + "grad_norm": 8.29542064666748, + "learning_rate": 3.409059414151361e-06, + "loss": 0.6694583892822266, + "memory(GiB)": 40.03, + "step": 13395, + "token_acc": 0.830316047867444, + "train_speed(iter/s)": 0.095958 + }, + { + "epoch": 0.6231489366649324, + "grad_norm": 8.759634017944336, + "learning_rate": 3.4054148115503695e-06, + "loss": 0.7301998138427734, + "memory(GiB)": 40.03, + "step": 13400, + "token_acc": 0.8189996401583304, + "train_speed(iter/s)": 0.095979 + }, + { + "epoch": 0.6231489366649324, + "eval_loss": 0.5786752700805664, + "eval_runtime": 293.4147, + "eval_samples_per_second": 11.843, + "eval_steps_per_second": 11.843, + "step": 13400 + }, + { + "epoch": 0.6233814549248821, + "grad_norm": 7.837320327758789, + "learning_rate": 3.4017711518768293e-06, + "loss": 0.787266206741333, + "memory(GiB)": 40.03, + "step": 13405, + "token_acc": 0.8289615867959114, + "train_speed(iter/s)": 0.095798 + }, + { + "epoch": 0.6236139731848317, + "grad_norm": 9.125218391418457, + "learning_rate": 3.398128437285353e-06, + "loss": 0.6132421016693115, + "memory(GiB)": 40.03, + "step": 13410, + "token_acc": 0.8308845577211394, + "train_speed(iter/s)": 0.095818 + }, + { + "epoch": 0.6238464914447813, + "grad_norm": 6.4724273681640625, + "learning_rate": 3.3944866699299872e-06, + "loss": 0.49155464172363283, + "memory(GiB)": 40.03, + "step": 13415, + "token_acc": 0.8774436090225564, + "train_speed(iter/s)": 0.095838 + }, + { + "epoch": 0.6240790097047308, + "grad_norm": 7.698394298553467, + "learning_rate": 3.3908458519642252e-06, + "loss": 0.6231107711791992, + "memory(GiB)": 40.03, + "step": 13420, + "token_acc": 0.8484301696138579, + "train_speed(iter/s)": 0.095858 + }, + { + "epoch": 0.6243115279646805, + "grad_norm": 9.802258491516113, + "learning_rate": 3.3872059855409893e-06, + "loss": 0.6324323177337646, + "memory(GiB)": 40.03, + "step": 13425, + "token_acc": 0.8554396423248882, + "train_speed(iter/s)": 0.095879 + }, + { + "epoch": 0.6245440462246301, + "grad_norm": 8.112380981445312, + "learning_rate": 3.383567072812651e-06, + "loss": 0.6585474491119385, + "memory(GiB)": 40.03, + "step": 13430, + "token_acc": 0.8506078055022392, + "train_speed(iter/s)": 0.095899 + }, + { + "epoch": 0.6247765644845796, + "grad_norm": 7.874328136444092, + "learning_rate": 3.3799291159310077e-06, + "loss": 0.5667964935302734, + "memory(GiB)": 40.03, + "step": 13435, + "token_acc": 0.8522376543209876, + "train_speed(iter/s)": 0.095919 + }, + { + "epoch": 0.6250090827445293, + "grad_norm": 9.904825210571289, + "learning_rate": 3.3762921170472973e-06, + "loss": 0.6364833354949951, + "memory(GiB)": 40.03, + "step": 13440, + "token_acc": 0.8434704830053668, + "train_speed(iter/s)": 0.095939 + }, + { + "epoch": 0.6252416010044789, + "grad_norm": 11.402176856994629, + "learning_rate": 3.372656078312189e-06, + "loss": 0.7669492244720459, + "memory(GiB)": 40.03, + "step": 13445, + "token_acc": 0.8166441136671178, + "train_speed(iter/s)": 0.095959 + }, + { + "epoch": 0.6254741192644285, + "grad_norm": 8.771894454956055, + "learning_rate": 3.3690210018757864e-06, + "loss": 0.5411834239959716, + "memory(GiB)": 40.03, + "step": 13450, + "token_acc": 0.8685134607881388, + "train_speed(iter/s)": 0.095979 + }, + { + "epoch": 0.6254741192644285, + "eval_loss": 0.5777615904808044, + "eval_runtime": 290.3126, + "eval_samples_per_second": 11.97, + "eval_steps_per_second": 11.97, + "step": 13450 + }, + { + "epoch": 0.625706637524378, + "grad_norm": 9.126480102539062, + "learning_rate": 3.3653868898876187e-06, + "loss": 0.5853212833404541, + "memory(GiB)": 40.03, + "step": 13455, + "token_acc": 0.8296562545029539, + "train_speed(iter/s)": 0.095801 + }, + { + "epoch": 0.6259391557843277, + "grad_norm": 9.626083374023438, + "learning_rate": 3.3617537444966515e-06, + "loss": 0.576531457901001, + "memory(GiB)": 40.03, + "step": 13460, + "token_acc": 0.8597442851607904, + "train_speed(iter/s)": 0.095821 + }, + { + "epoch": 0.6261716740442773, + "grad_norm": 11.668168067932129, + "learning_rate": 3.358121567851274e-06, + "loss": 0.6511495590209961, + "memory(GiB)": 40.03, + "step": 13465, + "token_acc": 0.8036732108929703, + "train_speed(iter/s)": 0.095841 + }, + { + "epoch": 0.6264041923042268, + "grad_norm": 6.140625, + "learning_rate": 3.354490362099308e-06, + "loss": 0.7017635822296142, + "memory(GiB)": 40.03, + "step": 13470, + "token_acc": 0.8272281511393135, + "train_speed(iter/s)": 0.095861 + }, + { + "epoch": 0.6266367105641765, + "grad_norm": 8.743377685546875, + "learning_rate": 3.350860129387993e-06, + "loss": 0.7101643562316895, + "memory(GiB)": 40.03, + "step": 13475, + "token_acc": 0.8201840894148587, + "train_speed(iter/s)": 0.095881 + }, + { + "epoch": 0.6268692288241261, + "grad_norm": 6.7029337882995605, + "learning_rate": 3.347230871864e-06, + "loss": 0.6393361568450928, + "memory(GiB)": 40.03, + "step": 13480, + "token_acc": 0.835424883470778, + "train_speed(iter/s)": 0.0959 + }, + { + "epoch": 0.6271017470840757, + "grad_norm": 9.114521026611328, + "learning_rate": 3.3436025916734207e-06, + "loss": 0.6931623458862305, + "memory(GiB)": 40.03, + "step": 13485, + "token_acc": 0.8219291014014839, + "train_speed(iter/s)": 0.095921 + }, + { + "epoch": 0.6273342653440253, + "grad_norm": 7.915518283843994, + "learning_rate": 3.339975290961771e-06, + "loss": 0.6445385456085205, + "memory(GiB)": 40.03, + "step": 13490, + "token_acc": 0.8270348837209303, + "train_speed(iter/s)": 0.09594 + }, + { + "epoch": 0.6275667836039749, + "grad_norm": 10.833139419555664, + "learning_rate": 3.3363489718739817e-06, + "loss": 0.7753934383392334, + "memory(GiB)": 43.68, + "step": 13495, + "token_acc": 0.7683600947617795, + "train_speed(iter/s)": 0.095959 + }, + { + "epoch": 0.6277993018639245, + "grad_norm": 7.672884464263916, + "learning_rate": 3.3327236365544115e-06, + "loss": 0.7468546390533447, + "memory(GiB)": 43.68, + "step": 13500, + "token_acc": 0.8008849557522124, + "train_speed(iter/s)": 0.095979 + }, + { + "epoch": 0.6277993018639245, + "eval_loss": 0.5770713686943054, + "eval_runtime": 293.1909, + "eval_samples_per_second": 11.852, + "eval_steps_per_second": 11.852, + "step": 13500 + }, + { + "epoch": 0.6280318201238742, + "grad_norm": 8.18341064453125, + "learning_rate": 3.3290992871468286e-06, + "loss": 0.5357805252075195, + "memory(GiB)": 43.68, + "step": 13505, + "token_acc": 0.8304309882384967, + "train_speed(iter/s)": 0.095799 + }, + { + "epoch": 0.6282643383838237, + "grad_norm": 8.50880241394043, + "learning_rate": 3.3254759257944284e-06, + "loss": 0.5876242637634277, + "memory(GiB)": 43.68, + "step": 13510, + "token_acc": 0.8617998163452709, + "train_speed(iter/s)": 0.09582 + }, + { + "epoch": 0.6284968566437733, + "grad_norm": 8.151193618774414, + "learning_rate": 3.321853554639811e-06, + "loss": 0.6524288654327393, + "memory(GiB)": 43.68, + "step": 13515, + "token_acc": 0.8302622253720765, + "train_speed(iter/s)": 0.09584 + }, + { + "epoch": 0.6287293749037229, + "grad_norm": 11.721169471740723, + "learning_rate": 3.3182321758249997e-06, + "loss": 0.5465070724487304, + "memory(GiB)": 43.68, + "step": 13520, + "token_acc": 0.8517136070580251, + "train_speed(iter/s)": 0.09586 + }, + { + "epoch": 0.6289618931636725, + "grad_norm": 5.964034557342529, + "learning_rate": 3.3146117914914257e-06, + "loss": 0.628056812286377, + "memory(GiB)": 43.68, + "step": 13525, + "token_acc": 0.84767393989296, + "train_speed(iter/s)": 0.095881 + }, + { + "epoch": 0.6291944114236221, + "grad_norm": 6.987983703613281, + "learning_rate": 3.310992403779934e-06, + "loss": 0.6603175640106201, + "memory(GiB)": 43.68, + "step": 13530, + "token_acc": 0.8353322528363047, + "train_speed(iter/s)": 0.095901 + }, + { + "epoch": 0.6294269296835717, + "grad_norm": 8.833318710327148, + "learning_rate": 3.3073740148307833e-06, + "loss": 0.6114417552947998, + "memory(GiB)": 43.68, + "step": 13535, + "token_acc": 0.8437158469945355, + "train_speed(iter/s)": 0.095921 + }, + { + "epoch": 0.6296594479435214, + "grad_norm": 7.671047210693359, + "learning_rate": 3.3037566267836347e-06, + "loss": 0.6404377460479737, + "memory(GiB)": 43.68, + "step": 13540, + "token_acc": 0.8379737045630317, + "train_speed(iter/s)": 0.095941 + }, + { + "epoch": 0.6298919662034709, + "grad_norm": 8.291642189025879, + "learning_rate": 3.300140241777564e-06, + "loss": 0.5734952449798584, + "memory(GiB)": 43.68, + "step": 13545, + "token_acc": 0.8604651162790697, + "train_speed(iter/s)": 0.095962 + }, + { + "epoch": 0.6301244844634205, + "grad_norm": 8.439014434814453, + "learning_rate": 3.2965248619510494e-06, + "loss": 0.6413207530975342, + "memory(GiB)": 43.68, + "step": 13550, + "token_acc": 0.8401782871696912, + "train_speed(iter/s)": 0.095982 + }, + { + "epoch": 0.6301244844634205, + "eval_loss": 0.578080952167511, + "eval_runtime": 289.0829, + "eval_samples_per_second": 12.021, + "eval_steps_per_second": 12.021, + "step": 13550 + }, + { + "epoch": 0.6303570027233701, + "grad_norm": 6.432934761047363, + "learning_rate": 3.2929104894419806e-06, + "loss": 0.7349601745605469, + "memory(GiB)": 43.68, + "step": 13555, + "token_acc": 0.8285882287503278, + "train_speed(iter/s)": 0.095805 + }, + { + "epoch": 0.6305895209833198, + "grad_norm": 7.739002227783203, + "learning_rate": 3.2892971263876416e-06, + "loss": 0.5691585540771484, + "memory(GiB)": 43.68, + "step": 13560, + "token_acc": 0.8558882235528942, + "train_speed(iter/s)": 0.095825 + }, + { + "epoch": 0.6308220392432693, + "grad_norm": 8.967711448669434, + "learning_rate": 3.28568477492473e-06, + "loss": 0.7265130996704101, + "memory(GiB)": 43.68, + "step": 13565, + "token_acc": 0.8324140857021638, + "train_speed(iter/s)": 0.095845 + }, + { + "epoch": 0.6310545575032189, + "grad_norm": 7.715224266052246, + "learning_rate": 3.2820734371893394e-06, + "loss": 0.6721633434295654, + "memory(GiB)": 43.68, + "step": 13570, + "token_acc": 0.8380987746008169, + "train_speed(iter/s)": 0.095865 + }, + { + "epoch": 0.6312870757631686, + "grad_norm": 6.480725288391113, + "learning_rate": 3.2784631153169667e-06, + "loss": 0.6947125434875489, + "memory(GiB)": 43.68, + "step": 13575, + "token_acc": 0.8141641504254735, + "train_speed(iter/s)": 0.095885 + }, + { + "epoch": 0.6315195940231181, + "grad_norm": 8.174640655517578, + "learning_rate": 3.274853811442503e-06, + "loss": 0.6244683742523194, + "memory(GiB)": 43.68, + "step": 13580, + "token_acc": 0.843908135461269, + "train_speed(iter/s)": 0.095905 + }, + { + "epoch": 0.6317521122830677, + "grad_norm": 10.560827255249023, + "learning_rate": 3.271245527700245e-06, + "loss": 0.6520066738128663, + "memory(GiB)": 43.68, + "step": 13585, + "token_acc": 0.8298475717830557, + "train_speed(iter/s)": 0.095926 + }, + { + "epoch": 0.6319846305430173, + "grad_norm": 6.402472972869873, + "learning_rate": 3.2676382662238792e-06, + "loss": 0.6191014289855957, + "memory(GiB)": 43.68, + "step": 13590, + "token_acc": 0.8441325768886234, + "train_speed(iter/s)": 0.095945 + }, + { + "epoch": 0.632217148802967, + "grad_norm": 9.326212882995605, + "learning_rate": 3.264032029146495e-06, + "loss": 0.8372744560241699, + "memory(GiB)": 43.68, + "step": 13595, + "token_acc": 0.80083857442348, + "train_speed(iter/s)": 0.095966 + }, + { + "epoch": 0.6324496670629165, + "grad_norm": 5.53291130065918, + "learning_rate": 3.260426818600566e-06, + "loss": 0.6795487403869629, + "memory(GiB)": 43.68, + "step": 13600, + "token_acc": 0.8197094844773569, + "train_speed(iter/s)": 0.095986 + }, + { + "epoch": 0.6324496670629165, + "eval_loss": 0.5772190690040588, + "eval_runtime": 295.8738, + "eval_samples_per_second": 11.745, + "eval_steps_per_second": 11.745, + "step": 13600 + }, + { + "epoch": 0.6326821853228661, + "grad_norm": 5.555092811584473, + "learning_rate": 3.2568226367179695e-06, + "loss": 0.7151205539703369, + "memory(GiB)": 43.68, + "step": 13605, + "token_acc": 0.8287276524178828, + "train_speed(iter/s)": 0.095804 + }, + { + "epoch": 0.6329147035828158, + "grad_norm": 8.747469902038574, + "learning_rate": 3.253219485629966e-06, + "loss": 0.7140115737915039, + "memory(GiB)": 43.68, + "step": 13610, + "token_acc": 0.8367752184273233, + "train_speed(iter/s)": 0.095824 + }, + { + "epoch": 0.6331472218427653, + "grad_norm": 6.2085747718811035, + "learning_rate": 3.249617367467214e-06, + "loss": 0.6731038570404053, + "memory(GiB)": 43.68, + "step": 13615, + "token_acc": 0.8313609467455622, + "train_speed(iter/s)": 0.095844 + }, + { + "epoch": 0.6333797401027149, + "grad_norm": 9.040670394897461, + "learning_rate": 3.246016284359752e-06, + "loss": 0.7263114929199219, + "memory(GiB)": 43.68, + "step": 13620, + "token_acc": 0.8259759211966435, + "train_speed(iter/s)": 0.095864 + }, + { + "epoch": 0.6336122583626645, + "grad_norm": 9.780359268188477, + "learning_rate": 3.242416238437015e-06, + "loss": 0.631324291229248, + "memory(GiB)": 43.68, + "step": 13625, + "token_acc": 0.8524399690162665, + "train_speed(iter/s)": 0.095884 + }, + { + "epoch": 0.6338447766226142, + "grad_norm": 8.346222877502441, + "learning_rate": 3.23881723182782e-06, + "loss": 0.5538475036621093, + "memory(GiB)": 43.68, + "step": 13630, + "token_acc": 0.8669991687448046, + "train_speed(iter/s)": 0.095904 + }, + { + "epoch": 0.6340772948825637, + "grad_norm": 7.389737129211426, + "learning_rate": 3.2352192666603733e-06, + "loss": 0.6332985877990722, + "memory(GiB)": 43.68, + "step": 13635, + "token_acc": 0.8622803432774826, + "train_speed(iter/s)": 0.095924 + }, + { + "epoch": 0.6343098131425133, + "grad_norm": 7.637555122375488, + "learning_rate": 3.231622345062259e-06, + "loss": 0.5670449733734131, + "memory(GiB)": 43.68, + "step": 13640, + "token_acc": 0.8635014836795252, + "train_speed(iter/s)": 0.095944 + }, + { + "epoch": 0.634542331402463, + "grad_norm": 8.666919708251953, + "learning_rate": 3.2280264691604505e-06, + "loss": 0.6867550373077392, + "memory(GiB)": 43.68, + "step": 13645, + "token_acc": 0.8319641523525019, + "train_speed(iter/s)": 0.095964 + }, + { + "epoch": 0.6347748496624126, + "grad_norm": 9.447999000549316, + "learning_rate": 3.224431641081298e-06, + "loss": 0.6106939792633057, + "memory(GiB)": 43.68, + "step": 13650, + "token_acc": 0.8417582417582418, + "train_speed(iter/s)": 0.095983 + }, + { + "epoch": 0.6347748496624126, + "eval_loss": 0.5787781476974487, + "eval_runtime": 291.6325, + "eval_samples_per_second": 11.916, + "eval_steps_per_second": 11.916, + "step": 13650 + }, + { + "epoch": 0.6350073679223621, + "grad_norm": 8.59967041015625, + "learning_rate": 3.2208378629505366e-06, + "loss": 0.7382328510284424, + "memory(GiB)": 43.68, + "step": 13655, + "token_acc": 0.8293154714326821, + "train_speed(iter/s)": 0.095806 + }, + { + "epoch": 0.6352398861823118, + "grad_norm": 9.68194580078125, + "learning_rate": 3.217245136893279e-06, + "loss": 0.7147025108337403, + "memory(GiB)": 43.68, + "step": 13660, + "token_acc": 0.8270995059985886, + "train_speed(iter/s)": 0.095826 + }, + { + "epoch": 0.6354724044422614, + "grad_norm": 6.674454212188721, + "learning_rate": 3.2136534650340117e-06, + "loss": 0.700990867614746, + "memory(GiB)": 43.68, + "step": 13665, + "token_acc": 0.8329571106094809, + "train_speed(iter/s)": 0.095846 + }, + { + "epoch": 0.6357049227022109, + "grad_norm": 7.11359167098999, + "learning_rate": 3.2100628494966033e-06, + "loss": 0.7956765651702881, + "memory(GiB)": 43.68, + "step": 13670, + "token_acc": 0.7988103568929321, + "train_speed(iter/s)": 0.095866 + }, + { + "epoch": 0.6359374409621605, + "grad_norm": 8.432509422302246, + "learning_rate": 3.206473292404295e-06, + "loss": 0.6146938323974609, + "memory(GiB)": 43.68, + "step": 13675, + "token_acc": 0.8451695457453615, + "train_speed(iter/s)": 0.095885 + }, + { + "epoch": 0.6361699592221102, + "grad_norm": 7.3090901374816895, + "learning_rate": 3.202884795879705e-06, + "loss": 0.6174387454986572, + "memory(GiB)": 43.68, + "step": 13680, + "token_acc": 0.8505385996409336, + "train_speed(iter/s)": 0.095905 + }, + { + "epoch": 0.6364024774820598, + "grad_norm": 9.991720199584961, + "learning_rate": 3.1992973620448178e-06, + "loss": 0.6521985054016113, + "memory(GiB)": 43.68, + "step": 13685, + "token_acc": 0.8370843130668458, + "train_speed(iter/s)": 0.095925 + }, + { + "epoch": 0.6366349957420093, + "grad_norm": 6.456875324249268, + "learning_rate": 3.1957109930209972e-06, + "loss": 0.6120789051055908, + "memory(GiB)": 43.68, + "step": 13690, + "token_acc": 0.845568783068783, + "train_speed(iter/s)": 0.095945 + }, + { + "epoch": 0.636867514001959, + "grad_norm": 6.283238410949707, + "learning_rate": 3.1921256909289717e-06, + "loss": 0.5926040172576904, + "memory(GiB)": 43.68, + "step": 13695, + "token_acc": 0.8515030785947121, + "train_speed(iter/s)": 0.095965 + }, + { + "epoch": 0.6371000322619086, + "grad_norm": 6.858433246612549, + "learning_rate": 3.188541457888844e-06, + "loss": 0.6875529289245605, + "memory(GiB)": 43.68, + "step": 13700, + "token_acc": 0.8313755210306935, + "train_speed(iter/s)": 0.095984 + }, + { + "epoch": 0.6371000322619086, + "eval_loss": 0.5758278965950012, + "eval_runtime": 293.8539, + "eval_samples_per_second": 11.826, + "eval_steps_per_second": 11.826, + "step": 13700 + }, + { + "epoch": 0.6373325505218582, + "grad_norm": 8.783116340637207, + "learning_rate": 3.184958296020078e-06, + "loss": 0.5677198886871337, + "memory(GiB)": 43.68, + "step": 13705, + "token_acc": 0.8306381276255251, + "train_speed(iter/s)": 0.095807 + }, + { + "epoch": 0.6375650687818077, + "grad_norm": 8.005859375, + "learning_rate": 3.181376207441511e-06, + "loss": 0.647659158706665, + "memory(GiB)": 43.68, + "step": 13710, + "token_acc": 0.840589417280643, + "train_speed(iter/s)": 0.095827 + }, + { + "epoch": 0.6377975870417574, + "grad_norm": 6.116490840911865, + "learning_rate": 3.1777951942713407e-06, + "loss": 0.643339729309082, + "memory(GiB)": 43.68, + "step": 13715, + "token_acc": 0.8404487379245871, + "train_speed(iter/s)": 0.095846 + }, + { + "epoch": 0.638030105301707, + "grad_norm": 8.654963493347168, + "learning_rate": 3.1742152586271336e-06, + "loss": 0.6308553695678711, + "memory(GiB)": 43.68, + "step": 13720, + "token_acc": 0.8474514118078474, + "train_speed(iter/s)": 0.095866 + }, + { + "epoch": 0.6382626235616565, + "grad_norm": 8.51541519165039, + "learning_rate": 3.170636402625812e-06, + "loss": 0.6695310592651367, + "memory(GiB)": 43.68, + "step": 13725, + "token_acc": 0.8303973781237198, + "train_speed(iter/s)": 0.095886 + }, + { + "epoch": 0.6384951418216062, + "grad_norm": 8.290445327758789, + "learning_rate": 3.167058628383667e-06, + "loss": 0.6182727336883544, + "memory(GiB)": 43.68, + "step": 13730, + "token_acc": 0.8479587048334115, + "train_speed(iter/s)": 0.095906 + }, + { + "epoch": 0.6387276600815558, + "grad_norm": 6.582568645477295, + "learning_rate": 3.163481938016345e-06, + "loss": 0.5673922538757324, + "memory(GiB)": 43.68, + "step": 13735, + "token_acc": 0.8598392170569731, + "train_speed(iter/s)": 0.095925 + }, + { + "epoch": 0.6389601783415054, + "grad_norm": 7.43300724029541, + "learning_rate": 3.159906333638856e-06, + "loss": 0.6439460754394531, + "memory(GiB)": 43.68, + "step": 13740, + "token_acc": 0.8429054054054054, + "train_speed(iter/s)": 0.095945 + }, + { + "epoch": 0.639192696601455, + "grad_norm": 9.422440528869629, + "learning_rate": 3.1563318173655623e-06, + "loss": 0.5517416000366211, + "memory(GiB)": 43.68, + "step": 13745, + "token_acc": 0.8609561752988047, + "train_speed(iter/s)": 0.095964 + }, + { + "epoch": 0.6394252148614046, + "grad_norm": 9.674040794372559, + "learning_rate": 3.1527583913101878e-06, + "loss": 0.6156484127044678, + "memory(GiB)": 43.68, + "step": 13750, + "token_acc": 0.8474264705882353, + "train_speed(iter/s)": 0.095984 + }, + { + "epoch": 0.6394252148614046, + "eval_loss": 0.5760475397109985, + "eval_runtime": 294.3994, + "eval_samples_per_second": 11.804, + "eval_steps_per_second": 11.804, + "step": 13750 + }, + { + "epoch": 0.6396577331213542, + "grad_norm": 7.030127048492432, + "learning_rate": 3.1491860575858084e-06, + "loss": 0.6011571884155273, + "memory(GiB)": 43.68, + "step": 13755, + "token_acc": 0.8305973680200807, + "train_speed(iter/s)": 0.095807 + }, + { + "epoch": 0.6398902513813038, + "grad_norm": 9.282323837280273, + "learning_rate": 3.1456148183048583e-06, + "loss": 0.6616458892822266, + "memory(GiB)": 43.68, + "step": 13760, + "token_acc": 0.8436350257542311, + "train_speed(iter/s)": 0.095826 + }, + { + "epoch": 0.6401227696412534, + "grad_norm": 6.603138446807861, + "learning_rate": 3.1420446755791157e-06, + "loss": 0.7302883625030517, + "memory(GiB)": 43.68, + "step": 13765, + "token_acc": 0.8173349534224382, + "train_speed(iter/s)": 0.095846 + }, + { + "epoch": 0.640355287901203, + "grad_norm": 5.0972137451171875, + "learning_rate": 3.138475631519723e-06, + "loss": 0.859835147857666, + "memory(GiB)": 43.68, + "step": 13770, + "token_acc": 0.7932816537467701, + "train_speed(iter/s)": 0.095865 + }, + { + "epoch": 0.6405878061611526, + "grad_norm": 8.140726089477539, + "learning_rate": 3.1349076882371597e-06, + "loss": 0.6677399635314941, + "memory(GiB)": 43.68, + "step": 13775, + "token_acc": 0.8311345646437994, + "train_speed(iter/s)": 0.095885 + }, + { + "epoch": 0.6408203244211021, + "grad_norm": 9.439065933227539, + "learning_rate": 3.1313408478412677e-06, + "loss": 0.5588486671447754, + "memory(GiB)": 43.68, + "step": 13780, + "token_acc": 0.8525798525798526, + "train_speed(iter/s)": 0.095905 + }, + { + "epoch": 0.6410528426810518, + "grad_norm": 6.783348083496094, + "learning_rate": 3.127775112441222e-06, + "loss": 0.696048355102539, + "memory(GiB)": 43.68, + "step": 13785, + "token_acc": 0.8311071534579995, + "train_speed(iter/s)": 0.095924 + }, + { + "epoch": 0.6412853609410014, + "grad_norm": 6.2189531326293945, + "learning_rate": 3.124210484145558e-06, + "loss": 0.7082841396331787, + "memory(GiB)": 43.68, + "step": 13790, + "token_acc": 0.8230918499353169, + "train_speed(iter/s)": 0.095942 + }, + { + "epoch": 0.641517879200951, + "grad_norm": 8.979401588439941, + "learning_rate": 3.1206469650621496e-06, + "loss": 0.7509594440460206, + "memory(GiB)": 43.68, + "step": 13795, + "token_acc": 0.8189786585365854, + "train_speed(iter/s)": 0.095962 + }, + { + "epoch": 0.6417503974609006, + "grad_norm": 6.314192295074463, + "learning_rate": 3.117084557298213e-06, + "loss": 0.6499105453491211, + "memory(GiB)": 43.68, + "step": 13800, + "token_acc": 0.8217032184562802, + "train_speed(iter/s)": 0.095981 + }, + { + "epoch": 0.6417503974609006, + "eval_loss": 0.5744973421096802, + "eval_runtime": 294.748, + "eval_samples_per_second": 11.79, + "eval_steps_per_second": 11.79, + "step": 13800 + }, + { + "epoch": 0.6419829157208502, + "grad_norm": 6.625218391418457, + "learning_rate": 3.113523262960313e-06, + "loss": 0.7195854663848877, + "memory(GiB)": 43.68, + "step": 13805, + "token_acc": 0.8297727636421828, + "train_speed(iter/s)": 0.095804 + }, + { + "epoch": 0.6422154339807998, + "grad_norm": 7.311429500579834, + "learning_rate": 3.1099630841543504e-06, + "loss": 0.6338868618011475, + "memory(GiB)": 43.68, + "step": 13810, + "token_acc": 0.8433029908972692, + "train_speed(iter/s)": 0.095823 + }, + { + "epoch": 0.6424479522407494, + "grad_norm": 12.813942909240723, + "learning_rate": 3.106404022985572e-06, + "loss": 0.6633894443511963, + "memory(GiB)": 43.68, + "step": 13815, + "token_acc": 0.8306645316253003, + "train_speed(iter/s)": 0.095842 + }, + { + "epoch": 0.642680470500699, + "grad_norm": 6.483264446258545, + "learning_rate": 3.102846081558556e-06, + "loss": 0.7106656551361084, + "memory(GiB)": 43.68, + "step": 13820, + "token_acc": 0.8238815374921235, + "train_speed(iter/s)": 0.095862 + }, + { + "epoch": 0.6429129887606486, + "grad_norm": 8.264899253845215, + "learning_rate": 3.099289261977227e-06, + "loss": 0.8299205780029297, + "memory(GiB)": 43.68, + "step": 13825, + "token_acc": 0.7785035629453682, + "train_speed(iter/s)": 0.095881 + }, + { + "epoch": 0.6431455070205983, + "grad_norm": 8.50506591796875, + "learning_rate": 3.0957335663448397e-06, + "loss": 0.6548618793487548, + "memory(GiB)": 43.68, + "step": 13830, + "token_acc": 0.8282962470939887, + "train_speed(iter/s)": 0.0959 + }, + { + "epoch": 0.6433780252805478, + "grad_norm": 6.441330432891846, + "learning_rate": 3.0921789967639893e-06, + "loss": 0.6297261238098144, + "memory(GiB)": 43.68, + "step": 13835, + "token_acc": 0.8375950241879752, + "train_speed(iter/s)": 0.095919 + }, + { + "epoch": 0.6436105435404974, + "grad_norm": 9.392109870910645, + "learning_rate": 3.088625555336599e-06, + "loss": 0.768109655380249, + "memory(GiB)": 43.68, + "step": 13840, + "token_acc": 0.8078817733990148, + "train_speed(iter/s)": 0.095939 + }, + { + "epoch": 0.643843061800447, + "grad_norm": 8.62150764465332, + "learning_rate": 3.085073244163932e-06, + "loss": 0.7165932178497314, + "memory(GiB)": 43.68, + "step": 13845, + "token_acc": 0.8293402137854773, + "train_speed(iter/s)": 0.095958 + }, + { + "epoch": 0.6440755800603967, + "grad_norm": 8.874046325683594, + "learning_rate": 3.081522065346576e-06, + "loss": 0.6085056304931641, + "memory(GiB)": 43.68, + "step": 13850, + "token_acc": 0.8465703971119134, + "train_speed(iter/s)": 0.095978 + }, + { + "epoch": 0.6440755800603967, + "eval_loss": 0.5765819549560547, + "eval_runtime": 291.2604, + "eval_samples_per_second": 11.931, + "eval_steps_per_second": 11.931, + "step": 13850 + }, + { + "epoch": 0.6443080983203462, + "grad_norm": 8.001680374145508, + "learning_rate": 3.077972020984458e-06, + "loss": 0.6871384620666504, + "memory(GiB)": 43.68, + "step": 13855, + "token_acc": 0.8298419082732809, + "train_speed(iter/s)": 0.095803 + }, + { + "epoch": 0.6445406165802958, + "grad_norm": 5.726327419281006, + "learning_rate": 3.074423113176822e-06, + "loss": 0.6752862453460693, + "memory(GiB)": 43.68, + "step": 13860, + "token_acc": 0.8349722103463019, + "train_speed(iter/s)": 0.095823 + }, + { + "epoch": 0.6447731348402455, + "grad_norm": 6.493947505950928, + "learning_rate": 3.070875344022252e-06, + "loss": 0.6672042369842529, + "memory(GiB)": 43.68, + "step": 13865, + "token_acc": 0.8307134220072552, + "train_speed(iter/s)": 0.095842 + }, + { + "epoch": 0.645005653100195, + "grad_norm": 8.697665214538574, + "learning_rate": 3.0673287156186503e-06, + "loss": 0.6020650386810302, + "memory(GiB)": 43.68, + "step": 13870, + "token_acc": 0.8491048593350383, + "train_speed(iter/s)": 0.095862 + }, + { + "epoch": 0.6452381713601446, + "grad_norm": 7.544933319091797, + "learning_rate": 3.063783230063252e-06, + "loss": 0.6294498920440674, + "memory(GiB)": 43.68, + "step": 13875, + "token_acc": 0.8423893486865779, + "train_speed(iter/s)": 0.095881 + }, + { + "epoch": 0.6454706896200942, + "grad_norm": 9.882427215576172, + "learning_rate": 3.060238889452607e-06, + "loss": 0.6869438648223877, + "memory(GiB)": 43.68, + "step": 13880, + "token_acc": 0.8407671721677074, + "train_speed(iter/s)": 0.095901 + }, + { + "epoch": 0.6457032078800439, + "grad_norm": 7.94822359085083, + "learning_rate": 3.0566956958825965e-06, + "loss": 0.6243470191955567, + "memory(GiB)": 43.68, + "step": 13885, + "token_acc": 0.8395860284605433, + "train_speed(iter/s)": 0.09592 + }, + { + "epoch": 0.6459357261399934, + "grad_norm": 8.722342491149902, + "learning_rate": 3.0531536514484183e-06, + "loss": 0.759521484375, + "memory(GiB)": 43.68, + "step": 13890, + "token_acc": 0.8233936129280492, + "train_speed(iter/s)": 0.09594 + }, + { + "epoch": 0.646168244399943, + "grad_norm": 8.995368003845215, + "learning_rate": 3.0496127582445955e-06, + "loss": 0.6832744598388671, + "memory(GiB)": 43.68, + "step": 13895, + "token_acc": 0.8318135764944276, + "train_speed(iter/s)": 0.095959 + }, + { + "epoch": 0.6464007626598927, + "grad_norm": 8.118508338928223, + "learning_rate": 3.0460730183649646e-06, + "loss": 0.6283859252929688, + "memory(GiB)": 43.68, + "step": 13900, + "token_acc": 0.8495370370370371, + "train_speed(iter/s)": 0.095978 + }, + { + "epoch": 0.6464007626598927, + "eval_loss": 0.5750031471252441, + "eval_runtime": 292.3705, + "eval_samples_per_second": 11.886, + "eval_steps_per_second": 11.886, + "step": 13900 + }, + { + "epoch": 0.6466332809198423, + "grad_norm": 6.906221389770508, + "learning_rate": 3.0425344339026842e-06, + "loss": 0.6433838367462158, + "memory(GiB)": 43.68, + "step": 13905, + "token_acc": 0.8298588838999359, + "train_speed(iter/s)": 0.095804 + }, + { + "epoch": 0.6468657991797918, + "grad_norm": 7.201536178588867, + "learning_rate": 3.0389970069502282e-06, + "loss": 0.5808377742767334, + "memory(GiB)": 43.68, + "step": 13910, + "token_acc": 0.8401525658807212, + "train_speed(iter/s)": 0.095824 + }, + { + "epoch": 0.6470983174397414, + "grad_norm": 6.924560070037842, + "learning_rate": 3.0354607395993897e-06, + "loss": 0.5945795059204102, + "memory(GiB)": 43.68, + "step": 13915, + "token_acc": 0.849733570159858, + "train_speed(iter/s)": 0.095843 + }, + { + "epoch": 0.6473308356996911, + "grad_norm": 9.050676345825195, + "learning_rate": 3.031925633941267e-06, + "loss": 0.68319673538208, + "memory(GiB)": 43.68, + "step": 13920, + "token_acc": 0.8365591397849462, + "train_speed(iter/s)": 0.095862 + }, + { + "epoch": 0.6475633539596406, + "grad_norm": 8.45664119720459, + "learning_rate": 3.02839169206628e-06, + "loss": 0.5608391761779785, + "memory(GiB)": 43.68, + "step": 13925, + "token_acc": 0.8556187766714083, + "train_speed(iter/s)": 0.095882 + }, + { + "epoch": 0.6477958722195902, + "grad_norm": 6.573840618133545, + "learning_rate": 3.024858916064158e-06, + "loss": 0.664864444732666, + "memory(GiB)": 43.68, + "step": 13930, + "token_acc": 0.8237134909596662, + "train_speed(iter/s)": 0.095901 + }, + { + "epoch": 0.6480283904795399, + "grad_norm": 9.13306713104248, + "learning_rate": 3.0213273080239407e-06, + "loss": 0.6471784114837646, + "memory(GiB)": 43.68, + "step": 13935, + "token_acc": 0.8430813124108416, + "train_speed(iter/s)": 0.09592 + }, + { + "epoch": 0.6482609087394895, + "grad_norm": 8.26430892944336, + "learning_rate": 3.0177968700339783e-06, + "loss": 0.6823818683624268, + "memory(GiB)": 43.68, + "step": 13940, + "token_acc": 0.8463696948439144, + "train_speed(iter/s)": 0.09594 + }, + { + "epoch": 0.648493426999439, + "grad_norm": 7.3765435218811035, + "learning_rate": 3.0142676041819235e-06, + "loss": 0.5814319610595703, + "memory(GiB)": 43.68, + "step": 13945, + "token_acc": 0.8620386643233744, + "train_speed(iter/s)": 0.095959 + }, + { + "epoch": 0.6487259452593886, + "grad_norm": 6.571820259094238, + "learning_rate": 3.010739512554744e-06, + "loss": 0.6527836322784424, + "memory(GiB)": 43.68, + "step": 13950, + "token_acc": 0.8295980078263963, + "train_speed(iter/s)": 0.095978 + }, + { + "epoch": 0.6487259452593886, + "eval_loss": 0.574194610118866, + "eval_runtime": 295.9557, + "eval_samples_per_second": 11.742, + "eval_steps_per_second": 11.742, + "step": 13950 + }, + { + "epoch": 0.6489584635193383, + "grad_norm": 7.557634353637695, + "learning_rate": 3.0072125972387066e-06, + "loss": 0.562659215927124, + "memory(GiB)": 43.68, + "step": 13955, + "token_acc": 0.8313536907910556, + "train_speed(iter/s)": 0.095801 + }, + { + "epoch": 0.6491909817792878, + "grad_norm": 10.294422149658203, + "learning_rate": 3.0036868603193894e-06, + "loss": 0.664063835144043, + "memory(GiB)": 43.68, + "step": 13960, + "token_acc": 0.8424485699949824, + "train_speed(iter/s)": 0.09582 + }, + { + "epoch": 0.6494235000392374, + "grad_norm": 9.17708969116211, + "learning_rate": 3.000162303881664e-06, + "loss": 0.6378396511077881, + "memory(GiB)": 43.68, + "step": 13965, + "token_acc": 0.8550347222222222, + "train_speed(iter/s)": 0.09584 + }, + { + "epoch": 0.6496560182991871, + "grad_norm": 7.761695384979248, + "learning_rate": 2.996638930009713e-06, + "loss": 0.6696043491363526, + "memory(GiB)": 43.68, + "step": 13970, + "token_acc": 0.8345890410958904, + "train_speed(iter/s)": 0.095859 + }, + { + "epoch": 0.6498885365591367, + "grad_norm": 8.447530746459961, + "learning_rate": 2.9931167407870142e-06, + "loss": 0.7301533222198486, + "memory(GiB)": 43.68, + "step": 13975, + "token_acc": 0.8180955393862702, + "train_speed(iter/s)": 0.095878 + }, + { + "epoch": 0.6501210548190862, + "grad_norm": 5.710422992706299, + "learning_rate": 2.9895957382963507e-06, + "loss": 0.6815677642822265, + "memory(GiB)": 43.68, + "step": 13980, + "token_acc": 0.8277608915906788, + "train_speed(iter/s)": 0.095897 + }, + { + "epoch": 0.6503535730790359, + "grad_norm": 7.333799839019775, + "learning_rate": 2.9860759246197948e-06, + "loss": 0.6125518321990967, + "memory(GiB)": 43.68, + "step": 13985, + "token_acc": 0.839831401475237, + "train_speed(iter/s)": 0.095915 + }, + { + "epoch": 0.6505860913389855, + "grad_norm": 5.481876850128174, + "learning_rate": 2.9825573018387245e-06, + "loss": 0.781313419342041, + "memory(GiB)": 43.68, + "step": 13990, + "token_acc": 0.7800857237059018, + "train_speed(iter/s)": 0.095935 + }, + { + "epoch": 0.6508186095989351, + "grad_norm": 7.97532844543457, + "learning_rate": 2.9790398720338076e-06, + "loss": 0.5176995277404786, + "memory(GiB)": 43.68, + "step": 13995, + "token_acc": 0.8639365918097754, + "train_speed(iter/s)": 0.095953 + }, + { + "epoch": 0.6510511278588846, + "grad_norm": 9.163570404052734, + "learning_rate": 2.975523637285013e-06, + "loss": 0.6193868160247803, + "memory(GiB)": 43.68, + "step": 14000, + "token_acc": 0.8512965964343598, + "train_speed(iter/s)": 0.095973 + }, + { + "epoch": 0.6510511278588846, + "eval_loss": 0.5734658241271973, + "eval_runtime": 295.8049, + "eval_samples_per_second": 11.748, + "eval_steps_per_second": 11.748, + "step": 14000 + }, + { + "epoch": 0.6512836461188343, + "grad_norm": 6.884223937988281, + "learning_rate": 2.9720085996715934e-06, + "loss": 0.7771946430206299, + "memory(GiB)": 43.68, + "step": 14005, + "token_acc": 0.8298910314151567, + "train_speed(iter/s)": 0.095798 + }, + { + "epoch": 0.6515161643787839, + "grad_norm": 6.871092319488525, + "learning_rate": 2.968494761272104e-06, + "loss": 0.6113077640533447, + "memory(GiB)": 43.68, + "step": 14010, + "token_acc": 0.8521489971346705, + "train_speed(iter/s)": 0.095816 + }, + { + "epoch": 0.6517486826387334, + "grad_norm": 9.444489479064941, + "learning_rate": 2.9649821241643815e-06, + "loss": 0.7457359790802002, + "memory(GiB)": 43.68, + "step": 14015, + "token_acc": 0.8287831513260531, + "train_speed(iter/s)": 0.095836 + }, + { + "epoch": 0.651981200898683, + "grad_norm": 8.061306953430176, + "learning_rate": 2.9614706904255618e-06, + "loss": 0.7024789333343506, + "memory(GiB)": 43.68, + "step": 14020, + "token_acc": 0.8234827449424832, + "train_speed(iter/s)": 0.095855 + }, + { + "epoch": 0.6522137191586327, + "grad_norm": 6.835376739501953, + "learning_rate": 2.957960462132059e-06, + "loss": 0.737022066116333, + "memory(GiB)": 43.68, + "step": 14025, + "token_acc": 0.8126159554730983, + "train_speed(iter/s)": 0.095874 + }, + { + "epoch": 0.6524462374185823, + "grad_norm": 8.676056861877441, + "learning_rate": 2.9544514413595826e-06, + "loss": 0.6453481674194336, + "memory(GiB)": 43.68, + "step": 14030, + "token_acc": 0.8382838283828383, + "train_speed(iter/s)": 0.095894 + }, + { + "epoch": 0.6526787556785318, + "grad_norm": 5.367279529571533, + "learning_rate": 2.950943630183123e-06, + "loss": 0.6223780155181885, + "memory(GiB)": 43.68, + "step": 14035, + "token_acc": 0.8357348703170029, + "train_speed(iter/s)": 0.095913 + }, + { + "epoch": 0.6529112739384815, + "grad_norm": 6.280850410461426, + "learning_rate": 2.947437030676961e-06, + "loss": 0.6186736583709717, + "memory(GiB)": 43.68, + "step": 14040, + "token_acc": 0.844964314036479, + "train_speed(iter/s)": 0.095932 + }, + { + "epoch": 0.6531437921984311, + "grad_norm": 7.411701679229736, + "learning_rate": 2.9439316449146515e-06, + "loss": 0.6844239234924316, + "memory(GiB)": 43.68, + "step": 14045, + "token_acc": 0.8381320224719101, + "train_speed(iter/s)": 0.095952 + }, + { + "epoch": 0.6533763104583807, + "grad_norm": 6.25632381439209, + "learning_rate": 2.940427474969042e-06, + "loss": 0.45699324607849123, + "memory(GiB)": 43.68, + "step": 14050, + "token_acc": 0.8847736625514403, + "train_speed(iter/s)": 0.095971 + }, + { + "epoch": 0.6533763104583807, + "eval_loss": 0.5746976137161255, + "eval_runtime": 295.928, + "eval_samples_per_second": 11.743, + "eval_steps_per_second": 11.743, + "step": 14050 + }, + { + "epoch": 0.6536088287183303, + "grad_norm": 9.883111953735352, + "learning_rate": 2.9369245229122532e-06, + "loss": 0.5956651210784912, + "memory(GiB)": 43.68, + "step": 14055, + "token_acc": 0.8314616620441049, + "train_speed(iter/s)": 0.095796 + }, + { + "epoch": 0.6538413469782799, + "grad_norm": 6.37638521194458, + "learning_rate": 2.93342279081569e-06, + "loss": 0.7650864124298096, + "memory(GiB)": 43.68, + "step": 14060, + "token_acc": 0.7998368234974164, + "train_speed(iter/s)": 0.095815 + }, + { + "epoch": 0.6540738652382295, + "grad_norm": 7.586302757263184, + "learning_rate": 2.929922280750037e-06, + "loss": 0.559014081954956, + "memory(GiB)": 43.68, + "step": 14065, + "token_acc": 0.8657630083078268, + "train_speed(iter/s)": 0.095834 + }, + { + "epoch": 0.654306383498179, + "grad_norm": 8.94262409210205, + "learning_rate": 2.9264229947852506e-06, + "loss": 0.5777543067932129, + "memory(GiB)": 43.68, + "step": 14070, + "token_acc": 0.8611764705882353, + "train_speed(iter/s)": 0.095853 + }, + { + "epoch": 0.6545389017581287, + "grad_norm": 7.716413497924805, + "learning_rate": 2.9229249349905686e-06, + "loss": 0.6495201587677002, + "memory(GiB)": 43.68, + "step": 14075, + "token_acc": 0.8377230246389125, + "train_speed(iter/s)": 0.095872 + }, + { + "epoch": 0.6547714200180783, + "grad_norm": 6.944495677947998, + "learning_rate": 2.9194281034344995e-06, + "loss": 0.5988493919372558, + "memory(GiB)": 43.68, + "step": 14080, + "token_acc": 0.8444821731748726, + "train_speed(iter/s)": 0.095891 + }, + { + "epoch": 0.6550039382780279, + "grad_norm": 8.53555965423584, + "learning_rate": 2.9159325021848305e-06, + "loss": 0.6086976528167725, + "memory(GiB)": 43.68, + "step": 14085, + "token_acc": 0.8399395998489996, + "train_speed(iter/s)": 0.09591 + }, + { + "epoch": 0.6552364565379775, + "grad_norm": 7.036818981170654, + "learning_rate": 2.9124381333086173e-06, + "loss": 0.6461452007293701, + "memory(GiB)": 43.68, + "step": 14090, + "token_acc": 0.8341232227488151, + "train_speed(iter/s)": 0.095928 + }, + { + "epoch": 0.6554689747979271, + "grad_norm": 9.432230949401855, + "learning_rate": 2.9089449988721883e-06, + "loss": 0.6820971965789795, + "memory(GiB)": 43.68, + "step": 14095, + "token_acc": 0.8365271802894017, + "train_speed(iter/s)": 0.095948 + }, + { + "epoch": 0.6557014930578767, + "grad_norm": 7.816703796386719, + "learning_rate": 2.9054531009411414e-06, + "loss": 0.6649023532867432, + "memory(GiB)": 43.68, + "step": 14100, + "token_acc": 0.835795836131632, + "train_speed(iter/s)": 0.095967 + }, + { + "epoch": 0.6557014930578767, + "eval_loss": 0.572761595249176, + "eval_runtime": 296.5335, + "eval_samples_per_second": 11.719, + "eval_steps_per_second": 11.719, + "step": 14100 + }, + { + "epoch": 0.6559340113178262, + "grad_norm": 9.045654296875, + "learning_rate": 2.901962441580345e-06, + "loss": 0.6884272575378418, + "memory(GiB)": 43.68, + "step": 14105, + "token_acc": 0.8311472914179493, + "train_speed(iter/s)": 0.095792 + }, + { + "epoch": 0.6561665295777759, + "grad_norm": 8.36978816986084, + "learning_rate": 2.8984730228539304e-06, + "loss": 0.6390267372131347, + "memory(GiB)": 43.68, + "step": 14110, + "token_acc": 0.844022770398482, + "train_speed(iter/s)": 0.095811 + }, + { + "epoch": 0.6563990478377255, + "grad_norm": 9.524076461791992, + "learning_rate": 2.894984846825303e-06, + "loss": 0.6867616653442383, + "memory(GiB)": 43.68, + "step": 14115, + "token_acc": 0.8194259012016022, + "train_speed(iter/s)": 0.095831 + }, + { + "epoch": 0.6566315660976751, + "grad_norm": 4.460529804229736, + "learning_rate": 2.8914979155571227e-06, + "loss": 0.8245270729064942, + "memory(GiB)": 43.68, + "step": 14120, + "token_acc": 0.7884366087632876, + "train_speed(iter/s)": 0.095849 + }, + { + "epoch": 0.6568640843576247, + "grad_norm": 8.338356971740723, + "learning_rate": 2.888012231111328e-06, + "loss": 0.7064074516296387, + "memory(GiB)": 43.68, + "step": 14125, + "token_acc": 0.8225, + "train_speed(iter/s)": 0.095868 + }, + { + "epoch": 0.6570966026175743, + "grad_norm": 8.58897590637207, + "learning_rate": 2.8845277955491046e-06, + "loss": 0.6155064582824707, + "memory(GiB)": 43.68, + "step": 14130, + "token_acc": 0.8537222464083587, + "train_speed(iter/s)": 0.095887 + }, + { + "epoch": 0.6573291208775239, + "grad_norm": 6.647374153137207, + "learning_rate": 2.8810446109309128e-06, + "loss": 0.6504099845886231, + "memory(GiB)": 43.68, + "step": 14135, + "token_acc": 0.8369781312127237, + "train_speed(iter/s)": 0.095907 + }, + { + "epoch": 0.6575616391374736, + "grad_norm": 6.2928547859191895, + "learning_rate": 2.8775626793164613e-06, + "loss": 0.7517566204071044, + "memory(GiB)": 43.68, + "step": 14140, + "token_acc": 0.8156510980513455, + "train_speed(iter/s)": 0.095926 + }, + { + "epoch": 0.6577941573974231, + "grad_norm": 6.533952236175537, + "learning_rate": 2.8740820027647287e-06, + "loss": 0.6002644062042236, + "memory(GiB)": 43.68, + "step": 14145, + "token_acc": 0.8577603143418467, + "train_speed(iter/s)": 0.095945 + }, + { + "epoch": 0.6580266756573727, + "grad_norm": 8.172672271728516, + "learning_rate": 2.8706025833339426e-06, + "loss": 0.6353270530700683, + "memory(GiB)": 43.68, + "step": 14150, + "token_acc": 0.8471074380165289, + "train_speed(iter/s)": 0.095965 + }, + { + "epoch": 0.6580266756573727, + "eval_loss": 0.5736487507820129, + "eval_runtime": 291.4912, + "eval_samples_per_second": 11.921, + "eval_steps_per_second": 11.921, + "step": 14150 + }, + { + "epoch": 0.6582591939173223, + "grad_norm": 8.132963180541992, + "learning_rate": 2.867124423081592e-06, + "loss": 0.6144407749176025, + "memory(GiB)": 43.68, + "step": 14155, + "token_acc": 0.8309699194599739, + "train_speed(iter/s)": 0.095794 + }, + { + "epoch": 0.6584917121772719, + "grad_norm": 8.013964653015137, + "learning_rate": 2.8636475240644224e-06, + "loss": 0.7096580982208252, + "memory(GiB)": 43.68, + "step": 14160, + "token_acc": 0.8162005085361423, + "train_speed(iter/s)": 0.095814 + }, + { + "epoch": 0.6587242304372215, + "grad_norm": 6.895023345947266, + "learning_rate": 2.8601718883384323e-06, + "loss": 0.6625951766967774, + "memory(GiB)": 43.68, + "step": 14165, + "token_acc": 0.8329945799457995, + "train_speed(iter/s)": 0.095833 + }, + { + "epoch": 0.6589567486971711, + "grad_norm": 5.4100260734558105, + "learning_rate": 2.856697517958868e-06, + "loss": 0.7325220108032227, + "memory(GiB)": 43.68, + "step": 14170, + "token_acc": 0.8107576254509675, + "train_speed(iter/s)": 0.095853 + }, + { + "epoch": 0.6591892669571208, + "grad_norm": 7.335725784301758, + "learning_rate": 2.853224414980237e-06, + "loss": 0.6887342453002929, + "memory(GiB)": 43.68, + "step": 14175, + "token_acc": 0.8252652519893899, + "train_speed(iter/s)": 0.095872 + }, + { + "epoch": 0.6594217852170703, + "grad_norm": 8.461999893188477, + "learning_rate": 2.849752581456288e-06, + "loss": 0.7595938682556153, + "memory(GiB)": 43.68, + "step": 14180, + "token_acc": 0.8095395826432593, + "train_speed(iter/s)": 0.095891 + }, + { + "epoch": 0.6596543034770199, + "grad_norm": 8.143224716186523, + "learning_rate": 2.846282019440024e-06, + "loss": 0.7325577735900879, + "memory(GiB)": 43.68, + "step": 14185, + "token_acc": 0.8076923076923077, + "train_speed(iter/s)": 0.09591 + }, + { + "epoch": 0.6598868217369696, + "grad_norm": 8.338869094848633, + "learning_rate": 2.8428127309837e-06, + "loss": 0.6581947326660156, + "memory(GiB)": 43.68, + "step": 14190, + "token_acc": 0.8365147783251231, + "train_speed(iter/s)": 0.09593 + }, + { + "epoch": 0.6601193399969192, + "grad_norm": 7.938302516937256, + "learning_rate": 2.839344718138808e-06, + "loss": 0.7366507530212403, + "memory(GiB)": 43.68, + "step": 14195, + "token_acc": 0.8035604665438919, + "train_speed(iter/s)": 0.095948 + }, + { + "epoch": 0.6603518582568687, + "grad_norm": 8.492761611938477, + "learning_rate": 2.8358779829560925e-06, + "loss": 0.6317257404327392, + "memory(GiB)": 43.68, + "step": 14200, + "token_acc": 0.835741980965809, + "train_speed(iter/s)": 0.095967 + }, + { + "epoch": 0.6603518582568687, + "eval_loss": 0.5748091340065002, + "eval_runtime": 292.2811, + "eval_samples_per_second": 11.889, + "eval_steps_per_second": 11.889, + "step": 14200 + }, + { + "epoch": 0.6605843765168183, + "grad_norm": 9.483651161193848, + "learning_rate": 2.8324125274855417e-06, + "loss": 0.6270530700683594, + "memory(GiB)": 43.68, + "step": 14205, + "token_acc": 0.8306502044415939, + "train_speed(iter/s)": 0.095796 + }, + { + "epoch": 0.660816894776768, + "grad_norm": 11.275638580322266, + "learning_rate": 2.8289483537763896e-06, + "loss": 0.6266158103942872, + "memory(GiB)": 43.68, + "step": 14210, + "token_acc": 0.8413255360623781, + "train_speed(iter/s)": 0.095815 + }, + { + "epoch": 0.6610494130367175, + "grad_norm": 6.486075401306152, + "learning_rate": 2.8254854638771024e-06, + "loss": 0.5724782943725586, + "memory(GiB)": 43.68, + "step": 14215, + "token_acc": 0.8566433566433567, + "train_speed(iter/s)": 0.095834 + }, + { + "epoch": 0.6612819312966671, + "grad_norm": 8.653912544250488, + "learning_rate": 2.8220238598354e-06, + "loss": 0.7338716983795166, + "memory(GiB)": 43.68, + "step": 14220, + "token_acc": 0.8108202443280977, + "train_speed(iter/s)": 0.095852 + }, + { + "epoch": 0.6615144495566168, + "grad_norm": 8.97234058380127, + "learning_rate": 2.8185635436982304e-06, + "loss": 0.7180376052856445, + "memory(GiB)": 43.68, + "step": 14225, + "token_acc": 0.8164863856769862, + "train_speed(iter/s)": 0.095872 + }, + { + "epoch": 0.6617469678165664, + "grad_norm": 7.8891730308532715, + "learning_rate": 2.815104517511791e-06, + "loss": 0.7036499977111816, + "memory(GiB)": 43.68, + "step": 14230, + "token_acc": 0.8244940683879972, + "train_speed(iter/s)": 0.095891 + }, + { + "epoch": 0.6619794860765159, + "grad_norm": 10.813043594360352, + "learning_rate": 2.8116467833215056e-06, + "loss": 0.5858555316925049, + "memory(GiB)": 43.68, + "step": 14235, + "token_acc": 0.8422420193021529, + "train_speed(iter/s)": 0.09591 + }, + { + "epoch": 0.6622120043364655, + "grad_norm": 8.522062301635742, + "learning_rate": 2.8081903431720403e-06, + "loss": 0.6352302074432373, + "memory(GiB)": 43.68, + "step": 14240, + "token_acc": 0.8485938521909745, + "train_speed(iter/s)": 0.095929 + }, + { + "epoch": 0.6624445225964152, + "grad_norm": 8.544513702392578, + "learning_rate": 2.804735199107297e-06, + "loss": 0.6336312294006348, + "memory(GiB)": 43.68, + "step": 14245, + "token_acc": 0.8369250562881956, + "train_speed(iter/s)": 0.095948 + }, + { + "epoch": 0.6626770408563647, + "grad_norm": 9.956283569335938, + "learning_rate": 2.8012813531704097e-06, + "loss": 0.642596435546875, + "memory(GiB)": 43.68, + "step": 14250, + "token_acc": 0.8246120534103212, + "train_speed(iter/s)": 0.095966 + }, + { + "epoch": 0.6626770408563647, + "eval_loss": 0.5738605856895447, + "eval_runtime": 291.7651, + "eval_samples_per_second": 11.91, + "eval_steps_per_second": 11.91, + "step": 14250 + }, + { + "epoch": 0.6629095591163143, + "grad_norm": 9.033814430236816, + "learning_rate": 2.7978288074037397e-06, + "loss": 0.6293091773986816, + "memory(GiB)": 43.68, + "step": 14255, + "token_acc": 0.8305877812884397, + "train_speed(iter/s)": 0.095797 + }, + { + "epoch": 0.663142077376264, + "grad_norm": 7.8562703132629395, + "learning_rate": 2.7943775638488896e-06, + "loss": 0.5436077117919922, + "memory(GiB)": 43.68, + "step": 14260, + "token_acc": 0.8550860719874804, + "train_speed(iter/s)": 0.095816 + }, + { + "epoch": 0.6633745956362136, + "grad_norm": 8.680620193481445, + "learning_rate": 2.790927624546681e-06, + "loss": 0.6706278324127197, + "memory(GiB)": 43.68, + "step": 14265, + "token_acc": 0.834733893557423, + "train_speed(iter/s)": 0.095836 + }, + { + "epoch": 0.6636071138961631, + "grad_norm": 8.097391128540039, + "learning_rate": 2.7874789915371736e-06, + "loss": 0.6728082656860351, + "memory(GiB)": 43.68, + "step": 14270, + "token_acc": 0.8357296908698778, + "train_speed(iter/s)": 0.095855 + }, + { + "epoch": 0.6638396321561127, + "grad_norm": 6.790459632873535, + "learning_rate": 2.7840316668596468e-06, + "loss": 0.7238803386688233, + "memory(GiB)": 43.68, + "step": 14275, + "token_acc": 0.8292777134028583, + "train_speed(iter/s)": 0.095874 + }, + { + "epoch": 0.6640721504160624, + "grad_norm": 7.590424060821533, + "learning_rate": 2.7805856525526125e-06, + "loss": 0.6790872573852539, + "memory(GiB)": 43.68, + "step": 14280, + "token_acc": 0.83116095786602, + "train_speed(iter/s)": 0.095892 + }, + { + "epoch": 0.664304668676012, + "grad_norm": 8.506343841552734, + "learning_rate": 2.777140950653805e-06, + "loss": 0.6780567646026612, + "memory(GiB)": 43.68, + "step": 14285, + "token_acc": 0.8256519102486355, + "train_speed(iter/s)": 0.095911 + }, + { + "epoch": 0.6645371869359615, + "grad_norm": 8.424605369567871, + "learning_rate": 2.7736975632001843e-06, + "loss": 0.7088619709014893, + "memory(GiB)": 43.68, + "step": 14290, + "token_acc": 0.8216023353876095, + "train_speed(iter/s)": 0.09593 + }, + { + "epoch": 0.6647697051959112, + "grad_norm": 8.809297561645508, + "learning_rate": 2.770255492227929e-06, + "loss": 0.6911493301391601, + "memory(GiB)": 43.68, + "step": 14295, + "token_acc": 0.8151062155782848, + "train_speed(iter/s)": 0.095949 + }, + { + "epoch": 0.6650022234558608, + "grad_norm": 6.6492509841918945, + "learning_rate": 2.766814739772444e-06, + "loss": 0.5668047904968262, + "memory(GiB)": 43.68, + "step": 14300, + "token_acc": 0.8501394978078916, + "train_speed(iter/s)": 0.095968 + }, + { + "epoch": 0.6650022234558608, + "eval_loss": 0.5735751986503601, + "eval_runtime": 292.2407, + "eval_samples_per_second": 11.891, + "eval_steps_per_second": 11.891, + "step": 14300 + }, + { + "epoch": 0.6652347417158103, + "grad_norm": 9.933414459228516, + "learning_rate": 2.763375307868351e-06, + "loss": 0.6078849792480469, + "memory(GiB)": 43.68, + "step": 14305, + "token_acc": 0.8308808820823032, + "train_speed(iter/s)": 0.095799 + }, + { + "epoch": 0.66546725997576, + "grad_norm": 9.008543014526367, + "learning_rate": 2.7599371985494936e-06, + "loss": 0.7673866748809814, + "memory(GiB)": 43.68, + "step": 14310, + "token_acc": 0.8110333470564018, + "train_speed(iter/s)": 0.095818 + }, + { + "epoch": 0.6656997782357096, + "grad_norm": 8.77670955657959, + "learning_rate": 2.75650041384893e-06, + "loss": 0.5974670886993408, + "memory(GiB)": 43.68, + "step": 14315, + "token_acc": 0.8492482730597318, + "train_speed(iter/s)": 0.095837 + }, + { + "epoch": 0.6659322964956592, + "grad_norm": 9.20003604888916, + "learning_rate": 2.7530649557989392e-06, + "loss": 0.655994987487793, + "memory(GiB)": 43.68, + "step": 14320, + "token_acc": 0.8544620517097581, + "train_speed(iter/s)": 0.095856 + }, + { + "epoch": 0.6661648147556087, + "grad_norm": 8.369126319885254, + "learning_rate": 2.7496308264310124e-06, + "loss": 0.6682295322418212, + "memory(GiB)": 43.68, + "step": 14325, + "token_acc": 0.8262195121951219, + "train_speed(iter/s)": 0.095875 + }, + { + "epoch": 0.6663973330155584, + "grad_norm": 9.203055381774902, + "learning_rate": 2.7461980277758566e-06, + "loss": 0.7167951583862304, + "memory(GiB)": 43.68, + "step": 14330, + "token_acc": 0.8232542599398597, + "train_speed(iter/s)": 0.095894 + }, + { + "epoch": 0.666629851275508, + "grad_norm": 8.59557056427002, + "learning_rate": 2.7427665618633938e-06, + "loss": 0.6695326805114746, + "memory(GiB)": 43.68, + "step": 14335, + "token_acc": 0.83780276816609, + "train_speed(iter/s)": 0.095913 + }, + { + "epoch": 0.6668623695354576, + "grad_norm": 10.057333946228027, + "learning_rate": 2.7393364307227516e-06, + "loss": 0.5996942043304443, + "memory(GiB)": 43.68, + "step": 14340, + "token_acc": 0.8568464730290456, + "train_speed(iter/s)": 0.095932 + }, + { + "epoch": 0.6670948877954072, + "grad_norm": 7.976700782775879, + "learning_rate": 2.7359076363822767e-06, + "loss": 0.635762882232666, + "memory(GiB)": 43.68, + "step": 14345, + "token_acc": 0.8433179723502304, + "train_speed(iter/s)": 0.095951 + }, + { + "epoch": 0.6673274060553568, + "grad_norm": 8.137279510498047, + "learning_rate": 2.7324801808695168e-06, + "loss": 0.6215654850006104, + "memory(GiB)": 43.68, + "step": 14350, + "token_acc": 0.8468543046357616, + "train_speed(iter/s)": 0.095969 + }, + { + "epoch": 0.6673274060553568, + "eval_loss": 0.5712202191352844, + "eval_runtime": 294.1138, + "eval_samples_per_second": 11.815, + "eval_steps_per_second": 11.815, + "step": 14350 + }, + { + "epoch": 0.6675599243153064, + "grad_norm": 7.595315933227539, + "learning_rate": 2.7290540662112363e-06, + "loss": 0.6087830543518067, + "memory(GiB)": 43.68, + "step": 14355, + "token_acc": 0.8307561379000273, + "train_speed(iter/s)": 0.095799 + }, + { + "epoch": 0.6677924425752559, + "grad_norm": 9.151399612426758, + "learning_rate": 2.7256292944333983e-06, + "loss": 0.6383302688598633, + "memory(GiB)": 43.68, + "step": 14360, + "token_acc": 0.8437638703950289, + "train_speed(iter/s)": 0.095818 + }, + { + "epoch": 0.6680249608352056, + "grad_norm": 8.153346061706543, + "learning_rate": 2.722205867561179e-06, + "loss": 0.7106346607208252, + "memory(GiB)": 43.68, + "step": 14365, + "token_acc": 0.8124318429661941, + "train_speed(iter/s)": 0.095837 + }, + { + "epoch": 0.6682574790951552, + "grad_norm": 9.878233909606934, + "learning_rate": 2.718783787618956e-06, + "loss": 0.5851165294647217, + "memory(GiB)": 43.68, + "step": 14370, + "token_acc": 0.847217298830202, + "train_speed(iter/s)": 0.095855 + }, + { + "epoch": 0.6684899973551048, + "grad_norm": 7.442888259887695, + "learning_rate": 2.715363056630312e-06, + "loss": 0.6458406448364258, + "memory(GiB)": 43.68, + "step": 14375, + "token_acc": 0.8395563770794824, + "train_speed(iter/s)": 0.095873 + }, + { + "epoch": 0.6687225156150544, + "grad_norm": 8.392727851867676, + "learning_rate": 2.7119436766180273e-06, + "loss": 0.6156882286071778, + "memory(GiB)": 43.68, + "step": 14380, + "token_acc": 0.8352173913043478, + "train_speed(iter/s)": 0.095893 + }, + { + "epoch": 0.668955033875004, + "grad_norm": 7.722550392150879, + "learning_rate": 2.7085256496040914e-06, + "loss": 0.5698171615600586, + "memory(GiB)": 43.68, + "step": 14385, + "token_acc": 0.8557172557172558, + "train_speed(iter/s)": 0.095911 + }, + { + "epoch": 0.6691875521349536, + "grad_norm": 7.979699611663818, + "learning_rate": 2.7051089776096846e-06, + "loss": 0.6607985973358155, + "memory(GiB)": 43.68, + "step": 14390, + "token_acc": 0.8231414868105515, + "train_speed(iter/s)": 0.09593 + }, + { + "epoch": 0.6694200703949033, + "grad_norm": 7.391639709472656, + "learning_rate": 2.701693662655195e-06, + "loss": 0.5928860187530518, + "memory(GiB)": 43.68, + "step": 14395, + "token_acc": 0.8518250813155042, + "train_speed(iter/s)": 0.095948 + }, + { + "epoch": 0.6696525886548528, + "grad_norm": 8.907283782958984, + "learning_rate": 2.6982797067601997e-06, + "loss": 0.538227128982544, + "memory(GiB)": 43.68, + "step": 14400, + "token_acc": 0.8650433347090384, + "train_speed(iter/s)": 0.095967 + }, + { + "epoch": 0.6696525886548528, + "eval_loss": 0.5737596154212952, + "eval_runtime": 297.3936, + "eval_samples_per_second": 11.685, + "eval_steps_per_second": 11.685, + "step": 14400 + }, + { + "epoch": 0.6698851069148024, + "grad_norm": 6.146030902862549, + "learning_rate": 2.694867111943478e-06, + "loss": 0.6597367763519287, + "memory(GiB)": 43.68, + "step": 14405, + "token_acc": 0.8308298941037208, + "train_speed(iter/s)": 0.095796 + }, + { + "epoch": 0.670117625174752, + "grad_norm": 6.513291835784912, + "learning_rate": 2.6914558802230018e-06, + "loss": 0.5947196006774902, + "memory(GiB)": 43.68, + "step": 14410, + "token_acc": 0.8473229706390328, + "train_speed(iter/s)": 0.095815 + }, + { + "epoch": 0.6703501434347016, + "grad_norm": 9.858675003051758, + "learning_rate": 2.6880460136159415e-06, + "loss": 0.5992252349853515, + "memory(GiB)": 43.68, + "step": 14415, + "token_acc": 0.8595160707836764, + "train_speed(iter/s)": 0.095834 + }, + { + "epoch": 0.6705826616946512, + "grad_norm": 6.774270534515381, + "learning_rate": 2.684637514138651e-06, + "loss": 0.6149023056030274, + "memory(GiB)": 43.68, + "step": 14420, + "token_acc": 0.8390723822909346, + "train_speed(iter/s)": 0.095852 + }, + { + "epoch": 0.6708151799546008, + "grad_norm": 9.595948219299316, + "learning_rate": 2.6812303838066863e-06, + "loss": 0.5945103645324707, + "memory(GiB)": 43.68, + "step": 14425, + "token_acc": 0.8649948471315699, + "train_speed(iter/s)": 0.095871 + }, + { + "epoch": 0.6710476982145505, + "grad_norm": 10.324151039123535, + "learning_rate": 2.677824624634784e-06, + "loss": 0.6018610954284668, + "memory(GiB)": 43.68, + "step": 14430, + "token_acc": 0.8574784651527017, + "train_speed(iter/s)": 0.09589 + }, + { + "epoch": 0.6712802164745, + "grad_norm": 6.825169563293457, + "learning_rate": 2.674420238636879e-06, + "loss": 0.6257966518402099, + "memory(GiB)": 43.68, + "step": 14435, + "token_acc": 0.8469807145115397, + "train_speed(iter/s)": 0.095908 + }, + { + "epoch": 0.6715127347344496, + "grad_norm": 6.618413925170898, + "learning_rate": 2.671017227826086e-06, + "loss": 0.6099302768707275, + "memory(GiB)": 43.68, + "step": 14440, + "token_acc": 0.842526436124607, + "train_speed(iter/s)": 0.095927 + }, + { + "epoch": 0.6717452529943992, + "grad_norm": 8.459787368774414, + "learning_rate": 2.667615594214712e-06, + "loss": 0.627414321899414, + "memory(GiB)": 43.68, + "step": 14445, + "token_acc": 0.8332191780821918, + "train_speed(iter/s)": 0.095946 + }, + { + "epoch": 0.6719777712543488, + "grad_norm": 7.22158670425415, + "learning_rate": 2.664215339814248e-06, + "loss": 0.5792049407958985, + "memory(GiB)": 43.68, + "step": 14450, + "token_acc": 0.8588117489986649, + "train_speed(iter/s)": 0.095965 + }, + { + "epoch": 0.6719777712543488, + "eval_loss": 0.573811411857605, + "eval_runtime": 294.5723, + "eval_samples_per_second": 11.797, + "eval_steps_per_second": 11.797, + "step": 14450 + }, + { + "epoch": 0.6722102895142984, + "grad_norm": 9.8744535446167, + "learning_rate": 2.66081646663537e-06, + "loss": 0.6201520442962647, + "memory(GiB)": 43.68, + "step": 14455, + "token_acc": 0.8305132160526884, + "train_speed(iter/s)": 0.095796 + }, + { + "epoch": 0.672442807774248, + "grad_norm": 6.522641658782959, + "learning_rate": 2.6574189766879377e-06, + "loss": 0.6753710746765137, + "memory(GiB)": 43.68, + "step": 14460, + "token_acc": 0.8397988505747126, + "train_speed(iter/s)": 0.095814 + }, + { + "epoch": 0.6726753260341977, + "grad_norm": 7.374817371368408, + "learning_rate": 2.654022871980989e-06, + "loss": 0.6833240032196045, + "memory(GiB)": 43.68, + "step": 14465, + "token_acc": 0.8307291666666666, + "train_speed(iter/s)": 0.095833 + }, + { + "epoch": 0.6729078442941472, + "grad_norm": 7.5340118408203125, + "learning_rate": 2.650628154522748e-06, + "loss": 0.726350975036621, + "memory(GiB)": 43.68, + "step": 14470, + "token_acc": 0.8182887386062923, + "train_speed(iter/s)": 0.095851 + }, + { + "epoch": 0.6731403625540968, + "grad_norm": 7.650021076202393, + "learning_rate": 2.647234826320613e-06, + "loss": 0.7587420463562011, + "memory(GiB)": 43.68, + "step": 14475, + "token_acc": 0.8096377306903623, + "train_speed(iter/s)": 0.09587 + }, + { + "epoch": 0.6733728808140464, + "grad_norm": 8.88990306854248, + "learning_rate": 2.6438428893811657e-06, + "loss": 0.7666681289672852, + "memory(GiB)": 43.68, + "step": 14480, + "token_acc": 0.8095693779904306, + "train_speed(iter/s)": 0.095888 + }, + { + "epoch": 0.6736053990739961, + "grad_norm": 8.150176048278809, + "learning_rate": 2.640452345710163e-06, + "loss": 0.710899019241333, + "memory(GiB)": 43.68, + "step": 14485, + "token_acc": 0.8158881691101262, + "train_speed(iter/s)": 0.095907 + }, + { + "epoch": 0.6738379173339456, + "grad_norm": 8.934723854064941, + "learning_rate": 2.6370631973125394e-06, + "loss": 0.7278150081634521, + "memory(GiB)": 43.68, + "step": 14490, + "token_acc": 0.7667876588021778, + "train_speed(iter/s)": 0.095924 + }, + { + "epoch": 0.6740704355938952, + "grad_norm": 11.422002792358398, + "learning_rate": 2.6336754461923997e-06, + "loss": 0.5746397495269775, + "memory(GiB)": 43.68, + "step": 14495, + "token_acc": 0.8616751269035533, + "train_speed(iter/s)": 0.095943 + }, + { + "epoch": 0.6743029538538449, + "grad_norm": 9.185924530029297, + "learning_rate": 2.6302890943530294e-06, + "loss": 0.7092298984527587, + "memory(GiB)": 43.68, + "step": 14500, + "token_acc": 0.8205882352941176, + "train_speed(iter/s)": 0.095961 + }, + { + "epoch": 0.6743029538538449, + "eval_loss": 0.5714608430862427, + "eval_runtime": 291.671, + "eval_samples_per_second": 11.914, + "eval_steps_per_second": 11.914, + "step": 14500 + }, + { + "epoch": 0.6745354721137944, + "grad_norm": 10.095582008361816, + "learning_rate": 2.6269041437968794e-06, + "loss": 0.6917707920074463, + "memory(GiB)": 43.68, + "step": 14505, + "token_acc": 0.8309227052367162, + "train_speed(iter/s)": 0.095795 + }, + { + "epoch": 0.674767990373744, + "grad_norm": 9.785516738891602, + "learning_rate": 2.6235205965255794e-06, + "loss": 0.6394733428955078, + "memory(GiB)": 43.68, + "step": 14510, + "token_acc": 0.8392096086788067, + "train_speed(iter/s)": 0.095814 + }, + { + "epoch": 0.6750005086336937, + "grad_norm": 7.499686241149902, + "learning_rate": 2.6201384545399205e-06, + "loss": 0.6602446556091308, + "memory(GiB)": 43.68, + "step": 14515, + "token_acc": 0.8349798755945848, + "train_speed(iter/s)": 0.095833 + }, + { + "epoch": 0.6752330268936433, + "grad_norm": 8.838566780090332, + "learning_rate": 2.616757719839871e-06, + "loss": 0.710049057006836, + "memory(GiB)": 43.68, + "step": 14520, + "token_acc": 0.8274932614555256, + "train_speed(iter/s)": 0.095852 + }, + { + "epoch": 0.6754655451535928, + "grad_norm": 10.444742202758789, + "learning_rate": 2.6133783944245617e-06, + "loss": 0.67647705078125, + "memory(GiB)": 43.68, + "step": 14525, + "token_acc": 0.8286158631415241, + "train_speed(iter/s)": 0.095871 + }, + { + "epoch": 0.6756980634135424, + "grad_norm": 7.48112154006958, + "learning_rate": 2.6100004802922963e-06, + "loss": 0.6187932968139649, + "memory(GiB)": 43.68, + "step": 14530, + "token_acc": 0.8535674262233844, + "train_speed(iter/s)": 0.09589 + }, + { + "epoch": 0.6759305816734921, + "grad_norm": 7.320490837097168, + "learning_rate": 2.6066239794405346e-06, + "loss": 0.5680778503417969, + "memory(GiB)": 43.68, + "step": 14535, + "token_acc": 0.8603896103896104, + "train_speed(iter/s)": 0.095908 + }, + { + "epoch": 0.6761630999334417, + "grad_norm": 9.288532257080078, + "learning_rate": 2.6032488938659096e-06, + "loss": 0.6736385345458984, + "memory(GiB)": 43.68, + "step": 14540, + "token_acc": 0.8226904376012966, + "train_speed(iter/s)": 0.095926 + }, + { + "epoch": 0.6763956181933912, + "grad_norm": 8.691412925720215, + "learning_rate": 2.59987522556421e-06, + "loss": 0.5699333667755127, + "memory(GiB)": 43.68, + "step": 14545, + "token_acc": 0.8680815647107782, + "train_speed(iter/s)": 0.095945 + }, + { + "epoch": 0.6766281364533409, + "grad_norm": 6.899057388305664, + "learning_rate": 2.596502976530394e-06, + "loss": 0.5840402603149414, + "memory(GiB)": 43.68, + "step": 14550, + "token_acc": 0.8597560975609756, + "train_speed(iter/s)": 0.095964 + }, + { + "epoch": 0.6766281364533409, + "eval_loss": 0.5713512301445007, + "eval_runtime": 293.982, + "eval_samples_per_second": 11.82, + "eval_steps_per_second": 11.82, + "step": 14550 + }, + { + "epoch": 0.6768606547132905, + "grad_norm": 7.8339080810546875, + "learning_rate": 2.593132148758573e-06, + "loss": 0.6375294685363769, + "memory(GiB)": 43.68, + "step": 14555, + "token_acc": 0.831493550740322, + "train_speed(iter/s)": 0.095796 + }, + { + "epoch": 0.67709317297324, + "grad_norm": 10.36740493774414, + "learning_rate": 2.5897627442420224e-06, + "loss": 0.6267732143402099, + "memory(GiB)": 43.68, + "step": 14560, + "token_acc": 0.8491691521090754, + "train_speed(iter/s)": 0.095815 + }, + { + "epoch": 0.6773256912331896, + "grad_norm": 8.344315528869629, + "learning_rate": 2.586394764973177e-06, + "loss": 0.6922134399414063, + "memory(GiB)": 43.68, + "step": 14565, + "token_acc": 0.838535164377861, + "train_speed(iter/s)": 0.095833 + }, + { + "epoch": 0.6775582094931393, + "grad_norm": 8.412717819213867, + "learning_rate": 2.583028212943627e-06, + "loss": 0.6234781265258789, + "memory(GiB)": 43.68, + "step": 14570, + "token_acc": 0.8586094260382641, + "train_speed(iter/s)": 0.095852 + }, + { + "epoch": 0.6777907277530889, + "grad_norm": 6.043086051940918, + "learning_rate": 2.5796630901441144e-06, + "loss": 0.6654970645904541, + "memory(GiB)": 43.68, + "step": 14575, + "token_acc": 0.8365173817726276, + "train_speed(iter/s)": 0.09587 + }, + { + "epoch": 0.6780232460130384, + "grad_norm": 7.909218788146973, + "learning_rate": 2.576299398564544e-06, + "loss": 0.635191822052002, + "memory(GiB)": 43.68, + "step": 14580, + "token_acc": 0.82876254180602, + "train_speed(iter/s)": 0.095889 + }, + { + "epoch": 0.6782557642729881, + "grad_norm": 7.435328960418701, + "learning_rate": 2.572937140193972e-06, + "loss": 0.6927794456481934, + "memory(GiB)": 43.68, + "step": 14585, + "token_acc": 0.8353344768439108, + "train_speed(iter/s)": 0.095908 + }, + { + "epoch": 0.6784882825329377, + "grad_norm": 9.59237289428711, + "learning_rate": 2.5695763170206e-06, + "loss": 0.6599873065948486, + "memory(GiB)": 43.68, + "step": 14590, + "token_acc": 0.8305149884704074, + "train_speed(iter/s)": 0.095927 + }, + { + "epoch": 0.6787208007928872, + "grad_norm": 7.779507637023926, + "learning_rate": 2.5662169310317923e-06, + "loss": 0.6364209651947021, + "memory(GiB)": 43.68, + "step": 14595, + "token_acc": 0.831943981327109, + "train_speed(iter/s)": 0.095946 + }, + { + "epoch": 0.6789533190528368, + "grad_norm": 7.693490505218506, + "learning_rate": 2.5628589842140528e-06, + "loss": 0.6035785675048828, + "memory(GiB)": 43.68, + "step": 14600, + "token_acc": 0.8487282463186078, + "train_speed(iter/s)": 0.095964 + }, + { + "epoch": 0.6789533190528368, + "eval_loss": 0.5727770328521729, + "eval_runtime": 291.4508, + "eval_samples_per_second": 11.923, + "eval_steps_per_second": 11.923, + "step": 14600 + }, + { + "epoch": 0.6791858373127865, + "grad_norm": 9.653355598449707, + "learning_rate": 2.5595024785530415e-06, + "loss": 0.6479739189147949, + "memory(GiB)": 43.68, + "step": 14605, + "token_acc": 0.8313283952202489, + "train_speed(iter/s)": 0.095799 + }, + { + "epoch": 0.6794183555727361, + "grad_norm": 5.43248987197876, + "learning_rate": 2.5561474160335633e-06, + "loss": 0.6288150787353516, + "memory(GiB)": 43.68, + "step": 14610, + "token_acc": 0.8369351669941061, + "train_speed(iter/s)": 0.095818 + }, + { + "epoch": 0.6796508738326856, + "grad_norm": 8.895267486572266, + "learning_rate": 2.5527937986395722e-06, + "loss": 0.7126208782196045, + "memory(GiB)": 43.68, + "step": 14615, + "token_acc": 0.8321744627054362, + "train_speed(iter/s)": 0.095836 + }, + { + "epoch": 0.6798833920926353, + "grad_norm": 6.117085933685303, + "learning_rate": 2.549441628354163e-06, + "loss": 0.5901892185211182, + "memory(GiB)": 43.68, + "step": 14620, + "token_acc": 0.8526863084922011, + "train_speed(iter/s)": 0.095855 + }, + { + "epoch": 0.6801159103525849, + "grad_norm": 7.739634037017822, + "learning_rate": 2.5460909071595795e-06, + "loss": 0.6293376922607422, + "memory(GiB)": 43.68, + "step": 14625, + "token_acc": 0.8395632087358252, + "train_speed(iter/s)": 0.095873 + }, + { + "epoch": 0.6803484286125345, + "grad_norm": 9.097126007080078, + "learning_rate": 2.542741637037204e-06, + "loss": 0.5195389270782471, + "memory(GiB)": 43.68, + "step": 14630, + "token_acc": 0.8595877090626215, + "train_speed(iter/s)": 0.095892 + }, + { + "epoch": 0.680580946872484, + "grad_norm": 7.753961563110352, + "learning_rate": 2.5393938199675673e-06, + "loss": 0.7167007923126221, + "memory(GiB)": 43.68, + "step": 14635, + "token_acc": 0.8415798611111112, + "train_speed(iter/s)": 0.095911 + }, + { + "epoch": 0.6808134651324337, + "grad_norm": 8.212662696838379, + "learning_rate": 2.5360474579303323e-06, + "loss": 0.643263578414917, + "memory(GiB)": 43.68, + "step": 14640, + "token_acc": 0.8508230452674898, + "train_speed(iter/s)": 0.09593 + }, + { + "epoch": 0.6810459833923833, + "grad_norm": 8.01890754699707, + "learning_rate": 2.5327025529043083e-06, + "loss": 0.6267054080963135, + "memory(GiB)": 43.68, + "step": 14645, + "token_acc": 0.8474452554744526, + "train_speed(iter/s)": 0.095949 + }, + { + "epoch": 0.6812785016523328, + "grad_norm": 8.446707725524902, + "learning_rate": 2.5293591068674418e-06, + "loss": 0.7221577167510986, + "memory(GiB)": 43.68, + "step": 14650, + "token_acc": 0.8208, + "train_speed(iter/s)": 0.095968 + }, + { + "epoch": 0.6812785016523328, + "eval_loss": 0.5712907910346985, + "eval_runtime": 292.4027, + "eval_samples_per_second": 11.884, + "eval_steps_per_second": 11.884, + "step": 14650 + }, + { + "epoch": 0.6815110199122825, + "grad_norm": 8.568792343139648, + "learning_rate": 2.5260171217968164e-06, + "loss": 0.5040855407714844, + "memory(GiB)": 43.68, + "step": 14655, + "token_acc": 0.8315160352189971, + "train_speed(iter/s)": 0.095803 + }, + { + "epoch": 0.6817435381722321, + "grad_norm": 8.438779830932617, + "learning_rate": 2.5226765996686477e-06, + "loss": 0.6068079471588135, + "memory(GiB)": 43.68, + "step": 14660, + "token_acc": 0.8387342737323675, + "train_speed(iter/s)": 0.095821 + }, + { + "epoch": 0.6819760564321817, + "grad_norm": 7.259321212768555, + "learning_rate": 2.5193375424582933e-06, + "loss": 0.6753248691558837, + "memory(GiB)": 43.68, + "step": 14665, + "token_acc": 0.8390367553865653, + "train_speed(iter/s)": 0.095839 + }, + { + "epoch": 0.6822085746921313, + "grad_norm": 9.771519660949707, + "learning_rate": 2.5159999521402377e-06, + "loss": 0.6740274429321289, + "memory(GiB)": 43.68, + "step": 14670, + "token_acc": 0.839384878257155, + "train_speed(iter/s)": 0.095858 + }, + { + "epoch": 0.6824410929520809, + "grad_norm": 10.084802627563477, + "learning_rate": 2.512663830688104e-06, + "loss": 0.637111759185791, + "memory(GiB)": 43.68, + "step": 14675, + "token_acc": 0.8309859154929577, + "train_speed(iter/s)": 0.095877 + }, + { + "epoch": 0.6826736112120305, + "grad_norm": 11.251710891723633, + "learning_rate": 2.5093291800746404e-06, + "loss": 0.6224228858947753, + "memory(GiB)": 43.68, + "step": 14680, + "token_acc": 0.8412305516265912, + "train_speed(iter/s)": 0.095896 + }, + { + "epoch": 0.6829061294719802, + "grad_norm": 6.67312479019165, + "learning_rate": 2.505996002271731e-06, + "loss": 0.6442068576812744, + "memory(GiB)": 43.68, + "step": 14685, + "token_acc": 0.842228935884525, + "train_speed(iter/s)": 0.095914 + }, + { + "epoch": 0.6831386477319297, + "grad_norm": 8.050361633300781, + "learning_rate": 2.502664299250386e-06, + "loss": 0.684641170501709, + "memory(GiB)": 43.68, + "step": 14690, + "token_acc": 0.8318224445139106, + "train_speed(iter/s)": 0.095933 + }, + { + "epoch": 0.6833711659918793, + "grad_norm": 8.623970031738281, + "learning_rate": 2.4993340729807463e-06, + "loss": 0.6142326354980469, + "memory(GiB)": 43.68, + "step": 14695, + "token_acc": 0.8480436760691538, + "train_speed(iter/s)": 0.095952 + }, + { + "epoch": 0.6836036842518289, + "grad_norm": 8.763551712036133, + "learning_rate": 2.496005325432074e-06, + "loss": 0.6094542026519776, + "memory(GiB)": 43.68, + "step": 14700, + "token_acc": 0.8512938093678349, + "train_speed(iter/s)": 0.095971 + }, + { + "epoch": 0.6836036842518289, + "eval_loss": 0.569529116153717, + "eval_runtime": 291.8592, + "eval_samples_per_second": 11.906, + "eval_steps_per_second": 11.906, + "step": 14700 + }, + { + "epoch": 0.6838362025117785, + "grad_norm": 7.144622325897217, + "learning_rate": 2.492678058572765e-06, + "loss": 0.5630511283874512, + "memory(GiB)": 43.68, + "step": 14705, + "token_acc": 0.8320883273685138, + "train_speed(iter/s)": 0.095806 + }, + { + "epoch": 0.6840687207717281, + "grad_norm": 9.401549339294434, + "learning_rate": 2.4893522743703293e-06, + "loss": 0.6558740615844727, + "memory(GiB)": 43.68, + "step": 14710, + "token_acc": 0.8358806404657934, + "train_speed(iter/s)": 0.095825 + }, + { + "epoch": 0.6843012390316777, + "grad_norm": 8.555597305297852, + "learning_rate": 2.4860279747914083e-06, + "loss": 0.7010702610015869, + "memory(GiB)": 43.68, + "step": 14715, + "token_acc": 0.8217913204062789, + "train_speed(iter/s)": 0.095843 + }, + { + "epoch": 0.6845337572916274, + "grad_norm": 7.839824199676514, + "learning_rate": 2.482705161801766e-06, + "loss": 0.7387192249298096, + "memory(GiB)": 43.68, + "step": 14720, + "token_acc": 0.8162778366914104, + "train_speed(iter/s)": 0.095861 + }, + { + "epoch": 0.6847662755515769, + "grad_norm": 7.019917011260986, + "learning_rate": 2.4793838373662787e-06, + "loss": 0.6605482578277588, + "memory(GiB)": 43.68, + "step": 14725, + "token_acc": 0.8397932816537468, + "train_speed(iter/s)": 0.09588 + }, + { + "epoch": 0.6849987938115265, + "grad_norm": 10.25475025177002, + "learning_rate": 2.476064003448952e-06, + "loss": 0.6720106124877929, + "memory(GiB)": 43.68, + "step": 14730, + "token_acc": 0.8458235753317721, + "train_speed(iter/s)": 0.095898 + }, + { + "epoch": 0.6852313120714761, + "grad_norm": 9.277771949768066, + "learning_rate": 2.472745662012904e-06, + "loss": 0.7551665306091309, + "memory(GiB)": 43.68, + "step": 14735, + "token_acc": 0.7976900149031296, + "train_speed(iter/s)": 0.095917 + }, + { + "epoch": 0.6854638303314257, + "grad_norm": 9.814400672912598, + "learning_rate": 2.469428815020376e-06, + "loss": 0.6167050361633301, + "memory(GiB)": 43.68, + "step": 14740, + "token_acc": 0.849003984063745, + "train_speed(iter/s)": 0.095935 + }, + { + "epoch": 0.6856963485913753, + "grad_norm": 6.580095291137695, + "learning_rate": 2.466113464432718e-06, + "loss": 0.6547997474670411, + "memory(GiB)": 43.68, + "step": 14745, + "token_acc": 0.8346456692913385, + "train_speed(iter/s)": 0.095954 + }, + { + "epoch": 0.6859288668513249, + "grad_norm": 8.515876770019531, + "learning_rate": 2.462799612210402e-06, + "loss": 0.641704797744751, + "memory(GiB)": 43.68, + "step": 14750, + "token_acc": 0.8397271952259164, + "train_speed(iter/s)": 0.095972 + }, + { + "epoch": 0.6859288668513249, + "eval_loss": 0.5697548985481262, + "eval_runtime": 292.9245, + "eval_samples_per_second": 11.863, + "eval_steps_per_second": 11.863, + "step": 14750 + }, + { + "epoch": 0.6861613851112746, + "grad_norm": 7.716666221618652, + "learning_rate": 2.459487260313008e-06, + "loss": 0.5847614765167236, + "memory(GiB)": 43.68, + "step": 14755, + "token_acc": 0.8317114187568244, + "train_speed(iter/s)": 0.095807 + }, + { + "epoch": 0.6863939033712241, + "grad_norm": 9.125048637390137, + "learning_rate": 2.4561764106992364e-06, + "loss": 0.5890578746795654, + "memory(GiB)": 43.68, + "step": 14760, + "token_acc": 0.853467073702573, + "train_speed(iter/s)": 0.095826 + }, + { + "epoch": 0.6866264216311737, + "grad_norm": 10.58772087097168, + "learning_rate": 2.45286706532689e-06, + "loss": 0.5699628829956055, + "memory(GiB)": 43.68, + "step": 14765, + "token_acc": 0.8517412935323383, + "train_speed(iter/s)": 0.095844 + }, + { + "epoch": 0.6868589398911233, + "grad_norm": 9.083113670349121, + "learning_rate": 2.449559226152889e-06, + "loss": 0.559787654876709, + "memory(GiB)": 43.68, + "step": 14770, + "token_acc": 0.8571428571428571, + "train_speed(iter/s)": 0.095862 + }, + { + "epoch": 0.687091458151073, + "grad_norm": 7.75703239440918, + "learning_rate": 2.446252895133261e-06, + "loss": 0.5728509426116943, + "memory(GiB)": 43.68, + "step": 14775, + "token_acc": 0.8530890804597702, + "train_speed(iter/s)": 0.09588 + }, + { + "epoch": 0.6873239764110225, + "grad_norm": 9.354948997497559, + "learning_rate": 2.4429480742231433e-06, + "loss": 0.6882061004638672, + "memory(GiB)": 43.68, + "step": 14780, + "token_acc": 0.8113659705580281, + "train_speed(iter/s)": 0.095898 + }, + { + "epoch": 0.6875564946709721, + "grad_norm": 7.420952320098877, + "learning_rate": 2.4396447653767746e-06, + "loss": 0.811635684967041, + "memory(GiB)": 43.68, + "step": 14785, + "token_acc": 0.7832310838445807, + "train_speed(iter/s)": 0.095916 + }, + { + "epoch": 0.6877890129309218, + "grad_norm": 7.162716865539551, + "learning_rate": 2.4363429705475082e-06, + "loss": 0.7320106506347657, + "memory(GiB)": 43.68, + "step": 14790, + "token_acc": 0.8040944881889763, + "train_speed(iter/s)": 0.095934 + }, + { + "epoch": 0.6880215311908713, + "grad_norm": 9.993701934814453, + "learning_rate": 2.4330426916877927e-06, + "loss": 0.6562893867492676, + "memory(GiB)": 43.68, + "step": 14795, + "token_acc": 0.832933653077538, + "train_speed(iter/s)": 0.095952 + }, + { + "epoch": 0.6882540494508209, + "grad_norm": 6.7831950187683105, + "learning_rate": 2.429743930749189e-06, + "loss": 0.6879189014434814, + "memory(GiB)": 43.68, + "step": 14800, + "token_acc": 0.8200647249190939, + "train_speed(iter/s)": 0.09597 + }, + { + "epoch": 0.6882540494508209, + "eval_loss": 0.571153461933136, + "eval_runtime": 291.7537, + "eval_samples_per_second": 11.911, + "eval_steps_per_second": 11.911, + "step": 14800 + }, + { + "epoch": 0.6884865677107705, + "grad_norm": 8.59450626373291, + "learning_rate": 2.4264466896823494e-06, + "loss": 0.5881685256958008, + "memory(GiB)": 43.68, + "step": 14805, + "token_acc": 0.831436996201984, + "train_speed(iter/s)": 0.095807 + }, + { + "epoch": 0.6887190859707202, + "grad_norm": 6.911591053009033, + "learning_rate": 2.4231509704370438e-06, + "loss": 0.6806635856628418, + "memory(GiB)": 43.68, + "step": 14810, + "token_acc": 0.8354381936471489, + "train_speed(iter/s)": 0.095826 + }, + { + "epoch": 0.6889516042306697, + "grad_norm": 10.008810043334961, + "learning_rate": 2.419856774962126e-06, + "loss": 0.6647405624389648, + "memory(GiB)": 43.68, + "step": 14815, + "token_acc": 0.8392990305741984, + "train_speed(iter/s)": 0.095845 + }, + { + "epoch": 0.6891841224906193, + "grad_norm": 7.691843509674072, + "learning_rate": 2.4165641052055592e-06, + "loss": 0.6704154014587402, + "memory(GiB)": 43.68, + "step": 14820, + "token_acc": 0.83171657528469, + "train_speed(iter/s)": 0.095863 + }, + { + "epoch": 0.689416640750569, + "grad_norm": 7.425168037414551, + "learning_rate": 2.4132729631143974e-06, + "loss": 0.6791874408721924, + "memory(GiB)": 43.68, + "step": 14825, + "token_acc": 0.8379405666897028, + "train_speed(iter/s)": 0.095881 + }, + { + "epoch": 0.6896491590105186, + "grad_norm": 7.715751647949219, + "learning_rate": 2.4099833506347984e-06, + "loss": 0.6338499546051025, + "memory(GiB)": 43.68, + "step": 14830, + "token_acc": 0.8356896010053408, + "train_speed(iter/s)": 0.095899 + }, + { + "epoch": 0.6898816772704681, + "grad_norm": 10.072502136230469, + "learning_rate": 2.4066952697120073e-06, + "loss": 0.6765162467956543, + "memory(GiB)": 43.68, + "step": 14835, + "token_acc": 0.8386277001270648, + "train_speed(iter/s)": 0.095917 + }, + { + "epoch": 0.6901141955304178, + "grad_norm": 10.854269027709961, + "learning_rate": 2.4034087222903703e-06, + "loss": 0.5723191738128662, + "memory(GiB)": 43.68, + "step": 14840, + "token_acc": 0.8614406779661017, + "train_speed(iter/s)": 0.095935 + }, + { + "epoch": 0.6903467137903674, + "grad_norm": 9.747332572937012, + "learning_rate": 2.4001237103133233e-06, + "loss": 0.6472094535827637, + "memory(GiB)": 43.68, + "step": 14845, + "token_acc": 0.8457767722473605, + "train_speed(iter/s)": 0.095954 + }, + { + "epoch": 0.6905792320503169, + "grad_norm": 7.175398826599121, + "learning_rate": 2.3968402357233966e-06, + "loss": 0.6688910007476807, + "memory(GiB)": 43.68, + "step": 14850, + "token_acc": 0.8367983367983368, + "train_speed(iter/s)": 0.095971 + }, + { + "epoch": 0.6905792320503169, + "eval_loss": 0.5700727105140686, + "eval_runtime": 291.5929, + "eval_samples_per_second": 11.917, + "eval_steps_per_second": 11.917, + "step": 14850 + }, + { + "epoch": 0.6908117503102665, + "grad_norm": 9.811661720275879, + "learning_rate": 2.3935583004622117e-06, + "loss": 0.642160701751709, + "memory(GiB)": 43.68, + "step": 14855, + "token_acc": 0.8314214066791387, + "train_speed(iter/s)": 0.095808 + }, + { + "epoch": 0.6910442685702162, + "grad_norm": 9.53758430480957, + "learning_rate": 2.3902779064704733e-06, + "loss": 0.6385757923126221, + "memory(GiB)": 43.68, + "step": 14860, + "token_acc": 0.8374074074074074, + "train_speed(iter/s)": 0.095826 + }, + { + "epoch": 0.6912767868301658, + "grad_norm": 7.413933753967285, + "learning_rate": 2.386999055687985e-06, + "loss": 0.6519462108612061, + "memory(GiB)": 43.68, + "step": 14865, + "token_acc": 0.8399087055754809, + "train_speed(iter/s)": 0.095845 + }, + { + "epoch": 0.6915093050901153, + "grad_norm": 8.833547592163086, + "learning_rate": 2.3837217500536283e-06, + "loss": 0.6524216175079346, + "memory(GiB)": 43.68, + "step": 14870, + "token_acc": 0.8381706244503079, + "train_speed(iter/s)": 0.095863 + }, + { + "epoch": 0.691741823350065, + "grad_norm": 7.5595173835754395, + "learning_rate": 2.3804459915053777e-06, + "loss": 0.6694557666778564, + "memory(GiB)": 43.68, + "step": 14875, + "token_acc": 0.8293471234647706, + "train_speed(iter/s)": 0.095881 + }, + { + "epoch": 0.6919743416100146, + "grad_norm": 9.778063774108887, + "learning_rate": 2.3771717819802885e-06, + "loss": 0.665160083770752, + "memory(GiB)": 43.68, + "step": 14880, + "token_acc": 0.8377831715210357, + "train_speed(iter/s)": 0.0959 + }, + { + "epoch": 0.6922068598699641, + "grad_norm": 5.986135959625244, + "learning_rate": 2.3738991234145025e-06, + "loss": 0.6411314964294433, + "memory(GiB)": 43.68, + "step": 14885, + "token_acc": 0.8561580882352942, + "train_speed(iter/s)": 0.095919 + }, + { + "epoch": 0.6924393781299137, + "grad_norm": 7.493768692016602, + "learning_rate": 2.3706280177432444e-06, + "loss": 0.6026975631713867, + "memory(GiB)": 43.68, + "step": 14890, + "token_acc": 0.8538228359717076, + "train_speed(iter/s)": 0.095937 + }, + { + "epoch": 0.6926718963898634, + "grad_norm": 9.839439392089844, + "learning_rate": 2.367358466900822e-06, + "loss": 0.603148365020752, + "memory(GiB)": 43.68, + "step": 14895, + "token_acc": 0.8563741721854304, + "train_speed(iter/s)": 0.095955 + }, + { + "epoch": 0.692904414649813, + "grad_norm": 9.162924766540527, + "learning_rate": 2.364090472820617e-06, + "loss": 0.7842535972595215, + "memory(GiB)": 43.68, + "step": 14900, + "token_acc": 0.7974545454545454, + "train_speed(iter/s)": 0.095973 + }, + { + "epoch": 0.692904414649813, + "eval_loss": 0.5699092149734497, + "eval_runtime": 292.1984, + "eval_samples_per_second": 11.893, + "eval_steps_per_second": 11.893, + "step": 14900 + }, + { + "epoch": 0.6931369329097625, + "grad_norm": 6.95413064956665, + "learning_rate": 2.3608240374350994e-06, + "loss": 0.7118723869323731, + "memory(GiB)": 43.68, + "step": 14905, + "token_acc": 0.8312244376278118, + "train_speed(iter/s)": 0.095811 + }, + { + "epoch": 0.6933694511697122, + "grad_norm": 7.15910530090332, + "learning_rate": 2.35755916267581e-06, + "loss": 0.6422323703765869, + "memory(GiB)": 43.68, + "step": 14910, + "token_acc": 0.8529996027016289, + "train_speed(iter/s)": 0.095829 + }, + { + "epoch": 0.6936019694296618, + "grad_norm": 8.158949851989746, + "learning_rate": 2.3542958504733733e-06, + "loss": 0.6427113533020019, + "memory(GiB)": 43.68, + "step": 14915, + "token_acc": 0.8367631670735961, + "train_speed(iter/s)": 0.095847 + }, + { + "epoch": 0.6938344876896114, + "grad_norm": 9.417699813842773, + "learning_rate": 2.3510341027574824e-06, + "loss": 0.8722169876098633, + "memory(GiB)": 43.68, + "step": 14920, + "token_acc": 0.8015364916773368, + "train_speed(iter/s)": 0.095865 + }, + { + "epoch": 0.694067005949561, + "grad_norm": 9.309544563293457, + "learning_rate": 2.3477739214569124e-06, + "loss": 0.7727357864379882, + "memory(GiB)": 43.68, + "step": 14925, + "token_acc": 0.8069565217391305, + "train_speed(iter/s)": 0.095883 + }, + { + "epoch": 0.6942995242095106, + "grad_norm": 6.384006023406982, + "learning_rate": 2.3445153084995083e-06, + "loss": 0.6519456386566163, + "memory(GiB)": 43.68, + "step": 14930, + "token_acc": 0.8368974266038419, + "train_speed(iter/s)": 0.095901 + }, + { + "epoch": 0.6945320424694602, + "grad_norm": 7.078763961791992, + "learning_rate": 2.3412582658121907e-06, + "loss": 0.7300206184387207, + "memory(GiB)": 43.68, + "step": 14935, + "token_acc": 0.8188202247191011, + "train_speed(iter/s)": 0.095918 + }, + { + "epoch": 0.6947645607294097, + "grad_norm": 9.662609100341797, + "learning_rate": 2.3380027953209463e-06, + "loss": 0.6980655670166016, + "memory(GiB)": 43.68, + "step": 14940, + "token_acc": 0.8221408221408222, + "train_speed(iter/s)": 0.095936 + }, + { + "epoch": 0.6949970789893594, + "grad_norm": 9.974004745483398, + "learning_rate": 2.3347488989508377e-06, + "loss": 0.6442455768585205, + "memory(GiB)": 43.68, + "step": 14945, + "token_acc": 0.8377208799134511, + "train_speed(iter/s)": 0.095954 + }, + { + "epoch": 0.695229597249309, + "grad_norm": 9.717050552368164, + "learning_rate": 2.3314965786259918e-06, + "loss": 0.6342096328735352, + "memory(GiB)": 43.68, + "step": 14950, + "token_acc": 0.8540729635182409, + "train_speed(iter/s)": 0.095972 + }, + { + "epoch": 0.695229597249309, + "eval_loss": 0.5690205097198486, + "eval_runtime": 296.494, + "eval_samples_per_second": 11.72, + "eval_steps_per_second": 11.72, + "step": 14950 + }, + { + "epoch": 0.6954621155092586, + "grad_norm": 8.165386199951172, + "learning_rate": 2.328245836269609e-06, + "loss": 0.6690054893493652, + "memory(GiB)": 43.68, + "step": 14955, + "token_acc": 0.8313861648130477, + "train_speed(iter/s)": 0.095807 + }, + { + "epoch": 0.6956946337692081, + "grad_norm": 8.748926162719727, + "learning_rate": 2.32499667380395e-06, + "loss": 0.7164567470550537, + "memory(GiB)": 43.68, + "step": 14960, + "token_acc": 0.8199805384365877, + "train_speed(iter/s)": 0.095826 + }, + { + "epoch": 0.6959271520291578, + "grad_norm": 9.91205883026123, + "learning_rate": 2.3217490931503478e-06, + "loss": 0.6394514083862305, + "memory(GiB)": 43.68, + "step": 14965, + "token_acc": 0.8360460500198491, + "train_speed(iter/s)": 0.095844 + }, + { + "epoch": 0.6961596702891074, + "grad_norm": 7.21843147277832, + "learning_rate": 2.3185030962291954e-06, + "loss": 0.6726161479949951, + "memory(GiB)": 43.68, + "step": 14970, + "token_acc": 0.8351713859910581, + "train_speed(iter/s)": 0.095862 + }, + { + "epoch": 0.696392188549057, + "grad_norm": 9.273655891418457, + "learning_rate": 2.3152586849599544e-06, + "loss": 0.6527256488800048, + "memory(GiB)": 43.68, + "step": 14975, + "token_acc": 0.8383581547402833, + "train_speed(iter/s)": 0.09588 + }, + { + "epoch": 0.6966247068090066, + "grad_norm": 8.348166465759277, + "learning_rate": 2.3120158612611406e-06, + "loss": 0.7497655868530273, + "memory(GiB)": 43.68, + "step": 14980, + "token_acc": 0.8282306163021869, + "train_speed(iter/s)": 0.095899 + }, + { + "epoch": 0.6968572250689562, + "grad_norm": 9.885611534118652, + "learning_rate": 2.308774627050338e-06, + "loss": 0.7358698368072509, + "memory(GiB)": 43.68, + "step": 14985, + "token_acc": 0.8208015899304405, + "train_speed(iter/s)": 0.095917 + }, + { + "epoch": 0.6970897433289058, + "grad_norm": 10.031953811645508, + "learning_rate": 2.3055349842441903e-06, + "loss": 0.6113825798034668, + "memory(GiB)": 43.68, + "step": 14990, + "token_acc": 0.8391099700470689, + "train_speed(iter/s)": 0.095935 + }, + { + "epoch": 0.6973222615888554, + "grad_norm": 7.13198184967041, + "learning_rate": 2.3022969347583944e-06, + "loss": 0.6385971069335937, + "memory(GiB)": 43.68, + "step": 14995, + "token_acc": 0.8495164410058027, + "train_speed(iter/s)": 0.095953 + }, + { + "epoch": 0.697554779848805, + "grad_norm": 10.534452438354492, + "learning_rate": 2.299060480507713e-06, + "loss": 0.7708720684051513, + "memory(GiB)": 43.68, + "step": 15000, + "token_acc": 0.8125, + "train_speed(iter/s)": 0.09597 + }, + { + "epoch": 0.697554779848805, + "eval_loss": 0.5687591433525085, + "eval_runtime": 297.5632, + "eval_samples_per_second": 11.678, + "eval_steps_per_second": 11.678, + "step": 15000 + }, + { + "epoch": 0.6977872981087546, + "grad_norm": 10.113677978515625, + "learning_rate": 2.295825623405958e-06, + "loss": 0.6230460166931152, + "memory(GiB)": 43.68, + "step": 15005, + "token_acc": 0.8312627079297482, + "train_speed(iter/s)": 0.095806 + }, + { + "epoch": 0.6980198163687042, + "grad_norm": 6.392548084259033, + "learning_rate": 2.2925923653660017e-06, + "loss": 0.717192268371582, + "memory(GiB)": 43.68, + "step": 15010, + "token_acc": 0.8032128514056225, + "train_speed(iter/s)": 0.095824 + }, + { + "epoch": 0.6982523346286538, + "grad_norm": 8.732665061950684, + "learning_rate": 2.2893607082997686e-06, + "loss": 0.7233646392822266, + "memory(GiB)": 43.68, + "step": 15015, + "token_acc": 0.802689075630252, + "train_speed(iter/s)": 0.095842 + }, + { + "epoch": 0.6984848528886034, + "grad_norm": 7.695556163787842, + "learning_rate": 2.2861306541182403e-06, + "loss": 0.6288101196289062, + "memory(GiB)": 43.68, + "step": 15020, + "token_acc": 0.8389084507042254, + "train_speed(iter/s)": 0.095859 + }, + { + "epoch": 0.698717371148553, + "grad_norm": 8.413089752197266, + "learning_rate": 2.2829022047314436e-06, + "loss": 0.6322573661804199, + "memory(GiB)": 43.68, + "step": 15025, + "token_acc": 0.8373626373626374, + "train_speed(iter/s)": 0.095878 + }, + { + "epoch": 0.6989498894085026, + "grad_norm": 8.214163780212402, + "learning_rate": 2.2796753620484636e-06, + "loss": 0.6683392524719238, + "memory(GiB)": 43.68, + "step": 15030, + "token_acc": 0.8358092259577795, + "train_speed(iter/s)": 0.095896 + }, + { + "epoch": 0.6991824076684522, + "grad_norm": 6.662692546844482, + "learning_rate": 2.2764501279774288e-06, + "loss": 0.6149757862091064, + "memory(GiB)": 43.68, + "step": 15035, + "token_acc": 0.8461538461538461, + "train_speed(iter/s)": 0.095913 + }, + { + "epoch": 0.6994149259284018, + "grad_norm": 8.121337890625, + "learning_rate": 2.273226504425523e-06, + "loss": 0.621696949005127, + "memory(GiB)": 43.68, + "step": 15040, + "token_acc": 0.8443677439598368, + "train_speed(iter/s)": 0.09593 + }, + { + "epoch": 0.6996474441883515, + "grad_norm": 7.089735507965088, + "learning_rate": 2.2700044932989713e-06, + "loss": 0.6983431816101074, + "memory(GiB)": 43.68, + "step": 15045, + "token_acc": 0.8254545454545454, + "train_speed(iter/s)": 0.095948 + }, + { + "epoch": 0.699879962448301, + "grad_norm": 9.890140533447266, + "learning_rate": 2.26678409650305e-06, + "loss": 0.4932220458984375, + "memory(GiB)": 43.68, + "step": 15050, + "token_acc": 0.8757187085360459, + "train_speed(iter/s)": 0.095966 + }, + { + "epoch": 0.699879962448301, + "eval_loss": 0.569486141204834, + "eval_runtime": 295.1351, + "eval_samples_per_second": 11.774, + "eval_steps_per_second": 11.774, + "step": 15050 + }, + { + "epoch": 0.7001124807082506, + "grad_norm": 9.600886344909668, + "learning_rate": 2.263565315942078e-06, + "loss": 0.7439414501190186, + "memory(GiB)": 43.68, + "step": 15055, + "token_acc": 0.8303879017496743, + "train_speed(iter/s)": 0.095803 + }, + { + "epoch": 0.7003449989682002, + "grad_norm": 7.0749287605285645, + "learning_rate": 2.260348153519423e-06, + "loss": 0.6322244167327881, + "memory(GiB)": 43.68, + "step": 15060, + "token_acc": 0.8393371757925072, + "train_speed(iter/s)": 0.095821 + }, + { + "epoch": 0.7005775172281499, + "grad_norm": 9.142419815063477, + "learning_rate": 2.2571326111374876e-06, + "loss": 0.6422764301300049, + "memory(GiB)": 43.68, + "step": 15065, + "token_acc": 0.8480373105324523, + "train_speed(iter/s)": 0.095839 + }, + { + "epoch": 0.7008100354880994, + "grad_norm": 8.40816593170166, + "learning_rate": 2.2539186906977256e-06, + "loss": 0.7075412750244141, + "memory(GiB)": 43.68, + "step": 15070, + "token_acc": 0.8350113673270543, + "train_speed(iter/s)": 0.095857 + }, + { + "epoch": 0.701042553748049, + "grad_norm": 9.429239273071289, + "learning_rate": 2.2507063941006237e-06, + "loss": 0.5903666973114013, + "memory(GiB)": 43.68, + "step": 15075, + "token_acc": 0.8476021314387211, + "train_speed(iter/s)": 0.095874 + }, + { + "epoch": 0.7012750720079987, + "grad_norm": 6.772191524505615, + "learning_rate": 2.2474957232457157e-06, + "loss": 0.6024794101715087, + "memory(GiB)": 43.68, + "step": 15080, + "token_acc": 0.8567311650885138, + "train_speed(iter/s)": 0.095892 + }, + { + "epoch": 0.7015075902679482, + "grad_norm": 6.209893226623535, + "learning_rate": 2.2442866800315666e-06, + "loss": 0.5590320587158203, + "memory(GiB)": 43.68, + "step": 15085, + "token_acc": 0.8531830642704843, + "train_speed(iter/s)": 0.095909 + }, + { + "epoch": 0.7017401085278978, + "grad_norm": 7.735403060913086, + "learning_rate": 2.2410792663557847e-06, + "loss": 0.654276704788208, + "memory(GiB)": 43.68, + "step": 15090, + "token_acc": 0.8297485610421085, + "train_speed(iter/s)": 0.095927 + }, + { + "epoch": 0.7019726267878474, + "grad_norm": 8.271126747131348, + "learning_rate": 2.2378734841150124e-06, + "loss": 0.6791478633880615, + "memory(GiB)": 43.68, + "step": 15095, + "token_acc": 0.826361721336142, + "train_speed(iter/s)": 0.095944 + }, + { + "epoch": 0.7022051450477971, + "grad_norm": 7.209582805633545, + "learning_rate": 2.2346693352049305e-06, + "loss": 0.7014167785644532, + "memory(GiB)": 43.68, + "step": 15100, + "token_acc": 0.8235294117647058, + "train_speed(iter/s)": 0.095961 + }, + { + "epoch": 0.7022051450477971, + "eval_loss": 0.5675148367881775, + "eval_runtime": 293.9971, + "eval_samples_per_second": 11.82, + "eval_steps_per_second": 11.82, + "step": 15100 + }, + { + "epoch": 0.7024376633077466, + "grad_norm": 8.525975227355957, + "learning_rate": 2.2314668215202463e-06, + "loss": 0.6555490016937255, + "memory(GiB)": 43.68, + "step": 15105, + "token_acc": 0.8320721442885771, + "train_speed(iter/s)": 0.0958 + }, + { + "epoch": 0.7026701815676962, + "grad_norm": 7.5753397941589355, + "learning_rate": 2.2282659449547074e-06, + "loss": 0.7085586071014405, + "memory(GiB)": 43.68, + "step": 15110, + "token_acc": 0.8353581901968998, + "train_speed(iter/s)": 0.095818 + }, + { + "epoch": 0.7029026998276459, + "grad_norm": 8.003477096557617, + "learning_rate": 2.2250667074010927e-06, + "loss": 0.6282653331756591, + "memory(GiB)": 43.68, + "step": 15115, + "token_acc": 0.848521668691778, + "train_speed(iter/s)": 0.095835 + }, + { + "epoch": 0.7031352180875955, + "grad_norm": 9.937396049499512, + "learning_rate": 2.221869110751207e-06, + "loss": 0.6093691349029541, + "memory(GiB)": 43.68, + "step": 15120, + "token_acc": 0.836764705882353, + "train_speed(iter/s)": 0.095852 + }, + { + "epoch": 0.703367736347545, + "grad_norm": 8.043498992919922, + "learning_rate": 2.2186731568958907e-06, + "loss": 0.6808287143707276, + "memory(GiB)": 43.68, + "step": 15125, + "token_acc": 0.8264291632145816, + "train_speed(iter/s)": 0.09587 + }, + { + "epoch": 0.7036002546074946, + "grad_norm": 7.490905284881592, + "learning_rate": 2.215478847725005e-06, + "loss": 0.6184853553771973, + "memory(GiB)": 43.68, + "step": 15130, + "token_acc": 0.8535514764565044, + "train_speed(iter/s)": 0.095888 + }, + { + "epoch": 0.7038327728674443, + "grad_norm": 8.327759742736816, + "learning_rate": 2.2122861851274507e-06, + "loss": 0.7480375289916992, + "memory(GiB)": 43.68, + "step": 15135, + "token_acc": 0.8175512665862484, + "train_speed(iter/s)": 0.095905 + }, + { + "epoch": 0.7040652911273938, + "grad_norm": 9.702696800231934, + "learning_rate": 2.2090951709911423e-06, + "loss": 0.6199995994567871, + "memory(GiB)": 43.68, + "step": 15140, + "token_acc": 0.8450704225352113, + "train_speed(iter/s)": 0.095923 + }, + { + "epoch": 0.7042978093873434, + "grad_norm": 6.733327388763428, + "learning_rate": 2.2059058072030286e-06, + "loss": 0.712891960144043, + "memory(GiB)": 43.68, + "step": 15145, + "token_acc": 0.8355555555555556, + "train_speed(iter/s)": 0.095941 + }, + { + "epoch": 0.7045303276472931, + "grad_norm": 5.543945789337158, + "learning_rate": 2.2027180956490756e-06, + "loss": 0.6161305427551269, + "memory(GiB)": 43.68, + "step": 15150, + "token_acc": 0.8320469280642173, + "train_speed(iter/s)": 0.095958 + }, + { + "epoch": 0.7045303276472931, + "eval_loss": 0.5683196783065796, + "eval_runtime": 291.8022, + "eval_samples_per_second": 11.909, + "eval_steps_per_second": 11.909, + "step": 15150 + }, + { + "epoch": 0.7047628459072427, + "grad_norm": 8.165233612060547, + "learning_rate": 2.199532038214279e-06, + "loss": 0.7056881427764893, + "memory(GiB)": 43.68, + "step": 15155, + "token_acc": 0.831401147343477, + "train_speed(iter/s)": 0.095798 + }, + { + "epoch": 0.7049953641671922, + "grad_norm": 6.688375949859619, + "learning_rate": 2.1963476367826488e-06, + "loss": 0.6474667549133301, + "memory(GiB)": 43.68, + "step": 15160, + "token_acc": 0.8420677361853832, + "train_speed(iter/s)": 0.095816 + }, + { + "epoch": 0.7052278824271419, + "grad_norm": 8.083352088928223, + "learning_rate": 2.1931648932372222e-06, + "loss": 0.6967349052429199, + "memory(GiB)": 43.68, + "step": 15165, + "token_acc": 0.835667215815486, + "train_speed(iter/s)": 0.095833 + }, + { + "epoch": 0.7054604006870915, + "grad_norm": 8.43288803100586, + "learning_rate": 2.189983809460054e-06, + "loss": 0.6387832641601563, + "memory(GiB)": 43.68, + "step": 15170, + "token_acc": 0.8391264226391879, + "train_speed(iter/s)": 0.095851 + }, + { + "epoch": 0.7056929189470411, + "grad_norm": 10.101707458496094, + "learning_rate": 2.186804387332218e-06, + "loss": 0.6357023239135742, + "memory(GiB)": 43.68, + "step": 15175, + "token_acc": 0.8369157284203343, + "train_speed(iter/s)": 0.095868 + }, + { + "epoch": 0.7059254372069906, + "grad_norm": 9.89687728881836, + "learning_rate": 2.1836266287338026e-06, + "loss": 0.7231058120727539, + "memory(GiB)": 43.68, + "step": 15180, + "token_acc": 0.8384531984098301, + "train_speed(iter/s)": 0.095886 + }, + { + "epoch": 0.7061579554669403, + "grad_norm": 7.171428680419922, + "learning_rate": 2.1804505355439167e-06, + "loss": 0.7421711921691895, + "memory(GiB)": 43.68, + "step": 15185, + "token_acc": 0.809905316824472, + "train_speed(iter/s)": 0.095904 + }, + { + "epoch": 0.7063904737268899, + "grad_norm": 9.13636589050293, + "learning_rate": 2.177276109640679e-06, + "loss": 0.5785239696502685, + "memory(GiB)": 43.68, + "step": 15190, + "token_acc": 0.8532567049808429, + "train_speed(iter/s)": 0.095922 + }, + { + "epoch": 0.7066229919868394, + "grad_norm": 7.7410197257995605, + "learning_rate": 2.1741033529012303e-06, + "loss": 0.7573292255401611, + "memory(GiB)": 43.68, + "step": 15195, + "token_acc": 0.8209109730848861, + "train_speed(iter/s)": 0.095939 + }, + { + "epoch": 0.706855510246789, + "grad_norm": 8.934343338012695, + "learning_rate": 2.1709322672017146e-06, + "loss": 0.685581636428833, + "memory(GiB)": 43.68, + "step": 15200, + "token_acc": 0.8298518799848082, + "train_speed(iter/s)": 0.095957 + }, + { + "epoch": 0.706855510246789, + "eval_loss": 0.567160427570343, + "eval_runtime": 292.0237, + "eval_samples_per_second": 11.9, + "eval_steps_per_second": 11.9, + "step": 15200 + }, + { + "epoch": 0.7070880285067387, + "grad_norm": 11.455964088439941, + "learning_rate": 2.167762854417295e-06, + "loss": 0.5708463668823243, + "memory(GiB)": 43.68, + "step": 15205, + "token_acc": 0.8325553885993798, + "train_speed(iter/s)": 0.095798 + }, + { + "epoch": 0.7073205467666883, + "grad_norm": 7.0011091232299805, + "learning_rate": 2.1645951164221435e-06, + "loss": 0.7240097999572754, + "memory(GiB)": 43.68, + "step": 15210, + "token_acc": 0.8169977206121785, + "train_speed(iter/s)": 0.095815 + }, + { + "epoch": 0.7075530650266378, + "grad_norm": 8.62884521484375, + "learning_rate": 2.161429055089443e-06, + "loss": 0.5619840145111084, + "memory(GiB)": 43.68, + "step": 15215, + "token_acc": 0.8600823045267489, + "train_speed(iter/s)": 0.095833 + }, + { + "epoch": 0.7077855832865875, + "grad_norm": 7.956699371337891, + "learning_rate": 2.1582646722913797e-06, + "loss": 0.5228055000305176, + "memory(GiB)": 43.68, + "step": 15220, + "token_acc": 0.8676521141285665, + "train_speed(iter/s)": 0.095851 + }, + { + "epoch": 0.7080181015465371, + "grad_norm": 11.059106826782227, + "learning_rate": 2.1551019698991536e-06, + "loss": 0.6586771011352539, + "memory(GiB)": 43.68, + "step": 15225, + "token_acc": 0.846788990825688, + "train_speed(iter/s)": 0.095869 + }, + { + "epoch": 0.7082506198064866, + "grad_norm": 8.307846069335938, + "learning_rate": 2.1519409497829662e-06, + "loss": 0.6575278282165528, + "memory(GiB)": 43.68, + "step": 15230, + "token_acc": 0.815359477124183, + "train_speed(iter/s)": 0.095887 + }, + { + "epoch": 0.7084831380664363, + "grad_norm": 7.24730110168457, + "learning_rate": 2.1487816138120295e-06, + "loss": 0.6653165817260742, + "memory(GiB)": 43.68, + "step": 15235, + "token_acc": 0.8274193548387097, + "train_speed(iter/s)": 0.095904 + }, + { + "epoch": 0.7087156563263859, + "grad_norm": 8.117443084716797, + "learning_rate": 2.1456239638545517e-06, + "loss": 0.6554098129272461, + "memory(GiB)": 43.68, + "step": 15240, + "token_acc": 0.843871975019516, + "train_speed(iter/s)": 0.095922 + }, + { + "epoch": 0.7089481745863355, + "grad_norm": 8.179558753967285, + "learning_rate": 2.1424680017777517e-06, + "loss": 0.6279301643371582, + "memory(GiB)": 43.68, + "step": 15245, + "token_acc": 0.8371501272264631, + "train_speed(iter/s)": 0.095939 + }, + { + "epoch": 0.709180692846285, + "grad_norm": 9.259252548217773, + "learning_rate": 2.139313729447847e-06, + "loss": 0.7471514225006104, + "memory(GiB)": 43.68, + "step": 15250, + "token_acc": 0.819258693160107, + "train_speed(iter/s)": 0.095958 + }, + { + "epoch": 0.709180692846285, + "eval_loss": 0.5682738423347473, + "eval_runtime": 294.5503, + "eval_samples_per_second": 11.798, + "eval_steps_per_second": 11.798, + "step": 15250 + }, + { + "epoch": 0.7094132111062347, + "grad_norm": 9.733885765075684, + "learning_rate": 2.1361611487300552e-06, + "loss": 0.6511485576629639, + "memory(GiB)": 43.68, + "step": 15255, + "token_acc": 0.8323060028695785, + "train_speed(iter/s)": 0.095797 + }, + { + "epoch": 0.7096457293661843, + "grad_norm": 6.685081481933594, + "learning_rate": 2.1330102614885983e-06, + "loss": 0.6439203262329102, + "memory(GiB)": 43.68, + "step": 15260, + "token_acc": 0.8359621451104101, + "train_speed(iter/s)": 0.095815 + }, + { + "epoch": 0.7098782476261339, + "grad_norm": 6.545475006103516, + "learning_rate": 2.1298610695866883e-06, + "loss": 0.6626582145690918, + "memory(GiB)": 43.68, + "step": 15265, + "token_acc": 0.848257006151743, + "train_speed(iter/s)": 0.095833 + }, + { + "epoch": 0.7101107658860835, + "grad_norm": 8.421649932861328, + "learning_rate": 2.1267135748865434e-06, + "loss": 0.668232011795044, + "memory(GiB)": 43.68, + "step": 15270, + "token_acc": 0.824332712600869, + "train_speed(iter/s)": 0.09585 + }, + { + "epoch": 0.7103432841460331, + "grad_norm": 7.355252265930176, + "learning_rate": 2.1235677792493707e-06, + "loss": 0.7098326683044434, + "memory(GiB)": 43.68, + "step": 15275, + "token_acc": 0.8215933558904952, + "train_speed(iter/s)": 0.095868 + }, + { + "epoch": 0.7105758024059827, + "grad_norm": 8.247864723205566, + "learning_rate": 2.120423684535381e-06, + "loss": 0.6275835037231445, + "memory(GiB)": 43.68, + "step": 15280, + "token_acc": 0.8363309352517986, + "train_speed(iter/s)": 0.095886 + }, + { + "epoch": 0.7108083206659322, + "grad_norm": 10.8916597366333, + "learning_rate": 2.1172812926037693e-06, + "loss": 0.7184661388397217, + "memory(GiB)": 43.68, + "step": 15285, + "token_acc": 0.7979899497487437, + "train_speed(iter/s)": 0.095903 + }, + { + "epoch": 0.7110408389258819, + "grad_norm": 8.518646240234375, + "learning_rate": 2.114140605312732e-06, + "loss": 0.5484371185302734, + "memory(GiB)": 43.68, + "step": 15290, + "token_acc": 0.8663251047973917, + "train_speed(iter/s)": 0.095921 + }, + { + "epoch": 0.7112733571858315, + "grad_norm": 8.54529857635498, + "learning_rate": 2.1110016245194533e-06, + "loss": 0.44772658348083494, + "memory(GiB)": 43.68, + "step": 15295, + "token_acc": 0.8883770375620128, + "train_speed(iter/s)": 0.095938 + }, + { + "epoch": 0.7115058754457811, + "grad_norm": 9.231746673583984, + "learning_rate": 2.1078643520801124e-06, + "loss": 0.6469157695770263, + "memory(GiB)": 43.68, + "step": 15300, + "token_acc": 0.8370279146141215, + "train_speed(iter/s)": 0.095956 + }, + { + "epoch": 0.7115058754457811, + "eval_loss": 0.5666142702102661, + "eval_runtime": 292.2752, + "eval_samples_per_second": 11.889, + "eval_steps_per_second": 11.889, + "step": 15300 + }, + { + "epoch": 0.7117383937057307, + "grad_norm": 8.66163158416748, + "learning_rate": 2.1047287898498714e-06, + "loss": 0.6179323673248291, + "memory(GiB)": 43.68, + "step": 15305, + "token_acc": 0.8329655040061411, + "train_speed(iter/s)": 0.095797 + }, + { + "epoch": 0.7119709119656803, + "grad_norm": 9.693718910217285, + "learning_rate": 2.1015949396828884e-06, + "loss": 0.5869213581085205, + "memory(GiB)": 43.68, + "step": 15310, + "token_acc": 0.8610321007720438, + "train_speed(iter/s)": 0.095815 + }, + { + "epoch": 0.7122034302256299, + "grad_norm": 10.758035659790039, + "learning_rate": 2.0984628034323025e-06, + "loss": 0.6068830490112305, + "memory(GiB)": 43.68, + "step": 15315, + "token_acc": 0.8376518218623482, + "train_speed(iter/s)": 0.095833 + }, + { + "epoch": 0.7124359484855796, + "grad_norm": 7.330872058868408, + "learning_rate": 2.095332382950246e-06, + "loss": 0.797172737121582, + "memory(GiB)": 43.68, + "step": 15320, + "token_acc": 0.7954419121734296, + "train_speed(iter/s)": 0.09585 + }, + { + "epoch": 0.7126684667455291, + "grad_norm": 8.344979286193848, + "learning_rate": 2.092203680087829e-06, + "loss": 0.8205219268798828, + "memory(GiB)": 43.68, + "step": 15325, + "token_acc": 0.7884267631103075, + "train_speed(iter/s)": 0.095868 + }, + { + "epoch": 0.7129009850054787, + "grad_norm": 8.697916984558105, + "learning_rate": 2.089076696695153e-06, + "loss": 0.7596703052520752, + "memory(GiB)": 47.44, + "step": 15330, + "token_acc": 0.764751552795031, + "train_speed(iter/s)": 0.095885 + }, + { + "epoch": 0.7131335032654283, + "grad_norm": 9.916091918945312, + "learning_rate": 2.0859514346212993e-06, + "loss": 0.6388274192810058, + "memory(GiB)": 47.44, + "step": 15335, + "token_acc": 0.8622060284862537, + "train_speed(iter/s)": 0.095902 + }, + { + "epoch": 0.7133660215253779, + "grad_norm": 8.304482460021973, + "learning_rate": 2.0828278957143332e-06, + "loss": 0.6312067985534668, + "memory(GiB)": 47.44, + "step": 15340, + "token_acc": 0.8490203611217826, + "train_speed(iter/s)": 0.095919 + }, + { + "epoch": 0.7135985397853275, + "grad_norm": 9.36806869506836, + "learning_rate": 2.0797060818212972e-06, + "loss": 0.659159803390503, + "memory(GiB)": 47.44, + "step": 15345, + "token_acc": 0.839056681836988, + "train_speed(iter/s)": 0.095937 + }, + { + "epoch": 0.7138310580452771, + "grad_norm": 6.6365509033203125, + "learning_rate": 2.0765859947882188e-06, + "loss": 0.6138392925262451, + "memory(GiB)": 47.44, + "step": 15350, + "token_acc": 0.8444, + "train_speed(iter/s)": 0.095955 + }, + { + "epoch": 0.7138310580452771, + "eval_loss": 0.5672307014465332, + "eval_runtime": 292.2735, + "eval_samples_per_second": 11.89, + "eval_steps_per_second": 11.89, + "step": 15350 + }, + { + "epoch": 0.7140635763052268, + "grad_norm": 7.136357307434082, + "learning_rate": 2.0734676364600986e-06, + "loss": 0.7098967075347901, + "memory(GiB)": 47.44, + "step": 15355, + "token_acc": 0.8321157404453519, + "train_speed(iter/s)": 0.095797 + }, + { + "epoch": 0.7142960945651763, + "grad_norm": 6.81723690032959, + "learning_rate": 2.070351008680922e-06, + "loss": 0.6600059509277344, + "memory(GiB)": 47.44, + "step": 15360, + "token_acc": 0.8320908768193114, + "train_speed(iter/s)": 0.095815 + }, + { + "epoch": 0.7145286128251259, + "grad_norm": 6.2631707191467285, + "learning_rate": 2.067236113293643e-06, + "loss": 0.6850215911865234, + "memory(GiB)": 47.44, + "step": 15365, + "token_acc": 0.829415501905972, + "train_speed(iter/s)": 0.095832 + }, + { + "epoch": 0.7147611310850756, + "grad_norm": 7.0183796882629395, + "learning_rate": 2.064122952140198e-06, + "loss": 0.6342861175537109, + "memory(GiB)": 47.44, + "step": 15370, + "token_acc": 0.8410546139359699, + "train_speed(iter/s)": 0.09585 + }, + { + "epoch": 0.7149936493450251, + "grad_norm": 11.088750839233398, + "learning_rate": 2.061011527061495e-06, + "loss": 0.6724984645843506, + "memory(GiB)": 47.44, + "step": 15375, + "token_acc": 0.8334586466165413, + "train_speed(iter/s)": 0.095868 + }, + { + "epoch": 0.7152261676049747, + "grad_norm": 9.443142890930176, + "learning_rate": 2.0579018398974147e-06, + "loss": 0.6077555656433106, + "memory(GiB)": 47.44, + "step": 15380, + "token_acc": 0.8423803779654202, + "train_speed(iter/s)": 0.095886 + }, + { + "epoch": 0.7154586858649243, + "grad_norm": 9.812091827392578, + "learning_rate": 2.054793892486815e-06, + "loss": 0.6703172206878663, + "memory(GiB)": 47.44, + "step": 15385, + "token_acc": 0.8240412504028359, + "train_speed(iter/s)": 0.095903 + }, + { + "epoch": 0.715691204124874, + "grad_norm": 7.7283477783203125, + "learning_rate": 2.0516876866675155e-06, + "loss": 0.6120262145996094, + "memory(GiB)": 47.44, + "step": 15390, + "token_acc": 0.8498862774829417, + "train_speed(iter/s)": 0.095921 + }, + { + "epoch": 0.7159237223848235, + "grad_norm": 6.389163970947266, + "learning_rate": 2.0485832242763164e-06, + "loss": 0.6700059413909912, + "memory(GiB)": 47.44, + "step": 15395, + "token_acc": 0.8351304347826087, + "train_speed(iter/s)": 0.095939 + }, + { + "epoch": 0.7161562406447731, + "grad_norm": 6.533292770385742, + "learning_rate": 2.0454805071489785e-06, + "loss": 0.6029557228088379, + "memory(GiB)": 47.44, + "step": 15400, + "token_acc": 0.8440822111977321, + "train_speed(iter/s)": 0.095956 + }, + { + "epoch": 0.7161562406447731, + "eval_loss": 0.5675437450408936, + "eval_runtime": 292.8256, + "eval_samples_per_second": 11.867, + "eval_steps_per_second": 11.867, + "step": 15400 + }, + { + "epoch": 0.7163887589047228, + "grad_norm": 7.733583450317383, + "learning_rate": 2.042379537120237e-06, + "loss": 0.4989192008972168, + "memory(GiB)": 47.44, + "step": 15405, + "token_acc": 0.8333587351499643, + "train_speed(iter/s)": 0.095799 + }, + { + "epoch": 0.7166212771646724, + "grad_norm": 10.462811470031738, + "learning_rate": 2.0392803160237888e-06, + "loss": 0.670336389541626, + "memory(GiB)": 47.44, + "step": 15410, + "token_acc": 0.8332145402708482, + "train_speed(iter/s)": 0.095816 + }, + { + "epoch": 0.7168537954246219, + "grad_norm": 10.108728408813477, + "learning_rate": 2.0361828456923e-06, + "loss": 0.6763839721679688, + "memory(GiB)": 47.44, + "step": 15415, + "token_acc": 0.8427698574338085, + "train_speed(iter/s)": 0.095834 + }, + { + "epoch": 0.7170863136845715, + "grad_norm": 7.717390537261963, + "learning_rate": 2.0330871279574006e-06, + "loss": 0.6465532302856445, + "memory(GiB)": 47.44, + "step": 15420, + "token_acc": 0.8464551508029768, + "train_speed(iter/s)": 0.095851 + }, + { + "epoch": 0.7173188319445212, + "grad_norm": 7.598498344421387, + "learning_rate": 2.0299931646496864e-06, + "loss": 0.8090932846069336, + "memory(GiB)": 47.44, + "step": 15425, + "token_acc": 0.8031572164948454, + "train_speed(iter/s)": 0.095868 + }, + { + "epoch": 0.7175513502044707, + "grad_norm": 7.445656776428223, + "learning_rate": 2.0269009575987087e-06, + "loss": 0.6042123794555664, + "memory(GiB)": 47.44, + "step": 15430, + "token_acc": 0.85587018771874, + "train_speed(iter/s)": 0.095885 + }, + { + "epoch": 0.7177838684644203, + "grad_norm": 8.982229232788086, + "learning_rate": 2.0238105086329894e-06, + "loss": 0.6927988052368164, + "memory(GiB)": 47.44, + "step": 15435, + "token_acc": 0.8335028823329942, + "train_speed(iter/s)": 0.095903 + }, + { + "epoch": 0.71801638672437, + "grad_norm": 8.382378578186035, + "learning_rate": 2.020721819580003e-06, + "loss": 0.6854721546173096, + "memory(GiB)": 47.44, + "step": 15440, + "token_acc": 0.8381818181818181, + "train_speed(iter/s)": 0.095921 + }, + { + "epoch": 0.7182489049843196, + "grad_norm": 6.911797046661377, + "learning_rate": 2.0176348922661893e-06, + "loss": 0.6210799217224121, + "memory(GiB)": 47.44, + "step": 15445, + "token_acc": 0.8527648234510327, + "train_speed(iter/s)": 0.095938 + }, + { + "epoch": 0.7184814232442691, + "grad_norm": 8.026445388793945, + "learning_rate": 2.014549728516941e-06, + "loss": 0.7180139064788819, + "memory(GiB)": 47.44, + "step": 15450, + "token_acc": 0.8164603058994901, + "train_speed(iter/s)": 0.095955 + }, + { + "epoch": 0.7184814232442691, + "eval_loss": 0.5675672888755798, + "eval_runtime": 294.7585, + "eval_samples_per_second": 11.789, + "eval_steps_per_second": 11.789, + "step": 15450 + }, + { + "epoch": 0.7187139415042187, + "grad_norm": 9.904377937316895, + "learning_rate": 2.0114663301566128e-06, + "loss": 0.6216643333435059, + "memory(GiB)": 47.44, + "step": 15455, + "token_acc": 0.832423668411083, + "train_speed(iter/s)": 0.095797 + }, + { + "epoch": 0.7189464597641684, + "grad_norm": 6.8368988037109375, + "learning_rate": 2.0083846990085125e-06, + "loss": 0.6121196746826172, + "memory(GiB)": 47.44, + "step": 15460, + "token_acc": 0.8402468289338362, + "train_speed(iter/s)": 0.095815 + }, + { + "epoch": 0.719178978024118, + "grad_norm": 11.11691665649414, + "learning_rate": 2.005304836894906e-06, + "loss": 0.5897928714752197, + "memory(GiB)": 47.44, + "step": 15465, + "token_acc": 0.8488927485887973, + "train_speed(iter/s)": 0.095832 + }, + { + "epoch": 0.7194114962840675, + "grad_norm": 9.302391052246094, + "learning_rate": 2.002226745637007e-06, + "loss": 0.5943363666534424, + "memory(GiB)": 47.44, + "step": 15470, + "token_acc": 0.8488794669897032, + "train_speed(iter/s)": 0.09585 + }, + { + "epoch": 0.7196440145440172, + "grad_norm": 9.435709953308105, + "learning_rate": 1.9991504270549895e-06, + "loss": 0.5556567668914795, + "memory(GiB)": 47.44, + "step": 15475, + "token_acc": 0.8607260726072608, + "train_speed(iter/s)": 0.095867 + }, + { + "epoch": 0.7198765328039668, + "grad_norm": 9.0478515625, + "learning_rate": 1.996075882967972e-06, + "loss": 0.6758971214294434, + "memory(GiB)": 47.44, + "step": 15480, + "token_acc": 0.831566994700984, + "train_speed(iter/s)": 0.095885 + }, + { + "epoch": 0.7201090510639163, + "grad_norm": 9.4107084274292, + "learning_rate": 1.99300311519403e-06, + "loss": 0.5821828365325927, + "memory(GiB)": 47.44, + "step": 15485, + "token_acc": 0.8466947960618847, + "train_speed(iter/s)": 0.095902 + }, + { + "epoch": 0.720341569323866, + "grad_norm": 9.905683517456055, + "learning_rate": 1.9899321255501845e-06, + "loss": 0.6144163608551025, + "memory(GiB)": 47.44, + "step": 15490, + "token_acc": 0.8548801369863014, + "train_speed(iter/s)": 0.095919 + }, + { + "epoch": 0.7205740875838156, + "grad_norm": 9.403578758239746, + "learning_rate": 1.9868629158524093e-06, + "loss": 0.5417950630187989, + "memory(GiB)": 47.44, + "step": 15495, + "token_acc": 0.8679031037093111, + "train_speed(iter/s)": 0.095937 + }, + { + "epoch": 0.7208066058437652, + "grad_norm": 7.02471923828125, + "learning_rate": 1.983795487915619e-06, + "loss": 0.593695592880249, + "memory(GiB)": 47.44, + "step": 15500, + "token_acc": 0.859472049689441, + "train_speed(iter/s)": 0.095954 + }, + { + "epoch": 0.7208066058437652, + "eval_loss": 0.5664077401161194, + "eval_runtime": 292.9703, + "eval_samples_per_second": 11.861, + "eval_steps_per_second": 11.861, + "step": 15500 + }, + { + "epoch": 0.7210391241037147, + "grad_norm": 9.336894989013672, + "learning_rate": 1.9807298435536803e-06, + "loss": 0.5820582866668701, + "memory(GiB)": 47.44, + "step": 15505, + "token_acc": 0.8331124976912637, + "train_speed(iter/s)": 0.095798 + }, + { + "epoch": 0.7212716423636644, + "grad_norm": 7.727779865264893, + "learning_rate": 1.977665984579405e-06, + "loss": 0.5849403858184814, + "memory(GiB)": 47.44, + "step": 15510, + "token_acc": 0.8396624472573839, + "train_speed(iter/s)": 0.095815 + }, + { + "epoch": 0.721504160623614, + "grad_norm": 8.475208282470703, + "learning_rate": 1.974603912804544e-06, + "loss": 0.6912332057952881, + "memory(GiB)": 47.44, + "step": 15515, + "token_acc": 0.8325024925224327, + "train_speed(iter/s)": 0.095832 + }, + { + "epoch": 0.7217366788835635, + "grad_norm": 7.069519519805908, + "learning_rate": 1.971543630039799e-06, + "loss": 0.6353631019592285, + "memory(GiB)": 47.44, + "step": 15520, + "token_acc": 0.8291793313069908, + "train_speed(iter/s)": 0.09585 + }, + { + "epoch": 0.7219691971435132, + "grad_norm": 11.572453498840332, + "learning_rate": 1.968485138094805e-06, + "loss": 0.7156434059143066, + "memory(GiB)": 47.44, + "step": 15525, + "token_acc": 0.8214027476500362, + "train_speed(iter/s)": 0.095867 + }, + { + "epoch": 0.7222017154034628, + "grad_norm": 11.58348560333252, + "learning_rate": 1.9654284387781453e-06, + "loss": 0.7011518001556396, + "memory(GiB)": 47.44, + "step": 15530, + "token_acc": 0.8055172413793104, + "train_speed(iter/s)": 0.095884 + }, + { + "epoch": 0.7224342336634124, + "grad_norm": 10.236895561218262, + "learning_rate": 1.9623735338973404e-06, + "loss": 0.675053882598877, + "memory(GiB)": 47.44, + "step": 15535, + "token_acc": 0.8261376896149358, + "train_speed(iter/s)": 0.095902 + }, + { + "epoch": 0.7226667519233619, + "grad_norm": 7.544062614440918, + "learning_rate": 1.9593204252588515e-06, + "loss": 0.593741512298584, + "memory(GiB)": 47.44, + "step": 15540, + "token_acc": 0.8604933279417711, + "train_speed(iter/s)": 0.095919 + }, + { + "epoch": 0.7228992701833116, + "grad_norm": 7.399377822875977, + "learning_rate": 1.956269114668073e-06, + "loss": 0.6229902744293213, + "memory(GiB)": 47.44, + "step": 15545, + "token_acc": 0.8310586499446698, + "train_speed(iter/s)": 0.095937 + }, + { + "epoch": 0.7231317884432612, + "grad_norm": 9.306867599487305, + "learning_rate": 1.9532196039293415e-06, + "loss": 0.6560436248779297, + "memory(GiB)": 47.44, + "step": 15550, + "token_acc": 0.8388037928519329, + "train_speed(iter/s)": 0.095955 + }, + { + "epoch": 0.7231317884432612, + "eval_loss": 0.5657230019569397, + "eval_runtime": 291.9442, + "eval_samples_per_second": 11.903, + "eval_steps_per_second": 11.903, + "step": 15550 + }, + { + "epoch": 0.7233643067032108, + "grad_norm": 7.044973373413086, + "learning_rate": 1.950171894845924e-06, + "loss": 0.5830237388610839, + "memory(GiB)": 47.44, + "step": 15555, + "token_acc": 0.8333811299110984, + "train_speed(iter/s)": 0.095799 + }, + { + "epoch": 0.7235968249631604, + "grad_norm": 6.381565570831299, + "learning_rate": 1.9471259892200296e-06, + "loss": 0.673720407485962, + "memory(GiB)": 47.44, + "step": 15560, + "token_acc": 0.8369430693069307, + "train_speed(iter/s)": 0.095817 + }, + { + "epoch": 0.72382934322311, + "grad_norm": 9.84653377532959, + "learning_rate": 1.9440818888527908e-06, + "loss": 0.7157990455627441, + "memory(GiB)": 47.44, + "step": 15565, + "token_acc": 0.8328240942819729, + "train_speed(iter/s)": 0.095834 + }, + { + "epoch": 0.7240618614830596, + "grad_norm": 7.861911296844482, + "learning_rate": 1.941039595544281e-06, + "loss": 0.8552507400512696, + "memory(GiB)": 47.44, + "step": 15570, + "token_acc": 0.7770975056689342, + "train_speed(iter/s)": 0.095851 + }, + { + "epoch": 0.7242943797430091, + "grad_norm": 7.185626983642578, + "learning_rate": 1.937999111093502e-06, + "loss": 0.628577184677124, + "memory(GiB)": 47.44, + "step": 15575, + "token_acc": 0.8374384236453202, + "train_speed(iter/s)": 0.095868 + }, + { + "epoch": 0.7245268980029588, + "grad_norm": 7.608919620513916, + "learning_rate": 1.9349604372983877e-06, + "loss": 0.8854595184326172, + "memory(GiB)": 47.44, + "step": 15580, + "token_acc": 0.7832712495767017, + "train_speed(iter/s)": 0.095886 + }, + { + "epoch": 0.7247594162629084, + "grad_norm": 8.92005443572998, + "learning_rate": 1.9319235759557964e-06, + "loss": 0.5188089847564697, + "memory(GiB)": 47.44, + "step": 15585, + "token_acc": 0.8663994655978624, + "train_speed(iter/s)": 0.095903 + }, + { + "epoch": 0.724991934522858, + "grad_norm": 7.952560901641846, + "learning_rate": 1.9288885288615216e-06, + "loss": 0.6974950790405273, + "memory(GiB)": 47.44, + "step": 15590, + "token_acc": 0.8319559228650137, + "train_speed(iter/s)": 0.09592 + }, + { + "epoch": 0.7252244527828076, + "grad_norm": 7.014359474182129, + "learning_rate": 1.925855297810277e-06, + "loss": 0.7252201080322266, + "memory(GiB)": 47.44, + "step": 15595, + "token_acc": 0.8212915601023018, + "train_speed(iter/s)": 0.095937 + }, + { + "epoch": 0.7254569710427572, + "grad_norm": 7.969059944152832, + "learning_rate": 1.922823884595708e-06, + "loss": 0.644140625, + "memory(GiB)": 47.44, + "step": 15600, + "token_acc": 0.8409179307662388, + "train_speed(iter/s)": 0.095954 + }, + { + "epoch": 0.7254569710427572, + "eval_loss": 0.5666154026985168, + "eval_runtime": 294.9961, + "eval_samples_per_second": 11.78, + "eval_steps_per_second": 11.78, + "step": 15600 + }, + { + "epoch": 0.7256894893027068, + "grad_norm": 11.729976654052734, + "learning_rate": 1.919794291010381e-06, + "loss": 0.7216911315917969, + "memory(GiB)": 47.44, + "step": 15605, + "token_acc": 0.8319603824591902, + "train_speed(iter/s)": 0.095798 + }, + { + "epoch": 0.7259220075626565, + "grad_norm": 8.04870319366455, + "learning_rate": 1.9167665188457894e-06, + "loss": 0.6861124992370605, + "memory(GiB)": 47.44, + "step": 15610, + "token_acc": 0.8221914008321776, + "train_speed(iter/s)": 0.095814 + }, + { + "epoch": 0.726154525822606, + "grad_norm": 6.789844512939453, + "learning_rate": 1.9137405698923476e-06, + "loss": 0.7341386795043945, + "memory(GiB)": 47.44, + "step": 15615, + "token_acc": 0.826115061409179, + "train_speed(iter/s)": 0.095832 + }, + { + "epoch": 0.7263870440825556, + "grad_norm": 7.037516117095947, + "learning_rate": 1.9107164459393956e-06, + "loss": 0.6245403289794922, + "memory(GiB)": 47.44, + "step": 15620, + "token_acc": 0.8459883184752536, + "train_speed(iter/s)": 0.095849 + }, + { + "epoch": 0.7266195623425052, + "grad_norm": 10.631331443786621, + "learning_rate": 1.907694148775187e-06, + "loss": 0.683773422241211, + "memory(GiB)": 47.44, + "step": 15625, + "token_acc": 0.8298048982980489, + "train_speed(iter/s)": 0.095866 + }, + { + "epoch": 0.7268520806024548, + "grad_norm": 8.501079559326172, + "learning_rate": 1.9046736801869037e-06, + "loss": 0.5465147972106934, + "memory(GiB)": 47.44, + "step": 15630, + "token_acc": 0.86484375, + "train_speed(iter/s)": 0.095883 + }, + { + "epoch": 0.7270845988624044, + "grad_norm": 8.950760841369629, + "learning_rate": 1.9016550419606372e-06, + "loss": 0.6169103622436524, + "memory(GiB)": 47.44, + "step": 15635, + "token_acc": 0.8555262165220672, + "train_speed(iter/s)": 0.0959 + }, + { + "epoch": 0.727317117122354, + "grad_norm": 8.562825202941895, + "learning_rate": 1.8986382358814043e-06, + "loss": 0.660850715637207, + "memory(GiB)": 47.44, + "step": 15640, + "token_acc": 0.8426395939086294, + "train_speed(iter/s)": 0.095917 + }, + { + "epoch": 0.7275496353823037, + "grad_norm": 9.356790542602539, + "learning_rate": 1.8956232637331368e-06, + "loss": 0.6509898185729981, + "memory(GiB)": 47.44, + "step": 15645, + "token_acc": 0.8205387205387206, + "train_speed(iter/s)": 0.095934 + }, + { + "epoch": 0.7277821536422532, + "grad_norm": 11.473599433898926, + "learning_rate": 1.8926101272986775e-06, + "loss": 0.5593137264251709, + "memory(GiB)": 47.44, + "step": 15650, + "token_acc": 0.869598180439727, + "train_speed(iter/s)": 0.095951 + }, + { + "epoch": 0.7277821536422532, + "eval_loss": 0.5655349493026733, + "eval_runtime": 294.0032, + "eval_samples_per_second": 11.82, + "eval_steps_per_second": 11.82, + "step": 15650 + }, + { + "epoch": 0.7280146719022028, + "grad_norm": 5.624059200286865, + "learning_rate": 1.8895988283597894e-06, + "loss": 0.6395047187805176, + "memory(GiB)": 47.44, + "step": 15655, + "token_acc": 0.8326598337554284, + "train_speed(iter/s)": 0.095795 + }, + { + "epoch": 0.7282471901621524, + "grad_norm": 8.765081405639648, + "learning_rate": 1.8865893686971454e-06, + "loss": 0.6849233150482178, + "memory(GiB)": 47.44, + "step": 15660, + "token_acc": 0.8255968169761273, + "train_speed(iter/s)": 0.095813 + }, + { + "epoch": 0.728479708422102, + "grad_norm": 9.399160385131836, + "learning_rate": 1.883581750090334e-06, + "loss": 0.5533103466033935, + "memory(GiB)": 47.44, + "step": 15665, + "token_acc": 0.8568696206566784, + "train_speed(iter/s)": 0.09583 + }, + { + "epoch": 0.7287122266820516, + "grad_norm": 7.852560520172119, + "learning_rate": 1.8805759743178497e-06, + "loss": 0.5719121932983399, + "memory(GiB)": 47.44, + "step": 15670, + "token_acc": 0.8539235412474849, + "train_speed(iter/s)": 0.095848 + }, + { + "epoch": 0.7289447449420012, + "grad_norm": 9.14884090423584, + "learning_rate": 1.8775720431571042e-06, + "loss": 0.5568684101104736, + "memory(GiB)": 47.44, + "step": 15675, + "token_acc": 0.8567848191908343, + "train_speed(iter/s)": 0.095865 + }, + { + "epoch": 0.7291772632019509, + "grad_norm": 6.8728766441345215, + "learning_rate": 1.8745699583844108e-06, + "loss": 0.6052249908447266, + "memory(GiB)": 47.44, + "step": 15680, + "token_acc": 0.8487972508591065, + "train_speed(iter/s)": 0.095883 + }, + { + "epoch": 0.7294097814619004, + "grad_norm": 7.464291572570801, + "learning_rate": 1.8715697217749985e-06, + "loss": 0.599236249923706, + "memory(GiB)": 47.44, + "step": 15685, + "token_acc": 0.8551859099804305, + "train_speed(iter/s)": 0.0959 + }, + { + "epoch": 0.72964229972185, + "grad_norm": 10.347436904907227, + "learning_rate": 1.8685713351029965e-06, + "loss": 0.6958876132965088, + "memory(GiB)": 47.44, + "step": 15690, + "token_acc": 0.8292985723153321, + "train_speed(iter/s)": 0.095918 + }, + { + "epoch": 0.7298748179817997, + "grad_norm": 7.473763942718506, + "learning_rate": 1.8655748001414452e-06, + "loss": 0.7635407447814941, + "memory(GiB)": 47.44, + "step": 15695, + "token_acc": 0.8141470180305131, + "train_speed(iter/s)": 0.095935 + }, + { + "epoch": 0.7301073362417493, + "grad_norm": 6.556323051452637, + "learning_rate": 1.8625801186622883e-06, + "loss": 0.6744859218597412, + "memory(GiB)": 47.44, + "step": 15700, + "token_acc": 0.836890243902439, + "train_speed(iter/s)": 0.095952 + }, + { + "epoch": 0.7301073362417493, + "eval_loss": 0.565443217754364, + "eval_runtime": 295.1311, + "eval_samples_per_second": 11.774, + "eval_steps_per_second": 11.774, + "step": 15700 + }, + { + "epoch": 0.7303398545016988, + "grad_norm": 7.942391395568848, + "learning_rate": 1.8595872924363744e-06, + "loss": 0.6304332733154296, + "memory(GiB)": 47.44, + "step": 15705, + "token_acc": 0.8331690112620737, + "train_speed(iter/s)": 0.095797 + }, + { + "epoch": 0.7305723727616484, + "grad_norm": 10.444586753845215, + "learning_rate": 1.8565963232334516e-06, + "loss": 0.6448088169097901, + "memory(GiB)": 47.44, + "step": 15710, + "token_acc": 0.8362779740871613, + "train_speed(iter/s)": 0.095814 + }, + { + "epoch": 0.7308048910215981, + "grad_norm": 7.157534599304199, + "learning_rate": 1.853607212822175e-06, + "loss": 0.6816494941711426, + "memory(GiB)": 47.44, + "step": 15715, + "token_acc": 0.8259624562519885, + "train_speed(iter/s)": 0.095832 + }, + { + "epoch": 0.7310374092815476, + "grad_norm": 7.530759811401367, + "learning_rate": 1.8506199629700949e-06, + "loss": 0.6713624000549316, + "memory(GiB)": 47.44, + "step": 15720, + "token_acc": 0.8371550719005052, + "train_speed(iter/s)": 0.095849 + }, + { + "epoch": 0.7312699275414972, + "grad_norm": 8.83544635772705, + "learning_rate": 1.847634575443668e-06, + "loss": 0.7240818977355957, + "memory(GiB)": 47.44, + "step": 15725, + "token_acc": 0.8081967213114755, + "train_speed(iter/s)": 0.095865 + }, + { + "epoch": 0.7315024458014469, + "grad_norm": 9.414497375488281, + "learning_rate": 1.8446510520082423e-06, + "loss": 0.6359312057495117, + "memory(GiB)": 47.44, + "step": 15730, + "token_acc": 0.8445309964297306, + "train_speed(iter/s)": 0.095882 + }, + { + "epoch": 0.7317349640613965, + "grad_norm": 10.996722221374512, + "learning_rate": 1.8416693944280689e-06, + "loss": 0.6379516124725342, + "memory(GiB)": 47.44, + "step": 15735, + "token_acc": 0.8364099299809039, + "train_speed(iter/s)": 0.095899 + }, + { + "epoch": 0.731967482321346, + "grad_norm": 10.735776901245117, + "learning_rate": 1.8386896044662944e-06, + "loss": 0.6942223072052002, + "memory(GiB)": 47.44, + "step": 15740, + "token_acc": 0.8277654046028211, + "train_speed(iter/s)": 0.095917 + }, + { + "epoch": 0.7322000005812956, + "grad_norm": 8.521059036254883, + "learning_rate": 1.835711683884962e-06, + "loss": 0.6801161766052246, + "memory(GiB)": 47.44, + "step": 15745, + "token_acc": 0.8320294523699954, + "train_speed(iter/s)": 0.095934 + }, + { + "epoch": 0.7324325188412453, + "grad_norm": 6.813046455383301, + "learning_rate": 1.8327356344450048e-06, + "loss": 0.6779103755950928, + "memory(GiB)": 47.44, + "step": 15750, + "token_acc": 0.8271536102592696, + "train_speed(iter/s)": 0.095951 + }, + { + "epoch": 0.7324325188412453, + "eval_loss": 0.5668007135391235, + "eval_runtime": 294.0271, + "eval_samples_per_second": 11.819, + "eval_steps_per_second": 11.819, + "step": 15750 + }, + { + "epoch": 0.7326650371011949, + "grad_norm": 9.11612319946289, + "learning_rate": 1.8297614579062557e-06, + "loss": 0.5452903270721435, + "memory(GiB)": 47.44, + "step": 15755, + "token_acc": 0.8330476627316548, + "train_speed(iter/s)": 0.095796 + }, + { + "epoch": 0.7328975553611444, + "grad_norm": 5.6731038093566895, + "learning_rate": 1.8267891560274342e-06, + "loss": 0.7143843173980713, + "memory(GiB)": 47.44, + "step": 15760, + "token_acc": 0.8132650156561344, + "train_speed(iter/s)": 0.095813 + }, + { + "epoch": 0.7331300736210941, + "grad_norm": 8.81519889831543, + "learning_rate": 1.823818730566158e-06, + "loss": 0.6255356311798096, + "memory(GiB)": 47.44, + "step": 15765, + "token_acc": 0.8302339532093581, + "train_speed(iter/s)": 0.09583 + }, + { + "epoch": 0.7333625918810437, + "grad_norm": 7.650895118713379, + "learning_rate": 1.8208501832789271e-06, + "loss": 0.5208076953887939, + "memory(GiB)": 47.44, + "step": 15770, + "token_acc": 0.8714511041009464, + "train_speed(iter/s)": 0.095847 + }, + { + "epoch": 0.7335951101409932, + "grad_norm": 6.7183756828308105, + "learning_rate": 1.8178835159211371e-06, + "loss": 0.5086612224578857, + "memory(GiB)": 47.44, + "step": 15775, + "token_acc": 0.8686751641560448, + "train_speed(iter/s)": 0.095864 + }, + { + "epoch": 0.7338276284009428, + "grad_norm": 9.220132827758789, + "learning_rate": 1.8149187302470706e-06, + "loss": 0.6746468544006348, + "memory(GiB)": 47.44, + "step": 15780, + "token_acc": 0.8361801242236024, + "train_speed(iter/s)": 0.095881 + }, + { + "epoch": 0.7340601466608925, + "grad_norm": 7.657320976257324, + "learning_rate": 1.811955828009896e-06, + "loss": 0.5522412300109864, + "memory(GiB)": 47.44, + "step": 15785, + "token_acc": 0.8663440059568132, + "train_speed(iter/s)": 0.095898 + }, + { + "epoch": 0.7342926649208421, + "grad_norm": 10.659828186035156, + "learning_rate": 1.8089948109616712e-06, + "loss": 0.6790434837341308, + "memory(GiB)": 47.44, + "step": 15790, + "token_acc": 0.8299460804645376, + "train_speed(iter/s)": 0.095915 + }, + { + "epoch": 0.7345251831807916, + "grad_norm": 5.570415019989014, + "learning_rate": 1.806035680853333e-06, + "loss": 0.780366849899292, + "memory(GiB)": 47.44, + "step": 15795, + "token_acc": 0.800187617260788, + "train_speed(iter/s)": 0.095931 + }, + { + "epoch": 0.7347577014407413, + "grad_norm": 11.067488670349121, + "learning_rate": 1.8030784394347106e-06, + "loss": 0.7626240730285645, + "memory(GiB)": 47.44, + "step": 15800, + "token_acc": 0.7994816974408812, + "train_speed(iter/s)": 0.095948 + }, + { + "epoch": 0.7347577014407413, + "eval_loss": 0.564397931098938, + "eval_runtime": 296.0923, + "eval_samples_per_second": 11.736, + "eval_steps_per_second": 11.736, + "step": 15800 + }, + { + "epoch": 0.7349902197006909, + "grad_norm": 10.09076976776123, + "learning_rate": 1.8001230884545084e-06, + "loss": 0.6031111240386963, + "memory(GiB)": 47.44, + "step": 15805, + "token_acc": 0.8330454392783456, + "train_speed(iter/s)": 0.095793 + }, + { + "epoch": 0.7352227379606405, + "grad_norm": 8.190320014953613, + "learning_rate": 1.797169629660318e-06, + "loss": 0.6044882297515869, + "memory(GiB)": 47.44, + "step": 15810, + "token_acc": 0.8356687898089172, + "train_speed(iter/s)": 0.095809 + }, + { + "epoch": 0.73545525622059, + "grad_norm": 6.573525905609131, + "learning_rate": 1.7942180647986113e-06, + "loss": 0.6540366649627686, + "memory(GiB)": 47.44, + "step": 15815, + "token_acc": 0.8233638282899367, + "train_speed(iter/s)": 0.095826 + }, + { + "epoch": 0.7356877744805397, + "grad_norm": 10.954277038574219, + "learning_rate": 1.7912683956147415e-06, + "loss": 0.62742018699646, + "memory(GiB)": 47.44, + "step": 15820, + "token_acc": 0.8497256226255804, + "train_speed(iter/s)": 0.095843 + }, + { + "epoch": 0.7359202927404893, + "grad_norm": 7.76631498336792, + "learning_rate": 1.788320623852935e-06, + "loss": 0.6723296642303467, + "memory(GiB)": 47.44, + "step": 15825, + "token_acc": 0.821697803998689, + "train_speed(iter/s)": 0.09586 + }, + { + "epoch": 0.7361528110004388, + "grad_norm": 10.623085021972656, + "learning_rate": 1.7853747512563042e-06, + "loss": 0.7741125583648681, + "memory(GiB)": 47.44, + "step": 15830, + "token_acc": 0.804755944931164, + "train_speed(iter/s)": 0.095877 + }, + { + "epoch": 0.7363853292603885, + "grad_norm": 8.324301719665527, + "learning_rate": 1.782430779566831e-06, + "loss": 0.6203536033630371, + "memory(GiB)": 47.44, + "step": 15835, + "token_acc": 0.8429657794676806, + "train_speed(iter/s)": 0.095894 + }, + { + "epoch": 0.7366178475203381, + "grad_norm": 6.4376115798950195, + "learning_rate": 1.77948871052538e-06, + "loss": 0.6823951244354248, + "memory(GiB)": 47.44, + "step": 15840, + "token_acc": 0.8315889628924833, + "train_speed(iter/s)": 0.095911 + }, + { + "epoch": 0.7368503657802877, + "grad_norm": 7.326390266418457, + "learning_rate": 1.7765485458716842e-06, + "loss": 0.6224531650543212, + "memory(GiB)": 47.44, + "step": 15845, + "token_acc": 0.8392242727557084, + "train_speed(iter/s)": 0.095928 + }, + { + "epoch": 0.7370828840402373, + "grad_norm": 8.882484436035156, + "learning_rate": 1.7736102873443555e-06, + "loss": 0.7446016311645508, + "memory(GiB)": 47.44, + "step": 15850, + "token_acc": 0.8142804291527932, + "train_speed(iter/s)": 0.095945 + }, + { + "epoch": 0.7370828840402373, + "eval_loss": 0.5673614144325256, + "eval_runtime": 293.6289, + "eval_samples_per_second": 11.835, + "eval_steps_per_second": 11.835, + "step": 15850 + }, + { + "epoch": 0.7373154023001869, + "grad_norm": 8.337072372436523, + "learning_rate": 1.7706739366808768e-06, + "loss": 0.739326810836792, + "memory(GiB)": 47.44, + "step": 15855, + "token_acc": 0.8325765038482177, + "train_speed(iter/s)": 0.095791 + }, + { + "epoch": 0.7375479205601365, + "grad_norm": 6.729470729827881, + "learning_rate": 1.7677394956176042e-06, + "loss": 0.5923904418945313, + "memory(GiB)": 47.44, + "step": 15860, + "token_acc": 0.8559451219512195, + "train_speed(iter/s)": 0.095808 + }, + { + "epoch": 0.737780438820086, + "grad_norm": 10.68433666229248, + "learning_rate": 1.7648069658897605e-06, + "loss": 0.8138419151306152, + "memory(GiB)": 47.44, + "step": 15865, + "token_acc": 0.8052335210334548, + "train_speed(iter/s)": 0.095825 + }, + { + "epoch": 0.7380129570800357, + "grad_norm": 9.127498626708984, + "learning_rate": 1.7618763492314444e-06, + "loss": 0.6489748001098633, + "memory(GiB)": 47.44, + "step": 15870, + "token_acc": 0.8489361702127659, + "train_speed(iter/s)": 0.095842 + }, + { + "epoch": 0.7382454753399853, + "grad_norm": 13.219034194946289, + "learning_rate": 1.7589476473756167e-06, + "loss": 0.5790813446044922, + "memory(GiB)": 47.44, + "step": 15875, + "token_acc": 0.8512685914260717, + "train_speed(iter/s)": 0.095859 + }, + { + "epoch": 0.7384779935999349, + "grad_norm": 7.873594284057617, + "learning_rate": 1.756020862054112e-06, + "loss": 0.6810230731964111, + "memory(GiB)": 47.44, + "step": 15880, + "token_acc": 0.8141503046716316, + "train_speed(iter/s)": 0.095876 + }, + { + "epoch": 0.7387105118598845, + "grad_norm": 7.8127522468566895, + "learning_rate": 1.7530959949976262e-06, + "loss": 0.719043493270874, + "memory(GiB)": 47.44, + "step": 15885, + "token_acc": 0.8200278164116829, + "train_speed(iter/s)": 0.095893 + }, + { + "epoch": 0.7389430301198341, + "grad_norm": 8.471165657043457, + "learning_rate": 1.7501730479357242e-06, + "loss": 0.6684861660003663, + "memory(GiB)": 47.44, + "step": 15890, + "token_acc": 0.834314880251276, + "train_speed(iter/s)": 0.09591 + }, + { + "epoch": 0.7391755483797837, + "grad_norm": 9.835054397583008, + "learning_rate": 1.747252022596836e-06, + "loss": 0.6790287494659424, + "memory(GiB)": 47.44, + "step": 15895, + "token_acc": 0.8317520556609741, + "train_speed(iter/s)": 0.095926 + }, + { + "epoch": 0.7394080666397334, + "grad_norm": 8.918397903442383, + "learning_rate": 1.7443329207082548e-06, + "loss": 0.6621670246124267, + "memory(GiB)": 47.44, + "step": 15900, + "token_acc": 0.8318614130434783, + "train_speed(iter/s)": 0.095943 + }, + { + "epoch": 0.7394080666397334, + "eval_loss": 0.5644251108169556, + "eval_runtime": 292.8436, + "eval_samples_per_second": 11.866, + "eval_steps_per_second": 11.866, + "step": 15900 + }, + { + "epoch": 0.7396405848996829, + "grad_norm": 7.586838245391846, + "learning_rate": 1.7414157439961332e-06, + "loss": 0.6244229793548584, + "memory(GiB)": 47.44, + "step": 15905, + "token_acc": 0.8333346639309579, + "train_speed(iter/s)": 0.095791 + }, + { + "epoch": 0.7398731031596325, + "grad_norm": 7.999522686004639, + "learning_rate": 1.7385004941854888e-06, + "loss": 0.6422570228576661, + "memory(GiB)": 47.44, + "step": 15910, + "token_acc": 0.8473042109405746, + "train_speed(iter/s)": 0.095808 + }, + { + "epoch": 0.7401056214195821, + "grad_norm": 7.450502395629883, + "learning_rate": 1.735587173000201e-06, + "loss": 0.7661757946014405, + "memory(GiB)": 47.44, + "step": 15915, + "token_acc": 0.8147856861974047, + "train_speed(iter/s)": 0.095824 + }, + { + "epoch": 0.7403381396795317, + "grad_norm": 9.627984046936035, + "learning_rate": 1.7326757821630026e-06, + "loss": 0.6477952480316163, + "memory(GiB)": 47.44, + "step": 15920, + "token_acc": 0.8338368580060423, + "train_speed(iter/s)": 0.095841 + }, + { + "epoch": 0.7405706579394813, + "grad_norm": 9.843599319458008, + "learning_rate": 1.729766323395493e-06, + "loss": 0.6486878395080566, + "memory(GiB)": 47.44, + "step": 15925, + "token_acc": 0.8337614678899082, + "train_speed(iter/s)": 0.095858 + }, + { + "epoch": 0.7408031761994309, + "grad_norm": 9.124186515808105, + "learning_rate": 1.7268587984181213e-06, + "loss": 0.7141555309295654, + "memory(GiB)": 47.44, + "step": 15930, + "token_acc": 0.8325718015665796, + "train_speed(iter/s)": 0.095875 + }, + { + "epoch": 0.7410356944593806, + "grad_norm": 9.008081436157227, + "learning_rate": 1.7239532089501982e-06, + "loss": 0.6716857433319092, + "memory(GiB)": 47.44, + "step": 15935, + "token_acc": 0.8405483405483406, + "train_speed(iter/s)": 0.095892 + }, + { + "epoch": 0.7412682127193301, + "grad_norm": 8.329229354858398, + "learning_rate": 1.7210495567098885e-06, + "loss": 0.6629110813140869, + "memory(GiB)": 47.44, + "step": 15940, + "token_acc": 0.8348040945993647, + "train_speed(iter/s)": 0.095908 + }, + { + "epoch": 0.7415007309792797, + "grad_norm": 7.304653644561768, + "learning_rate": 1.7181478434142134e-06, + "loss": 0.49824161529541017, + "memory(GiB)": 47.44, + "step": 15945, + "token_acc": 0.8744855967078189, + "train_speed(iter/s)": 0.095926 + }, + { + "epoch": 0.7417332492392293, + "grad_norm": 6.444869041442871, + "learning_rate": 1.715248070779042e-06, + "loss": 0.6452473640441895, + "memory(GiB)": 47.44, + "step": 15950, + "token_acc": 0.8382570162481536, + "train_speed(iter/s)": 0.095943 + }, + { + "epoch": 0.7417332492392293, + "eval_loss": 0.5636093020439148, + "eval_runtime": 291.7132, + "eval_samples_per_second": 11.912, + "eval_steps_per_second": 11.912, + "step": 15950 + }, + { + "epoch": 0.741965767499179, + "grad_norm": 7.70983362197876, + "learning_rate": 1.712350240519103e-06, + "loss": 0.6836659908294678, + "memory(GiB)": 47.44, + "step": 15955, + "token_acc": 0.8332400223946252, + "train_speed(iter/s)": 0.095791 + }, + { + "epoch": 0.7421982857591285, + "grad_norm": 9.401704788208008, + "learning_rate": 1.709454354347969e-06, + "loss": 0.6634652137756347, + "memory(GiB)": 47.44, + "step": 15960, + "token_acc": 0.8399353274050121, + "train_speed(iter/s)": 0.095808 + }, + { + "epoch": 0.7424308040190781, + "grad_norm": 8.098258972167969, + "learning_rate": 1.7065604139780712e-06, + "loss": 0.7540879726409913, + "memory(GiB)": 47.44, + "step": 15965, + "token_acc": 0.827323717948718, + "train_speed(iter/s)": 0.095826 + }, + { + "epoch": 0.7426633222790278, + "grad_norm": 10.010939598083496, + "learning_rate": 1.7036684211206817e-06, + "loss": 0.6124141216278076, + "memory(GiB)": 47.44, + "step": 15970, + "token_acc": 0.8495206335973322, + "train_speed(iter/s)": 0.095843 + }, + { + "epoch": 0.7428958405389773, + "grad_norm": 10.145440101623535, + "learning_rate": 1.700778377485927e-06, + "loss": 0.713443660736084, + "memory(GiB)": 47.44, + "step": 15975, + "token_acc": 0.830102622576967, + "train_speed(iter/s)": 0.09586 + }, + { + "epoch": 0.7431283587989269, + "grad_norm": 8.140713691711426, + "learning_rate": 1.6978902847827793e-06, + "loss": 0.6635231018066406, + "memory(GiB)": 47.44, + "step": 15980, + "token_acc": 0.8262042389210019, + "train_speed(iter/s)": 0.095877 + }, + { + "epoch": 0.7433608770588765, + "grad_norm": 6.786279678344727, + "learning_rate": 1.6950041447190584e-06, + "loss": 0.6087878227233887, + "memory(GiB)": 47.44, + "step": 15985, + "token_acc": 0.8394011568560735, + "train_speed(iter/s)": 0.095894 + }, + { + "epoch": 0.7435933953188262, + "grad_norm": 9.08250904083252, + "learning_rate": 1.6921199590014253e-06, + "loss": 0.6415596008300781, + "memory(GiB)": 47.44, + "step": 15990, + "token_acc": 0.8438988640527666, + "train_speed(iter/s)": 0.095912 + }, + { + "epoch": 0.7438259135787757, + "grad_norm": 7.580020427703857, + "learning_rate": 1.6892377293353906e-06, + "loss": 0.6963030815124511, + "memory(GiB)": 47.44, + "step": 15995, + "token_acc": 0.8396860986547086, + "train_speed(iter/s)": 0.095929 + }, + { + "epoch": 0.7440584318387253, + "grad_norm": 7.410154819488525, + "learning_rate": 1.6863574574253033e-06, + "loss": 0.635734224319458, + "memory(GiB)": 47.44, + "step": 16000, + "token_acc": 0.8359079986268452, + "train_speed(iter/s)": 0.095946 + }, + { + "epoch": 0.7440584318387253, + "eval_loss": 0.5647992491722107, + "eval_runtime": 291.7146, + "eval_samples_per_second": 11.912, + "eval_steps_per_second": 11.912, + "step": 16000 + }, + { + "epoch": 0.744290950098675, + "grad_norm": 7.568226337432861, + "learning_rate": 1.6834791449743594e-06, + "loss": 0.5388147354125976, + "memory(GiB)": 47.44, + "step": 16005, + "token_acc": 0.8337101532935668, + "train_speed(iter/s)": 0.095795 + }, + { + "epoch": 0.7445234683586245, + "grad_norm": 8.07730484008789, + "learning_rate": 1.6806027936845908e-06, + "loss": 0.5757100582122803, + "memory(GiB)": 47.44, + "step": 16010, + "token_acc": 0.8572752548656163, + "train_speed(iter/s)": 0.095811 + }, + { + "epoch": 0.7447559866185741, + "grad_norm": 8.20462417602539, + "learning_rate": 1.6777284052568755e-06, + "loss": 0.6828523635864258, + "memory(GiB)": 47.44, + "step": 16015, + "token_acc": 0.8220823798627003, + "train_speed(iter/s)": 0.095829 + }, + { + "epoch": 0.7449885048785237, + "grad_norm": 7.580663681030273, + "learning_rate": 1.6748559813909266e-06, + "loss": 0.7655567646026611, + "memory(GiB)": 47.44, + "step": 16020, + "token_acc": 0.8057692307692308, + "train_speed(iter/s)": 0.095846 + }, + { + "epoch": 0.7452210231384734, + "grad_norm": 7.321385860443115, + "learning_rate": 1.6719855237853e-06, + "loss": 0.7181625843048096, + "memory(GiB)": 47.44, + "step": 16025, + "token_acc": 0.8347318496898942, + "train_speed(iter/s)": 0.095863 + }, + { + "epoch": 0.7454535413984229, + "grad_norm": 10.682913780212402, + "learning_rate": 1.669117034137382e-06, + "loss": 0.6672065734863282, + "memory(GiB)": 47.44, + "step": 16030, + "token_acc": 0.837620578778135, + "train_speed(iter/s)": 0.09588 + }, + { + "epoch": 0.7456860596583725, + "grad_norm": 8.959480285644531, + "learning_rate": 1.6662505141434004e-06, + "loss": 0.6392136573791504, + "memory(GiB)": 47.44, + "step": 16035, + "token_acc": 0.845925925925926, + "train_speed(iter/s)": 0.095897 + }, + { + "epoch": 0.7459185779183222, + "grad_norm": 8.299612998962402, + "learning_rate": 1.6633859654984192e-06, + "loss": 0.5677808284759521, + "memory(GiB)": 47.44, + "step": 16040, + "token_acc": 0.8607932875667429, + "train_speed(iter/s)": 0.095913 + }, + { + "epoch": 0.7461510961782718, + "grad_norm": 7.424012660980225, + "learning_rate": 1.6605233898963307e-06, + "loss": 0.5513256072998047, + "memory(GiB)": 47.44, + "step": 16045, + "token_acc": 0.8574193548387097, + "train_speed(iter/s)": 0.09593 + }, + { + "epoch": 0.7463836144382213, + "grad_norm": 7.407084941864014, + "learning_rate": 1.6576627890298685e-06, + "loss": 0.7417648315429688, + "memory(GiB)": 47.44, + "step": 16050, + "token_acc": 0.8085351787773933, + "train_speed(iter/s)": 0.095947 + }, + { + "epoch": 0.7463836144382213, + "eval_loss": 0.5642947554588318, + "eval_runtime": 292.8114, + "eval_samples_per_second": 11.868, + "eval_steps_per_second": 11.868, + "step": 16050 + }, + { + "epoch": 0.746616132698171, + "grad_norm": 7.718512535095215, + "learning_rate": 1.6548041645905894e-06, + "loss": 0.579425048828125, + "memory(GiB)": 47.44, + "step": 16055, + "token_acc": 0.8340791050739113, + "train_speed(iter/s)": 0.095796 + }, + { + "epoch": 0.7468486509581206, + "grad_norm": 7.850195407867432, + "learning_rate": 1.6519475182688894e-06, + "loss": 0.5587710857391357, + "memory(GiB)": 47.44, + "step": 16060, + "token_acc": 0.8575553416746872, + "train_speed(iter/s)": 0.095812 + }, + { + "epoch": 0.7470811692180701, + "grad_norm": 9.208138465881348, + "learning_rate": 1.6490928517539906e-06, + "loss": 0.8099372863769532, + "memory(GiB)": 47.44, + "step": 16065, + "token_acc": 0.8002898550724638, + "train_speed(iter/s)": 0.095829 + }, + { + "epoch": 0.7473136874780197, + "grad_norm": 10.444913864135742, + "learning_rate": 1.6462401667339477e-06, + "loss": 0.6440964221954346, + "memory(GiB)": 47.44, + "step": 16070, + "token_acc": 0.8335183129855716, + "train_speed(iter/s)": 0.095846 + }, + { + "epoch": 0.7475462057379694, + "grad_norm": 9.752091407775879, + "learning_rate": 1.6433894648956377e-06, + "loss": 0.6518924236297607, + "memory(GiB)": 47.44, + "step": 16075, + "token_acc": 0.8316729646169702, + "train_speed(iter/s)": 0.095863 + }, + { + "epoch": 0.747778723997919, + "grad_norm": 8.187569618225098, + "learning_rate": 1.6405407479247727e-06, + "loss": 0.6321462631225586, + "memory(GiB)": 47.44, + "step": 16080, + "token_acc": 0.8360964581763376, + "train_speed(iter/s)": 0.095879 + }, + { + "epoch": 0.7480112422578685, + "grad_norm": 9.651229858398438, + "learning_rate": 1.6376940175058826e-06, + "loss": 0.6507625102996826, + "memory(GiB)": 47.44, + "step": 16085, + "token_acc": 0.8332066869300911, + "train_speed(iter/s)": 0.095896 + }, + { + "epoch": 0.7482437605178182, + "grad_norm": 8.283926010131836, + "learning_rate": 1.634849275322331e-06, + "loss": 0.6909523487091065, + "memory(GiB)": 47.44, + "step": 16090, + "token_acc": 0.8298148749594024, + "train_speed(iter/s)": 0.095913 + }, + { + "epoch": 0.7484762787777678, + "grad_norm": 8.173871040344238, + "learning_rate": 1.632006523056298e-06, + "loss": 0.6695907592773438, + "memory(GiB)": 47.44, + "step": 16095, + "token_acc": 0.8302583025830258, + "train_speed(iter/s)": 0.09593 + }, + { + "epoch": 0.7487087970377174, + "grad_norm": 6.885328769683838, + "learning_rate": 1.6291657623887935e-06, + "loss": 0.6500693321228027, + "memory(GiB)": 47.44, + "step": 16100, + "token_acc": 0.8388214904679376, + "train_speed(iter/s)": 0.095947 + }, + { + "epoch": 0.7487087970377174, + "eval_loss": 0.5655313730239868, + "eval_runtime": 295.4483, + "eval_samples_per_second": 11.762, + "eval_steps_per_second": 11.762, + "step": 16100 + }, + { + "epoch": 0.7489413152976669, + "grad_norm": 8.974815368652344, + "learning_rate": 1.6263269949996457e-06, + "loss": 0.6070386886596679, + "memory(GiB)": 47.44, + "step": 16105, + "token_acc": 0.8336296794656158, + "train_speed(iter/s)": 0.095794 + }, + { + "epoch": 0.7491738335576166, + "grad_norm": 6.081818103790283, + "learning_rate": 1.6234902225675075e-06, + "loss": 0.8032929420471191, + "memory(GiB)": 47.44, + "step": 16110, + "token_acc": 0.7996732026143791, + "train_speed(iter/s)": 0.095811 + }, + { + "epoch": 0.7494063518175662, + "grad_norm": 11.375482559204102, + "learning_rate": 1.620655446769847e-06, + "loss": 0.5640523433685303, + "memory(GiB)": 47.44, + "step": 16115, + "token_acc": 0.8554140127388535, + "train_speed(iter/s)": 0.095827 + }, + { + "epoch": 0.7496388700775157, + "grad_norm": 8.1347017288208, + "learning_rate": 1.6178226692829579e-06, + "loss": 0.649559736251831, + "memory(GiB)": 47.44, + "step": 16120, + "token_acc": 0.8394160583941606, + "train_speed(iter/s)": 0.095844 + }, + { + "epoch": 0.7498713883374654, + "grad_norm": 10.170631408691406, + "learning_rate": 1.6149918917819452e-06, + "loss": 0.6405446052551269, + "memory(GiB)": 47.44, + "step": 16125, + "token_acc": 0.8422807602534178, + "train_speed(iter/s)": 0.09586 + }, + { + "epoch": 0.750103906597415, + "grad_norm": 6.244584083557129, + "learning_rate": 1.612163115940739e-06, + "loss": 0.6530537605285645, + "memory(GiB)": 47.44, + "step": 16130, + "token_acc": 0.8325503355704698, + "train_speed(iter/s)": 0.095877 + }, + { + "epoch": 0.7503364248573646, + "grad_norm": 7.818486213684082, + "learning_rate": 1.609336343432078e-06, + "loss": 0.5714409828186036, + "memory(GiB)": 47.44, + "step": 16135, + "token_acc": 0.8491704374057315, + "train_speed(iter/s)": 0.095894 + }, + { + "epoch": 0.7505689431173141, + "grad_norm": 9.446545600891113, + "learning_rate": 1.6065115759275224e-06, + "loss": 0.6604638576507569, + "memory(GiB)": 47.44, + "step": 16140, + "token_acc": 0.8426255436931593, + "train_speed(iter/s)": 0.095911 + }, + { + "epoch": 0.7508014613772638, + "grad_norm": 7.6879987716674805, + "learning_rate": 1.6036888150974433e-06, + "loss": 0.6549732685089111, + "memory(GiB)": 47.44, + "step": 16145, + "token_acc": 0.8540145985401459, + "train_speed(iter/s)": 0.095927 + }, + { + "epoch": 0.7510339796372134, + "grad_norm": 8.272690773010254, + "learning_rate": 1.600868062611029e-06, + "loss": 0.6290201187133789, + "memory(GiB)": 47.44, + "step": 16150, + "token_acc": 0.840620592383639, + "train_speed(iter/s)": 0.095944 + }, + { + "epoch": 0.7510339796372134, + "eval_loss": 0.5634961128234863, + "eval_runtime": 292.755, + "eval_samples_per_second": 11.87, + "eval_steps_per_second": 11.87, + "step": 16150 + }, + { + "epoch": 0.7512664978971629, + "grad_norm": 8.650757789611816, + "learning_rate": 1.5980493201362734e-06, + "loss": 0.6302085876464844, + "memory(GiB)": 47.44, + "step": 16155, + "token_acc": 0.8328231617881513, + "train_speed(iter/s)": 0.095794 + }, + { + "epoch": 0.7514990161571126, + "grad_norm": 9.49666690826416, + "learning_rate": 1.59523258933999e-06, + "loss": 0.8467613220214844, + "memory(GiB)": 47.44, + "step": 16160, + "token_acc": 0.7879041248606466, + "train_speed(iter/s)": 0.095811 + }, + { + "epoch": 0.7517315344170622, + "grad_norm": 11.6074800491333, + "learning_rate": 1.5924178718877953e-06, + "loss": 0.6553841590881347, + "memory(GiB)": 47.44, + "step": 16165, + "token_acc": 0.8311300639658848, + "train_speed(iter/s)": 0.095828 + }, + { + "epoch": 0.7519640526770118, + "grad_norm": 7.853658199310303, + "learning_rate": 1.5896051694441195e-06, + "loss": 0.6012135982513428, + "memory(GiB)": 47.44, + "step": 16170, + "token_acc": 0.843441466854725, + "train_speed(iter/s)": 0.095844 + }, + { + "epoch": 0.7521965709369614, + "grad_norm": 8.311904907226562, + "learning_rate": 1.5867944836722015e-06, + "loss": 0.565187931060791, + "memory(GiB)": 47.44, + "step": 16175, + "token_acc": 0.8583779333058872, + "train_speed(iter/s)": 0.095861 + }, + { + "epoch": 0.752429089196911, + "grad_norm": 7.729232311248779, + "learning_rate": 1.5839858162340854e-06, + "loss": 0.7426953792572022, + "memory(GiB)": 47.44, + "step": 16180, + "token_acc": 0.8138832997987927, + "train_speed(iter/s)": 0.095877 + }, + { + "epoch": 0.7526616074568606, + "grad_norm": 7.0569281578063965, + "learning_rate": 1.5811791687906259e-06, + "loss": 0.6274663925170898, + "memory(GiB)": 47.44, + "step": 16185, + "token_acc": 0.8426877470355731, + "train_speed(iter/s)": 0.095894 + }, + { + "epoch": 0.7528941257168102, + "grad_norm": 7.353418827056885, + "learning_rate": 1.5783745430014763e-06, + "loss": 0.7416880607604981, + "memory(GiB)": 47.44, + "step": 16190, + "token_acc": 0.8261287223823247, + "train_speed(iter/s)": 0.09591 + }, + { + "epoch": 0.7531266439767598, + "grad_norm": 10.384500503540039, + "learning_rate": 1.5755719405251014e-06, + "loss": 0.5416950225830078, + "memory(GiB)": 47.44, + "step": 16195, + "token_acc": 0.861673672143676, + "train_speed(iter/s)": 0.095926 + }, + { + "epoch": 0.7533591622367094, + "grad_norm": 8.491443634033203, + "learning_rate": 1.5727713630187635e-06, + "loss": 0.6371305465698243, + "memory(GiB)": 47.44, + "step": 16200, + "token_acc": 0.842686002522068, + "train_speed(iter/s)": 0.095943 + }, + { + "epoch": 0.7533591622367094, + "eval_loss": 0.5626077651977539, + "eval_runtime": 294.2238, + "eval_samples_per_second": 11.811, + "eval_steps_per_second": 11.811, + "step": 16200 + }, + { + "epoch": 0.753591680496659, + "grad_norm": 8.919591903686523, + "learning_rate": 1.5699728121385344e-06, + "loss": 0.5992330551147461, + "memory(GiB)": 47.44, + "step": 16205, + "token_acc": 0.8337118001695105, + "train_speed(iter/s)": 0.095793 + }, + { + "epoch": 0.7538241987566086, + "grad_norm": 8.25503158569336, + "learning_rate": 1.5671762895392801e-06, + "loss": 0.6749239921569824, + "memory(GiB)": 47.44, + "step": 16210, + "token_acc": 0.842032967032967, + "train_speed(iter/s)": 0.095809 + }, + { + "epoch": 0.7540567170165582, + "grad_norm": 8.522311210632324, + "learning_rate": 1.5643817968746717e-06, + "loss": 0.6455776214599609, + "memory(GiB)": 47.44, + "step": 16215, + "token_acc": 0.8392246294184721, + "train_speed(iter/s)": 0.095825 + }, + { + "epoch": 0.7542892352765078, + "grad_norm": 8.548661231994629, + "learning_rate": 1.5615893357971795e-06, + "loss": 0.6254148960113526, + "memory(GiB)": 47.44, + "step": 16220, + "token_acc": 0.8373618784530387, + "train_speed(iter/s)": 0.095842 + }, + { + "epoch": 0.7545217535364575, + "grad_norm": 7.641754627227783, + "learning_rate": 1.558798907958074e-06, + "loss": 0.6256637096405029, + "memory(GiB)": 47.44, + "step": 16225, + "token_acc": 0.8421729347476695, + "train_speed(iter/s)": 0.095858 + }, + { + "epoch": 0.754754271796407, + "grad_norm": 7.494331359863281, + "learning_rate": 1.5560105150074172e-06, + "loss": 0.7765919208526612, + "memory(GiB)": 47.44, + "step": 16230, + "token_acc": 0.8147335423197493, + "train_speed(iter/s)": 0.095874 + }, + { + "epoch": 0.7549867900563566, + "grad_norm": 6.190079689025879, + "learning_rate": 1.553224158594076e-06, + "loss": 0.7715739250183106, + "memory(GiB)": 47.44, + "step": 16235, + "token_acc": 0.8063669182802757, + "train_speed(iter/s)": 0.095891 + }, + { + "epoch": 0.7552193083163062, + "grad_norm": 9.882500648498535, + "learning_rate": 1.5504398403657055e-06, + "loss": 0.6255253791809082, + "memory(GiB)": 47.44, + "step": 16240, + "token_acc": 0.8476679503637141, + "train_speed(iter/s)": 0.095907 + }, + { + "epoch": 0.7554518265762559, + "grad_norm": 8.659635543823242, + "learning_rate": 1.5476575619687617e-06, + "loss": 0.6734414577484131, + "memory(GiB)": 47.44, + "step": 16245, + "token_acc": 0.8355778264954589, + "train_speed(iter/s)": 0.095924 + }, + { + "epoch": 0.7556843448362054, + "grad_norm": 8.26206111907959, + "learning_rate": 1.5448773250484895e-06, + "loss": 0.669712495803833, + "memory(GiB)": 47.44, + "step": 16250, + "token_acc": 0.8396972824217406, + "train_speed(iter/s)": 0.095941 + }, + { + "epoch": 0.7556843448362054, + "eval_loss": 0.563765287399292, + "eval_runtime": 293.9564, + "eval_samples_per_second": 11.821, + "eval_steps_per_second": 11.821, + "step": 16250 + }, + { + "epoch": 0.755916863096155, + "grad_norm": 7.431678295135498, + "learning_rate": 1.5420991312489298e-06, + "loss": 0.7340573787689209, + "memory(GiB)": 47.44, + "step": 16255, + "token_acc": 0.8327177580829151, + "train_speed(iter/s)": 0.095791 + }, + { + "epoch": 0.7561493813561047, + "grad_norm": 8.096651077270508, + "learning_rate": 1.5393229822129142e-06, + "loss": 0.6868834018707275, + "memory(GiB)": 47.44, + "step": 16260, + "token_acc": 0.8088379705400982, + "train_speed(iter/s)": 0.095807 + }, + { + "epoch": 0.7563818996160542, + "grad_norm": 8.451735496520996, + "learning_rate": 1.536548879582067e-06, + "loss": 0.5752121448516846, + "memory(GiB)": 47.44, + "step": 16265, + "token_acc": 0.8594154642989271, + "train_speed(iter/s)": 0.095824 + }, + { + "epoch": 0.7566144178760038, + "grad_norm": 8.332307815551758, + "learning_rate": 1.5337768249967984e-06, + "loss": 0.5772712707519532, + "memory(GiB)": 47.44, + "step": 16270, + "token_acc": 0.851790450928382, + "train_speed(iter/s)": 0.09584 + }, + { + "epoch": 0.7568469361359534, + "grad_norm": 7.697171688079834, + "learning_rate": 1.5310068200963119e-06, + "loss": 0.6366849422454834, + "memory(GiB)": 47.44, + "step": 16275, + "token_acc": 0.843067143424712, + "train_speed(iter/s)": 0.095857 + }, + { + "epoch": 0.7570794543959031, + "grad_norm": 6.656437873840332, + "learning_rate": 1.5282388665185942e-06, + "loss": 0.5415359020233155, + "memory(GiB)": 47.44, + "step": 16280, + "token_acc": 0.8706157443491817, + "train_speed(iter/s)": 0.095874 + }, + { + "epoch": 0.7573119726558526, + "grad_norm": 10.454804420471191, + "learning_rate": 1.5254729659004252e-06, + "loss": 0.6282804489135743, + "memory(GiB)": 47.44, + "step": 16285, + "token_acc": 0.853583916083916, + "train_speed(iter/s)": 0.09589 + }, + { + "epoch": 0.7575444909158022, + "grad_norm": 10.950794219970703, + "learning_rate": 1.5227091198773641e-06, + "loss": 0.6014307975769043, + "memory(GiB)": 47.44, + "step": 16290, + "token_acc": 0.8438438438438438, + "train_speed(iter/s)": 0.095907 + }, + { + "epoch": 0.7577770091757519, + "grad_norm": 8.460285186767578, + "learning_rate": 1.519947330083759e-06, + "loss": 0.648493766784668, + "memory(GiB)": 47.44, + "step": 16295, + "token_acc": 0.8322683706070287, + "train_speed(iter/s)": 0.095924 + }, + { + "epoch": 0.7580095274357014, + "grad_norm": 9.404953956604004, + "learning_rate": 1.5171875981527429e-06, + "loss": 0.7454773902893066, + "memory(GiB)": 47.44, + "step": 16300, + "token_acc": 0.8106194690265487, + "train_speed(iter/s)": 0.09594 + }, + { + "epoch": 0.7580095274357014, + "eval_loss": 0.5625870227813721, + "eval_runtime": 293.7911, + "eval_samples_per_second": 11.828, + "eval_steps_per_second": 11.828, + "step": 16300 + }, + { + "epoch": 0.758242045695651, + "grad_norm": 7.188810348510742, + "learning_rate": 1.5144299257162293e-06, + "loss": 0.6204580783843994, + "memory(GiB)": 47.44, + "step": 16305, + "token_acc": 0.8339520296974254, + "train_speed(iter/s)": 0.095791 + }, + { + "epoch": 0.7584745639556006, + "grad_norm": 10.468605041503906, + "learning_rate": 1.5116743144049174e-06, + "loss": 0.6677374362945556, + "memory(GiB)": 47.44, + "step": 16310, + "token_acc": 0.8274293142671433, + "train_speed(iter/s)": 0.095807 + }, + { + "epoch": 0.7587070822155503, + "grad_norm": 12.967514991760254, + "learning_rate": 1.5089207658482818e-06, + "loss": 0.613736343383789, + "memory(GiB)": 47.44, + "step": 16315, + "token_acc": 0.848092404620231, + "train_speed(iter/s)": 0.095824 + }, + { + "epoch": 0.7589396004754998, + "grad_norm": 6.795639514923096, + "learning_rate": 1.5061692816745844e-06, + "loss": 0.7447350025177002, + "memory(GiB)": 47.44, + "step": 16320, + "token_acc": 0.8165016501650165, + "train_speed(iter/s)": 0.09584 + }, + { + "epoch": 0.7591721187354494, + "grad_norm": 8.063556671142578, + "learning_rate": 1.5034198635108583e-06, + "loss": 0.6021570205688477, + "memory(GiB)": 47.44, + "step": 16325, + "token_acc": 0.8490967056323061, + "train_speed(iter/s)": 0.095856 + }, + { + "epoch": 0.7594046369953991, + "grad_norm": 7.179757595062256, + "learning_rate": 1.5006725129829243e-06, + "loss": 0.5568655967712403, + "memory(GiB)": 47.44, + "step": 16330, + "token_acc": 0.8635454181672669, + "train_speed(iter/s)": 0.095872 + }, + { + "epoch": 0.7596371552553487, + "grad_norm": 5.947718143463135, + "learning_rate": 1.4979272317153704e-06, + "loss": 0.5865846157073975, + "memory(GiB)": 47.44, + "step": 16335, + "token_acc": 0.8524265434136679, + "train_speed(iter/s)": 0.095888 + }, + { + "epoch": 0.7598696735152982, + "grad_norm": 7.1008076667785645, + "learning_rate": 1.4951840213315694e-06, + "loss": 0.5202283382415771, + "memory(GiB)": 47.44, + "step": 16340, + "token_acc": 0.8688915375446961, + "train_speed(iter/s)": 0.095905 + }, + { + "epoch": 0.7601021917752478, + "grad_norm": 11.718537330627441, + "learning_rate": 1.4924428834536659e-06, + "loss": 0.6101597785949707, + "memory(GiB)": 47.44, + "step": 16345, + "token_acc": 0.8434628975265017, + "train_speed(iter/s)": 0.095921 + }, + { + "epoch": 0.7603347100351975, + "grad_norm": 10.827213287353516, + "learning_rate": 1.4897038197025805e-06, + "loss": 0.7467214584350585, + "memory(GiB)": 47.44, + "step": 16350, + "token_acc": 0.8111263736263736, + "train_speed(iter/s)": 0.095937 + }, + { + "epoch": 0.7603347100351975, + "eval_loss": 0.5625221133232117, + "eval_runtime": 293.4663, + "eval_samples_per_second": 11.841, + "eval_steps_per_second": 11.841, + "step": 16350 + }, + { + "epoch": 0.760567228295147, + "grad_norm": 6.726182460784912, + "learning_rate": 1.4869668316980034e-06, + "loss": 0.6483430862426758, + "memory(GiB)": 47.44, + "step": 16355, + "token_acc": 0.8333920112661631, + "train_speed(iter/s)": 0.095788 + }, + { + "epoch": 0.7607997465550966, + "grad_norm": 9.858255386352539, + "learning_rate": 1.4842319210584033e-06, + "loss": 0.5826794624328613, + "memory(GiB)": 47.44, + "step": 16360, + "token_acc": 0.8565055762081785, + "train_speed(iter/s)": 0.095804 + }, + { + "epoch": 0.7610322648150463, + "grad_norm": 6.0644145011901855, + "learning_rate": 1.4814990894010139e-06, + "loss": 0.6393117427825927, + "memory(GiB)": 47.44, + "step": 16365, + "token_acc": 0.8459736456808199, + "train_speed(iter/s)": 0.095821 + }, + { + "epoch": 0.7612647830749959, + "grad_norm": 8.901994705200195, + "learning_rate": 1.478768338341846e-06, + "loss": 0.552800464630127, + "memory(GiB)": 47.44, + "step": 16370, + "token_acc": 0.8708815672306323, + "train_speed(iter/s)": 0.095838 + }, + { + "epoch": 0.7614973013349454, + "grad_norm": 10.548748016357422, + "learning_rate": 1.476039669495674e-06, + "loss": 0.5374550819396973, + "memory(GiB)": 47.44, + "step": 16375, + "token_acc": 0.8676470588235294, + "train_speed(iter/s)": 0.095854 + }, + { + "epoch": 0.761729819594895, + "grad_norm": 8.564501762390137, + "learning_rate": 1.4733130844760456e-06, + "loss": 0.7531012535095215, + "memory(GiB)": 47.44, + "step": 16380, + "token_acc": 0.8209969788519638, + "train_speed(iter/s)": 0.09587 + }, + { + "epoch": 0.7619623378548447, + "grad_norm": 7.566102504730225, + "learning_rate": 1.470588584895275e-06, + "loss": 0.6096216678619385, + "memory(GiB)": 47.44, + "step": 16385, + "token_acc": 0.8416728902165795, + "train_speed(iter/s)": 0.095886 + }, + { + "epoch": 0.7621948561147943, + "grad_norm": 9.330343246459961, + "learning_rate": 1.4678661723644445e-06, + "loss": 0.6481287479400635, + "memory(GiB)": 47.44, + "step": 16390, + "token_acc": 0.8470254957507082, + "train_speed(iter/s)": 0.095903 + }, + { + "epoch": 0.7624273743747438, + "grad_norm": 11.43049144744873, + "learning_rate": 1.4651458484933967e-06, + "loss": 0.6192949771881103, + "memory(GiB)": 47.44, + "step": 16395, + "token_acc": 0.8459622909996443, + "train_speed(iter/s)": 0.095919 + }, + { + "epoch": 0.7626598926346935, + "grad_norm": 11.158167839050293, + "learning_rate": 1.4624276148907468e-06, + "loss": 0.7683715343475341, + "memory(GiB)": 47.44, + "step": 16400, + "token_acc": 0.8229934924078091, + "train_speed(iter/s)": 0.095936 + }, + { + "epoch": 0.7626598926346935, + "eval_loss": 0.5630708336830139, + "eval_runtime": 293.6494, + "eval_samples_per_second": 11.834, + "eval_steps_per_second": 11.834, + "step": 16400 + }, + { + "epoch": 0.7628924108946431, + "grad_norm": 7.216041564941406, + "learning_rate": 1.4597114731638674e-06, + "loss": 0.6736807823181152, + "memory(GiB)": 47.44, + "step": 16405, + "token_acc": 0.8332021153922621, + "train_speed(iter/s)": 0.095787 + }, + { + "epoch": 0.7631249291545926, + "grad_norm": 9.324705123901367, + "learning_rate": 1.4569974249189e-06, + "loss": 0.6405007362365722, + "memory(GiB)": 47.44, + "step": 16410, + "token_acc": 0.8385508265916286, + "train_speed(iter/s)": 0.095803 + }, + { + "epoch": 0.7633574474145423, + "grad_norm": 4.808244705200195, + "learning_rate": 1.4542854717607414e-06, + "loss": 0.7190701961517334, + "memory(GiB)": 47.44, + "step": 16415, + "token_acc": 0.8179800221975583, + "train_speed(iter/s)": 0.095819 + }, + { + "epoch": 0.7635899656744919, + "grad_norm": 9.507863998413086, + "learning_rate": 1.4515756152930556e-06, + "loss": 0.546860933303833, + "memory(GiB)": 47.44, + "step": 16420, + "token_acc": 0.8678556951763275, + "train_speed(iter/s)": 0.095836 + }, + { + "epoch": 0.7638224839344415, + "grad_norm": 8.980109214782715, + "learning_rate": 1.448867857118264e-06, + "loss": 0.7133777618408204, + "memory(GiB)": 47.44, + "step": 16425, + "token_acc": 0.8264150943396227, + "train_speed(iter/s)": 0.095852 + }, + { + "epoch": 0.764055002194391, + "grad_norm": 7.252748966217041, + "learning_rate": 1.4461621988375473e-06, + "loss": 0.6807666301727295, + "memory(GiB)": 47.44, + "step": 16430, + "token_acc": 0.8331565924354896, + "train_speed(iter/s)": 0.095869 + }, + { + "epoch": 0.7642875204543407, + "grad_norm": 7.0165205001831055, + "learning_rate": 1.4434586420508467e-06, + "loss": 0.5628365993499755, + "memory(GiB)": 47.44, + "step": 16435, + "token_acc": 0.8530997304582211, + "train_speed(iter/s)": 0.095885 + }, + { + "epoch": 0.7645200387142903, + "grad_norm": 7.279083251953125, + "learning_rate": 1.440757188356856e-06, + "loss": 0.6637139797210694, + "memory(GiB)": 47.44, + "step": 16440, + "token_acc": 0.8276481149012568, + "train_speed(iter/s)": 0.095902 + }, + { + "epoch": 0.7647525569742399, + "grad_norm": 8.89866828918457, + "learning_rate": 1.4380578393530303e-06, + "loss": 0.5501326560974121, + "memory(GiB)": 47.44, + "step": 16445, + "token_acc": 0.8655049151027703, + "train_speed(iter/s)": 0.095919 + }, + { + "epoch": 0.7649850752341895, + "grad_norm": 7.709764003753662, + "learning_rate": 1.435360596635576e-06, + "loss": 0.7855375289916993, + "memory(GiB)": 47.44, + "step": 16450, + "token_acc": 0.8118338846012495, + "train_speed(iter/s)": 0.095935 + }, + { + "epoch": 0.7649850752341895, + "eval_loss": 0.5636538863182068, + "eval_runtime": 296.5811, + "eval_samples_per_second": 11.717, + "eval_steps_per_second": 11.717, + "step": 16450 + }, + { + "epoch": 0.7652175934941391, + "grad_norm": 6.851712703704834, + "learning_rate": 1.4326654617994585e-06, + "loss": 0.7187223911285401, + "memory(GiB)": 47.44, + "step": 16455, + "token_acc": 0.8327603672300612, + "train_speed(iter/s)": 0.095786 + }, + { + "epoch": 0.7654501117540887, + "grad_norm": 6.9142889976501465, + "learning_rate": 1.4299724364383915e-06, + "loss": 0.5832521915435791, + "memory(GiB)": 47.44, + "step": 16460, + "token_acc": 0.8634826711749789, + "train_speed(iter/s)": 0.095802 + }, + { + "epoch": 0.7656826300140382, + "grad_norm": 7.56882381439209, + "learning_rate": 1.427281522144845e-06, + "loss": 0.7005406856536865, + "memory(GiB)": 47.44, + "step": 16465, + "token_acc": 0.8402910762160092, + "train_speed(iter/s)": 0.095818 + }, + { + "epoch": 0.7659151482739879, + "grad_norm": 7.536351203918457, + "learning_rate": 1.4245927205100402e-06, + "loss": 0.6036080360412598, + "memory(GiB)": 47.44, + "step": 16470, + "token_acc": 0.8438514244500541, + "train_speed(iter/s)": 0.095835 + }, + { + "epoch": 0.7661476665339375, + "grad_norm": 7.149500370025635, + "learning_rate": 1.4219060331239498e-06, + "loss": 0.7330766677856445, + "memory(GiB)": 47.44, + "step": 16475, + "token_acc": 0.8186915887850468, + "train_speed(iter/s)": 0.095851 + }, + { + "epoch": 0.7663801847938871, + "grad_norm": 11.843749046325684, + "learning_rate": 1.419221461575292e-06, + "loss": 0.7601501941680908, + "memory(GiB)": 47.44, + "step": 16480, + "token_acc": 0.8212290502793296, + "train_speed(iter/s)": 0.095867 + }, + { + "epoch": 0.7666127030538367, + "grad_norm": 8.46166706085205, + "learning_rate": 1.41653900745154e-06, + "loss": 0.6596882820129395, + "memory(GiB)": 47.44, + "step": 16485, + "token_acc": 0.8379591836734694, + "train_speed(iter/s)": 0.095884 + }, + { + "epoch": 0.7668452213137863, + "grad_norm": 8.52076244354248, + "learning_rate": 1.4138586723389092e-06, + "loss": 0.7091259002685547, + "memory(GiB)": 47.44, + "step": 16490, + "token_acc": 0.8447533929162528, + "train_speed(iter/s)": 0.0959 + }, + { + "epoch": 0.7670777395737359, + "grad_norm": 8.565044403076172, + "learning_rate": 1.4111804578223649e-06, + "loss": 0.5640076160430908, + "memory(GiB)": 47.44, + "step": 16495, + "token_acc": 0.8522522522522522, + "train_speed(iter/s)": 0.095917 + }, + { + "epoch": 0.7673102578336854, + "grad_norm": 7.396138668060303, + "learning_rate": 1.4085043654856184e-06, + "loss": 0.6888665199279785, + "memory(GiB)": 47.44, + "step": 16500, + "token_acc": 0.8275146906325613, + "train_speed(iter/s)": 0.095934 + }, + { + "epoch": 0.7673102578336854, + "eval_loss": 0.562106192111969, + "eval_runtime": 291.388, + "eval_samples_per_second": 11.926, + "eval_steps_per_second": 11.926, + "step": 16500 + }, + { + "epoch": 0.7675427760936351, + "grad_norm": 6.648313999176025, + "learning_rate": 1.405830396911128e-06, + "loss": 0.7077539443969727, + "memory(GiB)": 47.44, + "step": 16505, + "token_acc": 0.8332853371730261, + "train_speed(iter/s)": 0.095787 + }, + { + "epoch": 0.7677752943535847, + "grad_norm": 8.782613754272461, + "learning_rate": 1.4031585536800896e-06, + "loss": 0.696587085723877, + "memory(GiB)": 47.44, + "step": 16510, + "token_acc": 0.822529224229543, + "train_speed(iter/s)": 0.095803 + }, + { + "epoch": 0.7680078126135343, + "grad_norm": 7.743236541748047, + "learning_rate": 1.4004888373724506e-06, + "loss": 0.6986588954925537, + "memory(GiB)": 47.44, + "step": 16515, + "token_acc": 0.8177627535341119, + "train_speed(iter/s)": 0.095819 + }, + { + "epoch": 0.7682403308734839, + "grad_norm": 6.123650074005127, + "learning_rate": 1.3978212495668936e-06, + "loss": 0.6276377201080322, + "memory(GiB)": 47.44, + "step": 16520, + "token_acc": 0.8322475570032574, + "train_speed(iter/s)": 0.095835 + }, + { + "epoch": 0.7684728491334335, + "grad_norm": 7.923332214355469, + "learning_rate": 1.3951557918408482e-06, + "loss": 0.5356187343597412, + "memory(GiB)": 47.44, + "step": 16525, + "token_acc": 0.8737796373779637, + "train_speed(iter/s)": 0.095851 + }, + { + "epoch": 0.7687053673933831, + "grad_norm": 9.798731803894043, + "learning_rate": 1.392492465770479e-06, + "loss": 0.7026069641113282, + "memory(GiB)": 47.44, + "step": 16530, + "token_acc": 0.8280542986425339, + "train_speed(iter/s)": 0.095868 + }, + { + "epoch": 0.7689378856533328, + "grad_norm": 8.680355072021484, + "learning_rate": 1.389831272930695e-06, + "loss": 0.6817941188812255, + "memory(GiB)": 47.44, + "step": 16535, + "token_acc": 0.8211987809007789, + "train_speed(iter/s)": 0.095884 + }, + { + "epoch": 0.7691704039132823, + "grad_norm": 6.331023693084717, + "learning_rate": 1.3871722148951404e-06, + "loss": 0.6216944217681885, + "memory(GiB)": 47.44, + "step": 16540, + "token_acc": 0.8433771795656164, + "train_speed(iter/s)": 0.0959 + }, + { + "epoch": 0.7694029221732319, + "grad_norm": 9.733691215515137, + "learning_rate": 1.384515293236201e-06, + "loss": 0.8239374160766602, + "memory(GiB)": 47.44, + "step": 16545, + "token_acc": 0.7935423781434338, + "train_speed(iter/s)": 0.095916 + }, + { + "epoch": 0.7696354404331816, + "grad_norm": 7.86898946762085, + "learning_rate": 1.3818605095249932e-06, + "loss": 0.6025336742401123, + "memory(GiB)": 47.44, + "step": 16550, + "token_acc": 0.84384, + "train_speed(iter/s)": 0.095932 + }, + { + "epoch": 0.7696354404331816, + "eval_loss": 0.5639436841011047, + "eval_runtime": 295.1839, + "eval_samples_per_second": 11.772, + "eval_steps_per_second": 11.772, + "step": 16550 + }, + { + "epoch": 0.7698679586931311, + "grad_norm": 7.427320957183838, + "learning_rate": 1.3792078653313757e-06, + "loss": 0.6521914958953857, + "memory(GiB)": 47.44, + "step": 16555, + "token_acc": 0.8331147645854657, + "train_speed(iter/s)": 0.095783 + }, + { + "epoch": 0.7701004769530807, + "grad_norm": 8.594679832458496, + "learning_rate": 1.3765573622239354e-06, + "loss": 0.5646349430084229, + "memory(GiB)": 47.44, + "step": 16560, + "token_acc": 0.8466364586964099, + "train_speed(iter/s)": 0.0958 + }, + { + "epoch": 0.7703329952130303, + "grad_norm": 8.932754516601562, + "learning_rate": 1.3739090017699985e-06, + "loss": 0.6921500205993653, + "memory(GiB)": 47.44, + "step": 16565, + "token_acc": 0.8274748923959828, + "train_speed(iter/s)": 0.095815 + }, + { + "epoch": 0.77056551347298, + "grad_norm": 6.5633463859558105, + "learning_rate": 1.3712627855356241e-06, + "loss": 0.8388358116149902, + "memory(GiB)": 47.44, + "step": 16570, + "token_acc": 0.7907249779864984, + "train_speed(iter/s)": 0.095832 + }, + { + "epoch": 0.7707980317329295, + "grad_norm": 9.961206436157227, + "learning_rate": 1.368618715085598e-06, + "loss": 0.6288596153259277, + "memory(GiB)": 47.44, + "step": 16575, + "token_acc": 0.8423252279635258, + "train_speed(iter/s)": 0.095848 + }, + { + "epoch": 0.7710305499928791, + "grad_norm": 5.8794636726379395, + "learning_rate": 1.3659767919834426e-06, + "loss": 0.7108618736267089, + "memory(GiB)": 47.44, + "step": 16580, + "token_acc": 0.826055575604475, + "train_speed(iter/s)": 0.095863 + }, + { + "epoch": 0.7712630682528288, + "grad_norm": 7.274999141693115, + "learning_rate": 1.3633370177914086e-06, + "loss": 0.6645435333251953, + "memory(GiB)": 47.44, + "step": 16585, + "token_acc": 0.836635843240863, + "train_speed(iter/s)": 0.095879 + }, + { + "epoch": 0.7714955865127784, + "grad_norm": 8.009333610534668, + "learning_rate": 1.3606993940704766e-06, + "loss": 0.58541841506958, + "memory(GiB)": 47.44, + "step": 16590, + "token_acc": 0.8406515580736544, + "train_speed(iter/s)": 0.095895 + }, + { + "epoch": 0.7717281047727279, + "grad_norm": 9.380660057067871, + "learning_rate": 1.3580639223803533e-06, + "loss": 0.6907434940338135, + "memory(GiB)": 47.44, + "step": 16595, + "token_acc": 0.8184210526315789, + "train_speed(iter/s)": 0.095912 + }, + { + "epoch": 0.7719606230326775, + "grad_norm": 7.912655353546143, + "learning_rate": 1.3554306042794769e-06, + "loss": 0.605219030380249, + "memory(GiB)": 47.44, + "step": 16600, + "token_acc": 0.84609375, + "train_speed(iter/s)": 0.095928 + }, + { + "epoch": 0.7719606230326775, + "eval_loss": 0.5613829493522644, + "eval_runtime": 292.7757, + "eval_samples_per_second": 11.869, + "eval_steps_per_second": 11.869, + "step": 16600 + }, + { + "epoch": 0.7721931412926272, + "grad_norm": 8.54576587677002, + "learning_rate": 1.352799441325006e-06, + "loss": 0.8587137222290039, + "memory(GiB)": 47.44, + "step": 16605, + "token_acc": 0.8318083553371571, + "train_speed(iter/s)": 0.095782 + }, + { + "epoch": 0.7724256595525767, + "grad_norm": 9.212173461914062, + "learning_rate": 1.3501704350728328e-06, + "loss": 0.6808501720428467, + "memory(GiB)": 47.44, + "step": 16610, + "token_acc": 0.824523396880416, + "train_speed(iter/s)": 0.095797 + }, + { + "epoch": 0.7726581778125263, + "grad_norm": 7.881503582000732, + "learning_rate": 1.347543587077566e-06, + "loss": 0.6166458606719971, + "memory(GiB)": 47.44, + "step": 16615, + "token_acc": 0.8433734939759037, + "train_speed(iter/s)": 0.095814 + }, + { + "epoch": 0.772890696072476, + "grad_norm": 9.114757537841797, + "learning_rate": 1.3449188988925438e-06, + "loss": 0.6046999454498291, + "memory(GiB)": 47.44, + "step": 16620, + "token_acc": 0.8512783579402232, + "train_speed(iter/s)": 0.09583 + }, + { + "epoch": 0.7731232143324256, + "grad_norm": 8.172106742858887, + "learning_rate": 1.3422963720698252e-06, + "loss": 0.5814279556274414, + "memory(GiB)": 47.44, + "step": 16625, + "token_acc": 0.8441821247892074, + "train_speed(iter/s)": 0.095846 + }, + { + "epoch": 0.7733557325923751, + "grad_norm": 6.119772911071777, + "learning_rate": 1.3396760081601922e-06, + "loss": 0.5838455200195313, + "memory(GiB)": 47.44, + "step": 16630, + "token_acc": 0.845360824742268, + "train_speed(iter/s)": 0.095862 + }, + { + "epoch": 0.7735882508523247, + "grad_norm": 9.693753242492676, + "learning_rate": 1.3370578087131447e-06, + "loss": 0.667085599899292, + "memory(GiB)": 47.44, + "step": 16635, + "token_acc": 0.8405443126308444, + "train_speed(iter/s)": 0.095879 + }, + { + "epoch": 0.7738207691122744, + "grad_norm": 10.949572563171387, + "learning_rate": 1.3344417752769079e-06, + "loss": 0.695890235900879, + "memory(GiB)": 47.44, + "step": 16640, + "token_acc": 0.8172002978406553, + "train_speed(iter/s)": 0.095895 + }, + { + "epoch": 0.7740532873722239, + "grad_norm": 6.82157039642334, + "learning_rate": 1.33182790939842e-06, + "loss": 0.5298008441925048, + "memory(GiB)": 47.44, + "step": 16645, + "token_acc": 0.8507552870090634, + "train_speed(iter/s)": 0.095911 + }, + { + "epoch": 0.7742858056321735, + "grad_norm": 8.834753036499023, + "learning_rate": 1.3292162126233426e-06, + "loss": 0.6341611385345459, + "memory(GiB)": 47.44, + "step": 16650, + "token_acc": 0.8390536487837388, + "train_speed(iter/s)": 0.095927 + }, + { + "epoch": 0.7742858056321735, + "eval_loss": 0.5621234178543091, + "eval_runtime": 295.7835, + "eval_samples_per_second": 11.748, + "eval_steps_per_second": 11.748, + "step": 16650 + }, + { + "epoch": 0.7745183238921232, + "grad_norm": 6.885671615600586, + "learning_rate": 1.326606686496051e-06, + "loss": 0.6353133678436279, + "memory(GiB)": 47.44, + "step": 16655, + "token_acc": 0.8338329112300836, + "train_speed(iter/s)": 0.09578 + }, + { + "epoch": 0.7747508421520728, + "grad_norm": 6.530189037322998, + "learning_rate": 1.3239993325596396e-06, + "loss": 0.8055611610412597, + "memory(GiB)": 47.44, + "step": 16660, + "token_acc": 0.7988453357642054, + "train_speed(iter/s)": 0.095796 + }, + { + "epoch": 0.7749833604120223, + "grad_norm": 8.837427139282227, + "learning_rate": 1.321394152355917e-06, + "loss": 0.6655847549438476, + "memory(GiB)": 47.44, + "step": 16665, + "token_acc": 0.8508305647840532, + "train_speed(iter/s)": 0.095812 + }, + { + "epoch": 0.775215878671972, + "grad_norm": 6.300332069396973, + "learning_rate": 1.318791147425409e-06, + "loss": 0.523322868347168, + "memory(GiB)": 47.44, + "step": 16670, + "token_acc": 0.872, + "train_speed(iter/s)": 0.095828 + }, + { + "epoch": 0.7754483969319216, + "grad_norm": 6.994781494140625, + "learning_rate": 1.3161903193073484e-06, + "loss": 0.8038483619689941, + "memory(GiB)": 47.44, + "step": 16675, + "token_acc": 0.796137339055794, + "train_speed(iter/s)": 0.095844 + }, + { + "epoch": 0.7756809151918712, + "grad_norm": 7.429974555969238, + "learning_rate": 1.3135916695396893e-06, + "loss": 0.6116120338439941, + "memory(GiB)": 47.44, + "step": 16680, + "token_acc": 0.8417356408327062, + "train_speed(iter/s)": 0.09586 + }, + { + "epoch": 0.7759134334518207, + "grad_norm": 9.190269470214844, + "learning_rate": 1.3109951996590902e-06, + "loss": 0.5953320503234864, + "memory(GiB)": 47.44, + "step": 16685, + "token_acc": 0.8514371033967898, + "train_speed(iter/s)": 0.095876 + }, + { + "epoch": 0.7761459517117704, + "grad_norm": 8.540185928344727, + "learning_rate": 1.308400911200927e-06, + "loss": 0.6601593971252442, + "memory(GiB)": 47.44, + "step": 16690, + "token_acc": 0.8291079812206573, + "train_speed(iter/s)": 0.095892 + }, + { + "epoch": 0.77637846997172, + "grad_norm": 7.5788350105285645, + "learning_rate": 1.3058088056992796e-06, + "loss": 0.594563627243042, + "memory(GiB)": 47.44, + "step": 16695, + "token_acc": 0.8420711974110032, + "train_speed(iter/s)": 0.095908 + }, + { + "epoch": 0.7766109882316695, + "grad_norm": 6.727237224578857, + "learning_rate": 1.3032188846869416e-06, + "loss": 0.7008658885955811, + "memory(GiB)": 47.44, + "step": 16700, + "token_acc": 0.8154761904761905, + "train_speed(iter/s)": 0.095924 + }, + { + "epoch": 0.7766109882316695, + "eval_loss": 0.5621960163116455, + "eval_runtime": 295.9438, + "eval_samples_per_second": 11.742, + "eval_steps_per_second": 11.742, + "step": 16700 + }, + { + "epoch": 0.7768435064916192, + "grad_norm": 7.424905300140381, + "learning_rate": 1.3006311496954123e-06, + "loss": 0.5942497730255127, + "memory(GiB)": 47.44, + "step": 16705, + "token_acc": 0.8338894565425634, + "train_speed(iter/s)": 0.095777 + }, + { + "epoch": 0.7770760247515688, + "grad_norm": 8.602036476135254, + "learning_rate": 1.2980456022549003e-06, + "loss": 0.774842643737793, + "memory(GiB)": 47.44, + "step": 16710, + "token_acc": 0.8064624705486368, + "train_speed(iter/s)": 0.095793 + }, + { + "epoch": 0.7773085430115184, + "grad_norm": 10.397388458251953, + "learning_rate": 1.295462243894321e-06, + "loss": 0.645169734954834, + "memory(GiB)": 47.44, + "step": 16715, + "token_acc": 0.8327734229189996, + "train_speed(iter/s)": 0.095808 + }, + { + "epoch": 0.7775410612714679, + "grad_norm": 9.090619087219238, + "learning_rate": 1.2928810761412907e-06, + "loss": 0.6278375625610352, + "memory(GiB)": 47.44, + "step": 16720, + "token_acc": 0.8567083474146672, + "train_speed(iter/s)": 0.095825 + }, + { + "epoch": 0.7777735795314176, + "grad_norm": 6.70664119720459, + "learning_rate": 1.2903021005221367e-06, + "loss": 0.62248854637146, + "memory(GiB)": 47.44, + "step": 16725, + "token_acc": 0.8399031811894883, + "train_speed(iter/s)": 0.095841 + }, + { + "epoch": 0.7780060977913672, + "grad_norm": 8.718195915222168, + "learning_rate": 1.2877253185618843e-06, + "loss": 0.6643401145935058, + "memory(GiB)": 47.44, + "step": 16730, + "token_acc": 0.8326480263157895, + "train_speed(iter/s)": 0.095857 + }, + { + "epoch": 0.7782386160513168, + "grad_norm": 8.241601943969727, + "learning_rate": 1.2851507317842664e-06, + "loss": 0.7027626037597656, + "memory(GiB)": 47.44, + "step": 16735, + "token_acc": 0.8238440962934658, + "train_speed(iter/s)": 0.095873 + }, + { + "epoch": 0.7784711343112664, + "grad_norm": 6.704036712646484, + "learning_rate": 1.2825783417117132e-06, + "loss": 0.6183983325958252, + "memory(GiB)": 47.44, + "step": 16740, + "token_acc": 0.8387401574803149, + "train_speed(iter/s)": 0.095889 + }, + { + "epoch": 0.778703652571216, + "grad_norm": 7.664031505584717, + "learning_rate": 1.2800081498653598e-06, + "loss": 0.6994392395019531, + "memory(GiB)": 47.44, + "step": 16745, + "token_acc": 0.8196496049467537, + "train_speed(iter/s)": 0.095904 + }, + { + "epoch": 0.7789361708311656, + "grad_norm": 10.418838500976562, + "learning_rate": 1.2774401577650403e-06, + "loss": 0.7204785346984863, + "memory(GiB)": 47.44, + "step": 16750, + "token_acc": 0.8209718670076727, + "train_speed(iter/s)": 0.09592 + }, + { + "epoch": 0.7789361708311656, + "eval_loss": 0.5633994340896606, + "eval_runtime": 293.6917, + "eval_samples_per_second": 11.832, + "eval_steps_per_second": 11.832, + "step": 16750 + }, + { + "epoch": 0.7791686890911151, + "grad_norm": 7.190987586975098, + "learning_rate": 1.2748743669292884e-06, + "loss": 0.6094263076782227, + "memory(GiB)": 47.44, + "step": 16755, + "token_acc": 0.8336955915956084, + "train_speed(iter/s)": 0.095774 + }, + { + "epoch": 0.7794012073510648, + "grad_norm": 9.657785415649414, + "learning_rate": 1.272310778875333e-06, + "loss": 0.6889170646667481, + "memory(GiB)": 47.44, + "step": 16760, + "token_acc": 0.8287827076222981, + "train_speed(iter/s)": 0.09579 + }, + { + "epoch": 0.7796337256110144, + "grad_norm": 10.186052322387695, + "learning_rate": 1.269749395119106e-06, + "loss": 0.6745296478271484, + "memory(GiB)": 47.44, + "step": 16765, + "token_acc": 0.8335174953959484, + "train_speed(iter/s)": 0.095806 + }, + { + "epoch": 0.779866243870964, + "grad_norm": 9.893561363220215, + "learning_rate": 1.2671902171752292e-06, + "loss": 0.6191123008728028, + "memory(GiB)": 47.44, + "step": 16770, + "token_acc": 0.8578020134228188, + "train_speed(iter/s)": 0.095822 + }, + { + "epoch": 0.7800987621309136, + "grad_norm": 11.516806602478027, + "learning_rate": 1.2646332465570271e-06, + "loss": 0.6638372898101806, + "memory(GiB)": 47.44, + "step": 16775, + "token_acc": 0.8319570602807597, + "train_speed(iter/s)": 0.095839 + }, + { + "epoch": 0.7803312803908632, + "grad_norm": 10.242958068847656, + "learning_rate": 1.2620784847765122e-06, + "loss": 0.648340892791748, + "memory(GiB)": 47.44, + "step": 16780, + "token_acc": 0.8328214129003949, + "train_speed(iter/s)": 0.095855 + }, + { + "epoch": 0.7805637986508128, + "grad_norm": 11.26280403137207, + "learning_rate": 1.2595259333443948e-06, + "loss": 0.6338286399841309, + "memory(GiB)": 47.44, + "step": 16785, + "token_acc": 0.8478260869565217, + "train_speed(iter/s)": 0.095871 + }, + { + "epoch": 0.7807963169107623, + "grad_norm": 6.660550117492676, + "learning_rate": 1.2569755937700784e-06, + "loss": 0.6081973552703858, + "memory(GiB)": 47.44, + "step": 16790, + "token_acc": 0.8490693739424704, + "train_speed(iter/s)": 0.095887 + }, + { + "epoch": 0.781028835170712, + "grad_norm": 9.49515151977539, + "learning_rate": 1.2544274675616587e-06, + "loss": 0.6035889148712158, + "memory(GiB)": 47.44, + "step": 16795, + "token_acc": 0.8607979184735473, + "train_speed(iter/s)": 0.095902 + }, + { + "epoch": 0.7812613534306616, + "grad_norm": 8.45023250579834, + "learning_rate": 1.251881556225918e-06, + "loss": 0.5794333457946778, + "memory(GiB)": 47.44, + "step": 16800, + "token_acc": 0.8581162324649299, + "train_speed(iter/s)": 0.095918 + }, + { + "epoch": 0.7812613534306616, + "eval_loss": 0.5614963173866272, + "eval_runtime": 296.7538, + "eval_samples_per_second": 11.71, + "eval_steps_per_second": 11.71, + "step": 16800 + }, + { + "epoch": 0.7814938716906112, + "grad_norm": 8.028955459594727, + "learning_rate": 1.2493378612683354e-06, + "loss": 0.6352114677429199, + "memory(GiB)": 47.44, + "step": 16805, + "token_acc": 0.833584542612042, + "train_speed(iter/s)": 0.095771 + }, + { + "epoch": 0.7817263899505608, + "grad_norm": 10.286711692810059, + "learning_rate": 1.2467963841930736e-06, + "loss": 0.8744414329528809, + "memory(GiB)": 47.44, + "step": 16810, + "token_acc": 0.8024602026049205, + "train_speed(iter/s)": 0.095787 + }, + { + "epoch": 0.7819589082105104, + "grad_norm": 10.463303565979004, + "learning_rate": 1.2442571265029896e-06, + "loss": 0.5919531345367431, + "memory(GiB)": 47.44, + "step": 16815, + "token_acc": 0.8532846715328467, + "train_speed(iter/s)": 0.095803 + }, + { + "epoch": 0.78219142647046, + "grad_norm": 12.795990943908691, + "learning_rate": 1.24172008969962e-06, + "loss": 0.6375868797302247, + "memory(GiB)": 47.44, + "step": 16820, + "token_acc": 0.8484609313338595, + "train_speed(iter/s)": 0.095819 + }, + { + "epoch": 0.7824239447304097, + "grad_norm": 7.519901275634766, + "learning_rate": 1.2391852752831989e-06, + "loss": 0.6498225688934326, + "memory(GiB)": 47.44, + "step": 16825, + "token_acc": 0.8424710424710424, + "train_speed(iter/s)": 0.095834 + }, + { + "epoch": 0.7826564629903592, + "grad_norm": 10.070960998535156, + "learning_rate": 1.236652684752636e-06, + "loss": 0.6001636981964111, + "memory(GiB)": 47.44, + "step": 16830, + "token_acc": 0.8563958165728077, + "train_speed(iter/s)": 0.09585 + }, + { + "epoch": 0.7828889812503088, + "grad_norm": 8.31798267364502, + "learning_rate": 1.234122319605532e-06, + "loss": 0.608649730682373, + "memory(GiB)": 47.44, + "step": 16835, + "token_acc": 0.8568680261639092, + "train_speed(iter/s)": 0.095866 + }, + { + "epoch": 0.7831214995102584, + "grad_norm": 8.985321998596191, + "learning_rate": 1.2315941813381704e-06, + "loss": 0.6267675399780274, + "memory(GiB)": 47.44, + "step": 16840, + "token_acc": 0.8328173374613003, + "train_speed(iter/s)": 0.095882 + }, + { + "epoch": 0.783354017770208, + "grad_norm": 6.239924907684326, + "learning_rate": 1.229068271445516e-06, + "loss": 0.8521398544311524, + "memory(GiB)": 47.44, + "step": 16845, + "token_acc": 0.7998193859121011, + "train_speed(iter/s)": 0.095898 + }, + { + "epoch": 0.7835865360301576, + "grad_norm": 7.774771213531494, + "learning_rate": 1.2265445914212192e-06, + "loss": 0.6899847507476806, + "memory(GiB)": 47.44, + "step": 16850, + "token_acc": 0.8074837310195228, + "train_speed(iter/s)": 0.095914 + }, + { + "epoch": 0.7835865360301576, + "eval_loss": 0.5608976483345032, + "eval_runtime": 297.5458, + "eval_samples_per_second": 11.679, + "eval_steps_per_second": 11.679, + "step": 16850 + }, + { + "epoch": 0.7838190542901072, + "grad_norm": 10.392410278320312, + "learning_rate": 1.2240231427576072e-06, + "loss": 0.5573481559753418, + "memory(GiB)": 47.44, + "step": 16855, + "token_acc": 0.8341591699940668, + "train_speed(iter/s)": 0.095767 + }, + { + "epoch": 0.7840515725500569, + "grad_norm": 7.2723565101623535, + "learning_rate": 1.2215039269456919e-06, + "loss": 0.6622334957122803, + "memory(GiB)": 47.44, + "step": 16860, + "token_acc": 0.8323529411764706, + "train_speed(iter/s)": 0.095783 + }, + { + "epoch": 0.7842840908100064, + "grad_norm": 8.384167671203613, + "learning_rate": 1.218986945475164e-06, + "loss": 0.7650614738464355, + "memory(GiB)": 47.44, + "step": 16865, + "token_acc": 0.8271112722000725, + "train_speed(iter/s)": 0.095799 + }, + { + "epoch": 0.784516609069956, + "grad_norm": 7.8060126304626465, + "learning_rate": 1.2164721998343926e-06, + "loss": 0.5492019653320312, + "memory(GiB)": 47.44, + "step": 16870, + "token_acc": 0.8539993014320643, + "train_speed(iter/s)": 0.095815 + }, + { + "epoch": 0.7847491273299056, + "grad_norm": 9.077139854431152, + "learning_rate": 1.2139596915104224e-06, + "loss": 0.6087624549865722, + "memory(GiB)": 47.44, + "step": 16875, + "token_acc": 0.8356775300171526, + "train_speed(iter/s)": 0.095831 + }, + { + "epoch": 0.7849816455898553, + "grad_norm": 9.597736358642578, + "learning_rate": 1.21144942198898e-06, + "loss": 0.5296175479888916, + "memory(GiB)": 47.44, + "step": 16880, + "token_acc": 0.8635584504628042, + "train_speed(iter/s)": 0.095846 + }, + { + "epoch": 0.7852141638498048, + "grad_norm": 8.164989471435547, + "learning_rate": 1.2089413927544624e-06, + "loss": 0.6467774868011474, + "memory(GiB)": 47.44, + "step": 16885, + "token_acc": 0.8486714193130266, + "train_speed(iter/s)": 0.095862 + }, + { + "epoch": 0.7854466821097544, + "grad_norm": 7.90884256362915, + "learning_rate": 1.2064356052899473e-06, + "loss": 0.6473551273345948, + "memory(GiB)": 47.44, + "step": 16890, + "token_acc": 0.8361884368308351, + "train_speed(iter/s)": 0.095878 + }, + { + "epoch": 0.7856792003697041, + "grad_norm": 10.231267929077148, + "learning_rate": 1.203932061077181e-06, + "loss": 0.533270263671875, + "memory(GiB)": 47.44, + "step": 16895, + "token_acc": 0.8644901610017889, + "train_speed(iter/s)": 0.095894 + }, + { + "epoch": 0.7859117186296536, + "grad_norm": 8.054085731506348, + "learning_rate": 1.2014307615965887e-06, + "loss": 0.7031919002532959, + "memory(GiB)": 47.44, + "step": 16900, + "token_acc": 0.8313442211055276, + "train_speed(iter/s)": 0.09591 + }, + { + "epoch": 0.7859117186296536, + "eval_loss": 0.5615659952163696, + "eval_runtime": 293.9684, + "eval_samples_per_second": 11.821, + "eval_steps_per_second": 11.821, + "step": 16900 + }, + { + "epoch": 0.7861442368896032, + "grad_norm": 6.312916278839111, + "learning_rate": 1.1989317083272655e-06, + "loss": 0.6901503562927246, + "memory(GiB)": 47.44, + "step": 16905, + "token_acc": 0.833696296177681, + "train_speed(iter/s)": 0.095766 + }, + { + "epoch": 0.7863767551495529, + "grad_norm": 9.281437873840332, + "learning_rate": 1.1964349027469806e-06, + "loss": 0.5565787315368652, + "memory(GiB)": 47.44, + "step": 16910, + "token_acc": 0.8596059113300493, + "train_speed(iter/s)": 0.095781 + }, + { + "epoch": 0.7866092734095025, + "grad_norm": 7.677142143249512, + "learning_rate": 1.1939403463321692e-06, + "loss": 0.6321005821228027, + "memory(GiB)": 47.44, + "step": 16915, + "token_acc": 0.8306896551724138, + "train_speed(iter/s)": 0.095797 + }, + { + "epoch": 0.786841791669452, + "grad_norm": 8.295937538146973, + "learning_rate": 1.191448040557942e-06, + "loss": 0.6121166706085205, + "memory(GiB)": 47.44, + "step": 16920, + "token_acc": 0.8432967810399717, + "train_speed(iter/s)": 0.095813 + }, + { + "epoch": 0.7870743099294016, + "grad_norm": 8.459718704223633, + "learning_rate": 1.188957986898074e-06, + "loss": 0.6047228813171387, + "memory(GiB)": 47.44, + "step": 16925, + "token_acc": 0.8478260869565217, + "train_speed(iter/s)": 0.095829 + }, + { + "epoch": 0.7873068281893513, + "grad_norm": 12.994466781616211, + "learning_rate": 1.1864701868250133e-06, + "loss": 0.6566365242004395, + "memory(GiB)": 47.44, + "step": 16930, + "token_acc": 0.8429752066115702, + "train_speed(iter/s)": 0.095845 + }, + { + "epoch": 0.7875393464493008, + "grad_norm": 8.316268920898438, + "learning_rate": 1.1839846418098705e-06, + "loss": 0.6249664783477783, + "memory(GiB)": 47.44, + "step": 16935, + "token_acc": 0.8341105929380414, + "train_speed(iter/s)": 0.09586 + }, + { + "epoch": 0.7877718647092504, + "grad_norm": 6.81507682800293, + "learning_rate": 1.1815013533224262e-06, + "loss": 0.5734968185424805, + "memory(GiB)": 47.44, + "step": 16940, + "token_acc": 0.8526754690757471, + "train_speed(iter/s)": 0.095876 + }, + { + "epoch": 0.7880043829692001, + "grad_norm": 12.082489013671875, + "learning_rate": 1.1790203228311253e-06, + "loss": 0.6177358627319336, + "memory(GiB)": 47.44, + "step": 16945, + "token_acc": 0.8375468164794008, + "train_speed(iter/s)": 0.095892 + }, + { + "epoch": 0.7882369012291497, + "grad_norm": 8.138704299926758, + "learning_rate": 1.17654155180308e-06, + "loss": 0.6662531852722168, + "memory(GiB)": 47.44, + "step": 16950, + "token_acc": 0.8362164151637835, + "train_speed(iter/s)": 0.095908 + }, + { + "epoch": 0.7882369012291497, + "eval_loss": 0.5608460307121277, + "eval_runtime": 290.8127, + "eval_samples_per_second": 11.949, + "eval_steps_per_second": 11.949, + "step": 16950 + }, + { + "epoch": 0.7884694194890992, + "grad_norm": 8.260912895202637, + "learning_rate": 1.17406504170406e-06, + "loss": 0.5241977214813233, + "memory(GiB)": 47.44, + "step": 16955, + "token_acc": 0.8349334059910567, + "train_speed(iter/s)": 0.095766 + }, + { + "epoch": 0.7887019377490488, + "grad_norm": 11.177227020263672, + "learning_rate": 1.171590793998505e-06, + "loss": 0.6105329036712647, + "memory(GiB)": 47.44, + "step": 16960, + "token_acc": 0.8349753694581281, + "train_speed(iter/s)": 0.095782 + }, + { + "epoch": 0.7889344560089985, + "grad_norm": 8.79315185546875, + "learning_rate": 1.1691188101495142e-06, + "loss": 0.5571176528930664, + "memory(GiB)": 47.44, + "step": 16965, + "token_acc": 0.8521072796934865, + "train_speed(iter/s)": 0.095798 + }, + { + "epoch": 0.7891669742689481, + "grad_norm": 9.868995666503906, + "learning_rate": 1.1666490916188462e-06, + "loss": 0.659188985824585, + "memory(GiB)": 47.44, + "step": 16970, + "token_acc": 0.8380414312617702, + "train_speed(iter/s)": 0.095815 + }, + { + "epoch": 0.7893994925288976, + "grad_norm": 8.86392593383789, + "learning_rate": 1.1641816398669242e-06, + "loss": 0.6795865535736084, + "memory(GiB)": 47.44, + "step": 16975, + "token_acc": 0.8313253012048193, + "train_speed(iter/s)": 0.09583 + }, + { + "epoch": 0.7896320107888473, + "grad_norm": 11.150150299072266, + "learning_rate": 1.161716456352826e-06, + "loss": 0.6437868595123291, + "memory(GiB)": 47.44, + "step": 16980, + "token_acc": 0.8402964959568733, + "train_speed(iter/s)": 0.095846 + }, + { + "epoch": 0.7898645290487969, + "grad_norm": 8.442800521850586, + "learning_rate": 1.1592535425342915e-06, + "loss": 0.5874933719635009, + "memory(GiB)": 47.44, + "step": 16985, + "token_acc": 0.8525849335302806, + "train_speed(iter/s)": 0.095862 + }, + { + "epoch": 0.7900970473087464, + "grad_norm": 8.539682388305664, + "learning_rate": 1.156792899867718e-06, + "loss": 0.6538908004760742, + "memory(GiB)": 47.44, + "step": 16990, + "token_acc": 0.8395200599925009, + "train_speed(iter/s)": 0.095878 + }, + { + "epoch": 0.790329565568696, + "grad_norm": 8.99543285369873, + "learning_rate": 1.1543345298081614e-06, + "loss": 0.7801726818084717, + "memory(GiB)": 47.44, + "step": 16995, + "token_acc": 0.8106287425149701, + "train_speed(iter/s)": 0.095893 + }, + { + "epoch": 0.7905620838286457, + "grad_norm": 9.519553184509277, + "learning_rate": 1.1518784338093287e-06, + "loss": 0.6260163307189941, + "memory(GiB)": 47.44, + "step": 17000, + "token_acc": 0.8426924507251766, + "train_speed(iter/s)": 0.095909 + }, + { + "epoch": 0.7905620838286457, + "eval_loss": 0.5613806247711182, + "eval_runtime": 293.023, + "eval_samples_per_second": 11.859, + "eval_steps_per_second": 11.859, + "step": 17000 + }, + { + "epoch": 0.7907946020885953, + "grad_norm": 8.278639793395996, + "learning_rate": 1.1494246133235875e-06, + "loss": 0.6730622768402099, + "memory(GiB)": 47.44, + "step": 17005, + "token_acc": 0.8339751955611424, + "train_speed(iter/s)": 0.095767 + }, + { + "epoch": 0.7910271203485448, + "grad_norm": 8.14084529876709, + "learning_rate": 1.1469730698019554e-06, + "loss": 0.7279993057250976, + "memory(GiB)": 47.44, + "step": 17010, + "token_acc": 0.8286266924564797, + "train_speed(iter/s)": 0.095782 + }, + { + "epoch": 0.7912596386084945, + "grad_norm": 5.9442243576049805, + "learning_rate": 1.1445238046941087e-06, + "loss": 0.7142318725585938, + "memory(GiB)": 47.44, + "step": 17015, + "token_acc": 0.8208279430789134, + "train_speed(iter/s)": 0.095798 + }, + { + "epoch": 0.7914921568684441, + "grad_norm": 7.770115375518799, + "learning_rate": 1.1420768194483707e-06, + "loss": 0.6370692729949952, + "memory(GiB)": 47.44, + "step": 17020, + "token_acc": 0.8414426675740048, + "train_speed(iter/s)": 0.095813 + }, + { + "epoch": 0.7917246751283937, + "grad_norm": 8.381105422973633, + "learning_rate": 1.1396321155117197e-06, + "loss": 0.6671901226043702, + "memory(GiB)": 47.44, + "step": 17025, + "token_acc": 0.8299632352941176, + "train_speed(iter/s)": 0.095829 + }, + { + "epoch": 0.7919571933883433, + "grad_norm": 7.974504470825195, + "learning_rate": 1.1371896943297861e-06, + "loss": 0.6833327770233154, + "memory(GiB)": 47.44, + "step": 17030, + "token_acc": 0.8307752853207399, + "train_speed(iter/s)": 0.095844 + }, + { + "epoch": 0.7921897116482929, + "grad_norm": 8.342890739440918, + "learning_rate": 1.1347495573468492e-06, + "loss": 0.678836441040039, + "memory(GiB)": 47.44, + "step": 17035, + "token_acc": 0.8252461322081576, + "train_speed(iter/s)": 0.09586 + }, + { + "epoch": 0.7924222299082425, + "grad_norm": 10.222599983215332, + "learning_rate": 1.1323117060058353e-06, + "loss": 0.5971874713897705, + "memory(GiB)": 47.44, + "step": 17040, + "token_acc": 0.8501472754050073, + "train_speed(iter/s)": 0.095876 + }, + { + "epoch": 0.792654748168192, + "grad_norm": 9.33965015411377, + "learning_rate": 1.1298761417483235e-06, + "loss": 0.6354103565216065, + "memory(GiB)": 47.44, + "step": 17045, + "token_acc": 0.8423252279635258, + "train_speed(iter/s)": 0.095892 + }, + { + "epoch": 0.7928872664281417, + "grad_norm": 10.085137367248535, + "learning_rate": 1.127442866014536e-06, + "loss": 0.5293305397033692, + "memory(GiB)": 47.44, + "step": 17050, + "token_acc": 0.8703703703703703, + "train_speed(iter/s)": 0.095907 + }, + { + "epoch": 0.7928872664281417, + "eval_loss": 0.5618590712547302, + "eval_runtime": 295.7006, + "eval_samples_per_second": 11.752, + "eval_steps_per_second": 11.752, + "step": 17050 + }, + { + "epoch": 0.7931197846880913, + "grad_norm": 8.592312812805176, + "learning_rate": 1.125011880243345e-06, + "loss": 0.5516678333282471, + "memory(GiB)": 47.44, + "step": 17055, + "token_acc": 0.8342294409410331, + "train_speed(iter/s)": 0.095764 + }, + { + "epoch": 0.7933523029480409, + "grad_norm": 9.838059425354004, + "learning_rate": 1.1225831858722668e-06, + "loss": 0.6128196239471435, + "memory(GiB)": 47.44, + "step": 17060, + "token_acc": 0.8487261146496815, + "train_speed(iter/s)": 0.09578 + }, + { + "epoch": 0.7935848212079905, + "grad_norm": 8.308734893798828, + "learning_rate": 1.1201567843374639e-06, + "loss": 0.7210927486419678, + "memory(GiB)": 47.44, + "step": 17065, + "token_acc": 0.8256519102486355, + "train_speed(iter/s)": 0.095796 + }, + { + "epoch": 0.7938173394679401, + "grad_norm": 9.248513221740723, + "learning_rate": 1.1177326770737418e-06, + "loss": 0.685799503326416, + "memory(GiB)": 47.44, + "step": 17070, + "token_acc": 0.8226449831236575, + "train_speed(iter/s)": 0.095811 + }, + { + "epoch": 0.7940498577278897, + "grad_norm": 9.848244667053223, + "learning_rate": 1.1153108655145516e-06, + "loss": 0.6207716464996338, + "memory(GiB)": 47.44, + "step": 17075, + "token_acc": 0.8317911434236616, + "train_speed(iter/s)": 0.095827 + }, + { + "epoch": 0.7942823759878392, + "grad_norm": 8.39963436126709, + "learning_rate": 1.1128913510919836e-06, + "loss": 0.556895112991333, + "memory(GiB)": 47.44, + "step": 17080, + "token_acc": 0.8656422379826635, + "train_speed(iter/s)": 0.095843 + }, + { + "epoch": 0.7945148942477889, + "grad_norm": 9.106492042541504, + "learning_rate": 1.1104741352367727e-06, + "loss": 0.6832783222198486, + "memory(GiB)": 47.44, + "step": 17085, + "token_acc": 0.8177522780965238, + "train_speed(iter/s)": 0.095859 + }, + { + "epoch": 0.7947474125077385, + "grad_norm": 7.3956499099731445, + "learning_rate": 1.1080592193782913e-06, + "loss": 0.6365145683288574, + "memory(GiB)": 47.44, + "step": 17090, + "token_acc": 0.8236943568173852, + "train_speed(iter/s)": 0.095874 + }, + { + "epoch": 0.7949799307676881, + "grad_norm": 6.6494927406311035, + "learning_rate": 1.1056466049445547e-06, + "loss": 0.5309269428253174, + "memory(GiB)": 47.44, + "step": 17095, + "token_acc": 0.8577352472089315, + "train_speed(iter/s)": 0.09589 + }, + { + "epoch": 0.7952124490276377, + "grad_norm": 8.483128547668457, + "learning_rate": 1.103236293362218e-06, + "loss": 0.7713404655456543, + "memory(GiB)": 47.44, + "step": 17100, + "token_acc": 0.7899129172714079, + "train_speed(iter/s)": 0.095905 + }, + { + "epoch": 0.7952124490276377, + "eval_loss": 0.5605780482292175, + "eval_runtime": 293.2171, + "eval_samples_per_second": 11.851, + "eval_steps_per_second": 11.851, + "step": 17100 + }, + { + "epoch": 0.7954449672875873, + "grad_norm": 8.216812133789062, + "learning_rate": 1.10082828605657e-06, + "loss": 0.6681423187255859, + "memory(GiB)": 47.44, + "step": 17105, + "token_acc": 0.8337287972552666, + "train_speed(iter/s)": 0.095763 + }, + { + "epoch": 0.7956774855475369, + "grad_norm": 10.678299903869629, + "learning_rate": 1.098422584451541e-06, + "loss": 0.6339476108551025, + "memory(GiB)": 47.44, + "step": 17110, + "token_acc": 0.8391070053887606, + "train_speed(iter/s)": 0.095779 + }, + { + "epoch": 0.7959100038074866, + "grad_norm": 7.154067516326904, + "learning_rate": 1.0960191899696965e-06, + "loss": 0.6219954013824462, + "memory(GiB)": 47.44, + "step": 17115, + "token_acc": 0.8290206354405503, + "train_speed(iter/s)": 0.095795 + }, + { + "epoch": 0.7961425220674361, + "grad_norm": 9.737374305725098, + "learning_rate": 1.0936181040322402e-06, + "loss": 0.6709455013275146, + "memory(GiB)": 47.44, + "step": 17120, + "token_acc": 0.8210654737698251, + "train_speed(iter/s)": 0.09581 + }, + { + "epoch": 0.7963750403273857, + "grad_norm": 6.7336745262146, + "learning_rate": 1.0912193280590044e-06, + "loss": 0.6627488136291504, + "memory(GiB)": 47.44, + "step": 17125, + "token_acc": 0.8298080052066384, + "train_speed(iter/s)": 0.095826 + }, + { + "epoch": 0.7966075585873353, + "grad_norm": 7.747184753417969, + "learning_rate": 1.0888228634684623e-06, + "loss": 0.6206040382385254, + "memory(GiB)": 47.44, + "step": 17130, + "token_acc": 0.8367875647668394, + "train_speed(iter/s)": 0.095842 + }, + { + "epoch": 0.7968400768472849, + "grad_norm": 10.590611457824707, + "learning_rate": 1.0864287116777157e-06, + "loss": 0.6220110893249512, + "memory(GiB)": 47.44, + "step": 17135, + "token_acc": 0.8553191489361702, + "train_speed(iter/s)": 0.095857 + }, + { + "epoch": 0.7970725951072345, + "grad_norm": 8.546687126159668, + "learning_rate": 1.084036874102502e-06, + "loss": 0.59197998046875, + "memory(GiB)": 47.44, + "step": 17140, + "token_acc": 0.8423146473779385, + "train_speed(iter/s)": 0.095873 + }, + { + "epoch": 0.7973051133671841, + "grad_norm": 7.821950435638428, + "learning_rate": 1.0816473521571862e-06, + "loss": 0.5261586666107178, + "memory(GiB)": 47.44, + "step": 17145, + "token_acc": 0.8734987990392313, + "train_speed(iter/s)": 0.095888 + }, + { + "epoch": 0.7975376316271338, + "grad_norm": 10.940779685974121, + "learning_rate": 1.0792601472547682e-06, + "loss": 0.6301316738128662, + "memory(GiB)": 47.44, + "step": 17150, + "token_acc": 0.8311926605504587, + "train_speed(iter/s)": 0.095903 + }, + { + "epoch": 0.7975376316271338, + "eval_loss": 0.5604414343833923, + "eval_runtime": 296.127, + "eval_samples_per_second": 11.735, + "eval_steps_per_second": 11.735, + "step": 17150 + }, + { + "epoch": 0.7977701498870833, + "grad_norm": 9.065601348876953, + "learning_rate": 1.0768752608068756e-06, + "loss": 0.6107148170471192, + "memory(GiB)": 47.44, + "step": 17155, + "token_acc": 0.8342620960284338, + "train_speed(iter/s)": 0.09576 + }, + { + "epoch": 0.7980026681470329, + "grad_norm": 8.328161239624023, + "learning_rate": 1.074492694223767e-06, + "loss": 0.5185272693634033, + "memory(GiB)": 47.44, + "step": 17160, + "token_acc": 0.8681526256139025, + "train_speed(iter/s)": 0.095776 + }, + { + "epoch": 0.7982351864069825, + "grad_norm": 8.856505393981934, + "learning_rate": 1.0721124489143248e-06, + "loss": 0.6319143772125244, + "memory(GiB)": 47.44, + "step": 17165, + "token_acc": 0.8422652983656405, + "train_speed(iter/s)": 0.095792 + }, + { + "epoch": 0.7984677046669322, + "grad_norm": 9.311681747436523, + "learning_rate": 1.0697345262860638e-06, + "loss": 0.777432918548584, + "memory(GiB)": 47.44, + "step": 17170, + "token_acc": 0.8133465477370334, + "train_speed(iter/s)": 0.095808 + }, + { + "epoch": 0.7987002229268817, + "grad_norm": 10.043967247009277, + "learning_rate": 1.0673589277451208e-06, + "loss": 0.6721518039703369, + "memory(GiB)": 47.44, + "step": 17175, + "token_acc": 0.8283671036948749, + "train_speed(iter/s)": 0.095824 + }, + { + "epoch": 0.7989327411868313, + "grad_norm": 7.669497013092041, + "learning_rate": 1.0649856546962617e-06, + "loss": 0.7782588958740234, + "memory(GiB)": 47.44, + "step": 17180, + "token_acc": 0.8111563044741429, + "train_speed(iter/s)": 0.095839 + }, + { + "epoch": 0.799165259446781, + "grad_norm": 9.350188255310059, + "learning_rate": 1.0626147085428761e-06, + "loss": 0.5789762973785401, + "memory(GiB)": 47.44, + "step": 17185, + "token_acc": 0.8528351360209767, + "train_speed(iter/s)": 0.095855 + }, + { + "epoch": 0.7993977777067305, + "grad_norm": 8.215911865234375, + "learning_rate": 1.0602460906869799e-06, + "loss": 0.8086822509765625, + "memory(GiB)": 47.44, + "step": 17190, + "token_acc": 0.8049076037564374, + "train_speed(iter/s)": 0.09587 + }, + { + "epoch": 0.7996302959666801, + "grad_norm": 7.175421714782715, + "learning_rate": 1.057879802529206e-06, + "loss": 0.6093656539916992, + "memory(GiB)": 47.44, + "step": 17195, + "token_acc": 0.8360609480812641, + "train_speed(iter/s)": 0.095886 + }, + { + "epoch": 0.7998628142266297, + "grad_norm": 8.85153579711914, + "learning_rate": 1.055515845468817e-06, + "loss": 0.6791608810424805, + "memory(GiB)": 47.44, + "step": 17200, + "token_acc": 0.8277543061423465, + "train_speed(iter/s)": 0.095902 + }, + { + "epoch": 0.7998628142266297, + "eval_loss": 0.5614746809005737, + "eval_runtime": 297.9952, + "eval_samples_per_second": 11.661, + "eval_steps_per_second": 11.661, + "step": 17200 + }, + { + "epoch": 0.8000953324865794, + "grad_norm": 9.05666446685791, + "learning_rate": 1.053154220903691e-06, + "loss": 0.6790387630462646, + "memory(GiB)": 47.44, + "step": 17205, + "token_acc": 0.8340710792930423, + "train_speed(iter/s)": 0.095758 + }, + { + "epoch": 0.8003278507465289, + "grad_norm": 7.279318809509277, + "learning_rate": 1.0507949302303315e-06, + "loss": 0.6103257656097412, + "memory(GiB)": 47.44, + "step": 17210, + "token_acc": 0.848814862267777, + "train_speed(iter/s)": 0.095773 + }, + { + "epoch": 0.8005603690064785, + "grad_norm": 8.824692726135254, + "learning_rate": 1.0484379748438584e-06, + "loss": 0.5345361232757568, + "memory(GiB)": 47.44, + "step": 17215, + "token_acc": 0.8665330661322646, + "train_speed(iter/s)": 0.095789 + }, + { + "epoch": 0.8007928872664282, + "grad_norm": 10.482314109802246, + "learning_rate": 1.046083356138013e-06, + "loss": 0.6422648429870605, + "memory(GiB)": 47.44, + "step": 17220, + "token_acc": 0.8412544455221468, + "train_speed(iter/s)": 0.095804 + }, + { + "epoch": 0.8010254055263778, + "grad_norm": 10.245368957519531, + "learning_rate": 1.0437310755051533e-06, + "loss": 0.6183192253112793, + "memory(GiB)": 47.44, + "step": 17225, + "token_acc": 0.8408354339214981, + "train_speed(iter/s)": 0.09582 + }, + { + "epoch": 0.8012579237863273, + "grad_norm": 8.228830337524414, + "learning_rate": 1.0413811343362567e-06, + "loss": 0.7094565868377686, + "memory(GiB)": 47.44, + "step": 17230, + "token_acc": 0.8140299598100109, + "train_speed(iter/s)": 0.095836 + }, + { + "epoch": 0.801490442046277, + "grad_norm": 8.723837852478027, + "learning_rate": 1.0390335340209169e-06, + "loss": 0.6350565910339355, + "memory(GiB)": 47.44, + "step": 17235, + "token_acc": 0.8355555555555556, + "train_speed(iter/s)": 0.095851 + }, + { + "epoch": 0.8017229603062266, + "grad_norm": 9.939167976379395, + "learning_rate": 1.0366882759473391e-06, + "loss": 0.750972843170166, + "memory(GiB)": 47.44, + "step": 17240, + "token_acc": 0.7975287840494243, + "train_speed(iter/s)": 0.095865 + }, + { + "epoch": 0.8019554785661761, + "grad_norm": 8.15424633026123, + "learning_rate": 1.0343453615023501e-06, + "loss": 0.6216236114501953, + "memory(GiB)": 47.44, + "step": 17245, + "token_acc": 0.8429609131788308, + "train_speed(iter/s)": 0.095881 + }, + { + "epoch": 0.8021879968261257, + "grad_norm": 12.222821235656738, + "learning_rate": 1.0320047920713854e-06, + "loss": 0.5298105239868164, + "memory(GiB)": 47.44, + "step": 17250, + "token_acc": 0.8692610406646262, + "train_speed(iter/s)": 0.095897 + }, + { + "epoch": 0.8021879968261257, + "eval_loss": 0.5603924989700317, + "eval_runtime": 296.949, + "eval_samples_per_second": 11.702, + "eval_steps_per_second": 11.702, + "step": 17250 + }, + { + "epoch": 0.8024205150860754, + "grad_norm": 11.590433120727539, + "learning_rate": 1.0296665690384977e-06, + "loss": 0.721394157409668, + "memory(GiB)": 47.44, + "step": 17255, + "token_acc": 0.8327286584155664, + "train_speed(iter/s)": 0.095754 + }, + { + "epoch": 0.802653033346025, + "grad_norm": 8.545548439025879, + "learning_rate": 1.0273306937863474e-06, + "loss": 0.6448258399963379, + "memory(GiB)": 47.44, + "step": 17260, + "token_acc": 0.8342391304347826, + "train_speed(iter/s)": 0.095769 + }, + { + "epoch": 0.8028855516059745, + "grad_norm": 9.636141777038574, + "learning_rate": 1.0249971676962127e-06, + "loss": 0.5769176006317138, + "memory(GiB)": 47.44, + "step": 17265, + "token_acc": 0.8622431795217245, + "train_speed(iter/s)": 0.095785 + }, + { + "epoch": 0.8031180698659242, + "grad_norm": 6.957939147949219, + "learning_rate": 1.0226659921479782e-06, + "loss": 0.689917802810669, + "memory(GiB)": 47.44, + "step": 17270, + "token_acc": 0.8231624627068077, + "train_speed(iter/s)": 0.0958 + }, + { + "epoch": 0.8033505881258738, + "grad_norm": 8.156017303466797, + "learning_rate": 1.020337168520142e-06, + "loss": 0.647607421875, + "memory(GiB)": 47.44, + "step": 17275, + "token_acc": 0.8418691588785047, + "train_speed(iter/s)": 0.095816 + }, + { + "epoch": 0.8035831063858233, + "grad_norm": 8.871214866638184, + "learning_rate": 1.0180106981898058e-06, + "loss": 0.6655109882354736, + "memory(GiB)": 47.44, + "step": 17280, + "token_acc": 0.8353528153955809, + "train_speed(iter/s)": 0.095831 + }, + { + "epoch": 0.8038156246457729, + "grad_norm": 8.65011215209961, + "learning_rate": 1.0156865825326873e-06, + "loss": 0.7450146198272705, + "memory(GiB)": 47.44, + "step": 17285, + "token_acc": 0.8207972270363951, + "train_speed(iter/s)": 0.095847 + }, + { + "epoch": 0.8040481429057226, + "grad_norm": 11.104325294494629, + "learning_rate": 1.0133648229231047e-06, + "loss": 0.5896714687347412, + "memory(GiB)": 47.44, + "step": 17290, + "token_acc": 0.8539137714524906, + "train_speed(iter/s)": 0.095862 + }, + { + "epoch": 0.8042806611656722, + "grad_norm": 8.021060943603516, + "learning_rate": 1.011045420733988e-06, + "loss": 0.6420575618743897, + "memory(GiB)": 47.44, + "step": 17295, + "token_acc": 0.8414845646895595, + "train_speed(iter/s)": 0.095878 + }, + { + "epoch": 0.8045131794256217, + "grad_norm": 9.281312942504883, + "learning_rate": 1.00872837733687e-06, + "loss": 0.5397995948791504, + "memory(GiB)": 47.44, + "step": 17300, + "token_acc": 0.8661740558292282, + "train_speed(iter/s)": 0.095893 + }, + { + "epoch": 0.8045131794256217, + "eval_loss": 0.560265064239502, + "eval_runtime": 294.7526, + "eval_samples_per_second": 11.79, + "eval_steps_per_second": 11.79, + "step": 17300 + }, + { + "epoch": 0.8047456976855714, + "grad_norm": 10.986329078674316, + "learning_rate": 1.0064136941018904e-06, + "loss": 0.5388300895690918, + "memory(GiB)": 47.44, + "step": 17305, + "token_acc": 0.8344242987509314, + "train_speed(iter/s)": 0.095752 + }, + { + "epoch": 0.804978215945521, + "grad_norm": 12.374367713928223, + "learning_rate": 1.0041013723977933e-06, + "loss": 0.6608255863189697, + "memory(GiB)": 47.44, + "step": 17310, + "token_acc": 0.8333333333333334, + "train_speed(iter/s)": 0.095767 + }, + { + "epoch": 0.8052107342054706, + "grad_norm": 8.375582695007324, + "learning_rate": 1.0017914135919265e-06, + "loss": 0.5581004619598389, + "memory(GiB)": 47.44, + "step": 17315, + "token_acc": 0.8572536850271528, + "train_speed(iter/s)": 0.095783 + }, + { + "epoch": 0.8054432524654201, + "grad_norm": 5.624876499176025, + "learning_rate": 9.994838190502381e-07, + "loss": 0.7049301624298095, + "memory(GiB)": 47.44, + "step": 17320, + "token_acc": 0.8228299643281808, + "train_speed(iter/s)": 0.095798 + }, + { + "epoch": 0.8056757707253698, + "grad_norm": 9.356409072875977, + "learning_rate": 9.971785901372827e-07, + "loss": 0.6245266437530518, + "memory(GiB)": 47.44, + "step": 17325, + "token_acc": 0.8592896174863388, + "train_speed(iter/s)": 0.095813 + }, + { + "epoch": 0.8059082889853194, + "grad_norm": 7.530612945556641, + "learning_rate": 9.948757282162103e-07, + "loss": 0.6261490821838379, + "memory(GiB)": 47.44, + "step": 17330, + "token_acc": 0.8362235067437379, + "train_speed(iter/s)": 0.095829 + }, + { + "epoch": 0.8061408072452689, + "grad_norm": 8.12159252166748, + "learning_rate": 9.925752346487772e-07, + "loss": 0.6676755428314209, + "memory(GiB)": 47.44, + "step": 17335, + "token_acc": 0.8142649199417759, + "train_speed(iter/s)": 0.095844 + }, + { + "epoch": 0.8063733255052186, + "grad_norm": 7.375349998474121, + "learning_rate": 9.902771107953329e-07, + "loss": 0.6126779556274414, + "memory(GiB)": 47.44, + "step": 17340, + "token_acc": 0.8516806722689075, + "train_speed(iter/s)": 0.09586 + }, + { + "epoch": 0.8066058437651682, + "grad_norm": 10.835264205932617, + "learning_rate": 9.879813580148312e-07, + "loss": 0.686242151260376, + "memory(GiB)": 47.44, + "step": 17345, + "token_acc": 0.8460502692998204, + "train_speed(iter/s)": 0.095875 + }, + { + "epoch": 0.8068383620251178, + "grad_norm": 6.949465274810791, + "learning_rate": 9.856879776648214e-07, + "loss": 0.7318108558654786, + "memory(GiB)": 47.44, + "step": 17350, + "token_acc": 0.8146841936518114, + "train_speed(iter/s)": 0.095891 + }, + { + "epoch": 0.8068383620251178, + "eval_loss": 0.5608267784118652, + "eval_runtime": 295.2159, + "eval_samples_per_second": 11.771, + "eval_steps_per_second": 11.771, + "step": 17350 + }, + { + "epoch": 0.8070708802850673, + "grad_norm": 7.345584392547607, + "learning_rate": 9.833969711014497e-07, + "loss": 0.6408491611480713, + "memory(GiB)": 47.44, + "step": 17355, + "token_acc": 0.8342026527575559, + "train_speed(iter/s)": 0.09575 + }, + { + "epoch": 0.807303398545017, + "grad_norm": 10.600166320800781, + "learning_rate": 9.811083396794607e-07, + "loss": 0.6612047672271728, + "memory(GiB)": 47.44, + "step": 17360, + "token_acc": 0.8144783118405627, + "train_speed(iter/s)": 0.095765 + }, + { + "epoch": 0.8075359168049666, + "grad_norm": 7.029186248779297, + "learning_rate": 9.788220847521895e-07, + "loss": 0.6378894805908203, + "memory(GiB)": 47.44, + "step": 17365, + "token_acc": 0.8553054662379421, + "train_speed(iter/s)": 0.095781 + }, + { + "epoch": 0.8077684350649162, + "grad_norm": 6.6874470710754395, + "learning_rate": 9.76538207671573e-07, + "loss": 0.6131662368774414, + "memory(GiB)": 47.44, + "step": 17370, + "token_acc": 0.841715976331361, + "train_speed(iter/s)": 0.095797 + }, + { + "epoch": 0.8080009533248658, + "grad_norm": 6.854613304138184, + "learning_rate": 9.74256709788135e-07, + "loss": 0.6927377700805664, + "memory(GiB)": 47.44, + "step": 17375, + "token_acc": 0.8233151183970856, + "train_speed(iter/s)": 0.095812 + }, + { + "epoch": 0.8082334715848154, + "grad_norm": 9.138595581054688, + "learning_rate": 9.719775924509982e-07, + "loss": 0.6806143283843994, + "memory(GiB)": 47.44, + "step": 17380, + "token_acc": 0.8281972265023112, + "train_speed(iter/s)": 0.095828 + }, + { + "epoch": 0.808465989844765, + "grad_norm": 10.136089324951172, + "learning_rate": 9.697008570078726e-07, + "loss": 0.6174187183380127, + "memory(GiB)": 47.44, + "step": 17385, + "token_acc": 0.8403677392394484, + "train_speed(iter/s)": 0.095843 + }, + { + "epoch": 0.8086985081047146, + "grad_norm": 9.113638877868652, + "learning_rate": 9.674265048050636e-07, + "loss": 0.7248986721038818, + "memory(GiB)": 47.44, + "step": 17390, + "token_acc": 0.8193146417445483, + "train_speed(iter/s)": 0.095859 + }, + { + "epoch": 0.8089310263646642, + "grad_norm": 6.724001407623291, + "learning_rate": 9.65154537187465e-07, + "loss": 0.5666268825531006, + "memory(GiB)": 47.44, + "step": 17395, + "token_acc": 0.8590038314176245, + "train_speed(iter/s)": 0.095875 + }, + { + "epoch": 0.8091635446246138, + "grad_norm": 6.4625091552734375, + "learning_rate": 9.628849554985637e-07, + "loss": 0.5667214393615723, + "memory(GiB)": 47.44, + "step": 17400, + "token_acc": 0.8561014263074485, + "train_speed(iter/s)": 0.09589 + }, + { + "epoch": 0.8091635446246138, + "eval_loss": 0.5599117279052734, + "eval_runtime": 294.7125, + "eval_samples_per_second": 11.791, + "eval_steps_per_second": 11.791, + "step": 17400 + }, + { + "epoch": 0.8093960628845634, + "grad_norm": 11.611597061157227, + "learning_rate": 9.606177610804308e-07, + "loss": 0.6227489948272705, + "memory(GiB)": 47.44, + "step": 17405, + "token_acc": 0.8339909898396064, + "train_speed(iter/s)": 0.095749 + }, + { + "epoch": 0.809628581144513, + "grad_norm": 7.669189453125, + "learning_rate": 9.583529552737303e-07, + "loss": 0.6733208656311035, + "memory(GiB)": 47.44, + "step": 17410, + "token_acc": 0.8414746543778802, + "train_speed(iter/s)": 0.095764 + }, + { + "epoch": 0.8098610994044626, + "grad_norm": 7.762040615081787, + "learning_rate": 9.560905394177096e-07, + "loss": 0.7308223247528076, + "memory(GiB)": 47.44, + "step": 17415, + "token_acc": 0.8231552162849872, + "train_speed(iter/s)": 0.09578 + }, + { + "epoch": 0.8100936176644122, + "grad_norm": 9.89555549621582, + "learning_rate": 9.538305148502074e-07, + "loss": 0.5911648750305176, + "memory(GiB)": 47.44, + "step": 17420, + "token_acc": 0.8471774193548387, + "train_speed(iter/s)": 0.095795 + }, + { + "epoch": 0.8103261359243618, + "grad_norm": 8.816936492919922, + "learning_rate": 9.515728829076437e-07, + "loss": 0.5312886714935303, + "memory(GiB)": 47.44, + "step": 17425, + "token_acc": 0.8606158833063209, + "train_speed(iter/s)": 0.095811 + }, + { + "epoch": 0.8105586541843114, + "grad_norm": 8.899998664855957, + "learning_rate": 9.493176449250274e-07, + "loss": 0.6454525947570801, + "memory(GiB)": 47.44, + "step": 17430, + "token_acc": 0.8451676528599605, + "train_speed(iter/s)": 0.095826 + }, + { + "epoch": 0.810791172444261, + "grad_norm": 9.899276733398438, + "learning_rate": 9.470648022359496e-07, + "loss": 0.6515559673309326, + "memory(GiB)": 47.44, + "step": 17435, + "token_acc": 0.8539670371789958, + "train_speed(iter/s)": 0.095842 + }, + { + "epoch": 0.8110236907042107, + "grad_norm": 9.048864364624023, + "learning_rate": 9.448143561725881e-07, + "loss": 0.6078864574432373, + "memory(GiB)": 47.44, + "step": 17440, + "token_acc": 0.8472925594078691, + "train_speed(iter/s)": 0.095858 + }, + { + "epoch": 0.8112562089641602, + "grad_norm": 7.783231735229492, + "learning_rate": 9.425663080656977e-07, + "loss": 0.6650321960449219, + "memory(GiB)": 47.44, + "step": 17445, + "token_acc": 0.8310636731689364, + "train_speed(iter/s)": 0.095873 + }, + { + "epoch": 0.8114887272241098, + "grad_norm": 7.394621849060059, + "learning_rate": 9.403206592446217e-07, + "loss": 0.6150126934051514, + "memory(GiB)": 47.44, + "step": 17450, + "token_acc": 0.8462922966162707, + "train_speed(iter/s)": 0.095888 + }, + { + "epoch": 0.8114887272241098, + "eval_loss": 0.5597295165061951, + "eval_runtime": 291.778, + "eval_samples_per_second": 11.91, + "eval_steps_per_second": 11.91, + "step": 17450 + }, + { + "epoch": 0.8117212454840594, + "grad_norm": 8.46907901763916, + "learning_rate": 9.380774110372786e-07, + "loss": 0.6139643669128418, + "memory(GiB)": 47.44, + "step": 17455, + "token_acc": 0.834572073875923, + "train_speed(iter/s)": 0.09575 + }, + { + "epoch": 0.8119537637440091, + "grad_norm": 7.3746747970581055, + "learning_rate": 9.358365647701734e-07, + "loss": 0.6090128421783447, + "memory(GiB)": 47.44, + "step": 17460, + "token_acc": 0.8393951777686963, + "train_speed(iter/s)": 0.095765 + }, + { + "epoch": 0.8121862820039586, + "grad_norm": 10.859003067016602, + "learning_rate": 9.335981217683848e-07, + "loss": 0.6623462200164795, + "memory(GiB)": 47.44, + "step": 17465, + "token_acc": 0.831107903284013, + "train_speed(iter/s)": 0.095781 + }, + { + "epoch": 0.8124188002639082, + "grad_norm": 6.167802810668945, + "learning_rate": 9.313620833555742e-07, + "loss": 0.7212738990783691, + "memory(GiB)": 47.44, + "step": 17470, + "token_acc": 0.8365045806906272, + "train_speed(iter/s)": 0.095797 + }, + { + "epoch": 0.8126513185238579, + "grad_norm": 9.171575546264648, + "learning_rate": 9.29128450853981e-07, + "loss": 0.6452064037322998, + "memory(GiB)": 47.44, + "step": 17475, + "token_acc": 0.8430935709739019, + "train_speed(iter/s)": 0.095812 + }, + { + "epoch": 0.8128838367838074, + "grad_norm": 7.961929798126221, + "learning_rate": 9.268972255844217e-07, + "loss": 0.5232569217681885, + "memory(GiB)": 47.44, + "step": 17480, + "token_acc": 0.8704663212435233, + "train_speed(iter/s)": 0.095828 + }, + { + "epoch": 0.813116355043757, + "grad_norm": 7.137867450714111, + "learning_rate": 9.246684088662861e-07, + "loss": 0.5751725673675537, + "memory(GiB)": 47.44, + "step": 17485, + "token_acc": 0.8549962434259955, + "train_speed(iter/s)": 0.095844 + }, + { + "epoch": 0.8133488733037066, + "grad_norm": 10.85989761352539, + "learning_rate": 9.22442002017544e-07, + "loss": 0.5673013210296631, + "memory(GiB)": 47.44, + "step": 17490, + "token_acc": 0.8622823984526112, + "train_speed(iter/s)": 0.095859 + }, + { + "epoch": 0.8135813915636563, + "grad_norm": 9.529802322387695, + "learning_rate": 9.202180063547395e-07, + "loss": 0.7671789169311524, + "memory(GiB)": 47.44, + "step": 17495, + "token_acc": 0.8173950670705322, + "train_speed(iter/s)": 0.095875 + }, + { + "epoch": 0.8138139098236058, + "grad_norm": 8.780619621276855, + "learning_rate": 9.179964231929878e-07, + "loss": 0.6349299907684326, + "memory(GiB)": 47.44, + "step": 17500, + "token_acc": 0.8404255319148937, + "train_speed(iter/s)": 0.09589 + }, + { + "epoch": 0.8138139098236058, + "eval_loss": 0.5591466426849365, + "eval_runtime": 295.2355, + "eval_samples_per_second": 11.77, + "eval_steps_per_second": 11.77, + "step": 17500 + }, + { + "epoch": 0.8140464280835554, + "grad_norm": 7.224985599517822, + "learning_rate": 9.157772538459802e-07, + "loss": 0.6598941802978515, + "memory(GiB)": 47.44, + "step": 17505, + "token_acc": 0.8337386284812196, + "train_speed(iter/s)": 0.095749 + }, + { + "epoch": 0.8142789463435051, + "grad_norm": 11.278789520263672, + "learning_rate": 9.135604996259806e-07, + "loss": 0.7177794933319092, + "memory(GiB)": 47.44, + "step": 17510, + "token_acc": 0.8180579216354344, + "train_speed(iter/s)": 0.095765 + }, + { + "epoch": 0.8145114646034547, + "grad_norm": 11.014581680297852, + "learning_rate": 9.113461618438251e-07, + "loss": 0.5065845012664795, + "memory(GiB)": 47.44, + "step": 17515, + "token_acc": 0.8631626642501132, + "train_speed(iter/s)": 0.09578 + }, + { + "epoch": 0.8147439828634042, + "grad_norm": 7.524247646331787, + "learning_rate": 9.091342418089178e-07, + "loss": 0.7104159355163574, + "memory(GiB)": 47.44, + "step": 17520, + "token_acc": 0.8215931533903884, + "train_speed(iter/s)": 0.095796 + }, + { + "epoch": 0.8149765011233538, + "grad_norm": 8.67385196685791, + "learning_rate": 9.069247408292375e-07, + "loss": 0.587507963180542, + "memory(GiB)": 47.44, + "step": 17525, + "token_acc": 0.8514241724403387, + "train_speed(iter/s)": 0.095811 + }, + { + "epoch": 0.8152090193833035, + "grad_norm": 11.212056159973145, + "learning_rate": 9.047176602113278e-07, + "loss": 0.6599641799926758, + "memory(GiB)": 47.44, + "step": 17530, + "token_acc": 0.8254988163679405, + "train_speed(iter/s)": 0.095826 + }, + { + "epoch": 0.815441537643253, + "grad_norm": 10.266169548034668, + "learning_rate": 9.025130012603068e-07, + "loss": 0.5785196781158447, + "memory(GiB)": 47.44, + "step": 17535, + "token_acc": 0.8489553924336533, + "train_speed(iter/s)": 0.095841 + }, + { + "epoch": 0.8156740559032026, + "grad_norm": 10.367372512817383, + "learning_rate": 9.003107652798542e-07, + "loss": 0.699970293045044, + "memory(GiB)": 47.44, + "step": 17540, + "token_acc": 0.8289133247089263, + "train_speed(iter/s)": 0.095857 + }, + { + "epoch": 0.8159065741631523, + "grad_norm": 8.538458824157715, + "learning_rate": 8.981109535722215e-07, + "loss": 0.7431015491485595, + "memory(GiB)": 47.44, + "step": 17545, + "token_acc": 0.8228647391159853, + "train_speed(iter/s)": 0.095872 + }, + { + "epoch": 0.8161390924231019, + "grad_norm": 8.287872314453125, + "learning_rate": 8.959135674382258e-07, + "loss": 0.5843753337860107, + "memory(GiB)": 47.44, + "step": 17550, + "token_acc": 0.8404864091559371, + "train_speed(iter/s)": 0.095887 + }, + { + "epoch": 0.8161390924231019, + "eval_loss": 0.5593584775924683, + "eval_runtime": 297.4667, + "eval_samples_per_second": 11.682, + "eval_steps_per_second": 11.682, + "step": 17550 + }, + { + "epoch": 0.8163716106830514, + "grad_norm": 7.792442321777344, + "learning_rate": 8.937186081772498e-07, + "loss": 0.5183809757232666, + "memory(GiB)": 47.44, + "step": 17555, + "token_acc": 0.8351830272833206, + "train_speed(iter/s)": 0.095747 + }, + { + "epoch": 0.816604128943001, + "grad_norm": 9.457621574401855, + "learning_rate": 8.915260770872386e-07, + "loss": 0.5537750720977783, + "memory(GiB)": 47.44, + "step": 17560, + "token_acc": 0.8717538953256093, + "train_speed(iter/s)": 0.095762 + }, + { + "epoch": 0.8168366472029507, + "grad_norm": 9.286090850830078, + "learning_rate": 8.893359754647063e-07, + "loss": 0.6299626350402832, + "memory(GiB)": 47.44, + "step": 17565, + "token_acc": 0.8482068390325271, + "train_speed(iter/s)": 0.095776 + }, + { + "epoch": 0.8170691654629002, + "grad_norm": 10.740191459655762, + "learning_rate": 8.871483046047247e-07, + "loss": 0.7528077125549316, + "memory(GiB)": 47.44, + "step": 17570, + "token_acc": 0.8146027201145312, + "train_speed(iter/s)": 0.095791 + }, + { + "epoch": 0.8173016837228498, + "grad_norm": 8.127224922180176, + "learning_rate": 8.849630658009333e-07, + "loss": 0.6412830829620362, + "memory(GiB)": 47.44, + "step": 17575, + "token_acc": 0.8376005852231163, + "train_speed(iter/s)": 0.095807 + }, + { + "epoch": 0.8175342019827995, + "grad_norm": 9.804791450500488, + "learning_rate": 8.827802603455293e-07, + "loss": 0.6093309402465821, + "memory(GiB)": 47.44, + "step": 17580, + "token_acc": 0.8454398708635997, + "train_speed(iter/s)": 0.095822 + }, + { + "epoch": 0.8177667202427491, + "grad_norm": 9.034673690795898, + "learning_rate": 8.805998895292745e-07, + "loss": 0.6556478023529053, + "memory(GiB)": 47.44, + "step": 17585, + "token_acc": 0.8411873840445269, + "train_speed(iter/s)": 0.095838 + }, + { + "epoch": 0.8179992385026986, + "grad_norm": 6.9533491134643555, + "learning_rate": 8.78421954641489e-07, + "loss": 0.6458122253417968, + "memory(GiB)": 47.44, + "step": 17590, + "token_acc": 0.8348040945993647, + "train_speed(iter/s)": 0.095853 + }, + { + "epoch": 0.8182317567626483, + "grad_norm": 6.9696478843688965, + "learning_rate": 8.76246456970054e-07, + "loss": 0.5575369834899903, + "memory(GiB)": 47.44, + "step": 17595, + "token_acc": 0.8447592067988668, + "train_speed(iter/s)": 0.095867 + }, + { + "epoch": 0.8184642750225979, + "grad_norm": 9.378483772277832, + "learning_rate": 8.740733978014065e-07, + "loss": 0.5304455757141113, + "memory(GiB)": 47.44, + "step": 17600, + "token_acc": 0.8685944363103953, + "train_speed(iter/s)": 0.095882 + }, + { + "epoch": 0.8184642750225979, + "eval_loss": 0.5591949820518494, + "eval_runtime": 297.3176, + "eval_samples_per_second": 11.688, + "eval_steps_per_second": 11.688, + "step": 17600 + }, + { + "epoch": 0.8186967932825475, + "grad_norm": 7.968575954437256, + "learning_rate": 8.719027784205458e-07, + "loss": 0.5970022678375244, + "memory(GiB)": 47.44, + "step": 17605, + "token_acc": 0.8342140503091197, + "train_speed(iter/s)": 0.095742 + }, + { + "epoch": 0.818929311542497, + "grad_norm": 8.821393013000488, + "learning_rate": 8.697346001110235e-07, + "loss": 0.5827204704284668, + "memory(GiB)": 47.44, + "step": 17610, + "token_acc": 0.8548732050106935, + "train_speed(iter/s)": 0.095757 + }, + { + "epoch": 0.8191618298024467, + "grad_norm": 9.860644340515137, + "learning_rate": 8.675688641549529e-07, + "loss": 0.6527186393737793, + "memory(GiB)": 47.44, + "step": 17615, + "token_acc": 0.8406456953642384, + "train_speed(iter/s)": 0.095772 + }, + { + "epoch": 0.8193943480623963, + "grad_norm": 10.435001373291016, + "learning_rate": 8.654055718329979e-07, + "loss": 0.7885581493377686, + "memory(GiB)": 47.44, + "step": 17620, + "token_acc": 0.7981525829627095, + "train_speed(iter/s)": 0.095788 + }, + { + "epoch": 0.8196268663223458, + "grad_norm": 7.4888386726379395, + "learning_rate": 8.632447244243814e-07, + "loss": 0.689478588104248, + "memory(GiB)": 47.44, + "step": 17625, + "token_acc": 0.8370837083708371, + "train_speed(iter/s)": 0.095803 + }, + { + "epoch": 0.8198593845822955, + "grad_norm": 10.054991722106934, + "learning_rate": 8.610863232068795e-07, + "loss": 0.5457469940185546, + "memory(GiB)": 47.44, + "step": 17630, + "token_acc": 0.8628948281846581, + "train_speed(iter/s)": 0.095818 + }, + { + "epoch": 0.8200919028422451, + "grad_norm": 7.449556827545166, + "learning_rate": 8.589303694568213e-07, + "loss": 0.6463999271392822, + "memory(GiB)": 47.44, + "step": 17635, + "token_acc": 0.8249056603773585, + "train_speed(iter/s)": 0.095834 + }, + { + "epoch": 0.8203244211021947, + "grad_norm": 9.351618766784668, + "learning_rate": 8.567768644490898e-07, + "loss": 0.5305264472961426, + "memory(GiB)": 47.44, + "step": 17640, + "token_acc": 0.8620582765034098, + "train_speed(iter/s)": 0.095849 + }, + { + "epoch": 0.8205569393621442, + "grad_norm": 7.768716335296631, + "learning_rate": 8.54625809457117e-07, + "loss": 0.6108527183532715, + "memory(GiB)": 47.44, + "step": 17645, + "token_acc": 0.8543071161048689, + "train_speed(iter/s)": 0.095864 + }, + { + "epoch": 0.8207894576220939, + "grad_norm": 7.626466274261475, + "learning_rate": 8.524772057528902e-07, + "loss": 0.618977403640747, + "memory(GiB)": 47.44, + "step": 17650, + "token_acc": 0.8455143747835123, + "train_speed(iter/s)": 0.095879 + }, + { + "epoch": 0.8207894576220939, + "eval_loss": 0.5600150227546692, + "eval_runtime": 292.3615, + "eval_samples_per_second": 11.886, + "eval_steps_per_second": 11.886, + "step": 17650 + }, + { + "epoch": 0.8210219758820435, + "grad_norm": 13.730313301086426, + "learning_rate": 8.503310546069421e-07, + "loss": 0.600614595413208, + "memory(GiB)": 47.44, + "step": 17655, + "token_acc": 0.8345608248743155, + "train_speed(iter/s)": 0.095742 + }, + { + "epoch": 0.8212544941419931, + "grad_norm": 7.256465911865234, + "learning_rate": 8.48187357288362e-07, + "loss": 0.7289290428161621, + "memory(GiB)": 47.44, + "step": 17660, + "token_acc": 0.8085688240656336, + "train_speed(iter/s)": 0.095758 + }, + { + "epoch": 0.8214870124019427, + "grad_norm": 7.9318437576293945, + "learning_rate": 8.460461150647809e-07, + "loss": 0.6045058250427247, + "memory(GiB)": 47.44, + "step": 17665, + "token_acc": 0.8450986952157913, + "train_speed(iter/s)": 0.095773 + }, + { + "epoch": 0.8217195306618923, + "grad_norm": 6.51909875869751, + "learning_rate": 8.439073292023831e-07, + "loss": 0.671275281906128, + "memory(GiB)": 47.44, + "step": 17670, + "token_acc": 0.8170532505717086, + "train_speed(iter/s)": 0.095788 + }, + { + "epoch": 0.8219520489218419, + "grad_norm": 8.830435752868652, + "learning_rate": 8.41771000965898e-07, + "loss": 0.6234053611755371, + "memory(GiB)": 47.44, + "step": 17675, + "token_acc": 0.8314095837705492, + "train_speed(iter/s)": 0.095804 + }, + { + "epoch": 0.8221845671817914, + "grad_norm": 10.897567749023438, + "learning_rate": 8.396371316186041e-07, + "loss": 0.594688892364502, + "memory(GiB)": 47.44, + "step": 17680, + "token_acc": 0.8461187214611872, + "train_speed(iter/s)": 0.095818 + }, + { + "epoch": 0.8224170854417411, + "grad_norm": 8.042645454406738, + "learning_rate": 8.375057224223221e-07, + "loss": 0.7245099067687988, + "memory(GiB)": 47.44, + "step": 17685, + "token_acc": 0.833642089662838, + "train_speed(iter/s)": 0.095834 + }, + { + "epoch": 0.8226496037016907, + "grad_norm": 10.030806541442871, + "learning_rate": 8.353767746374225e-07, + "loss": 0.6409588813781738, + "memory(GiB)": 47.44, + "step": 17690, + "token_acc": 0.8301170515659602, + "train_speed(iter/s)": 0.095849 + }, + { + "epoch": 0.8228821219616403, + "grad_norm": 8.641294479370117, + "learning_rate": 8.332502895228145e-07, + "loss": 0.771269416809082, + "memory(GiB)": 47.44, + "step": 17695, + "token_acc": 0.804416403785489, + "train_speed(iter/s)": 0.095864 + }, + { + "epoch": 0.8231146402215899, + "grad_norm": 8.572598457336426, + "learning_rate": 8.311262683359583e-07, + "loss": 0.6635285377502441, + "memory(GiB)": 47.44, + "step": 17700, + "token_acc": 0.8299155609167672, + "train_speed(iter/s)": 0.095879 + }, + { + "epoch": 0.8231146402215899, + "eval_loss": 0.5600427985191345, + "eval_runtime": 295.028, + "eval_samples_per_second": 11.779, + "eval_steps_per_second": 11.779, + "step": 17700 + }, + { + "epoch": 0.8233471584815395, + "grad_norm": 7.2865777015686035, + "learning_rate": 8.290047123328493e-07, + "loss": 0.6963011264801026, + "memory(GiB)": 47.44, + "step": 17705, + "token_acc": 0.8337467003740241, + "train_speed(iter/s)": 0.09574 + }, + { + "epoch": 0.8235796767414891, + "grad_norm": 8.243517875671387, + "learning_rate": 8.26885622768031e-07, + "loss": 0.7035470962524414, + "memory(GiB)": 47.44, + "step": 17710, + "token_acc": 0.829205807002562, + "train_speed(iter/s)": 0.095755 + }, + { + "epoch": 0.8238121950014387, + "grad_norm": 7.2582173347473145, + "learning_rate": 8.247690008945869e-07, + "loss": 0.6932243347167969, + "memory(GiB)": 47.44, + "step": 17715, + "token_acc": 0.8333333333333334, + "train_speed(iter/s)": 0.09577 + }, + { + "epoch": 0.8240447132613883, + "grad_norm": 6.59385871887207, + "learning_rate": 8.226548479641411e-07, + "loss": 0.6206794738769531, + "memory(GiB)": 47.44, + "step": 17720, + "token_acc": 0.8364427860696517, + "train_speed(iter/s)": 0.095786 + }, + { + "epoch": 0.8242772315213379, + "grad_norm": 9.237839698791504, + "learning_rate": 8.205431652268559e-07, + "loss": 0.5992973804473877, + "memory(GiB)": 47.44, + "step": 17725, + "token_acc": 0.8515418502202643, + "train_speed(iter/s)": 0.095801 + }, + { + "epoch": 0.8245097497812875, + "grad_norm": 8.55127239227295, + "learning_rate": 8.184339539314362e-07, + "loss": 0.5941375255584717, + "memory(GiB)": 47.44, + "step": 17730, + "token_acc": 0.8611875737318129, + "train_speed(iter/s)": 0.095816 + }, + { + "epoch": 0.8247422680412371, + "grad_norm": 8.08411979675293, + "learning_rate": 8.163272153251222e-07, + "loss": 0.5708928108215332, + "memory(GiB)": 47.44, + "step": 17735, + "token_acc": 0.8621107966033158, + "train_speed(iter/s)": 0.095831 + }, + { + "epoch": 0.8249747863011867, + "grad_norm": 8.919515609741211, + "learning_rate": 8.142229506536952e-07, + "loss": 0.5553316593170166, + "memory(GiB)": 47.44, + "step": 17740, + "token_acc": 0.8557428459427675, + "train_speed(iter/s)": 0.095846 + }, + { + "epoch": 0.8252073045611363, + "grad_norm": 8.339323043823242, + "learning_rate": 8.121211611614699e-07, + "loss": 0.6758771896362304, + "memory(GiB)": 47.44, + "step": 17745, + "token_acc": 0.8269445478228508, + "train_speed(iter/s)": 0.095861 + }, + { + "epoch": 0.825439822821086, + "grad_norm": 9.175220489501953, + "learning_rate": 8.100218480913002e-07, + "loss": 0.7087527751922608, + "memory(GiB)": 47.44, + "step": 17750, + "token_acc": 0.8446563369090186, + "train_speed(iter/s)": 0.095876 + }, + { + "epoch": 0.825439822821086, + "eval_loss": 0.5590119957923889, + "eval_runtime": 297.7387, + "eval_samples_per_second": 11.671, + "eval_steps_per_second": 11.671, + "step": 17750 + }, + { + "epoch": 0.8256723410810355, + "grad_norm": 6.341367721557617, + "learning_rate": 8.079250126845745e-07, + "loss": 0.5751936435699463, + "memory(GiB)": 47.44, + "step": 17755, + "token_acc": 0.8348185055654035, + "train_speed(iter/s)": 0.095737 + }, + { + "epoch": 0.8259048593409851, + "grad_norm": 10.396602630615234, + "learning_rate": 8.058306561812168e-07, + "loss": 0.6061834335327149, + "memory(GiB)": 47.44, + "step": 17760, + "token_acc": 0.8371559633027523, + "train_speed(iter/s)": 0.095752 + }, + { + "epoch": 0.8261373776009348, + "grad_norm": 12.562670707702637, + "learning_rate": 8.03738779819686e-07, + "loss": 0.610113000869751, + "memory(GiB)": 47.44, + "step": 17765, + "token_acc": 0.8281904761904761, + "train_speed(iter/s)": 0.095767 + }, + { + "epoch": 0.8263698958608843, + "grad_norm": 11.741593360900879, + "learning_rate": 8.016493848369711e-07, + "loss": 0.7463398456573487, + "memory(GiB)": 47.44, + "step": 17770, + "token_acc": 0.8205345778532033, + "train_speed(iter/s)": 0.095783 + }, + { + "epoch": 0.8266024141208339, + "grad_norm": 8.214224815368652, + "learning_rate": 7.995624724685969e-07, + "loss": 0.6975039958953857, + "memory(GiB)": 47.44, + "step": 17775, + "token_acc": 0.8409549428379287, + "train_speed(iter/s)": 0.095798 + }, + { + "epoch": 0.8268349323807835, + "grad_norm": 10.039314270019531, + "learning_rate": 7.97478043948618e-07, + "loss": 0.649553918838501, + "memory(GiB)": 47.44, + "step": 17780, + "token_acc": 0.8373221216041398, + "train_speed(iter/s)": 0.095814 + }, + { + "epoch": 0.8270674506407332, + "grad_norm": 7.765139102935791, + "learning_rate": 7.953961005096234e-07, + "loss": 0.795719051361084, + "memory(GiB)": 47.44, + "step": 17785, + "token_acc": 0.7985458951832778, + "train_speed(iter/s)": 0.095829 + }, + { + "epoch": 0.8272999689006827, + "grad_norm": 7.198999881744385, + "learning_rate": 7.933166433827277e-07, + "loss": 0.5839630603790283, + "memory(GiB)": 47.44, + "step": 17790, + "token_acc": 0.8532955350815025, + "train_speed(iter/s)": 0.095844 + }, + { + "epoch": 0.8275324871606323, + "grad_norm": 9.437348365783691, + "learning_rate": 7.912396737975803e-07, + "loss": 0.5713480472564697, + "memory(GiB)": 47.44, + "step": 17795, + "token_acc": 0.8559150657229525, + "train_speed(iter/s)": 0.095859 + }, + { + "epoch": 0.827765005420582, + "grad_norm": 9.003602027893066, + "learning_rate": 7.891651929823562e-07, + "loss": 0.6855646133422851, + "memory(GiB)": 47.44, + "step": 17800, + "token_acc": 0.8108952116585705, + "train_speed(iter/s)": 0.095874 + }, + { + "epoch": 0.827765005420582, + "eval_loss": 0.5586514472961426, + "eval_runtime": 295.621, + "eval_samples_per_second": 11.755, + "eval_steps_per_second": 11.755, + "step": 17800 + }, + { + "epoch": 0.8279975236805316, + "grad_norm": 10.380149841308594, + "learning_rate": 7.870932021637622e-07, + "loss": 0.6497800350189209, + "memory(GiB)": 47.44, + "step": 17805, + "token_acc": 0.8343071185029853, + "train_speed(iter/s)": 0.095736 + }, + { + "epoch": 0.8282300419404811, + "grad_norm": 8.715871810913086, + "learning_rate": 7.85023702567027e-07, + "loss": 0.6714212417602539, + "memory(GiB)": 47.44, + "step": 17810, + "token_acc": 0.8406572411157814, + "train_speed(iter/s)": 0.095751 + }, + { + "epoch": 0.8284625602004307, + "grad_norm": 7.516015529632568, + "learning_rate": 7.829566954159135e-07, + "loss": 0.5611215114593506, + "memory(GiB)": 47.44, + "step": 17815, + "token_acc": 0.8698140200286123, + "train_speed(iter/s)": 0.095767 + }, + { + "epoch": 0.8286950784603804, + "grad_norm": 11.052634239196777, + "learning_rate": 7.808921819327025e-07, + "loss": 0.6710268497467041, + "memory(GiB)": 47.44, + "step": 17820, + "token_acc": 0.8282070517629407, + "train_speed(iter/s)": 0.095782 + }, + { + "epoch": 0.8289275967203299, + "grad_norm": 7.307531356811523, + "learning_rate": 7.788301633382089e-07, + "loss": 0.6433767795562744, + "memory(GiB)": 47.44, + "step": 17825, + "token_acc": 0.8330897398421514, + "train_speed(iter/s)": 0.095797 + }, + { + "epoch": 0.8291601149802795, + "grad_norm": 13.037498474121094, + "learning_rate": 7.767706408517628e-07, + "loss": 0.6921501159667969, + "memory(GiB)": 47.44, + "step": 17830, + "token_acc": 0.8359046283309958, + "train_speed(iter/s)": 0.095812 + }, + { + "epoch": 0.8293926332402292, + "grad_norm": 8.326820373535156, + "learning_rate": 7.747136156912294e-07, + "loss": 0.7372483253479004, + "memory(GiB)": 47.44, + "step": 17835, + "token_acc": 0.8138500635324015, + "train_speed(iter/s)": 0.095827 + }, + { + "epoch": 0.8296251515001788, + "grad_norm": 8.070684432983398, + "learning_rate": 7.726590890729868e-07, + "loss": 0.5706949234008789, + "memory(GiB)": 47.44, + "step": 17840, + "token_acc": 0.8526561977948547, + "train_speed(iter/s)": 0.095842 + }, + { + "epoch": 0.8298576697601283, + "grad_norm": 9.806997299194336, + "learning_rate": 7.706070622119433e-07, + "loss": 0.5731672286987305, + "memory(GiB)": 47.44, + "step": 17845, + "token_acc": 0.8651063829787234, + "train_speed(iter/s)": 0.095857 + }, + { + "epoch": 0.830090188020078, + "grad_norm": 8.055378913879395, + "learning_rate": 7.68557536321522e-07, + "loss": 0.7135499477386474, + "memory(GiB)": 47.44, + "step": 17850, + "token_acc": 0.8112104539202201, + "train_speed(iter/s)": 0.095872 + }, + { + "epoch": 0.830090188020078, + "eval_loss": 0.5592818856239319, + "eval_runtime": 293.2615, + "eval_samples_per_second": 11.849, + "eval_steps_per_second": 11.849, + "step": 17850 + }, + { + "epoch": 0.8303227062800276, + "grad_norm": 9.667156219482422, + "learning_rate": 7.66510512613674e-07, + "loss": 0.5140660285949707, + "memory(GiB)": 47.44, + "step": 17855, + "token_acc": 0.8352633724136278, + "train_speed(iter/s)": 0.095736 + }, + { + "epoch": 0.8305552245399772, + "grad_norm": 7.986255645751953, + "learning_rate": 7.644659922988657e-07, + "loss": 0.5835517406463623, + "memory(GiB)": 47.44, + "step": 17860, + "token_acc": 0.8530421216848674, + "train_speed(iter/s)": 0.095751 + }, + { + "epoch": 0.8307877427999267, + "grad_norm": 8.109107971191406, + "learning_rate": 7.624239765860858e-07, + "loss": 0.5663367748260498, + "memory(GiB)": 47.44, + "step": 17865, + "token_acc": 0.8564120054570259, + "train_speed(iter/s)": 0.095766 + }, + { + "epoch": 0.8310202610598764, + "grad_norm": 10.03013801574707, + "learning_rate": 7.603844666828408e-07, + "loss": 0.6867617130279541, + "memory(GiB)": 47.44, + "step": 17870, + "token_acc": 0.839731643682445, + "train_speed(iter/s)": 0.095782 + }, + { + "epoch": 0.831252779319826, + "grad_norm": 8.078845977783203, + "learning_rate": 7.583474637951577e-07, + "loss": 0.6239857196807861, + "memory(GiB)": 47.44, + "step": 17875, + "token_acc": 0.8506363027461487, + "train_speed(iter/s)": 0.095797 + }, + { + "epoch": 0.8314852975797755, + "grad_norm": 8.256733894348145, + "learning_rate": 7.563129691275767e-07, + "loss": 0.6100050449371338, + "memory(GiB)": 47.44, + "step": 17880, + "token_acc": 0.8483475479744137, + "train_speed(iter/s)": 0.095812 + }, + { + "epoch": 0.8317178158397251, + "grad_norm": 7.669572830200195, + "learning_rate": 7.542809838831583e-07, + "loss": 0.7314640045166015, + "memory(GiB)": 47.44, + "step": 17885, + "token_acc": 0.8145631067961165, + "train_speed(iter/s)": 0.095828 + }, + { + "epoch": 0.8319503340996748, + "grad_norm": 7.9997968673706055, + "learning_rate": 7.522515092634791e-07, + "loss": 0.7675912380218506, + "memory(GiB)": 47.44, + "step": 17890, + "token_acc": 0.8113207547169812, + "train_speed(iter/s)": 0.095843 + }, + { + "epoch": 0.8321828523596244, + "grad_norm": 7.206613063812256, + "learning_rate": 7.502245464686286e-07, + "loss": 0.5973569869995117, + "memory(GiB)": 47.44, + "step": 17895, + "token_acc": 0.8514957264957265, + "train_speed(iter/s)": 0.095858 + }, + { + "epoch": 0.8324153706195739, + "grad_norm": 8.209953308105469, + "learning_rate": 7.482000966972141e-07, + "loss": 0.5934009075164794, + "memory(GiB)": 47.44, + "step": 17900, + "token_acc": 0.8530659467797918, + "train_speed(iter/s)": 0.095874 + }, + { + "epoch": 0.8324153706195739, + "eval_loss": 0.5590567588806152, + "eval_runtime": 296.3204, + "eval_samples_per_second": 11.727, + "eval_steps_per_second": 11.727, + "step": 17900 + }, + { + "epoch": 0.8326478888795236, + "grad_norm": 6.786571502685547, + "learning_rate": 7.461781611463531e-07, + "loss": 0.6626224994659424, + "memory(GiB)": 47.44, + "step": 17905, + "token_acc": 0.8339872506028507, + "train_speed(iter/s)": 0.095736 + }, + { + "epoch": 0.8328804071394732, + "grad_norm": 9.307394981384277, + "learning_rate": 7.441587410116796e-07, + "loss": 0.696648359298706, + "memory(GiB)": 47.44, + "step": 17910, + "token_acc": 0.8244766505636071, + "train_speed(iter/s)": 0.095751 + }, + { + "epoch": 0.8331129253994227, + "grad_norm": 7.47057580947876, + "learning_rate": 7.42141837487339e-07, + "loss": 0.48722333908081056, + "memory(GiB)": 47.44, + "step": 17915, + "token_acc": 0.8612959719789842, + "train_speed(iter/s)": 0.095766 + }, + { + "epoch": 0.8333454436593724, + "grad_norm": 11.0787935256958, + "learning_rate": 7.401274517659901e-07, + "loss": 0.672635269165039, + "memory(GiB)": 47.44, + "step": 17920, + "token_acc": 0.8394547519878833, + "train_speed(iter/s)": 0.095781 + }, + { + "epoch": 0.833577961919322, + "grad_norm": 8.495865821838379, + "learning_rate": 7.381155850387988e-07, + "loss": 0.6448636531829834, + "memory(GiB)": 47.44, + "step": 17925, + "token_acc": 0.8262056414922657, + "train_speed(iter/s)": 0.095796 + }, + { + "epoch": 0.8338104801792716, + "grad_norm": 6.004539966583252, + "learning_rate": 7.36106238495447e-07, + "loss": 0.7003479480743409, + "memory(GiB)": 47.44, + "step": 17930, + "token_acc": 0.818907697221335, + "train_speed(iter/s)": 0.09581 + }, + { + "epoch": 0.8340429984392211, + "grad_norm": 10.699225425720215, + "learning_rate": 7.340994133241197e-07, + "loss": 0.6735119819641113, + "memory(GiB)": 47.44, + "step": 17935, + "token_acc": 0.8234676007005254, + "train_speed(iter/s)": 0.095825 + }, + { + "epoch": 0.8342755166991708, + "grad_norm": 13.282435417175293, + "learning_rate": 7.320951107115182e-07, + "loss": 0.7957521438598633, + "memory(GiB)": 47.44, + "step": 17940, + "token_acc": 0.8079071766222604, + "train_speed(iter/s)": 0.09584 + }, + { + "epoch": 0.8345080349591204, + "grad_norm": 7.707286357879639, + "learning_rate": 7.30093331842845e-07, + "loss": 0.673992919921875, + "memory(GiB)": 47.44, + "step": 17945, + "token_acc": 0.839835728952772, + "train_speed(iter/s)": 0.095855 + }, + { + "epoch": 0.83474055321907, + "grad_norm": 12.175082206726074, + "learning_rate": 7.280940779018158e-07, + "loss": 0.5859807968139649, + "memory(GiB)": 47.44, + "step": 17950, + "token_acc": 0.8603807796917498, + "train_speed(iter/s)": 0.09587 + }, + { + "epoch": 0.83474055321907, + "eval_loss": 0.5583511590957642, + "eval_runtime": 291.943, + "eval_samples_per_second": 11.903, + "eval_steps_per_second": 11.903, + "step": 17950 + }, + { + "epoch": 0.8349730714790196, + "grad_norm": 8.121362686157227, + "learning_rate": 7.260973500706514e-07, + "loss": 0.602592658996582, + "memory(GiB)": 47.44, + "step": 17955, + "token_acc": 0.8347663491657611, + "train_speed(iter/s)": 0.095736 + }, + { + "epoch": 0.8352055897389692, + "grad_norm": 13.328322410583496, + "learning_rate": 7.241031495300788e-07, + "loss": 0.7775434970855712, + "memory(GiB)": 47.44, + "step": 17960, + "token_acc": 0.8044280442804428, + "train_speed(iter/s)": 0.095751 + }, + { + "epoch": 0.8354381079989188, + "grad_norm": 8.016280174255371, + "learning_rate": 7.221114774593291e-07, + "loss": 0.6770434379577637, + "memory(GiB)": 47.44, + "step": 17965, + "token_acc": 0.8331099195710456, + "train_speed(iter/s)": 0.095766 + }, + { + "epoch": 0.8356706262588683, + "grad_norm": 9.899717330932617, + "learning_rate": 7.201223350361408e-07, + "loss": 0.572743272781372, + "memory(GiB)": 47.44, + "step": 17970, + "token_acc": 0.8537030280919372, + "train_speed(iter/s)": 0.095781 + }, + { + "epoch": 0.835903144518818, + "grad_norm": 9.240289688110352, + "learning_rate": 7.181357234367531e-07, + "loss": 0.628160047531128, + "memory(GiB)": 47.44, + "step": 17975, + "token_acc": 0.8461538461538461, + "train_speed(iter/s)": 0.095796 + }, + { + "epoch": 0.8361356627787676, + "grad_norm": 9.355088233947754, + "learning_rate": 7.16151643835914e-07, + "loss": 0.8679290771484375, + "memory(GiB)": 47.44, + "step": 17980, + "token_acc": 0.7922360248447204, + "train_speed(iter/s)": 0.095811 + }, + { + "epoch": 0.8363681810387172, + "grad_norm": 9.195143699645996, + "learning_rate": 7.141700974068678e-07, + "loss": 0.6345166206359864, + "memory(GiB)": 47.44, + "step": 17985, + "token_acc": 0.8389312977099237, + "train_speed(iter/s)": 0.095826 + }, + { + "epoch": 0.8366006992986668, + "grad_norm": 7.399878025054932, + "learning_rate": 7.121910853213654e-07, + "loss": 0.6902605533599854, + "memory(GiB)": 47.44, + "step": 17990, + "token_acc": 0.8341933264533884, + "train_speed(iter/s)": 0.095841 + }, + { + "epoch": 0.8368332175586164, + "grad_norm": 9.452262878417969, + "learning_rate": 7.102146087496576e-07, + "loss": 0.6338638305664063, + "memory(GiB)": 47.44, + "step": 17995, + "token_acc": 0.8394355453852022, + "train_speed(iter/s)": 0.095856 + }, + { + "epoch": 0.837065735818566, + "grad_norm": 9.153210639953613, + "learning_rate": 7.082406688604981e-07, + "loss": 0.612544584274292, + "memory(GiB)": 47.44, + "step": 18000, + "token_acc": 0.8403565640194489, + "train_speed(iter/s)": 0.095871 + }, + { + "epoch": 0.837065735818566, + "eval_loss": 0.5581134557723999, + "eval_runtime": 293.9767, + "eval_samples_per_second": 11.821, + "eval_steps_per_second": 11.821, + "step": 18000 + }, + { + "epoch": 0.8372982540785157, + "grad_norm": 9.343351364135742, + "learning_rate": 7.062692668211351e-07, + "loss": 0.7623269557952881, + "memory(GiB)": 47.44, + "step": 18005, + "token_acc": 0.8332722258532465, + "train_speed(iter/s)": 0.095735 + }, + { + "epoch": 0.8375307723384652, + "grad_norm": 9.574581146240234, + "learning_rate": 7.043004037973222e-07, + "loss": 0.627281904220581, + "memory(GiB)": 47.44, + "step": 18010, + "token_acc": 0.831611174894757, + "train_speed(iter/s)": 0.09575 + }, + { + "epoch": 0.8377632905984148, + "grad_norm": 8.998746871948242, + "learning_rate": 7.023340809533064e-07, + "loss": 0.6361284255981445, + "memory(GiB)": 47.44, + "step": 18015, + "token_acc": 0.8403451995685005, + "train_speed(iter/s)": 0.095765 + }, + { + "epoch": 0.8379958088583644, + "grad_norm": 8.098225593566895, + "learning_rate": 7.003702994518369e-07, + "loss": 0.5176102161407471, + "memory(GiB)": 47.44, + "step": 18020, + "token_acc": 0.8788109756097561, + "train_speed(iter/s)": 0.09578 + }, + { + "epoch": 0.838228327118314, + "grad_norm": 6.4128007888793945, + "learning_rate": 6.984090604541588e-07, + "loss": 0.6634963512420654, + "memory(GiB)": 47.44, + "step": 18025, + "token_acc": 0.8323624595469256, + "train_speed(iter/s)": 0.095795 + }, + { + "epoch": 0.8384608453782636, + "grad_norm": 10.500700950622559, + "learning_rate": 6.964503651200111e-07, + "loss": 0.6115920543670654, + "memory(GiB)": 47.44, + "step": 18030, + "token_acc": 0.8481152993348116, + "train_speed(iter/s)": 0.09581 + }, + { + "epoch": 0.8386933636382132, + "grad_norm": 9.047125816345215, + "learning_rate": 6.944942146076323e-07, + "loss": 0.5878973484039307, + "memory(GiB)": 47.44, + "step": 18035, + "token_acc": 0.8544600938967136, + "train_speed(iter/s)": 0.095825 + }, + { + "epoch": 0.8389258818981629, + "grad_norm": 8.661657333374023, + "learning_rate": 6.925406100737542e-07, + "loss": 0.5965959548950195, + "memory(GiB)": 47.44, + "step": 18040, + "token_acc": 0.8525943396226415, + "train_speed(iter/s)": 0.095839 + }, + { + "epoch": 0.8391584001581124, + "grad_norm": 8.694890975952148, + "learning_rate": 6.905895526736051e-07, + "loss": 0.6183953285217285, + "memory(GiB)": 47.44, + "step": 18045, + "token_acc": 0.8443489755452743, + "train_speed(iter/s)": 0.095854 + }, + { + "epoch": 0.839390918418062, + "grad_norm": 8.679917335510254, + "learning_rate": 6.886410435609025e-07, + "loss": 0.5601376056671142, + "memory(GiB)": 47.44, + "step": 18050, + "token_acc": 0.8597122302158273, + "train_speed(iter/s)": 0.095869 + }, + { + "epoch": 0.839390918418062, + "eval_loss": 0.5584567785263062, + "eval_runtime": 292.551, + "eval_samples_per_second": 11.878, + "eval_steps_per_second": 11.878, + "step": 18050 + }, + { + "epoch": 0.8396234366780116, + "grad_norm": 8.077510833740234, + "learning_rate": 6.866950838878628e-07, + "loss": 0.5764139652252197, + "memory(GiB)": 47.44, + "step": 18055, + "token_acc": 0.8349615180680303, + "train_speed(iter/s)": 0.095736 + }, + { + "epoch": 0.8398559549379612, + "grad_norm": 10.266379356384277, + "learning_rate": 6.847516748051897e-07, + "loss": 0.6612685680389404, + "memory(GiB)": 47.44, + "step": 18060, + "token_acc": 0.8329888383629599, + "train_speed(iter/s)": 0.095751 + }, + { + "epoch": 0.8400884731979108, + "grad_norm": 9.361281394958496, + "learning_rate": 6.828108174620835e-07, + "loss": 0.655527400970459, + "memory(GiB)": 47.44, + "step": 18065, + "token_acc": 0.8432756794917049, + "train_speed(iter/s)": 0.095766 + }, + { + "epoch": 0.8403209914578604, + "grad_norm": 7.153183937072754, + "learning_rate": 6.808725130062299e-07, + "loss": 0.603009843826294, + "memory(GiB)": 47.44, + "step": 18070, + "token_acc": 0.8513646826701743, + "train_speed(iter/s)": 0.095781 + }, + { + "epoch": 0.8405535097178101, + "grad_norm": 8.630191802978516, + "learning_rate": 6.789367625838106e-07, + "loss": 0.6014632225036621, + "memory(GiB)": 47.44, + "step": 18075, + "token_acc": 0.8404099560761347, + "train_speed(iter/s)": 0.095795 + }, + { + "epoch": 0.8407860279777596, + "grad_norm": 6.671989440917969, + "learning_rate": 6.770035673394931e-07, + "loss": 0.5919332504272461, + "memory(GiB)": 47.44, + "step": 18080, + "token_acc": 0.8455786736020806, + "train_speed(iter/s)": 0.09581 + }, + { + "epoch": 0.8410185462377092, + "grad_norm": 8.644976615905762, + "learning_rate": 6.750729284164381e-07, + "loss": 0.6521486282348633, + "memory(GiB)": 47.44, + "step": 18085, + "token_acc": 0.8404094010614102, + "train_speed(iter/s)": 0.095825 + }, + { + "epoch": 0.8412510644976589, + "grad_norm": 8.989290237426758, + "learning_rate": 6.731448469562885e-07, + "loss": 0.5394969463348389, + "memory(GiB)": 47.44, + "step": 18090, + "token_acc": 0.8701393983859135, + "train_speed(iter/s)": 0.095839 + }, + { + "epoch": 0.8414835827576085, + "grad_norm": 14.609663963317871, + "learning_rate": 6.71219324099181e-07, + "loss": 0.6068337917327881, + "memory(GiB)": 47.44, + "step": 18095, + "token_acc": 0.8634204275534442, + "train_speed(iter/s)": 0.095854 + }, + { + "epoch": 0.841716101017558, + "grad_norm": 8.150311470031738, + "learning_rate": 6.69296360983735e-07, + "loss": 0.6084172248840332, + "memory(GiB)": 47.44, + "step": 18100, + "token_acc": 0.8488570453770045, + "train_speed(iter/s)": 0.095869 + }, + { + "epoch": 0.841716101017558, + "eval_loss": 0.5587973594665527, + "eval_runtime": 292.6113, + "eval_samples_per_second": 11.876, + "eval_steps_per_second": 11.876, + "step": 18100 + }, + { + "epoch": 0.8419486192775076, + "grad_norm": 9.314252853393555, + "learning_rate": 6.673759587470596e-07, + "loss": 0.7102108001708984, + "memory(GiB)": 47.44, + "step": 18105, + "token_acc": 0.8340893328337625, + "train_speed(iter/s)": 0.095735 + }, + { + "epoch": 0.8421811375374573, + "grad_norm": 10.721854209899902, + "learning_rate": 6.65458118524746e-07, + "loss": 0.5791438579559326, + "memory(GiB)": 47.44, + "step": 18110, + "token_acc": 0.8577524893314367, + "train_speed(iter/s)": 0.09575 + }, + { + "epoch": 0.8424136557974068, + "grad_norm": 9.090928077697754, + "learning_rate": 6.635428414508738e-07, + "loss": 0.789949607849121, + "memory(GiB)": 47.44, + "step": 18115, + "token_acc": 0.7949775112443778, + "train_speed(iter/s)": 0.095764 + }, + { + "epoch": 0.8426461740573564, + "grad_norm": 5.7705583572387695, + "learning_rate": 6.616301286580046e-07, + "loss": 0.7491995811462402, + "memory(GiB)": 47.44, + "step": 18120, + "token_acc": 0.8068391866913124, + "train_speed(iter/s)": 0.095778 + }, + { + "epoch": 0.842878692317306, + "grad_norm": 7.779153347015381, + "learning_rate": 6.59719981277186e-07, + "loss": 0.7720121383666992, + "memory(GiB)": 47.44, + "step": 18125, + "token_acc": 0.8114950393431406, + "train_speed(iter/s)": 0.095793 + }, + { + "epoch": 0.8431112105772557, + "grad_norm": 10.119519233703613, + "learning_rate": 6.578124004379449e-07, + "loss": 0.6325667381286622, + "memory(GiB)": 47.44, + "step": 18130, + "token_acc": 0.8481435127242386, + "train_speed(iter/s)": 0.095808 + }, + { + "epoch": 0.8433437288372052, + "grad_norm": 9.072452545166016, + "learning_rate": 6.559073872682953e-07, + "loss": 0.7809335231781006, + "memory(GiB)": 47.44, + "step": 18135, + "token_acc": 0.8059299191374663, + "train_speed(iter/s)": 0.095822 + }, + { + "epoch": 0.8435762470971548, + "grad_norm": 8.513379096984863, + "learning_rate": 6.540049428947276e-07, + "loss": 0.6669524669647217, + "memory(GiB)": 47.44, + "step": 18140, + "token_acc": 0.835621521335807, + "train_speed(iter/s)": 0.095837 + }, + { + "epoch": 0.8438087653571045, + "grad_norm": 7.087943077087402, + "learning_rate": 6.521050684422187e-07, + "loss": 0.5620086193084717, + "memory(GiB)": 47.44, + "step": 18145, + "token_acc": 0.8636363636363636, + "train_speed(iter/s)": 0.095852 + }, + { + "epoch": 0.8440412836170541, + "grad_norm": 7.02882719039917, + "learning_rate": 6.502077650342204e-07, + "loss": 0.6077769279479981, + "memory(GiB)": 47.44, + "step": 18150, + "token_acc": 0.8466188137514167, + "train_speed(iter/s)": 0.095867 + }, + { + "epoch": 0.8440412836170541, + "eval_loss": 0.5583994388580322, + "eval_runtime": 296.5266, + "eval_samples_per_second": 11.719, + "eval_steps_per_second": 11.719, + "step": 18150 + }, + { + "epoch": 0.8442738018770036, + "grad_norm": 9.41649341583252, + "learning_rate": 6.483130337926675e-07, + "loss": 0.5184135913848877, + "memory(GiB)": 47.44, + "step": 18155, + "token_acc": 0.83554784055548, + "train_speed(iter/s)": 0.095731 + }, + { + "epoch": 0.8445063201369533, + "grad_norm": 9.200589179992676, + "learning_rate": 6.464208758379736e-07, + "loss": 0.6405242443084717, + "memory(GiB)": 47.44, + "step": 18160, + "token_acc": 0.8416289592760181, + "train_speed(iter/s)": 0.095746 + }, + { + "epoch": 0.8447388383969029, + "grad_norm": 6.7900872230529785, + "learning_rate": 6.445312922890301e-07, + "loss": 0.5888498306274415, + "memory(GiB)": 47.44, + "step": 18165, + "token_acc": 0.8449640287769784, + "train_speed(iter/s)": 0.09576 + }, + { + "epoch": 0.8449713566568524, + "grad_norm": 8.711414337158203, + "learning_rate": 6.426442842632075e-07, + "loss": 0.5557902336120606, + "memory(GiB)": 47.44, + "step": 18170, + "token_acc": 0.8608058608058609, + "train_speed(iter/s)": 0.095775 + }, + { + "epoch": 0.845203874916802, + "grad_norm": 7.435390949249268, + "learning_rate": 6.407598528763492e-07, + "loss": 0.6613565444946289, + "memory(GiB)": 47.44, + "step": 18175, + "token_acc": 0.8336182336182336, + "train_speed(iter/s)": 0.095789 + }, + { + "epoch": 0.8454363931767517, + "grad_norm": 8.605823516845703, + "learning_rate": 6.388779992427796e-07, + "loss": 0.6953454971313476, + "memory(GiB)": 47.44, + "step": 18180, + "token_acc": 0.8309314586994727, + "train_speed(iter/s)": 0.095804 + }, + { + "epoch": 0.8456689114367013, + "grad_norm": 6.164673328399658, + "learning_rate": 6.369987244752951e-07, + "loss": 0.7036227226257324, + "memory(GiB)": 47.44, + "step": 18185, + "token_acc": 0.8248962655601659, + "train_speed(iter/s)": 0.095819 + }, + { + "epoch": 0.8459014296966508, + "grad_norm": 11.071667671203613, + "learning_rate": 6.351220296851701e-07, + "loss": 0.5725512027740478, + "memory(GiB)": 47.44, + "step": 18190, + "token_acc": 0.8512280701754386, + "train_speed(iter/s)": 0.095833 + }, + { + "epoch": 0.8461339479566005, + "grad_norm": 13.574658393859863, + "learning_rate": 6.33247915982152e-07, + "loss": 0.7959741115570068, + "memory(GiB)": 47.44, + "step": 18195, + "token_acc": 0.7978000647039792, + "train_speed(iter/s)": 0.095847 + }, + { + "epoch": 0.8463664662165501, + "grad_norm": 6.559995174407959, + "learning_rate": 6.313763844744636e-07, + "loss": 0.6718171119689942, + "memory(GiB)": 47.44, + "step": 18200, + "token_acc": 0.8321273516642547, + "train_speed(iter/s)": 0.095862 + }, + { + "epoch": 0.8463664662165501, + "eval_loss": 0.5578708648681641, + "eval_runtime": 290.8952, + "eval_samples_per_second": 11.946, + "eval_steps_per_second": 11.946, + "step": 18200 + }, + { + "epoch": 0.8465989844764996, + "grad_norm": 8.805997848510742, + "learning_rate": 6.295074362687959e-07, + "loss": 0.6185959815979004, + "memory(GiB)": 47.44, + "step": 18205, + "token_acc": 0.8348420808731294, + "train_speed(iter/s)": 0.09573 + }, + { + "epoch": 0.8468315027364492, + "grad_norm": 9.289185523986816, + "learning_rate": 6.276410724703191e-07, + "loss": 0.6430187225341797, + "memory(GiB)": 47.44, + "step": 18210, + "token_acc": 0.8297029702970297, + "train_speed(iter/s)": 0.095745 + }, + { + "epoch": 0.8470640209963989, + "grad_norm": 8.73018741607666, + "learning_rate": 6.25777294182669e-07, + "loss": 0.7390905857086182, + "memory(GiB)": 47.44, + "step": 18215, + "token_acc": 0.8018839258584017, + "train_speed(iter/s)": 0.095759 + }, + { + "epoch": 0.8472965392563485, + "grad_norm": 7.182153224945068, + "learning_rate": 6.239161025079577e-07, + "loss": 0.7059009075164795, + "memory(GiB)": 47.44, + "step": 18220, + "token_acc": 0.8162042875157629, + "train_speed(iter/s)": 0.095774 + }, + { + "epoch": 0.847529057516298, + "grad_norm": 8.976435661315918, + "learning_rate": 6.220574985467625e-07, + "loss": 0.6841076850891114, + "memory(GiB)": 47.44, + "step": 18225, + "token_acc": 0.8229736449527598, + "train_speed(iter/s)": 0.095788 + }, + { + "epoch": 0.8477615757762477, + "grad_norm": 7.973471164703369, + "learning_rate": 6.202014833981351e-07, + "loss": 0.5512941837310791, + "memory(GiB)": 47.44, + "step": 18230, + "token_acc": 0.8602552131963896, + "train_speed(iter/s)": 0.095803 + }, + { + "epoch": 0.8479940940361973, + "grad_norm": 8.258837699890137, + "learning_rate": 6.183480581595941e-07, + "loss": 0.6314374923706054, + "memory(GiB)": 47.44, + "step": 18235, + "token_acc": 0.8396159317211949, + "train_speed(iter/s)": 0.095818 + }, + { + "epoch": 0.8482266122961469, + "grad_norm": 9.106829643249512, + "learning_rate": 6.164972239271288e-07, + "loss": 0.6298882484436035, + "memory(GiB)": 47.44, + "step": 18240, + "token_acc": 0.8451190065539841, + "train_speed(iter/s)": 0.095832 + }, + { + "epoch": 0.8484591305560965, + "grad_norm": 9.711416244506836, + "learning_rate": 6.146489817951917e-07, + "loss": 0.5575997829437256, + "memory(GiB)": 47.44, + "step": 18245, + "token_acc": 0.8538461538461538, + "train_speed(iter/s)": 0.095847 + }, + { + "epoch": 0.8486916488160461, + "grad_norm": 7.332555294036865, + "learning_rate": 6.128033328567079e-07, + "loss": 0.5691449165344238, + "memory(GiB)": 47.44, + "step": 18250, + "token_acc": 0.8534361851332398, + "train_speed(iter/s)": 0.095861 + }, + { + "epoch": 0.8486916488160461, + "eval_loss": 0.5585102438926697, + "eval_runtime": 291.6188, + "eval_samples_per_second": 11.916, + "eval_steps_per_second": 11.916, + "step": 18250 + }, + { + "epoch": 0.8489241670759957, + "grad_norm": 7.50723123550415, + "learning_rate": 6.109602782030644e-07, + "loss": 0.6675292015075683, + "memory(GiB)": 47.44, + "step": 18255, + "token_acc": 0.8349338967316495, + "train_speed(iter/s)": 0.095729 + }, + { + "epoch": 0.8491566853359452, + "grad_norm": 7.4508233070373535, + "learning_rate": 6.091198189241182e-07, + "loss": 0.6501538276672363, + "memory(GiB)": 47.44, + "step": 18260, + "token_acc": 0.8348740835192859, + "train_speed(iter/s)": 0.095744 + }, + { + "epoch": 0.8493892035958949, + "grad_norm": 11.345195770263672, + "learning_rate": 6.072819561081883e-07, + "loss": 0.7528467655181885, + "memory(GiB)": 47.44, + "step": 18265, + "token_acc": 0.8289156626506025, + "train_speed(iter/s)": 0.095758 + }, + { + "epoch": 0.8496217218558445, + "grad_norm": 9.892457962036133, + "learning_rate": 6.054466908420604e-07, + "loss": 0.7120511054992675, + "memory(GiB)": 47.44, + "step": 18270, + "token_acc": 0.8325581395348837, + "train_speed(iter/s)": 0.095773 + }, + { + "epoch": 0.8498542401157941, + "grad_norm": 9.037054061889648, + "learning_rate": 6.036140242109834e-07, + "loss": 0.6773629188537598, + "memory(GiB)": 47.44, + "step": 18275, + "token_acc": 0.8391859537110934, + "train_speed(iter/s)": 0.095788 + }, + { + "epoch": 0.8500867583757437, + "grad_norm": 9.72851848602295, + "learning_rate": 6.017839572986695e-07, + "loss": 0.48967576026916504, + "memory(GiB)": 47.44, + "step": 18280, + "token_acc": 0.8733153638814016, + "train_speed(iter/s)": 0.095803 + }, + { + "epoch": 0.8503192766356933, + "grad_norm": 11.348685264587402, + "learning_rate": 5.999564911872952e-07, + "loss": 0.6709898948669434, + "memory(GiB)": 47.44, + "step": 18285, + "token_acc": 0.8287549054584374, + "train_speed(iter/s)": 0.095818 + }, + { + "epoch": 0.8505517948956429, + "grad_norm": 11.713817596435547, + "learning_rate": 5.981316269574955e-07, + "loss": 0.5737239360809326, + "memory(GiB)": 47.44, + "step": 18290, + "token_acc": 0.8550358196375896, + "train_speed(iter/s)": 0.095832 + }, + { + "epoch": 0.8507843131555926, + "grad_norm": 8.777615547180176, + "learning_rate": 5.963093656883706e-07, + "loss": 0.5813944816589356, + "memory(GiB)": 47.44, + "step": 18295, + "token_acc": 0.8532901833872708, + "train_speed(iter/s)": 0.095847 + }, + { + "epoch": 0.8510168314155421, + "grad_norm": 9.691421508789062, + "learning_rate": 5.944897084574786e-07, + "loss": 0.5308315277099609, + "memory(GiB)": 47.44, + "step": 18300, + "token_acc": 0.8565149136577708, + "train_speed(iter/s)": 0.095861 + }, + { + "epoch": 0.8510168314155421, + "eval_loss": 0.5576656460762024, + "eval_runtime": 292.1527, + "eval_samples_per_second": 11.894, + "eval_steps_per_second": 11.894, + "step": 18300 + }, + { + "epoch": 0.8512493496754917, + "grad_norm": 9.326038360595703, + "learning_rate": 5.926726563408402e-07, + "loss": 0.6760049343109131, + "memory(GiB)": 47.44, + "step": 18305, + "token_acc": 0.8342113442732699, + "train_speed(iter/s)": 0.095729 + }, + { + "epoch": 0.8514818679354413, + "grad_norm": 5.767177104949951, + "learning_rate": 5.908582104129329e-07, + "loss": 0.7487131595611572, + "memory(GiB)": 47.44, + "step": 18310, + "token_acc": 0.8074925816023739, + "train_speed(iter/s)": 0.095744 + }, + { + "epoch": 0.8517143861953909, + "grad_norm": 7.889694690704346, + "learning_rate": 5.890463717466954e-07, + "loss": 0.7449358940124512, + "memory(GiB)": 47.44, + "step": 18315, + "token_acc": 0.8118628359592215, + "train_speed(iter/s)": 0.095758 + }, + { + "epoch": 0.8519469044553405, + "grad_norm": 7.965071678161621, + "learning_rate": 5.872371414135241e-07, + "loss": 0.6570825576782227, + "memory(GiB)": 47.44, + "step": 18320, + "token_acc": 0.8311509303928325, + "train_speed(iter/s)": 0.095773 + }, + { + "epoch": 0.8521794227152901, + "grad_norm": 10.874944686889648, + "learning_rate": 5.854305204832733e-07, + "loss": 0.7097513675689697, + "memory(GiB)": 47.44, + "step": 18325, + "token_acc": 0.8337330135891287, + "train_speed(iter/s)": 0.095787 + }, + { + "epoch": 0.8524119409752398, + "grad_norm": 8.72624397277832, + "learning_rate": 5.836265100242522e-07, + "loss": 0.7030339241027832, + "memory(GiB)": 47.44, + "step": 18330, + "token_acc": 0.8376623376623377, + "train_speed(iter/s)": 0.095802 + }, + { + "epoch": 0.8526444592351893, + "grad_norm": 10.563741683959961, + "learning_rate": 5.818251111032297e-07, + "loss": 0.5798059463500976, + "memory(GiB)": 47.44, + "step": 18335, + "token_acc": 0.8491039426523298, + "train_speed(iter/s)": 0.095817 + }, + { + "epoch": 0.8528769774951389, + "grad_norm": 8.762702941894531, + "learning_rate": 5.800263247854265e-07, + "loss": 0.7150753021240235, + "memory(GiB)": 47.44, + "step": 18340, + "token_acc": 0.8208005985783764, + "train_speed(iter/s)": 0.095831 + }, + { + "epoch": 0.8531094957550885, + "grad_norm": 8.711050987243652, + "learning_rate": 5.782301521345224e-07, + "loss": 0.5809697151184082, + "memory(GiB)": 47.44, + "step": 18345, + "token_acc": 0.8659476117103235, + "train_speed(iter/s)": 0.095845 + }, + { + "epoch": 0.8533420140150381, + "grad_norm": 9.37763500213623, + "learning_rate": 5.764365942126482e-07, + "loss": 0.6817828178405761, + "memory(GiB)": 47.44, + "step": 18350, + "token_acc": 0.8369325694138386, + "train_speed(iter/s)": 0.09586 + }, + { + "epoch": 0.8533420140150381, + "eval_loss": 0.5577992796897888, + "eval_runtime": 294.47, + "eval_samples_per_second": 11.801, + "eval_steps_per_second": 11.801, + "step": 18350 + }, + { + "epoch": 0.8535745322749877, + "grad_norm": 10.525474548339844, + "learning_rate": 5.746456520803906e-07, + "loss": 0.6407979488372803, + "memory(GiB)": 47.44, + "step": 18355, + "token_acc": 0.83432, + "train_speed(iter/s)": 0.095727 + }, + { + "epoch": 0.8538070505349373, + "grad_norm": 8.024861335754395, + "learning_rate": 5.728573267967891e-07, + "loss": 0.6994614601135254, + "memory(GiB)": 47.44, + "step": 18360, + "token_acc": 0.8156079854809437, + "train_speed(iter/s)": 0.095742 + }, + { + "epoch": 0.854039568794887, + "grad_norm": 7.524238109588623, + "learning_rate": 5.710716194193367e-07, + "loss": 0.6358412265777588, + "memory(GiB)": 47.44, + "step": 18365, + "token_acc": 0.8227394807520143, + "train_speed(iter/s)": 0.095756 + }, + { + "epoch": 0.8542720870548365, + "grad_norm": 8.940537452697754, + "learning_rate": 5.692885310039753e-07, + "loss": 0.5769914150238037, + "memory(GiB)": 47.44, + "step": 18370, + "token_acc": 0.8603263032232391, + "train_speed(iter/s)": 0.095771 + }, + { + "epoch": 0.8545046053147861, + "grad_norm": 9.157532691955566, + "learning_rate": 5.675080626051021e-07, + "loss": 0.805474853515625, + "memory(GiB)": 47.44, + "step": 18375, + "token_acc": 0.7917448405253283, + "train_speed(iter/s)": 0.095786 + }, + { + "epoch": 0.8547371235747357, + "grad_norm": 10.06224536895752, + "learning_rate": 5.657302152755612e-07, + "loss": 0.6651059627532959, + "memory(GiB)": 47.44, + "step": 18380, + "token_acc": 0.8368121442125237, + "train_speed(iter/s)": 0.0958 + }, + { + "epoch": 0.8549696418346854, + "grad_norm": 6.025383949279785, + "learning_rate": 5.639549900666508e-07, + "loss": 0.6011961936950684, + "memory(GiB)": 47.44, + "step": 18385, + "token_acc": 0.8556067588325653, + "train_speed(iter/s)": 0.095815 + }, + { + "epoch": 0.8552021600946349, + "grad_norm": 7.602214336395264, + "learning_rate": 5.621823880281135e-07, + "loss": 0.5666591167449951, + "memory(GiB)": 47.44, + "step": 18390, + "token_acc": 0.8518024032042724, + "train_speed(iter/s)": 0.095829 + }, + { + "epoch": 0.8554346783545845, + "grad_norm": 7.266246318817139, + "learning_rate": 5.604124102081454e-07, + "loss": 0.48615288734436035, + "memory(GiB)": 47.44, + "step": 18395, + "token_acc": 0.8771049802938015, + "train_speed(iter/s)": 0.095844 + }, + { + "epoch": 0.8556671966145342, + "grad_norm": 9.022258758544922, + "learning_rate": 5.586450576533892e-07, + "loss": 0.7559492588043213, + "memory(GiB)": 47.44, + "step": 18400, + "token_acc": 0.8222374742621825, + "train_speed(iter/s)": 0.095859 + }, + { + "epoch": 0.8556671966145342, + "eval_loss": 0.5577629804611206, + "eval_runtime": 292.6222, + "eval_samples_per_second": 11.875, + "eval_steps_per_second": 11.875, + "step": 18400 + }, + { + "epoch": 0.8558997148744837, + "grad_norm": 9.033079147338867, + "learning_rate": 5.568803314089349e-07, + "loss": 0.6726770401000977, + "memory(GiB)": 47.44, + "step": 18405, + "token_acc": 0.8342267810279715, + "train_speed(iter/s)": 0.095727 + }, + { + "epoch": 0.8561322331344333, + "grad_norm": 9.249860763549805, + "learning_rate": 5.551182325183191e-07, + "loss": 0.6182631969451904, + "memory(GiB)": 47.44, + "step": 18410, + "token_acc": 0.8520777735417461, + "train_speed(iter/s)": 0.095742 + }, + { + "epoch": 0.856364751394383, + "grad_norm": 8.269856452941895, + "learning_rate": 5.533587620235254e-07, + "loss": 0.6515116691589355, + "memory(GiB)": 47.44, + "step": 18415, + "token_acc": 0.8340821566110398, + "train_speed(iter/s)": 0.095756 + }, + { + "epoch": 0.8565972696543326, + "grad_norm": 9.464628219604492, + "learning_rate": 5.516019209649837e-07, + "loss": 0.6144441127777099, + "memory(GiB)": 47.44, + "step": 18420, + "token_acc": 0.8478441127694859, + "train_speed(iter/s)": 0.095771 + }, + { + "epoch": 0.8568297879142821, + "grad_norm": 10.576383590698242, + "learning_rate": 5.498477103815669e-07, + "loss": 0.6464351654052735, + "memory(GiB)": 47.44, + "step": 18425, + "token_acc": 0.8510998307952623, + "train_speed(iter/s)": 0.095785 + }, + { + "epoch": 0.8570623061742317, + "grad_norm": 7.952290058135986, + "learning_rate": 5.480961313105964e-07, + "loss": 0.6013254642486572, + "memory(GiB)": 47.44, + "step": 18430, + "token_acc": 0.8617727450214759, + "train_speed(iter/s)": 0.0958 + }, + { + "epoch": 0.8572948244341814, + "grad_norm": 8.244209289550781, + "learning_rate": 5.463471847878321e-07, + "loss": 0.7223697662353515, + "memory(GiB)": 47.44, + "step": 18435, + "token_acc": 0.8237899398924391, + "train_speed(iter/s)": 0.095815 + }, + { + "epoch": 0.857527342694131, + "grad_norm": 7.9289350509643555, + "learning_rate": 5.446008718474811e-07, + "loss": 0.7440857410430908, + "memory(GiB)": 47.44, + "step": 18440, + "token_acc": 0.8179434896591902, + "train_speed(iter/s)": 0.095829 + }, + { + "epoch": 0.8577598609540805, + "grad_norm": 8.39036750793457, + "learning_rate": 5.428571935221927e-07, + "loss": 0.6755226612091064, + "memory(GiB)": 47.44, + "step": 18445, + "token_acc": 0.8333333333333334, + "train_speed(iter/s)": 0.095844 + }, + { + "epoch": 0.8579923792140302, + "grad_norm": 7.686507701873779, + "learning_rate": 5.411161508430585e-07, + "loss": 0.5707077503204345, + "memory(GiB)": 47.44, + "step": 18450, + "token_acc": 0.8539792387543252, + "train_speed(iter/s)": 0.095858 + }, + { + "epoch": 0.8579923792140302, + "eval_loss": 0.5588163733482361, + "eval_runtime": 295.8192, + "eval_samples_per_second": 11.747, + "eval_steps_per_second": 11.747, + "step": 18450 + }, + { + "epoch": 0.8582248974739798, + "grad_norm": 7.943417072296143, + "learning_rate": 5.393777448396081e-07, + "loss": 0.8202121734619141, + "memory(GiB)": 47.44, + "step": 18455, + "token_acc": 0.833792784753649, + "train_speed(iter/s)": 0.095725 + }, + { + "epoch": 0.8584574157339293, + "grad_norm": 7.825821399688721, + "learning_rate": 5.376419765398183e-07, + "loss": 0.7452343463897705, + "memory(GiB)": 47.44, + "step": 18460, + "token_acc": 0.8056155507559395, + "train_speed(iter/s)": 0.095739 + }, + { + "epoch": 0.8586899339938789, + "grad_norm": 9.753142356872559, + "learning_rate": 5.359088469700985e-07, + "loss": 0.6943498611450195, + "memory(GiB)": 47.44, + "step": 18465, + "token_acc": 0.8412244897959184, + "train_speed(iter/s)": 0.095754 + }, + { + "epoch": 0.8589224522538286, + "grad_norm": 7.731417179107666, + "learning_rate": 5.341783571553056e-07, + "loss": 0.6089068412780761, + "memory(GiB)": 47.44, + "step": 18470, + "token_acc": 0.8480913026367571, + "train_speed(iter/s)": 0.095769 + }, + { + "epoch": 0.8591549705137782, + "grad_norm": 10.879091262817383, + "learning_rate": 5.324505081187281e-07, + "loss": 0.657605504989624, + "memory(GiB)": 47.44, + "step": 18475, + "token_acc": 0.834061135371179, + "train_speed(iter/s)": 0.095783 + }, + { + "epoch": 0.8593874887737277, + "grad_norm": 9.33606243133545, + "learning_rate": 5.307253008820984e-07, + "loss": 0.6699440002441406, + "memory(GiB)": 47.44, + "step": 18480, + "token_acc": 0.8354124748490945, + "train_speed(iter/s)": 0.095798 + }, + { + "epoch": 0.8596200070336774, + "grad_norm": 9.070079803466797, + "learning_rate": 5.290027364655842e-07, + "loss": 0.7180463314056397, + "memory(GiB)": 47.44, + "step": 18485, + "token_acc": 0.8165910563836681, + "train_speed(iter/s)": 0.095813 + }, + { + "epoch": 0.859852525293627, + "grad_norm": 8.304587364196777, + "learning_rate": 5.27282815887793e-07, + "loss": 0.5282202243804932, + "memory(GiB)": 47.44, + "step": 18490, + "token_acc": 0.8615094339622642, + "train_speed(iter/s)": 0.095827 + }, + { + "epoch": 0.8600850435535765, + "grad_norm": 7.470762729644775, + "learning_rate": 5.255655401657639e-07, + "loss": 0.5523840427398682, + "memory(GiB)": 47.44, + "step": 18495, + "token_acc": 0.8608035431825372, + "train_speed(iter/s)": 0.095842 + }, + { + "epoch": 0.8603175618135261, + "grad_norm": 8.002354621887207, + "learning_rate": 5.238509103149774e-07, + "loss": 0.7330649852752685, + "memory(GiB)": 47.44, + "step": 18500, + "token_acc": 0.8097764304660856, + "train_speed(iter/s)": 0.095857 + }, + { + "epoch": 0.8603175618135261, + "eval_loss": 0.5571265816688538, + "eval_runtime": 296.8144, + "eval_samples_per_second": 11.708, + "eval_steps_per_second": 11.708, + "step": 18500 + }, + { + "epoch": 0.8605500800734758, + "grad_norm": 6.639530181884766, + "learning_rate": 5.221389273493449e-07, + "loss": 0.6615922451019287, + "memory(GiB)": 47.44, + "step": 18505, + "token_acc": 0.8347044071370382, + "train_speed(iter/s)": 0.095724 + }, + { + "epoch": 0.8607825983334254, + "grad_norm": 8.321513175964355, + "learning_rate": 5.204295922812175e-07, + "loss": 0.7088462829589843, + "memory(GiB)": 47.44, + "step": 18510, + "token_acc": 0.821732283464567, + "train_speed(iter/s)": 0.095738 + }, + { + "epoch": 0.8610151165933749, + "grad_norm": 10.790754318237305, + "learning_rate": 5.187229061213739e-07, + "loss": 0.6691200256347656, + "memory(GiB)": 47.44, + "step": 18515, + "token_acc": 0.819551282051282, + "train_speed(iter/s)": 0.095753 + }, + { + "epoch": 0.8612476348533246, + "grad_norm": 9.01181411743164, + "learning_rate": 5.170188698790352e-07, + "loss": 0.7157214641571045, + "memory(GiB)": 47.44, + "step": 18520, + "token_acc": 0.8334043459735833, + "train_speed(iter/s)": 0.095768 + }, + { + "epoch": 0.8614801531132742, + "grad_norm": 9.205448150634766, + "learning_rate": 5.15317484561847e-07, + "loss": 0.7251205444335938, + "memory(GiB)": 47.44, + "step": 18525, + "token_acc": 0.8173913043478261, + "train_speed(iter/s)": 0.095782 + }, + { + "epoch": 0.8617126713732238, + "grad_norm": 9.783936500549316, + "learning_rate": 5.136187511758927e-07, + "loss": 0.7142038345336914, + "memory(GiB)": 47.44, + "step": 18530, + "token_acc": 0.830752990851513, + "train_speed(iter/s)": 0.095797 + }, + { + "epoch": 0.8619451896331733, + "grad_norm": 8.41704273223877, + "learning_rate": 5.119226707256847e-07, + "loss": 0.5512996673583984, + "memory(GiB)": 47.44, + "step": 18535, + "token_acc": 0.8576122672508215, + "train_speed(iter/s)": 0.095811 + }, + { + "epoch": 0.862177707893123, + "grad_norm": 8.004241943359375, + "learning_rate": 5.102292442141693e-07, + "loss": 0.5882785320281982, + "memory(GiB)": 47.44, + "step": 18540, + "token_acc": 0.851418439716312, + "train_speed(iter/s)": 0.095826 + }, + { + "epoch": 0.8624102261530726, + "grad_norm": 7.506921768188477, + "learning_rate": 5.085384726427195e-07, + "loss": 0.7991962432861328, + "memory(GiB)": 47.44, + "step": 18545, + "token_acc": 0.7994314592545799, + "train_speed(iter/s)": 0.09584 + }, + { + "epoch": 0.8626427444130221, + "grad_norm": 8.778756141662598, + "learning_rate": 5.068503570111422e-07, + "loss": 0.6244667053222657, + "memory(GiB)": 47.44, + "step": 18550, + "token_acc": 0.8412140575079872, + "train_speed(iter/s)": 0.095855 + }, + { + "epoch": 0.8626427444130221, + "eval_loss": 0.5570151209831238, + "eval_runtime": 294.7339, + "eval_samples_per_second": 11.79, + "eval_steps_per_second": 11.79, + "step": 18550 + }, + { + "epoch": 0.8628752626729718, + "grad_norm": 8.074554443359375, + "learning_rate": 5.051648983176722e-07, + "loss": 0.6556201457977295, + "memory(GiB)": 47.44, + "step": 18555, + "token_acc": 0.834621143575495, + "train_speed(iter/s)": 0.095724 + }, + { + "epoch": 0.8631077809329214, + "grad_norm": 8.432696342468262, + "learning_rate": 5.034820975589732e-07, + "loss": 0.59825758934021, + "memory(GiB)": 47.44, + "step": 18560, + "token_acc": 0.8419255718675316, + "train_speed(iter/s)": 0.095738 + }, + { + "epoch": 0.863340299192871, + "grad_norm": 8.92758560180664, + "learning_rate": 5.018019557301385e-07, + "loss": 0.6601286888122558, + "memory(GiB)": 47.44, + "step": 18565, + "token_acc": 0.837074583635047, + "train_speed(iter/s)": 0.095752 + }, + { + "epoch": 0.8635728174528206, + "grad_norm": 7.215491771697998, + "learning_rate": 5.001244738246852e-07, + "loss": 0.5904222965240479, + "memory(GiB)": 47.44, + "step": 18570, + "token_acc": 0.844404973357016, + "train_speed(iter/s)": 0.095767 + }, + { + "epoch": 0.8638053357127702, + "grad_norm": 10.66939926147461, + "learning_rate": 4.984496528345628e-07, + "loss": 0.5614672660827636, + "memory(GiB)": 47.44, + "step": 18575, + "token_acc": 0.863036303630363, + "train_speed(iter/s)": 0.095782 + }, + { + "epoch": 0.8640378539727198, + "grad_norm": 9.135615348815918, + "learning_rate": 4.967774937501424e-07, + "loss": 0.679977560043335, + "memory(GiB)": 47.44, + "step": 18580, + "token_acc": 0.837968561064087, + "train_speed(iter/s)": 0.095796 + }, + { + "epoch": 0.8642703722326694, + "grad_norm": 9.716190338134766, + "learning_rate": 4.951079975602257e-07, + "loss": 0.6977860927581787, + "memory(GiB)": 47.44, + "step": 18585, + "token_acc": 0.8257796257796258, + "train_speed(iter/s)": 0.095811 + }, + { + "epoch": 0.864502890492619, + "grad_norm": 8.211316108703613, + "learning_rate": 4.934411652520344e-07, + "loss": 0.6847392559051514, + "memory(GiB)": 47.44, + "step": 18590, + "token_acc": 0.834858734580183, + "train_speed(iter/s)": 0.095825 + }, + { + "epoch": 0.8647354087525686, + "grad_norm": 6.788343906402588, + "learning_rate": 4.917769978112196e-07, + "loss": 0.5711318969726562, + "memory(GiB)": 47.44, + "step": 18595, + "token_acc": 0.8474124809741248, + "train_speed(iter/s)": 0.09584 + }, + { + "epoch": 0.8649679270125182, + "grad_norm": 8.990870475769043, + "learning_rate": 4.901154962218552e-07, + "loss": 0.6872212409973144, + "memory(GiB)": 47.44, + "step": 18600, + "token_acc": 0.8271604938271605, + "train_speed(iter/s)": 0.095854 + }, + { + "epoch": 0.8649679270125182, + "eval_loss": 0.5573961734771729, + "eval_runtime": 295.4738, + "eval_samples_per_second": 11.761, + "eval_steps_per_second": 11.761, + "step": 18600 + }, + { + "epoch": 0.8652004452724678, + "grad_norm": 11.367814064025879, + "learning_rate": 4.884566614664383e-07, + "loss": 0.6198303699493408, + "memory(GiB)": 47.44, + "step": 18605, + "token_acc": 0.8351723307787987, + "train_speed(iter/s)": 0.095722 + }, + { + "epoch": 0.8654329635324174, + "grad_norm": 10.426321983337402, + "learning_rate": 4.868004945258881e-07, + "loss": 0.649804162979126, + "memory(GiB)": 47.44, + "step": 18610, + "token_acc": 0.8430807248764415, + "train_speed(iter/s)": 0.095737 + }, + { + "epoch": 0.865665481792367, + "grad_norm": 7.450739860534668, + "learning_rate": 4.851469963795485e-07, + "loss": 0.6020886898040771, + "memory(GiB)": 47.44, + "step": 18615, + "token_acc": 0.8388082505729565, + "train_speed(iter/s)": 0.095752 + }, + { + "epoch": 0.8658980000523167, + "grad_norm": 7.820985317230225, + "learning_rate": 4.834961680051825e-07, + "loss": 0.6482667446136474, + "memory(GiB)": 47.44, + "step": 18620, + "token_acc": 0.8368228647391159, + "train_speed(iter/s)": 0.095766 + }, + { + "epoch": 0.8661305183122662, + "grad_norm": 9.831792831420898, + "learning_rate": 4.81848010378978e-07, + "loss": 0.7055737972259521, + "memory(GiB)": 47.44, + "step": 18625, + "token_acc": 0.8175155734701356, + "train_speed(iter/s)": 0.095781 + }, + { + "epoch": 0.8663630365722158, + "grad_norm": 7.0892109870910645, + "learning_rate": 4.802025244755392e-07, + "loss": 0.5333632469177246, + "memory(GiB)": 47.44, + "step": 18630, + "token_acc": 0.8736027515047291, + "train_speed(iter/s)": 0.095795 + }, + { + "epoch": 0.8665955548321654, + "grad_norm": 10.063704490661621, + "learning_rate": 4.785597112678941e-07, + "loss": 0.6076772689819336, + "memory(GiB)": 47.44, + "step": 18635, + "token_acc": 0.8538344722854974, + "train_speed(iter/s)": 0.09581 + }, + { + "epoch": 0.8668280730921151, + "grad_norm": 8.082415580749512, + "learning_rate": 4.769195717274882e-07, + "loss": 0.5907391548156739, + "memory(GiB)": 47.44, + "step": 18640, + "token_acc": 0.8345757898473554, + "train_speed(iter/s)": 0.095824 + }, + { + "epoch": 0.8670605913520646, + "grad_norm": 9.68175220489502, + "learning_rate": 4.7528210682418807e-07, + "loss": 0.6756155014038085, + "memory(GiB)": 47.44, + "step": 18645, + "token_acc": 0.8227746591820368, + "train_speed(iter/s)": 0.095838 + }, + { + "epoch": 0.8672931096120142, + "grad_norm": 6.71742057800293, + "learning_rate": 4.7364731752627514e-07, + "loss": 0.625584888458252, + "memory(GiB)": 47.44, + "step": 18650, + "token_acc": 0.8328366296396641, + "train_speed(iter/s)": 0.095852 + }, + { + "epoch": 0.8672931096120142, + "eval_loss": 0.5572828650474548, + "eval_runtime": 293.8674, + "eval_samples_per_second": 11.825, + "eval_steps_per_second": 11.825, + "step": 18650 + }, + { + "epoch": 0.8675256278719639, + "grad_norm": 6.967619895935059, + "learning_rate": 4.7201520480045284e-07, + "loss": 0.5962310314178467, + "memory(GiB)": 47.44, + "step": 18655, + "token_acc": 0.8348475509697664, + "train_speed(iter/s)": 0.095722 + }, + { + "epoch": 0.8677581461319134, + "grad_norm": 8.604484558105469, + "learning_rate": 4.70385769611838e-07, + "loss": 0.6598502159118652, + "memory(GiB)": 47.44, + "step": 18660, + "token_acc": 0.8352527191298784, + "train_speed(iter/s)": 0.095737 + }, + { + "epoch": 0.867990664391863, + "grad_norm": 10.318936347961426, + "learning_rate": 4.687590129239672e-07, + "loss": 0.7001275062561035, + "memory(GiB)": 47.44, + "step": 18665, + "token_acc": 0.8137119113573407, + "train_speed(iter/s)": 0.095751 + }, + { + "epoch": 0.8682231826518126, + "grad_norm": 10.031474113464355, + "learning_rate": 4.671349356987909e-07, + "loss": 0.7455158233642578, + "memory(GiB)": 47.44, + "step": 18670, + "token_acc": 0.8162195497995683, + "train_speed(iter/s)": 0.095765 + }, + { + "epoch": 0.8684557009117623, + "grad_norm": 8.311787605285645, + "learning_rate": 4.6551353889667693e-07, + "loss": 0.7342134952545166, + "memory(GiB)": 47.44, + "step": 18675, + "token_acc": 0.8178947368421052, + "train_speed(iter/s)": 0.09578 + }, + { + "epoch": 0.8686882191717118, + "grad_norm": 8.125550270080566, + "learning_rate": 4.638948234764068e-07, + "loss": 0.6627881526947021, + "memory(GiB)": 47.44, + "step": 18680, + "token_acc": 0.8397740784780023, + "train_speed(iter/s)": 0.095794 + }, + { + "epoch": 0.8689207374316614, + "grad_norm": 10.395947456359863, + "learning_rate": 4.6227879039517754e-07, + "loss": 0.540507698059082, + "memory(GiB)": 47.44, + "step": 18685, + "token_acc": 0.8612343686970553, + "train_speed(iter/s)": 0.095809 + }, + { + "epoch": 0.8691532556916111, + "grad_norm": 7.6285200119018555, + "learning_rate": 4.606654406085992e-07, + "loss": 0.5860920429229737, + "memory(GiB)": 47.44, + "step": 18690, + "token_acc": 0.8527542372881356, + "train_speed(iter/s)": 0.095823 + }, + { + "epoch": 0.8693857739515606, + "grad_norm": 9.4238862991333, + "learning_rate": 4.5905477507069473e-07, + "loss": 0.6630524158477783, + "memory(GiB)": 47.44, + "step": 18695, + "token_acc": 0.828101644245142, + "train_speed(iter/s)": 0.095837 + }, + { + "epoch": 0.8696182922115102, + "grad_norm": 8.019683837890625, + "learning_rate": 4.574467947339017e-07, + "loss": 0.616099214553833, + "memory(GiB)": 47.44, + "step": 18700, + "token_acc": 0.841552142279709, + "train_speed(iter/s)": 0.095852 + }, + { + "epoch": 0.8696182922115102, + "eval_loss": 0.557950496673584, + "eval_runtime": 294.649, + "eval_samples_per_second": 11.794, + "eval_steps_per_second": 11.794, + "step": 18700 + }, + { + "epoch": 0.8698508104714598, + "grad_norm": 7.529056549072266, + "learning_rate": 4.5584150054906626e-07, + "loss": 0.5641360759735108, + "memory(GiB)": 47.44, + "step": 18705, + "token_acc": 0.8352028927280032, + "train_speed(iter/s)": 0.095721 + }, + { + "epoch": 0.8700833287314095, + "grad_norm": 7.795188903808594, + "learning_rate": 4.5423889346545125e-07, + "loss": 0.6588688850402832, + "memory(GiB)": 47.44, + "step": 18710, + "token_acc": 0.8301944106925881, + "train_speed(iter/s)": 0.095736 + }, + { + "epoch": 0.870315846991359, + "grad_norm": 7.797845363616943, + "learning_rate": 4.5263897443072525e-07, + "loss": 0.7449404239654541, + "memory(GiB)": 47.44, + "step": 18715, + "token_acc": 0.8107714701601164, + "train_speed(iter/s)": 0.09575 + }, + { + "epoch": 0.8705483652513086, + "grad_norm": 9.954667091369629, + "learning_rate": 4.5104174439097037e-07, + "loss": 0.5302114963531495, + "memory(GiB)": 47.44, + "step": 18720, + "token_acc": 0.8644132168628941, + "train_speed(iter/s)": 0.095764 + }, + { + "epoch": 0.8707808835112583, + "grad_norm": 9.294275283813477, + "learning_rate": 4.494472042906789e-07, + "loss": 0.6280638694763183, + "memory(GiB)": 47.44, + "step": 18725, + "token_acc": 0.8386212299255777, + "train_speed(iter/s)": 0.095779 + }, + { + "epoch": 0.8710134017712079, + "grad_norm": 8.05164623260498, + "learning_rate": 4.478553550727521e-07, + "loss": 0.7135934352874755, + "memory(GiB)": 47.44, + "step": 18730, + "token_acc": 0.8139735480161012, + "train_speed(iter/s)": 0.095793 + }, + { + "epoch": 0.8712459200311574, + "grad_norm": 7.834764003753662, + "learning_rate": 4.4626619767849764e-07, + "loss": 0.6254189968109131, + "memory(GiB)": 47.44, + "step": 18735, + "token_acc": 0.8413329257107918, + "train_speed(iter/s)": 0.095807 + }, + { + "epoch": 0.871478438291107, + "grad_norm": 10.09204387664795, + "learning_rate": 4.446797330476349e-07, + "loss": 0.7326918601989746, + "memory(GiB)": 47.44, + "step": 18740, + "token_acc": 0.8233387358184765, + "train_speed(iter/s)": 0.095822 + }, + { + "epoch": 0.8717109565510567, + "grad_norm": 9.581809043884277, + "learning_rate": 4.430959621182884e-07, + "loss": 0.5726360321044922, + "memory(GiB)": 47.44, + "step": 18745, + "token_acc": 0.8482712319570326, + "train_speed(iter/s)": 0.095836 + }, + { + "epoch": 0.8719434748110062, + "grad_norm": 7.136640548706055, + "learning_rate": 4.4151488582699186e-07, + "loss": 0.5546000480651856, + "memory(GiB)": 47.44, + "step": 18750, + "token_acc": 0.8588807785888077, + "train_speed(iter/s)": 0.095851 + }, + { + "epoch": 0.8719434748110062, + "eval_loss": 0.5570844411849976, + "eval_runtime": 295.122, + "eval_samples_per_second": 11.775, + "eval_steps_per_second": 11.775, + "step": 18750 + }, + { + "epoch": 0.8721759930709558, + "grad_norm": 9.215892791748047, + "learning_rate": 4.3993650510868347e-07, + "loss": 0.6979795455932617, + "memory(GiB)": 47.44, + "step": 18755, + "token_acc": 0.8343617539294181, + "train_speed(iter/s)": 0.095721 + }, + { + "epoch": 0.8724085113309055, + "grad_norm": 7.706457138061523, + "learning_rate": 4.383608208967083e-07, + "loss": 0.5588757514953613, + "memory(GiB)": 47.44, + "step": 18760, + "token_acc": 0.8442001516300227, + "train_speed(iter/s)": 0.095735 + }, + { + "epoch": 0.8726410295908551, + "grad_norm": 10.519378662109375, + "learning_rate": 4.367878341228182e-07, + "loss": 0.7292253017425537, + "memory(GiB)": 47.44, + "step": 18765, + "token_acc": 0.8270793036750483, + "train_speed(iter/s)": 0.095748 + }, + { + "epoch": 0.8728735478508046, + "grad_norm": 8.943109512329102, + "learning_rate": 4.3521754571716865e-07, + "loss": 0.6495208263397216, + "memory(GiB)": 47.44, + "step": 18770, + "token_acc": 0.8444714459295262, + "train_speed(iter/s)": 0.095763 + }, + { + "epoch": 0.8731060661107543, + "grad_norm": 9.06867504119873, + "learning_rate": 4.336499566083191e-07, + "loss": 0.6166380882263184, + "memory(GiB)": 47.44, + "step": 18775, + "token_acc": 0.8551223241590215, + "train_speed(iter/s)": 0.095777 + }, + { + "epoch": 0.8733385843707039, + "grad_norm": 8.140872955322266, + "learning_rate": 4.320850677232341e-07, + "loss": 0.43561468124389646, + "memory(GiB)": 47.44, + "step": 18780, + "token_acc": 0.9012189995796553, + "train_speed(iter/s)": 0.095791 + }, + { + "epoch": 0.8735711026306535, + "grad_norm": 9.459227561950684, + "learning_rate": 4.305228799872796e-07, + "loss": 0.5819206237792969, + "memory(GiB)": 47.44, + "step": 18785, + "token_acc": 0.8629981024667932, + "train_speed(iter/s)": 0.095806 + }, + { + "epoch": 0.873803620890603, + "grad_norm": 6.878607273101807, + "learning_rate": 4.289633943242266e-07, + "loss": 0.5773335456848144, + "memory(GiB)": 47.44, + "step": 18790, + "token_acc": 0.8589874277947672, + "train_speed(iter/s)": 0.09582 + }, + { + "epoch": 0.8740361391505527, + "grad_norm": 10.950090408325195, + "learning_rate": 4.2740661165624585e-07, + "loss": 0.6097473621368408, + "memory(GiB)": 47.44, + "step": 18795, + "token_acc": 0.8536082474226804, + "train_speed(iter/s)": 0.095834 + }, + { + "epoch": 0.8742686574105023, + "grad_norm": 10.073168754577637, + "learning_rate": 4.2585253290391205e-07, + "loss": 0.7397896766662597, + "memory(GiB)": 47.44, + "step": 18800, + "token_acc": 0.8026701400195376, + "train_speed(iter/s)": 0.095849 + }, + { + "epoch": 0.8742686574105023, + "eval_loss": 0.5571832060813904, + "eval_runtime": 291.4443, + "eval_samples_per_second": 11.923, + "eval_steps_per_second": 11.923, + "step": 18800 + }, + { + "epoch": 0.8745011756704518, + "grad_norm": 8.847938537597656, + "learning_rate": 4.243011589861995e-07, + "loss": 0.5326892852783203, + "memory(GiB)": 47.44, + "step": 18805, + "token_acc": 0.8355497269157877, + "train_speed(iter/s)": 0.095721 + }, + { + "epoch": 0.8747336939304015, + "grad_norm": 7.569638729095459, + "learning_rate": 4.227524908204833e-07, + "loss": 0.6248403549194336, + "memory(GiB)": 47.44, + "step": 18810, + "token_acc": 0.846929422548121, + "train_speed(iter/s)": 0.095735 + }, + { + "epoch": 0.8749662121903511, + "grad_norm": 7.655401229858398, + "learning_rate": 4.2120652932254036e-07, + "loss": 0.5118704795837402, + "memory(GiB)": 47.44, + "step": 18815, + "token_acc": 0.8706986444212722, + "train_speed(iter/s)": 0.095749 + }, + { + "epoch": 0.8751987304503007, + "grad_norm": 9.32550048828125, + "learning_rate": 4.1966327540654327e-07, + "loss": 0.539171838760376, + "memory(GiB)": 47.44, + "step": 18820, + "token_acc": 0.8633844147375286, + "train_speed(iter/s)": 0.095763 + }, + { + "epoch": 0.8754312487102502, + "grad_norm": 7.188518524169922, + "learning_rate": 4.1812272998506765e-07, + "loss": 0.6495114803314209, + "memory(GiB)": 47.44, + "step": 18825, + "token_acc": 0.8364995328558081, + "train_speed(iter/s)": 0.095778 + }, + { + "epoch": 0.8756637669701999, + "grad_norm": 10.491996765136719, + "learning_rate": 4.165848939690836e-07, + "loss": 0.7552758693695069, + "memory(GiB)": 47.44, + "step": 18830, + "token_acc": 0.8204585537918871, + "train_speed(iter/s)": 0.095792 + }, + { + "epoch": 0.8758962852301495, + "grad_norm": 11.448698997497559, + "learning_rate": 4.1504976826796327e-07, + "loss": 0.6858109474182129, + "memory(GiB)": 47.44, + "step": 18835, + "token_acc": 0.8332103321033211, + "train_speed(iter/s)": 0.095806 + }, + { + "epoch": 0.876128803490099, + "grad_norm": 9.617608070373535, + "learning_rate": 4.1351735378947043e-07, + "loss": 0.6972774028778076, + "memory(GiB)": 47.44, + "step": 18840, + "token_acc": 0.831145584725537, + "train_speed(iter/s)": 0.09582 + }, + { + "epoch": 0.8763613217500487, + "grad_norm": 8.686561584472656, + "learning_rate": 4.11987651439773e-07, + "loss": 0.7151837825775147, + "memory(GiB)": 47.44, + "step": 18845, + "token_acc": 0.8249015449863678, + "train_speed(iter/s)": 0.095833 + }, + { + "epoch": 0.8765938400099983, + "grad_norm": 9.857751846313477, + "learning_rate": 4.104606621234286e-07, + "loss": 0.5853623390197754, + "memory(GiB)": 47.44, + "step": 18850, + "token_acc": 0.8560057887120116, + "train_speed(iter/s)": 0.095848 + }, + { + "epoch": 0.8765938400099983, + "eval_loss": 0.557105541229248, + "eval_runtime": 289.6076, + "eval_samples_per_second": 11.999, + "eval_steps_per_second": 11.999, + "step": 18850 + }, + { + "epoch": 0.8768263582699479, + "grad_norm": 9.501344680786133, + "learning_rate": 4.089363867433954e-07, + "loss": 0.7283553123474121, + "memory(GiB)": 47.44, + "step": 18855, + "token_acc": 0.8342097049148839, + "train_speed(iter/s)": 0.09572 + }, + { + "epoch": 0.8770588765298974, + "grad_norm": 9.011695861816406, + "learning_rate": 4.074148262010219e-07, + "loss": 0.6737423419952393, + "memory(GiB)": 47.44, + "step": 18860, + "token_acc": 0.8236682400539447, + "train_speed(iter/s)": 0.095735 + }, + { + "epoch": 0.8772913947898471, + "grad_norm": 9.307239532470703, + "learning_rate": 4.058959813960556e-07, + "loss": 0.5290046215057373, + "memory(GiB)": 47.44, + "step": 18865, + "token_acc": 0.8745748299319728, + "train_speed(iter/s)": 0.095749 + }, + { + "epoch": 0.8775239130497967, + "grad_norm": 6.860555648803711, + "learning_rate": 4.043798532266352e-07, + "loss": 0.6110732555389404, + "memory(GiB)": 47.44, + "step": 18870, + "token_acc": 0.8466997870830376, + "train_speed(iter/s)": 0.095763 + }, + { + "epoch": 0.8777564313097463, + "grad_norm": 7.625909328460693, + "learning_rate": 4.0286644258929476e-07, + "loss": 0.6237162590026856, + "memory(GiB)": 47.44, + "step": 18875, + "token_acc": 0.8529635258358662, + "train_speed(iter/s)": 0.095778 + }, + { + "epoch": 0.8779889495696959, + "grad_norm": 9.900529861450195, + "learning_rate": 4.0135575037896056e-07, + "loss": 0.6516123294830323, + "memory(GiB)": 47.44, + "step": 18880, + "token_acc": 0.8463067240808214, + "train_speed(iter/s)": 0.095792 + }, + { + "epoch": 0.8782214678296455, + "grad_norm": 11.614465713500977, + "learning_rate": 3.9984777748895253e-07, + "loss": 0.6818142414093018, + "memory(GiB)": 47.44, + "step": 18885, + "token_acc": 0.8380439659039928, + "train_speed(iter/s)": 0.095806 + }, + { + "epoch": 0.8784539860895951, + "grad_norm": 9.174266815185547, + "learning_rate": 3.983425248109796e-07, + "loss": 0.5818466663360595, + "memory(GiB)": 47.44, + "step": 18890, + "token_acc": 0.8540983606557377, + "train_speed(iter/s)": 0.09582 + }, + { + "epoch": 0.8786865043495447, + "grad_norm": 7.933727741241455, + "learning_rate": 3.968399932351463e-07, + "loss": 0.6590875148773193, + "memory(GiB)": 47.44, + "step": 18895, + "token_acc": 0.8458536585365853, + "train_speed(iter/s)": 0.095835 + }, + { + "epoch": 0.8789190226094943, + "grad_norm": 9.447802543640137, + "learning_rate": 3.953401836499443e-07, + "loss": 0.8476919174194336, + "memory(GiB)": 47.44, + "step": 18900, + "token_acc": 0.8137555328566565, + "train_speed(iter/s)": 0.095849 + }, + { + "epoch": 0.8789190226094943, + "eval_loss": 0.556678295135498, + "eval_runtime": 292.5011, + "eval_samples_per_second": 11.88, + "eval_steps_per_second": 11.88, + "step": 18900 + }, + { + "epoch": 0.8791515408694439, + "grad_norm": 7.718998432159424, + "learning_rate": 3.9384309694225855e-07, + "loss": 0.48586311340332033, + "memory(GiB)": 47.44, + "step": 18905, + "token_acc": 0.8355619891552968, + "train_speed(iter/s)": 0.095721 + }, + { + "epoch": 0.8793840591293935, + "grad_norm": 8.555072784423828, + "learning_rate": 3.9234873399736086e-07, + "loss": 0.6869643211364747, + "memory(GiB)": 47.44, + "step": 18910, + "token_acc": 0.8362164151637835, + "train_speed(iter/s)": 0.095735 + }, + { + "epoch": 0.8796165773893431, + "grad_norm": 13.64516830444336, + "learning_rate": 3.908570956989155e-07, + "loss": 0.832066535949707, + "memory(GiB)": 47.44, + "step": 18915, + "token_acc": 0.807448159119763, + "train_speed(iter/s)": 0.095749 + }, + { + "epoch": 0.8798490956492927, + "grad_norm": 8.657330513000488, + "learning_rate": 3.893681829289736e-07, + "loss": 0.6564640998840332, + "memory(GiB)": 47.44, + "step": 18920, + "token_acc": 0.8369565217391305, + "train_speed(iter/s)": 0.095763 + }, + { + "epoch": 0.8800816139092423, + "grad_norm": 8.098555564880371, + "learning_rate": 3.87881996567977e-07, + "loss": 0.6175037860870362, + "memory(GiB)": 47.44, + "step": 18925, + "token_acc": 0.8464987926871335, + "train_speed(iter/s)": 0.095778 + }, + { + "epoch": 0.880314132169192, + "grad_norm": 10.006739616394043, + "learning_rate": 3.8639853749475153e-07, + "loss": 0.5666879653930664, + "memory(GiB)": 47.44, + "step": 18930, + "token_acc": 0.8589272593681117, + "train_speed(iter/s)": 0.095792 + }, + { + "epoch": 0.8805466504291415, + "grad_norm": 9.115056037902832, + "learning_rate": 3.849178065865139e-07, + "loss": 0.6132634162902832, + "memory(GiB)": 47.44, + "step": 18935, + "token_acc": 0.8485299590621511, + "train_speed(iter/s)": 0.095806 + }, + { + "epoch": 0.8807791686890911, + "grad_norm": 6.633612632751465, + "learning_rate": 3.8343980471886424e-07, + "loss": 0.6955962657928467, + "memory(GiB)": 47.44, + "step": 18940, + "token_acc": 0.8203592814371258, + "train_speed(iter/s)": 0.09582 + }, + { + "epoch": 0.8810116869490408, + "grad_norm": 7.961598873138428, + "learning_rate": 3.8196453276579173e-07, + "loss": 0.7367173194885254, + "memory(GiB)": 47.44, + "step": 18945, + "token_acc": 0.8152793614595211, + "train_speed(iter/s)": 0.095834 + }, + { + "epoch": 0.8812442052089903, + "grad_norm": 8.601881980895996, + "learning_rate": 3.8049199159967075e-07, + "loss": 0.6443628787994384, + "memory(GiB)": 47.44, + "step": 18950, + "token_acc": 0.8405551794177386, + "train_speed(iter/s)": 0.095848 + }, + { + "epoch": 0.8812442052089903, + "eval_loss": 0.5564034581184387, + "eval_runtime": 291.7319, + "eval_samples_per_second": 11.912, + "eval_steps_per_second": 11.912, + "step": 18950 + }, + { + "epoch": 0.8814767234689399, + "grad_norm": 9.404342651367188, + "learning_rate": 3.790221820912593e-07, + "loss": 0.6218993186950683, + "memory(GiB)": 47.44, + "step": 18955, + "token_acc": 0.8352454482045362, + "train_speed(iter/s)": 0.095721 + }, + { + "epoch": 0.8817092417288895, + "grad_norm": 8.895606994628906, + "learning_rate": 3.775551051097015e-07, + "loss": 0.6595926761627198, + "memory(GiB)": 47.44, + "step": 18960, + "token_acc": 0.8220284237726099, + "train_speed(iter/s)": 0.095735 + }, + { + "epoch": 0.8819417599888392, + "grad_norm": 9.177611351013184, + "learning_rate": 3.7609076152252513e-07, + "loss": 0.7080647945404053, + "memory(GiB)": 47.44, + "step": 18965, + "token_acc": 0.8249733191035219, + "train_speed(iter/s)": 0.095749 + }, + { + "epoch": 0.8821742782487887, + "grad_norm": 9.349613189697266, + "learning_rate": 3.7462915219564244e-07, + "loss": 0.6600799083709716, + "memory(GiB)": 47.44, + "step": 18970, + "token_acc": 0.829050279329609, + "train_speed(iter/s)": 0.095763 + }, + { + "epoch": 0.8824067965087383, + "grad_norm": 8.506677627563477, + "learning_rate": 3.731702779933477e-07, + "loss": 0.6581337451934814, + "memory(GiB)": 47.44, + "step": 18975, + "token_acc": 0.8370720188902007, + "train_speed(iter/s)": 0.095777 + }, + { + "epoch": 0.882639314768688, + "grad_norm": 8.334615707397461, + "learning_rate": 3.717141397783186e-07, + "loss": 0.6230666160583496, + "memory(GiB)": 47.44, + "step": 18980, + "token_acc": 0.8457613814756672, + "train_speed(iter/s)": 0.095791 + }, + { + "epoch": 0.8828718330286375, + "grad_norm": 7.229907035827637, + "learning_rate": 3.702607384116136e-07, + "loss": 0.687413501739502, + "memory(GiB)": 47.44, + "step": 18985, + "token_acc": 0.8236486486486486, + "train_speed(iter/s)": 0.095805 + }, + { + "epoch": 0.8831043512885871, + "grad_norm": 8.757658004760742, + "learning_rate": 3.6881007475267515e-07, + "loss": 0.7368191242218017, + "memory(GiB)": 47.44, + "step": 18990, + "token_acc": 0.8106930693069307, + "train_speed(iter/s)": 0.095819 + }, + { + "epoch": 0.8833368695485367, + "grad_norm": 7.911934852600098, + "learning_rate": 3.673621496593238e-07, + "loss": 0.542356300354004, + "memory(GiB)": 47.44, + "step": 18995, + "token_acc": 0.8660508083140878, + "train_speed(iter/s)": 0.095833 + }, + { + "epoch": 0.8835693878084864, + "grad_norm": 9.330195426940918, + "learning_rate": 3.6591696398776353e-07, + "loss": 0.5444394111633301, + "memory(GiB)": 47.44, + "step": 19000, + "token_acc": 0.8651315789473685, + "train_speed(iter/s)": 0.095848 + }, + { + "epoch": 0.8835693878084864, + "eval_loss": 0.5561711192131042, + "eval_runtime": 292.1787, + "eval_samples_per_second": 11.893, + "eval_steps_per_second": 11.893, + "step": 19000 + }, + { + "epoch": 0.8838019060684359, + "grad_norm": 10.371062278747559, + "learning_rate": 3.6447451859257685e-07, + "loss": 0.6457038879394531, + "memory(GiB)": 47.44, + "step": 19005, + "token_acc": 0.8348006338738335, + "train_speed(iter/s)": 0.09572 + }, + { + "epoch": 0.8840344243283855, + "grad_norm": 10.796051979064941, + "learning_rate": 3.630348143267276e-07, + "loss": 0.6592034816741943, + "memory(GiB)": 47.44, + "step": 19010, + "token_acc": 0.837495475931958, + "train_speed(iter/s)": 0.095735 + }, + { + "epoch": 0.8842669425883352, + "grad_norm": 8.91750717163086, + "learning_rate": 3.615978520415553e-07, + "loss": 0.6092522144317627, + "memory(GiB)": 47.44, + "step": 19015, + "token_acc": 0.8453038674033149, + "train_speed(iter/s)": 0.095749 + }, + { + "epoch": 0.8844994608482848, + "grad_norm": 9.327253341674805, + "learning_rate": 3.6016363258678187e-07, + "loss": 0.6067311763763428, + "memory(GiB)": 47.44, + "step": 19020, + "token_acc": 0.8439821693907875, + "train_speed(iter/s)": 0.095763 + }, + { + "epoch": 0.8847319791082343, + "grad_norm": 10.384136199951172, + "learning_rate": 3.58732156810504e-07, + "loss": 0.6553625106811524, + "memory(GiB)": 47.44, + "step": 19025, + "token_acc": 0.8404518178609248, + "train_speed(iter/s)": 0.095777 + }, + { + "epoch": 0.8849644973681839, + "grad_norm": 10.37766170501709, + "learning_rate": 3.5730342555919896e-07, + "loss": 0.6201920986175538, + "memory(GiB)": 47.44, + "step": 19030, + "token_acc": 0.8405219282348677, + "train_speed(iter/s)": 0.095791 + }, + { + "epoch": 0.8851970156281336, + "grad_norm": 8.837468147277832, + "learning_rate": 3.5587743967771816e-07, + "loss": 0.6653841018676758, + "memory(GiB)": 47.44, + "step": 19035, + "token_acc": 0.8508612873980055, + "train_speed(iter/s)": 0.095805 + }, + { + "epoch": 0.8854295338880831, + "grad_norm": 10.655569076538086, + "learning_rate": 3.544542000092921e-07, + "loss": 0.57546067237854, + "memory(GiB)": 47.44, + "step": 19040, + "token_acc": 0.8496751329001772, + "train_speed(iter/s)": 0.095819 + }, + { + "epoch": 0.8856620521480327, + "grad_norm": 8.41292667388916, + "learning_rate": 3.530337073955259e-07, + "loss": 0.6289780139923096, + "memory(GiB)": 47.44, + "step": 19045, + "token_acc": 0.8403242862178357, + "train_speed(iter/s)": 0.095833 + }, + { + "epoch": 0.8858945704079824, + "grad_norm": 7.690162658691406, + "learning_rate": 3.5161596267640153e-07, + "loss": 0.564287519454956, + "memory(GiB)": 47.44, + "step": 19050, + "token_acc": 0.8630756578947368, + "train_speed(iter/s)": 0.095848 + }, + { + "epoch": 0.8858945704079824, + "eval_loss": 0.556488037109375, + "eval_runtime": 291.1297, + "eval_samples_per_second": 11.936, + "eval_steps_per_second": 11.936, + "step": 19050 + }, + { + "epoch": 0.886127088667932, + "grad_norm": 12.681654930114746, + "learning_rate": 3.5020096669027395e-07, + "loss": 0.6333631038665771, + "memory(GiB)": 47.44, + "step": 19055, + "token_acc": 0.8349946314962902, + "train_speed(iter/s)": 0.095721 + }, + { + "epoch": 0.8863596069278815, + "grad_norm": 6.69144344329834, + "learning_rate": 3.4878872027387545e-07, + "loss": 0.7488809585571289, + "memory(GiB)": 47.44, + "step": 19060, + "token_acc": 0.8173852085524009, + "train_speed(iter/s)": 0.095735 + }, + { + "epoch": 0.8865921251878311, + "grad_norm": 8.09365463256836, + "learning_rate": 3.473792242623092e-07, + "loss": 0.6257463455200195, + "memory(GiB)": 47.44, + "step": 19065, + "token_acc": 0.8425925925925926, + "train_speed(iter/s)": 0.095749 + }, + { + "epoch": 0.8868246434477808, + "grad_norm": 6.565597057342529, + "learning_rate": 3.459724794890551e-07, + "loss": 0.7603954792022705, + "memory(GiB)": 47.44, + "step": 19070, + "token_acc": 0.8020090732339599, + "train_speed(iter/s)": 0.095763 + }, + { + "epoch": 0.8870571617077304, + "grad_norm": 8.496455192565918, + "learning_rate": 3.4456848678596387e-07, + "loss": 0.5489050865173339, + "memory(GiB)": 47.44, + "step": 19075, + "token_acc": 0.8712174524982407, + "train_speed(iter/s)": 0.095777 + }, + { + "epoch": 0.8872896799676799, + "grad_norm": 6.798948287963867, + "learning_rate": 3.4316724698325976e-07, + "loss": 0.7015841007232666, + "memory(GiB)": 47.44, + "step": 19080, + "token_acc": 0.8179470626210459, + "train_speed(iter/s)": 0.095791 + }, + { + "epoch": 0.8875221982276296, + "grad_norm": 8.605868339538574, + "learning_rate": 3.4176876090953993e-07, + "loss": 0.629077959060669, + "memory(GiB)": 47.44, + "step": 19085, + "token_acc": 0.8442776735459663, + "train_speed(iter/s)": 0.095805 + }, + { + "epoch": 0.8877547164875792, + "grad_norm": 7.39762020111084, + "learning_rate": 3.4037302939177174e-07, + "loss": 0.7615146160125732, + "memory(GiB)": 47.44, + "step": 19090, + "token_acc": 0.8089795918367347, + "train_speed(iter/s)": 0.09582 + }, + { + "epoch": 0.8879872347475287, + "grad_norm": 8.316370010375977, + "learning_rate": 3.3898005325529506e-07, + "loss": 0.6413861274719238, + "memory(GiB)": 47.44, + "step": 19095, + "token_acc": 0.8319242475481907, + "train_speed(iter/s)": 0.095834 + }, + { + "epoch": 0.8882197530074784, + "grad_norm": 9.32655143737793, + "learning_rate": 3.3758983332381865e-07, + "loss": 0.6109539031982422, + "memory(GiB)": 47.44, + "step": 19100, + "token_acc": 0.8414420721036052, + "train_speed(iter/s)": 0.095848 + }, + { + "epoch": 0.8882197530074784, + "eval_loss": 0.5563390851020813, + "eval_runtime": 293.6699, + "eval_samples_per_second": 11.833, + "eval_steps_per_second": 11.833, + "step": 19100 + }, + { + "epoch": 0.888452271267428, + "grad_norm": 8.048409461975098, + "learning_rate": 3.3620237041942396e-07, + "loss": 0.7133908748626709, + "memory(GiB)": 47.44, + "step": 19105, + "token_acc": 0.8342901337925975, + "train_speed(iter/s)": 0.095721 + }, + { + "epoch": 0.8886847895273776, + "grad_norm": 8.017596244812012, + "learning_rate": 3.3481766536255845e-07, + "loss": 0.6549528121948243, + "memory(GiB)": 47.44, + "step": 19110, + "token_acc": 0.8401264933239635, + "train_speed(iter/s)": 0.095735 + }, + { + "epoch": 0.8889173077873271, + "grad_norm": 7.829741954803467, + "learning_rate": 3.334357189720433e-07, + "loss": 0.5810675144195556, + "memory(GiB)": 47.44, + "step": 19115, + "token_acc": 0.8514382402707276, + "train_speed(iter/s)": 0.095748 + }, + { + "epoch": 0.8891498260472768, + "grad_norm": 6.798005104064941, + "learning_rate": 3.320565320650637e-07, + "loss": 0.6954770565032959, + "memory(GiB)": 47.44, + "step": 19120, + "token_acc": 0.8231875191189967, + "train_speed(iter/s)": 0.095762 + }, + { + "epoch": 0.8893823443072264, + "grad_norm": 7.311758995056152, + "learning_rate": 3.306801054571773e-07, + "loss": 0.6007462501525879, + "memory(GiB)": 47.44, + "step": 19125, + "token_acc": 0.8505388542257516, + "train_speed(iter/s)": 0.095777 + }, + { + "epoch": 0.8896148625671759, + "grad_norm": 8.575642585754395, + "learning_rate": 3.2930643996230696e-07, + "loss": 0.5701055526733398, + "memory(GiB)": 47.44, + "step": 19130, + "token_acc": 0.8495435945860875, + "train_speed(iter/s)": 0.095791 + }, + { + "epoch": 0.8898473808271256, + "grad_norm": 8.63494873046875, + "learning_rate": 3.2793553639274447e-07, + "loss": 0.7098838329315186, + "memory(GiB)": 47.44, + "step": 19135, + "token_acc": 0.8214892893573614, + "train_speed(iter/s)": 0.095805 + }, + { + "epoch": 0.8900798990870752, + "grad_norm": 7.499673843383789, + "learning_rate": 3.265673955591453e-07, + "loss": 0.6947129249572754, + "memory(GiB)": 47.44, + "step": 19140, + "token_acc": 0.8269592476489028, + "train_speed(iter/s)": 0.095819 + }, + { + "epoch": 0.8903124173470248, + "grad_norm": 7.8279709815979, + "learning_rate": 3.2520201827053564e-07, + "loss": 0.6508459568023681, + "memory(GiB)": 47.44, + "step": 19145, + "token_acc": 0.8448480063166206, + "train_speed(iter/s)": 0.095832 + }, + { + "epoch": 0.8905449356069743, + "grad_norm": 10.999360084533691, + "learning_rate": 3.2383940533430355e-07, + "loss": 0.6100882053375244, + "memory(GiB)": 47.44, + "step": 19150, + "token_acc": 0.848823294774631, + "train_speed(iter/s)": 0.095846 + }, + { + "epoch": 0.8905449356069743, + "eval_loss": 0.5568610429763794, + "eval_runtime": 293.2618, + "eval_samples_per_second": 11.849, + "eval_steps_per_second": 11.849, + "step": 19150 + }, + { + "epoch": 0.890777453866924, + "grad_norm": 7.643911838531494, + "learning_rate": 3.2247955755620566e-07, + "loss": 0.5627256870269776, + "memory(GiB)": 47.44, + "step": 19155, + "token_acc": 0.835324033594571, + "train_speed(iter/s)": 0.095719 + }, + { + "epoch": 0.8910099721268736, + "grad_norm": 8.361198425292969, + "learning_rate": 3.2112247574036025e-07, + "loss": 0.6052700996398925, + "memory(GiB)": 47.44, + "step": 19160, + "token_acc": 0.8570234113712375, + "train_speed(iter/s)": 0.095733 + }, + { + "epoch": 0.8912424903868232, + "grad_norm": 12.711145401000977, + "learning_rate": 3.1976816068925274e-07, + "loss": 0.5738406658172608, + "memory(GiB)": 47.44, + "step": 19165, + "token_acc": 0.8722176422093982, + "train_speed(iter/s)": 0.095747 + }, + { + "epoch": 0.8914750086467728, + "grad_norm": 10.067715644836426, + "learning_rate": 3.1841661320373086e-07, + "loss": 0.6505169868469238, + "memory(GiB)": 47.44, + "step": 19170, + "token_acc": 0.8445133772780148, + "train_speed(iter/s)": 0.09576 + }, + { + "epoch": 0.8917075269067224, + "grad_norm": 8.287736892700195, + "learning_rate": 3.17067834083008e-07, + "loss": 0.5217358112335205, + "memory(GiB)": 47.44, + "step": 19175, + "token_acc": 0.8596858638743455, + "train_speed(iter/s)": 0.095774 + }, + { + "epoch": 0.891940045166672, + "grad_norm": 8.864876747131348, + "learning_rate": 3.157218241246562e-07, + "loss": 0.6427960395812988, + "memory(GiB)": 47.44, + "step": 19180, + "token_acc": 0.844559585492228, + "train_speed(iter/s)": 0.095788 + }, + { + "epoch": 0.8921725634266215, + "grad_norm": 9.918158531188965, + "learning_rate": 3.143785841246155e-07, + "loss": 0.5956603527069092, + "memory(GiB)": 47.44, + "step": 19185, + "token_acc": 0.8509154315605929, + "train_speed(iter/s)": 0.095802 + }, + { + "epoch": 0.8924050816865712, + "grad_norm": 12.312911033630371, + "learning_rate": 3.130381148771827e-07, + "loss": 0.700811243057251, + "memory(GiB)": 47.44, + "step": 19190, + "token_acc": 0.820032310177706, + "train_speed(iter/s)": 0.095815 + }, + { + "epoch": 0.8926375999465208, + "grad_norm": 9.34288215637207, + "learning_rate": 3.11700417175021e-07, + "loss": 0.7641796112060547, + "memory(GiB)": 47.44, + "step": 19195, + "token_acc": 0.7960568842921784, + "train_speed(iter/s)": 0.095829 + }, + { + "epoch": 0.8928701182064704, + "grad_norm": 6.648347854614258, + "learning_rate": 3.1036549180914923e-07, + "loss": 0.6445430278778076, + "memory(GiB)": 47.44, + "step": 19200, + "token_acc": 0.8562091503267973, + "train_speed(iter/s)": 0.095843 + }, + { + "epoch": 0.8928701182064704, + "eval_loss": 0.556313157081604, + "eval_runtime": 294.234, + "eval_samples_per_second": 11.81, + "eval_steps_per_second": 11.81, + "step": 19200 + }, + { + "epoch": 0.89310263646642, + "grad_norm": 9.468454360961914, + "learning_rate": 3.0903333956895266e-07, + "loss": 0.5887749195098877, + "memory(GiB)": 47.44, + "step": 19205, + "token_acc": 0.8352915735601528, + "train_speed(iter/s)": 0.095716 + }, + { + "epoch": 0.8933351547263696, + "grad_norm": 9.351832389831543, + "learning_rate": 3.077039612421745e-07, + "loss": 0.5907858371734619, + "memory(GiB)": 47.44, + "step": 19210, + "token_acc": 0.850547195622435, + "train_speed(iter/s)": 0.09573 + }, + { + "epoch": 0.8935676729863192, + "grad_norm": 11.74577522277832, + "learning_rate": 3.063773576149143e-07, + "loss": 0.7539110660552979, + "memory(GiB)": 47.44, + "step": 19215, + "token_acc": 0.8197879858657244, + "train_speed(iter/s)": 0.095744 + }, + { + "epoch": 0.8938001912462689, + "grad_norm": 10.45272159576416, + "learning_rate": 3.0505352947163667e-07, + "loss": 0.5318979263305664, + "memory(GiB)": 47.44, + "step": 19220, + "token_acc": 0.8765799256505576, + "train_speed(iter/s)": 0.095758 + }, + { + "epoch": 0.8940327095062184, + "grad_norm": 8.622464179992676, + "learning_rate": 3.0373247759516e-07, + "loss": 0.7953807353973389, + "memory(GiB)": 47.44, + "step": 19225, + "token_acc": 0.7910621009866512, + "train_speed(iter/s)": 0.095772 + }, + { + "epoch": 0.894265227766168, + "grad_norm": 8.542391777038574, + "learning_rate": 3.024142027666649e-07, + "loss": 0.652153205871582, + "memory(GiB)": 47.44, + "step": 19230, + "token_acc": 0.8414383561643836, + "train_speed(iter/s)": 0.095786 + }, + { + "epoch": 0.8944977460261176, + "grad_norm": 9.397589683532715, + "learning_rate": 3.010987057656861e-07, + "loss": 0.5987738132476806, + "memory(GiB)": 47.44, + "step": 19235, + "token_acc": 0.8518365662401981, + "train_speed(iter/s)": 0.0958 + }, + { + "epoch": 0.8947302642860672, + "grad_norm": 9.284369468688965, + "learning_rate": 2.997859873701181e-07, + "loss": 0.6676845550537109, + "memory(GiB)": 47.44, + "step": 19240, + "token_acc": 0.8496299181924425, + "train_speed(iter/s)": 0.095814 + }, + { + "epoch": 0.8949627825460168, + "grad_norm": 9.94536018371582, + "learning_rate": 2.9847604835621216e-07, + "loss": 0.5450184345245361, + "memory(GiB)": 47.44, + "step": 19245, + "token_acc": 0.8580777096114519, + "train_speed(iter/s)": 0.095828 + }, + { + "epoch": 0.8951953008059664, + "grad_norm": 6.833744049072266, + "learning_rate": 2.9716888949857635e-07, + "loss": 0.5578155517578125, + "memory(GiB)": 47.44, + "step": 19250, + "token_acc": 0.8539007092198582, + "train_speed(iter/s)": 0.095842 + }, + { + "epoch": 0.8951953008059664, + "eval_loss": 0.5563004016876221, + "eval_runtime": 292.9507, + "eval_samples_per_second": 11.862, + "eval_steps_per_second": 11.862, + "step": 19250 + }, + { + "epoch": 0.8954278190659161, + "grad_norm": 8.075855255126953, + "learning_rate": 2.9586451157017304e-07, + "loss": 0.6162880420684814, + "memory(GiB)": 47.44, + "step": 19255, + "token_acc": 0.8352875220246676, + "train_speed(iter/s)": 0.095716 + }, + { + "epoch": 0.8956603373258656, + "grad_norm": 7.645998001098633, + "learning_rate": 2.9456291534232185e-07, + "loss": 0.6005894660949707, + "memory(GiB)": 47.44, + "step": 19260, + "token_acc": 0.8496168582375478, + "train_speed(iter/s)": 0.09573 + }, + { + "epoch": 0.8958928555858152, + "grad_norm": 9.750205993652344, + "learning_rate": 2.9326410158469543e-07, + "loss": 0.5538561344146729, + "memory(GiB)": 47.44, + "step": 19265, + "token_acc": 0.84688995215311, + "train_speed(iter/s)": 0.095744 + }, + { + "epoch": 0.8961253738457648, + "grad_norm": 7.047579288482666, + "learning_rate": 2.9196807106532443e-07, + "loss": 0.561616325378418, + "memory(GiB)": 47.44, + "step": 19270, + "token_acc": 0.8524930747922438, + "train_speed(iter/s)": 0.095758 + }, + { + "epoch": 0.8963578921057145, + "grad_norm": 6.872752666473389, + "learning_rate": 2.906748245505903e-07, + "loss": 0.6426856517791748, + "memory(GiB)": 47.44, + "step": 19275, + "token_acc": 0.8539862645565841, + "train_speed(iter/s)": 0.095771 + }, + { + "epoch": 0.896590410365664, + "grad_norm": 6.302699565887451, + "learning_rate": 2.893843628052301e-07, + "loss": 0.6119589805603027, + "memory(GiB)": 47.44, + "step": 19280, + "token_acc": 0.8436103663985701, + "train_speed(iter/s)": 0.095785 + }, + { + "epoch": 0.8968229286256136, + "grad_norm": 8.906804084777832, + "learning_rate": 2.8809668659233346e-07, + "loss": 0.6653795719146729, + "memory(GiB)": 47.44, + "step": 19285, + "token_acc": 0.8278719397363465, + "train_speed(iter/s)": 0.095798 + }, + { + "epoch": 0.8970554468855633, + "grad_norm": 7.2945709228515625, + "learning_rate": 2.8681179667334356e-07, + "loss": 0.6008943080902099, + "memory(GiB)": 47.44, + "step": 19290, + "token_acc": 0.8466141732283464, + "train_speed(iter/s)": 0.095812 + }, + { + "epoch": 0.8972879651455128, + "grad_norm": 7.817905902862549, + "learning_rate": 2.855296938080554e-07, + "loss": 0.6331915855407715, + "memory(GiB)": 47.44, + "step": 19295, + "token_acc": 0.8435329143235197, + "train_speed(iter/s)": 0.095826 + }, + { + "epoch": 0.8975204834054624, + "grad_norm": 8.829483032226562, + "learning_rate": 2.842503787546158e-07, + "loss": 0.521914291381836, + "memory(GiB)": 47.44, + "step": 19300, + "token_acc": 0.8705882352941177, + "train_speed(iter/s)": 0.09584 + }, + { + "epoch": 0.8975204834054624, + "eval_loss": 0.5560697913169861, + "eval_runtime": 291.6114, + "eval_samples_per_second": 11.917, + "eval_steps_per_second": 11.917, + "step": 19300 + }, + { + "epoch": 0.897753001665412, + "grad_norm": 7.60385799407959, + "learning_rate": 2.82973852269523e-07, + "loss": 0.5124347686767579, + "memory(GiB)": 47.44, + "step": 19305, + "token_acc": 0.8356065526448726, + "train_speed(iter/s)": 0.095714 + }, + { + "epoch": 0.8979855199253617, + "grad_norm": 7.674799919128418, + "learning_rate": 2.8170011510762706e-07, + "loss": 0.7254348754882812, + "memory(GiB)": 47.44, + "step": 19310, + "token_acc": 0.8155107618722242, + "train_speed(iter/s)": 0.095728 + }, + { + "epoch": 0.8982180381853112, + "grad_norm": 8.559470176696777, + "learning_rate": 2.804291680221277e-07, + "loss": 0.6556037902832031, + "memory(GiB)": 47.44, + "step": 19315, + "token_acc": 0.8498039215686275, + "train_speed(iter/s)": 0.095742 + }, + { + "epoch": 0.8984505564452608, + "grad_norm": 9.701505661010742, + "learning_rate": 2.791610117645749e-07, + "loss": 0.5915635108947754, + "memory(GiB)": 47.44, + "step": 19320, + "token_acc": 0.8635002139495079, + "train_speed(iter/s)": 0.095756 + }, + { + "epoch": 0.8986830747052105, + "grad_norm": 7.899413585662842, + "learning_rate": 2.7789564708486874e-07, + "loss": 0.6606907844543457, + "memory(GiB)": 47.44, + "step": 19325, + "token_acc": 0.8327725437415882, + "train_speed(iter/s)": 0.095769 + }, + { + "epoch": 0.89891559296516, + "grad_norm": 9.57420539855957, + "learning_rate": 2.766330747312601e-07, + "loss": 0.5400222301483154, + "memory(GiB)": 47.44, + "step": 19330, + "token_acc": 0.8736942070275404, + "train_speed(iter/s)": 0.095783 + }, + { + "epoch": 0.8991481112251096, + "grad_norm": 9.367339134216309, + "learning_rate": 2.7537329545034407e-07, + "loss": 0.6600838661193847, + "memory(GiB)": 47.44, + "step": 19335, + "token_acc": 0.8266504657756176, + "train_speed(iter/s)": 0.095797 + }, + { + "epoch": 0.8993806294850593, + "grad_norm": 8.384690284729004, + "learning_rate": 2.7411630998706917e-07, + "loss": 0.5784670352935791, + "memory(GiB)": 47.44, + "step": 19340, + "token_acc": 0.8525245187068652, + "train_speed(iter/s)": 0.095811 + }, + { + "epoch": 0.8996131477450089, + "grad_norm": 9.686371803283691, + "learning_rate": 2.7286211908472916e-07, + "loss": 0.6274217128753662, + "memory(GiB)": 47.44, + "step": 19345, + "token_acc": 0.8458904109589042, + "train_speed(iter/s)": 0.095824 + }, + { + "epoch": 0.8998456660049584, + "grad_norm": 9.73272705078125, + "learning_rate": 2.7161072348496576e-07, + "loss": 0.6328916549682617, + "memory(GiB)": 47.44, + "step": 19350, + "token_acc": 0.8475350963108064, + "train_speed(iter/s)": 0.095839 + }, + { + "epoch": 0.8998456660049584, + "eval_loss": 0.5558338165283203, + "eval_runtime": 293.162, + "eval_samples_per_second": 11.854, + "eval_steps_per_second": 11.854, + "step": 19350 + }, + { + "epoch": 0.900078184264908, + "grad_norm": 9.117722511291504, + "learning_rate": 2.703621239277682e-07, + "loss": 0.6286519527435303, + "memory(GiB)": 47.44, + "step": 19355, + "token_acc": 0.8351062975044592, + "train_speed(iter/s)": 0.095713 + }, + { + "epoch": 0.9003107025248577, + "grad_norm": 8.885711669921875, + "learning_rate": 2.691163211514708e-07, + "loss": 0.7022455215454102, + "memory(GiB)": 47.44, + "step": 19360, + "token_acc": 0.8235294117647058, + "train_speed(iter/s)": 0.095727 + }, + { + "epoch": 0.9005432207848073, + "grad_norm": 12.919280052185059, + "learning_rate": 2.67873315892756e-07, + "loss": 0.7374226570129394, + "memory(GiB)": 47.44, + "step": 19365, + "token_acc": 0.8167853128512551, + "train_speed(iter/s)": 0.09574 + }, + { + "epoch": 0.9007757390447568, + "grad_norm": 8.281429290771484, + "learning_rate": 2.6663310888665085e-07, + "loss": 0.6170114994049072, + "memory(GiB)": 47.44, + "step": 19370, + "token_acc": 0.846723044397463, + "train_speed(iter/s)": 0.095754 + }, + { + "epoch": 0.9010082573047065, + "grad_norm": 8.904351234436035, + "learning_rate": 2.653957008665298e-07, + "loss": 0.6403151988983155, + "memory(GiB)": 47.44, + "step": 19375, + "token_acc": 0.8420386198163976, + "train_speed(iter/s)": 0.095768 + }, + { + "epoch": 0.9012407755646561, + "grad_norm": 10.579533576965332, + "learning_rate": 2.641610925641075e-07, + "loss": 0.6673800468444824, + "memory(GiB)": 47.44, + "step": 19380, + "token_acc": 0.8332737030411449, + "train_speed(iter/s)": 0.095782 + }, + { + "epoch": 0.9014732938246056, + "grad_norm": 8.038885116577148, + "learning_rate": 2.629292847094489e-07, + "loss": 0.661448621749878, + "memory(GiB)": 47.44, + "step": 19385, + "token_acc": 0.8331445828614572, + "train_speed(iter/s)": 0.095796 + }, + { + "epoch": 0.9017058120845552, + "grad_norm": 10.71098518371582, + "learning_rate": 2.6170027803095685e-07, + "loss": 0.623293113708496, + "memory(GiB)": 47.44, + "step": 19390, + "token_acc": 0.8488517745302714, + "train_speed(iter/s)": 0.09581 + }, + { + "epoch": 0.9019383303445049, + "grad_norm": 11.028908729553223, + "learning_rate": 2.60474073255384e-07, + "loss": 0.6097889900207519, + "memory(GiB)": 47.44, + "step": 19395, + "token_acc": 0.8461862621154657, + "train_speed(iter/s)": 0.095824 + }, + { + "epoch": 0.9021708486044545, + "grad_norm": 9.047528266906738, + "learning_rate": 2.592506711078213e-07, + "loss": 0.6516981601715088, + "memory(GiB)": 47.44, + "step": 19400, + "token_acc": 0.8423571642237512, + "train_speed(iter/s)": 0.095838 + }, + { + "epoch": 0.9021708486044545, + "eval_loss": 0.5555245876312256, + "eval_runtime": 291.9705, + "eval_samples_per_second": 11.902, + "eval_steps_per_second": 11.902, + "step": 19400 + }, + { + "epoch": 0.902403366864404, + "grad_norm": 8.492449760437012, + "learning_rate": 2.580300723117041e-07, + "loss": 0.5659789085388184, + "memory(GiB)": 47.44, + "step": 19405, + "token_acc": 0.8352052408057097, + "train_speed(iter/s)": 0.095714 + }, + { + "epoch": 0.9026358851243537, + "grad_norm": 8.339861869812012, + "learning_rate": 2.56812277588811e-07, + "loss": 0.7192893028259277, + "memory(GiB)": 47.44, + "step": 19410, + "token_acc": 0.8159884767734966, + "train_speed(iter/s)": 0.095727 + }, + { + "epoch": 0.9028684033843033, + "grad_norm": 10.295209884643555, + "learning_rate": 2.555972876592616e-07, + "loss": 0.6294547080993652, + "memory(GiB)": 47.44, + "step": 19415, + "token_acc": 0.8360107095046854, + "train_speed(iter/s)": 0.09574 + }, + { + "epoch": 0.9031009216442529, + "grad_norm": 10.894436836242676, + "learning_rate": 2.543851032415162e-07, + "loss": 0.6193259239196778, + "memory(GiB)": 47.44, + "step": 19420, + "token_acc": 0.8415937803692906, + "train_speed(iter/s)": 0.095754 + }, + { + "epoch": 0.9033334399042025, + "grad_norm": 8.60874080657959, + "learning_rate": 2.531757250523781e-07, + "loss": 0.6329953670501709, + "memory(GiB)": 47.44, + "step": 19425, + "token_acc": 0.8524132429198245, + "train_speed(iter/s)": 0.095768 + }, + { + "epoch": 0.9035659581641521, + "grad_norm": 9.994233131408691, + "learning_rate": 2.519691538069885e-07, + "loss": 0.6359256744384766, + "memory(GiB)": 47.44, + "step": 19430, + "token_acc": 0.8427230046948356, + "train_speed(iter/s)": 0.095782 + }, + { + "epoch": 0.9037984764241017, + "grad_norm": 7.38197135925293, + "learning_rate": 2.507653902188317e-07, + "loss": 0.5808037757873535, + "memory(GiB)": 47.44, + "step": 19435, + "token_acc": 0.8666881859264042, + "train_speed(iter/s)": 0.095795 + }, + { + "epoch": 0.9040309946840512, + "grad_norm": 7.369668006896973, + "learning_rate": 2.495644349997289e-07, + "loss": 0.6205277919769288, + "memory(GiB)": 47.44, + "step": 19440, + "token_acc": 0.8471252907942838, + "train_speed(iter/s)": 0.095809 + }, + { + "epoch": 0.9042635129440009, + "grad_norm": 7.311388969421387, + "learning_rate": 2.483662888598426e-07, + "loss": 0.7141438484191894, + "memory(GiB)": 47.44, + "step": 19445, + "token_acc": 0.8237831176833025, + "train_speed(iter/s)": 0.095823 + }, + { + "epoch": 0.9044960312039505, + "grad_norm": 7.697142124176025, + "learning_rate": 2.471709525076732e-07, + "loss": 0.8054816246032714, + "memory(GiB)": 47.44, + "step": 19450, + "token_acc": 0.7954380883417813, + "train_speed(iter/s)": 0.095837 + }, + { + "epoch": 0.9044960312039505, + "eval_loss": 0.5556977391242981, + "eval_runtime": 292.9688, + "eval_samples_per_second": 11.861, + "eval_steps_per_second": 11.861, + "step": 19450 + }, + { + "epoch": 0.9047285494639001, + "grad_norm": 8.828365325927734, + "learning_rate": 2.4597842665006146e-07, + "loss": 0.601045799255371, + "memory(GiB)": 47.44, + "step": 19455, + "token_acc": 0.8351867989542776, + "train_speed(iter/s)": 0.095712 + }, + { + "epoch": 0.9049610677238497, + "grad_norm": 6.522864818572998, + "learning_rate": 2.447887119921827e-07, + "loss": 0.6024109840393066, + "memory(GiB)": 47.44, + "step": 19460, + "token_acc": 0.8365089121081746, + "train_speed(iter/s)": 0.095726 + }, + { + "epoch": 0.9051935859837993, + "grad_norm": 10.061827659606934, + "learning_rate": 2.436018092375542e-07, + "loss": 0.7021986961364746, + "memory(GiB)": 47.44, + "step": 19465, + "token_acc": 0.8153013910355487, + "train_speed(iter/s)": 0.09574 + }, + { + "epoch": 0.9054261042437489, + "grad_norm": 12.953351974487305, + "learning_rate": 2.424177190880256e-07, + "loss": 0.6061929225921631, + "memory(GiB)": 47.44, + "step": 19470, + "token_acc": 0.8400309119010819, + "train_speed(iter/s)": 0.095753 + }, + { + "epoch": 0.9056586225036984, + "grad_norm": 11.2939453125, + "learning_rate": 2.412364422437874e-07, + "loss": 0.6210571765899658, + "memory(GiB)": 47.44, + "step": 19475, + "token_acc": 0.8498431196772748, + "train_speed(iter/s)": 0.095767 + }, + { + "epoch": 0.9058911407636481, + "grad_norm": 7.610166549682617, + "learning_rate": 2.4005797940336585e-07, + "loss": 0.5850472450256348, + "memory(GiB)": 47.44, + "step": 19480, + "token_acc": 0.848943661971831, + "train_speed(iter/s)": 0.095781 + }, + { + "epoch": 0.9061236590235977, + "grad_norm": 11.881199836730957, + "learning_rate": 2.388823312636207e-07, + "loss": 0.6663021087646485, + "memory(GiB)": 47.44, + "step": 19485, + "token_acc": 0.839766081871345, + "train_speed(iter/s)": 0.095795 + }, + { + "epoch": 0.9063561772835473, + "grad_norm": 11.320914268493652, + "learning_rate": 2.3770949851974977e-07, + "loss": 0.7544370174407959, + "memory(GiB)": 47.44, + "step": 19490, + "token_acc": 0.8081852004960728, + "train_speed(iter/s)": 0.095809 + }, + { + "epoch": 0.9065886955434969, + "grad_norm": 11.551216125488281, + "learning_rate": 2.3653948186528552e-07, + "loss": 0.6839058876037598, + "memory(GiB)": 47.44, + "step": 19495, + "token_acc": 0.8361934477379095, + "train_speed(iter/s)": 0.095822 + }, + { + "epoch": 0.9068212138034465, + "grad_norm": 10.214497566223145, + "learning_rate": 2.3537228199209505e-07, + "loss": 0.640674352645874, + "memory(GiB)": 47.44, + "step": 19500, + "token_acc": 0.8455008488964346, + "train_speed(iter/s)": 0.095836 + }, + { + "epoch": 0.9068212138034465, + "eval_loss": 0.5556568503379822, + "eval_runtime": 295.0302, + "eval_samples_per_second": 11.778, + "eval_steps_per_second": 11.778, + "step": 19500 + }, + { + "epoch": 0.9070537320633961, + "grad_norm": 7.023237705230713, + "learning_rate": 2.3420789959037903e-07, + "loss": 0.8054847717285156, + "memory(GiB)": 47.44, + "step": 19505, + "token_acc": 0.8330771668515291, + "train_speed(iter/s)": 0.09571 + }, + { + "epoch": 0.9072862503233458, + "grad_norm": 9.729205131530762, + "learning_rate": 2.330463353486734e-07, + "loss": 0.5784210205078125, + "memory(GiB)": 47.44, + "step": 19510, + "token_acc": 0.8446054750402576, + "train_speed(iter/s)": 0.095724 + }, + { + "epoch": 0.9075187685832953, + "grad_norm": 11.617768287658691, + "learning_rate": 2.3188758995384585e-07, + "loss": 0.6469026565551758, + "memory(GiB)": 47.44, + "step": 19515, + "token_acc": 0.8218425869432581, + "train_speed(iter/s)": 0.095738 + }, + { + "epoch": 0.9077512868432449, + "grad_norm": 9.739715576171875, + "learning_rate": 2.3073166409110004e-07, + "loss": 0.6391276836395263, + "memory(GiB)": 47.44, + "step": 19520, + "token_acc": 0.8408128704487722, + "train_speed(iter/s)": 0.095752 + }, + { + "epoch": 0.9079838051031945, + "grad_norm": 8.337206840515137, + "learning_rate": 2.2957855844396804e-07, + "loss": 0.5951900959014893, + "memory(GiB)": 47.44, + "step": 19525, + "token_acc": 0.8453547046601365, + "train_speed(iter/s)": 0.095765 + }, + { + "epoch": 0.9082163233631441, + "grad_norm": 8.902191162109375, + "learning_rate": 2.2842827369431997e-07, + "loss": 0.5920657157897949, + "memory(GiB)": 47.44, + "step": 19530, + "token_acc": 0.8576858108108109, + "train_speed(iter/s)": 0.095778 + }, + { + "epoch": 0.9084488416230937, + "grad_norm": 7.1681904792785645, + "learning_rate": 2.2728081052235228e-07, + "loss": 0.6802764892578125, + "memory(GiB)": 47.44, + "step": 19535, + "token_acc": 0.8195467422096318, + "train_speed(iter/s)": 0.095792 + }, + { + "epoch": 0.9086813598830433, + "grad_norm": 8.618856430053711, + "learning_rate": 2.2613616960659723e-07, + "loss": 0.6406535148620606, + "memory(GiB)": 47.44, + "step": 19540, + "token_acc": 0.8352027610008628, + "train_speed(iter/s)": 0.095806 + }, + { + "epoch": 0.908913878142993, + "grad_norm": 8.298696517944336, + "learning_rate": 2.2499435162391448e-07, + "loss": 0.6611350536346435, + "memory(GiB)": 47.44, + "step": 19545, + "token_acc": 0.8341480446927374, + "train_speed(iter/s)": 0.095819 + }, + { + "epoch": 0.9091463964029425, + "grad_norm": 8.729609489440918, + "learning_rate": 2.238553572494978e-07, + "loss": 0.706944465637207, + "memory(GiB)": 47.44, + "step": 19550, + "token_acc": 0.8442392613408269, + "train_speed(iter/s)": 0.095833 + }, + { + "epoch": 0.9091463964029425, + "eval_loss": 0.5558927059173584, + "eval_runtime": 292.7905, + "eval_samples_per_second": 11.869, + "eval_steps_per_second": 11.869, + "step": 19550 + }, + { + "epoch": 0.9093789146628921, + "grad_norm": 8.666666984558105, + "learning_rate": 2.2271918715686792e-07, + "loss": 0.7575130939483643, + "memory(GiB)": 47.44, + "step": 19555, + "token_acc": 0.8341863254698121, + "train_speed(iter/s)": 0.095709 + }, + { + "epoch": 0.9096114329228417, + "grad_norm": 7.633399486541748, + "learning_rate": 2.2158584201787903e-07, + "loss": 0.6025700569152832, + "memory(GiB)": 47.44, + "step": 19560, + "token_acc": 0.8566856330014224, + "train_speed(iter/s)": 0.095723 + }, + { + "epoch": 0.9098439511827914, + "grad_norm": 6.445369720458984, + "learning_rate": 2.2045532250271228e-07, + "loss": 0.6307368278503418, + "memory(GiB)": 47.44, + "step": 19565, + "token_acc": 0.8414866581956798, + "train_speed(iter/s)": 0.095736 + }, + { + "epoch": 0.9100764694427409, + "grad_norm": 10.30349063873291, + "learning_rate": 2.193276292798796e-07, + "loss": 0.5543890953063965, + "memory(GiB)": 47.44, + "step": 19570, + "token_acc": 0.8581584292484766, + "train_speed(iter/s)": 0.09575 + }, + { + "epoch": 0.9103089877026905, + "grad_norm": 6.994936466217041, + "learning_rate": 2.1820276301621978e-07, + "loss": 0.6006917953491211, + "memory(GiB)": 47.44, + "step": 19575, + "token_acc": 0.846836191602602, + "train_speed(iter/s)": 0.095764 + }, + { + "epoch": 0.9105415059626402, + "grad_norm": 8.902931213378906, + "learning_rate": 2.1708072437690186e-07, + "loss": 0.8253030776977539, + "memory(GiB)": 47.44, + "step": 19580, + "token_acc": 0.8120567375886525, + "train_speed(iter/s)": 0.095778 + }, + { + "epoch": 0.9107740242225897, + "grad_norm": 11.879782676696777, + "learning_rate": 2.1596151402542065e-07, + "loss": 0.6510292053222656, + "memory(GiB)": 47.44, + "step": 19585, + "token_acc": 0.8594360086767896, + "train_speed(iter/s)": 0.095792 + }, + { + "epoch": 0.9110065424825393, + "grad_norm": 8.595463752746582, + "learning_rate": 2.1484513262360174e-07, + "loss": 0.629924726486206, + "memory(GiB)": 47.44, + "step": 19590, + "token_acc": 0.8402530644523527, + "train_speed(iter/s)": 0.095805 + }, + { + "epoch": 0.911239060742489, + "grad_norm": 8.18715763092041, + "learning_rate": 2.1373158083159374e-07, + "loss": 0.6486124038696289, + "memory(GiB)": 47.44, + "step": 19595, + "token_acc": 0.849500998003992, + "train_speed(iter/s)": 0.095819 + }, + { + "epoch": 0.9114715790024386, + "grad_norm": 7.39414119720459, + "learning_rate": 2.1262085930787546e-07, + "loss": 0.5645887851715088, + "memory(GiB)": 47.44, + "step": 19600, + "token_acc": 0.8555596601403768, + "train_speed(iter/s)": 0.095833 + }, + { + "epoch": 0.9114715790024386, + "eval_loss": 0.5564916729927063, + "eval_runtime": 291.8498, + "eval_samples_per_second": 11.907, + "eval_steps_per_second": 11.907, + "step": 19600 + }, + { + "epoch": 0.9117040972623881, + "grad_norm": 9.974652290344238, + "learning_rate": 2.1151296870925042e-07, + "loss": 0.655756664276123, + "memory(GiB)": 47.44, + "step": 19605, + "token_acc": 0.8347886103174857, + "train_speed(iter/s)": 0.09571 + }, + { + "epoch": 0.9119366155223377, + "grad_norm": 9.164752960205078, + "learning_rate": 2.1040790969084846e-07, + "loss": 0.6285743713378906, + "memory(GiB)": 47.44, + "step": 19610, + "token_acc": 0.8381852551984877, + "train_speed(iter/s)": 0.095724 + }, + { + "epoch": 0.9121691337822874, + "grad_norm": 7.622852325439453, + "learning_rate": 2.093056829061263e-07, + "loss": 0.5900651454925537, + "memory(GiB)": 47.44, + "step": 19615, + "token_acc": 0.8457163170991824, + "train_speed(iter/s)": 0.095738 + }, + { + "epoch": 0.9124016520422369, + "grad_norm": 7.594034671783447, + "learning_rate": 2.0820628900686313e-07, + "loss": 0.6741018295288086, + "memory(GiB)": 47.44, + "step": 19620, + "token_acc": 0.83187190240183, + "train_speed(iter/s)": 0.095752 + }, + { + "epoch": 0.9126341703021865, + "grad_norm": 7.638304710388184, + "learning_rate": 2.071097286431656e-07, + "loss": 0.571320915222168, + "memory(GiB)": 47.44, + "step": 19625, + "token_acc": 0.8505747126436781, + "train_speed(iter/s)": 0.095765 + }, + { + "epoch": 0.9128666885621362, + "grad_norm": 7.910111904144287, + "learning_rate": 2.0601600246346275e-07, + "loss": 0.641974401473999, + "memory(GiB)": 47.44, + "step": 19630, + "token_acc": 0.8341346153846154, + "train_speed(iter/s)": 0.095779 + }, + { + "epoch": 0.9130992068220858, + "grad_norm": 10.283453941345215, + "learning_rate": 2.0492511111450953e-07, + "loss": 0.5953716278076172, + "memory(GiB)": 47.44, + "step": 19635, + "token_acc": 0.8551294657489348, + "train_speed(iter/s)": 0.095793 + }, + { + "epoch": 0.9133317250820353, + "grad_norm": 10.159235000610352, + "learning_rate": 2.038370552413832e-07, + "loss": 0.5992490291595459, + "memory(GiB)": 47.44, + "step": 19640, + "token_acc": 0.8570114942528736, + "train_speed(iter/s)": 0.095806 + }, + { + "epoch": 0.9135642433419849, + "grad_norm": 7.889383316040039, + "learning_rate": 2.027518354874841e-07, + "loss": 0.5519330024719238, + "memory(GiB)": 47.44, + "step": 19645, + "token_acc": 0.8550350424197714, + "train_speed(iter/s)": 0.09582 + }, + { + "epoch": 0.9137967616019346, + "grad_norm": 8.540481567382812, + "learning_rate": 2.0166945249453728e-07, + "loss": 0.5851781368255615, + "memory(GiB)": 47.44, + "step": 19650, + "token_acc": 0.8482142857142857, + "train_speed(iter/s)": 0.095833 + }, + { + "epoch": 0.9137967616019346, + "eval_loss": 0.556018054485321, + "eval_runtime": 293.7189, + "eval_samples_per_second": 11.831, + "eval_steps_per_second": 11.831, + "step": 19650 + }, + { + "epoch": 0.9140292798618842, + "grad_norm": 8.106054306030273, + "learning_rate": 2.0058990690258963e-07, + "loss": 0.6603738784790039, + "memory(GiB)": 47.44, + "step": 19655, + "token_acc": 0.8345389579901481, + "train_speed(iter/s)": 0.09571 + }, + { + "epoch": 0.9142617981218337, + "grad_norm": 7.776497840881348, + "learning_rate": 1.9951319935000767e-07, + "loss": 0.8171131134033203, + "memory(GiB)": 47.44, + "step": 19660, + "token_acc": 0.8034281546316557, + "train_speed(iter/s)": 0.095724 + }, + { + "epoch": 0.9144943163817834, + "grad_norm": 9.407666206359863, + "learning_rate": 1.984393304734844e-07, + "loss": 0.5196086883544921, + "memory(GiB)": 47.44, + "step": 19665, + "token_acc": 0.8706225680933852, + "train_speed(iter/s)": 0.095737 + }, + { + "epoch": 0.914726834641733, + "grad_norm": 9.289840698242188, + "learning_rate": 1.9736830090802962e-07, + "loss": 0.6363831520080566, + "memory(GiB)": 47.44, + "step": 19670, + "token_acc": 0.8487730061349693, + "train_speed(iter/s)": 0.095751 + }, + { + "epoch": 0.9149593529016825, + "grad_norm": 8.724823951721191, + "learning_rate": 1.963001112869778e-07, + "loss": 0.624758243560791, + "memory(GiB)": 47.44, + "step": 19675, + "token_acc": 0.8428571428571429, + "train_speed(iter/s)": 0.095764 + }, + { + "epoch": 0.9151918711616321, + "grad_norm": 9.431694984436035, + "learning_rate": 1.952347622419809e-07, + "loss": 0.5540366172790527, + "memory(GiB)": 47.44, + "step": 19680, + "token_acc": 0.8601036269430051, + "train_speed(iter/s)": 0.095777 + }, + { + "epoch": 0.9154243894215818, + "grad_norm": 9.550405502319336, + "learning_rate": 1.9417225440301335e-07, + "loss": 0.6447968006134033, + "memory(GiB)": 47.44, + "step": 19685, + "token_acc": 0.8426339285714286, + "train_speed(iter/s)": 0.09579 + }, + { + "epoch": 0.9156569076815314, + "grad_norm": 9.518305778503418, + "learning_rate": 1.9311258839836865e-07, + "loss": 0.5623124122619629, + "memory(GiB)": 47.44, + "step": 19690, + "token_acc": 0.8611342785654712, + "train_speed(iter/s)": 0.095804 + }, + { + "epoch": 0.9158894259414809, + "grad_norm": 7.255014896392822, + "learning_rate": 1.9205576485466114e-07, + "loss": 0.6957176685333252, + "memory(GiB)": 47.44, + "step": 19695, + "token_acc": 0.8264137437365784, + "train_speed(iter/s)": 0.095818 + }, + { + "epoch": 0.9161219442014306, + "grad_norm": 10.805427551269531, + "learning_rate": 1.9100178439682148e-07, + "loss": 0.8065167427062988, + "memory(GiB)": 47.44, + "step": 19700, + "token_acc": 0.7643504531722054, + "train_speed(iter/s)": 0.095831 + }, + { + "epoch": 0.9161219442014306, + "eval_loss": 0.5560776591300964, + "eval_runtime": 294.8996, + "eval_samples_per_second": 11.784, + "eval_steps_per_second": 11.784, + "step": 19700 + }, + { + "epoch": 0.9163544624613802, + "grad_norm": 6.1116719245910645, + "learning_rate": 1.8995064764810278e-07, + "loss": 0.6948729515075683, + "memory(GiB)": 47.44, + "step": 19705, + "token_acc": 0.8349877016883314, + "train_speed(iter/s)": 0.095707 + }, + { + "epoch": 0.9165869807213298, + "grad_norm": 8.436870574951172, + "learning_rate": 1.8890235523007283e-07, + "loss": 0.6319416999816895, + "memory(GiB)": 47.44, + "step": 19710, + "token_acc": 0.8398151713515595, + "train_speed(iter/s)": 0.095721 + }, + { + "epoch": 0.9168194989812793, + "grad_norm": 9.345169067382812, + "learning_rate": 1.8785690776262023e-07, + "loss": 0.6956872463226318, + "memory(GiB)": 47.44, + "step": 19715, + "token_acc": 0.8270460358056266, + "train_speed(iter/s)": 0.095735 + }, + { + "epoch": 0.917052017241229, + "grad_norm": 7.356499195098877, + "learning_rate": 1.8681430586394988e-07, + "loss": 0.5555019855499268, + "memory(GiB)": 47.44, + "step": 19720, + "token_acc": 0.8647566235366605, + "train_speed(iter/s)": 0.095748 + }, + { + "epoch": 0.9172845355011786, + "grad_norm": 6.536581039428711, + "learning_rate": 1.8577455015058477e-07, + "loss": 0.9057181358337403, + "memory(GiB)": 47.44, + "step": 19725, + "token_acc": 0.7832127351664254, + "train_speed(iter/s)": 0.095762 + }, + { + "epoch": 0.9175170537611281, + "grad_norm": 10.982614517211914, + "learning_rate": 1.847376412373647e-07, + "loss": 0.5344199657440185, + "memory(GiB)": 47.44, + "step": 19730, + "token_acc": 0.8694404591104734, + "train_speed(iter/s)": 0.095775 + }, + { + "epoch": 0.9177495720210778, + "grad_norm": 9.253430366516113, + "learning_rate": 1.837035797374459e-07, + "loss": 0.7098407745361328, + "memory(GiB)": 47.44, + "step": 19735, + "token_acc": 0.8177966101694916, + "train_speed(iter/s)": 0.095789 + }, + { + "epoch": 0.9179820902810274, + "grad_norm": 8.579312324523926, + "learning_rate": 1.826723662623009e-07, + "loss": 0.6048952102661133, + "memory(GiB)": 47.44, + "step": 19740, + "token_acc": 0.8423377505946313, + "train_speed(iter/s)": 0.095803 + }, + { + "epoch": 0.918214608540977, + "grad_norm": 8.829182624816895, + "learning_rate": 1.8164400142171744e-07, + "loss": 0.5817587852478028, + "memory(GiB)": 47.44, + "step": 19745, + "token_acc": 0.8563344860106885, + "train_speed(iter/s)": 0.095816 + }, + { + "epoch": 0.9184471268009265, + "grad_norm": 11.861217498779297, + "learning_rate": 1.8061848582380081e-07, + "loss": 0.6502750396728516, + "memory(GiB)": 47.44, + "step": 19750, + "token_acc": 0.8443877551020408, + "train_speed(iter/s)": 0.09583 + }, + { + "epoch": 0.9184471268009265, + "eval_loss": 0.5561826229095459, + "eval_runtime": 293.7773, + "eval_samples_per_second": 11.829, + "eval_steps_per_second": 11.829, + "step": 19750 + }, + { + "epoch": 0.9186796450608762, + "grad_norm": 8.321170806884766, + "learning_rate": 1.7959582007496813e-07, + "loss": 0.6396979331970215, + "memory(GiB)": 47.44, + "step": 19755, + "token_acc": 0.8350863981927277, + "train_speed(iter/s)": 0.095707 + }, + { + "epoch": 0.9189121633208258, + "grad_norm": 6.915319442749023, + "learning_rate": 1.7857600477995507e-07, + "loss": 0.5526315212249756, + "memory(GiB)": 47.44, + "step": 19760, + "token_acc": 0.8581314878892734, + "train_speed(iter/s)": 0.09572 + }, + { + "epoch": 0.9191446815807753, + "grad_norm": 8.197853088378906, + "learning_rate": 1.7755904054180817e-07, + "loss": 0.6716443061828613, + "memory(GiB)": 47.44, + "step": 19765, + "token_acc": 0.8348387096774194, + "train_speed(iter/s)": 0.095734 + }, + { + "epoch": 0.919377199840725, + "grad_norm": 8.290502548217773, + "learning_rate": 1.7654492796189082e-07, + "loss": 0.7327411651611329, + "memory(GiB)": 47.44, + "step": 19770, + "token_acc": 0.8147554129911788, + "train_speed(iter/s)": 0.095748 + }, + { + "epoch": 0.9196097181006746, + "grad_norm": 7.795845985412598, + "learning_rate": 1.755336676398789e-07, + "loss": 0.5660604000091553, + "memory(GiB)": 47.44, + "step": 19775, + "token_acc": 0.8619830592924763, + "train_speed(iter/s)": 0.095762 + }, + { + "epoch": 0.9198422363606242, + "grad_norm": 7.266628742218018, + "learning_rate": 1.7452526017376238e-07, + "loss": 0.6886507034301758, + "memory(GiB)": 47.44, + "step": 19780, + "token_acc": 0.8298239558163617, + "train_speed(iter/s)": 0.095776 + }, + { + "epoch": 0.9200747546205738, + "grad_norm": 7.9375691413879395, + "learning_rate": 1.7351970615984258e-07, + "loss": 0.6057173728942871, + "memory(GiB)": 47.44, + "step": 19785, + "token_acc": 0.8486120077469335, + "train_speed(iter/s)": 0.09579 + }, + { + "epoch": 0.9203072728805234, + "grad_norm": 8.474169731140137, + "learning_rate": 1.7251700619273616e-07, + "loss": 0.6554636478424072, + "memory(GiB)": 47.44, + "step": 19790, + "token_acc": 0.8410298324478954, + "train_speed(iter/s)": 0.095803 + }, + { + "epoch": 0.920539791140473, + "grad_norm": 10.37956714630127, + "learning_rate": 1.7151716086536873e-07, + "loss": 0.6464309215545654, + "memory(GiB)": 47.44, + "step": 19795, + "token_acc": 0.8384670487106017, + "train_speed(iter/s)": 0.095817 + }, + { + "epoch": 0.9207723094004227, + "grad_norm": 7.706953525543213, + "learning_rate": 1.705201707689813e-07, + "loss": 0.617960786819458, + "memory(GiB)": 47.44, + "step": 19800, + "token_acc": 0.8595564941921858, + "train_speed(iter/s)": 0.09583 + }, + { + "epoch": 0.9207723094004227, + "eval_loss": 0.5560248494148254, + "eval_runtime": 293.0751, + "eval_samples_per_second": 11.857, + "eval_steps_per_second": 11.857, + "step": 19800 + }, + { + "epoch": 0.9210048276603722, + "grad_norm": 10.098922729492188, + "learning_rate": 1.6952603649312392e-07, + "loss": 0.6413202285766602, + "memory(GiB)": 47.44, + "step": 19805, + "token_acc": 0.8346519114043657, + "train_speed(iter/s)": 0.095708 + }, + { + "epoch": 0.9212373459203218, + "grad_norm": 10.483457565307617, + "learning_rate": 1.6853475862565916e-07, + "loss": 0.7492653369903565, + "memory(GiB)": 47.44, + "step": 19810, + "token_acc": 0.8183652875882946, + "train_speed(iter/s)": 0.095721 + }, + { + "epoch": 0.9214698641802714, + "grad_norm": 8.322340965270996, + "learning_rate": 1.675463377527603e-07, + "loss": 0.5719735145568847, + "memory(GiB)": 47.44, + "step": 19815, + "token_acc": 0.8572583906186818, + "train_speed(iter/s)": 0.095735 + }, + { + "epoch": 0.921702382440221, + "grad_norm": 11.469117164611816, + "learning_rate": 1.665607744589115e-07, + "loss": 0.6902899742126465, + "memory(GiB)": 47.44, + "step": 19820, + "token_acc": 0.8308492201039861, + "train_speed(iter/s)": 0.095748 + }, + { + "epoch": 0.9219349007001706, + "grad_norm": 7.651955604553223, + "learning_rate": 1.65578069326906e-07, + "loss": 0.7511250495910644, + "memory(GiB)": 47.44, + "step": 19825, + "token_acc": 0.8129323308270676, + "train_speed(iter/s)": 0.095762 + }, + { + "epoch": 0.9221674189601202, + "grad_norm": 7.940315246582031, + "learning_rate": 1.64598222937849e-07, + "loss": 0.49587244987487794, + "memory(GiB)": 47.44, + "step": 19830, + "token_acc": 0.8828061638280617, + "train_speed(iter/s)": 0.095775 + }, + { + "epoch": 0.9223999372200699, + "grad_norm": 8.776915550231934, + "learning_rate": 1.6362123587115198e-07, + "loss": 0.6938657283782959, + "memory(GiB)": 47.44, + "step": 19835, + "token_acc": 0.8298710601719198, + "train_speed(iter/s)": 0.095789 + }, + { + "epoch": 0.9226324554800194, + "grad_norm": 10.612738609313965, + "learning_rate": 1.6264710870453893e-07, + "loss": 0.560774564743042, + "memory(GiB)": 47.44, + "step": 19840, + "token_acc": 0.8556254917387883, + "train_speed(iter/s)": 0.095802 + }, + { + "epoch": 0.922864973739969, + "grad_norm": 7.882064342498779, + "learning_rate": 1.6167584201404074e-07, + "loss": 0.598447322845459, + "memory(GiB)": 47.44, + "step": 19845, + "token_acc": 0.844487552537989, + "train_speed(iter/s)": 0.095815 + }, + { + "epoch": 0.9230974919999186, + "grad_norm": 11.58521842956543, + "learning_rate": 1.6070743637399745e-07, + "loss": 0.7264031887054443, + "memory(GiB)": 47.44, + "step": 19850, + "token_acc": 0.8180930686625447, + "train_speed(iter/s)": 0.095829 + }, + { + "epoch": 0.9230974919999186, + "eval_loss": 0.555980384349823, + "eval_runtime": 294.0643, + "eval_samples_per_second": 11.817, + "eval_steps_per_second": 11.817, + "step": 19850 + }, + { + "epoch": 0.9233300102598683, + "grad_norm": 6.906254768371582, + "learning_rate": 1.5974189235705761e-07, + "loss": 0.570603322982788, + "memory(GiB)": 47.44, + "step": 19855, + "token_acc": 0.8350892057807904, + "train_speed(iter/s)": 0.095706 + }, + { + "epoch": 0.9235625285198178, + "grad_norm": 6.352476119995117, + "learning_rate": 1.5877921053417732e-07, + "loss": 0.5671680450439454, + "memory(GiB)": 47.44, + "step": 19860, + "token_acc": 0.853103448275862, + "train_speed(iter/s)": 0.09572 + }, + { + "epoch": 0.9237950467797674, + "grad_norm": 8.458511352539062, + "learning_rate": 1.57819391474619e-07, + "loss": 0.7611868858337403, + "memory(GiB)": 47.44, + "step": 19865, + "token_acc": 0.8183606557377049, + "train_speed(iter/s)": 0.095733 + }, + { + "epoch": 0.9240275650397171, + "grad_norm": 11.120638847351074, + "learning_rate": 1.5686243574595416e-07, + "loss": 0.5099010467529297, + "memory(GiB)": 47.44, + "step": 19870, + "token_acc": 0.8711592178770949, + "train_speed(iter/s)": 0.095747 + }, + { + "epoch": 0.9242600832996666, + "grad_norm": 9.012869834899902, + "learning_rate": 1.5590834391406072e-07, + "loss": 0.6600170612335206, + "memory(GiB)": 47.44, + "step": 19875, + "token_acc": 0.8373001776198934, + "train_speed(iter/s)": 0.09576 + }, + { + "epoch": 0.9244926015596162, + "grad_norm": 7.926135063171387, + "learning_rate": 1.5495711654312128e-07, + "loss": 0.6755642890930176, + "memory(GiB)": 47.44, + "step": 19880, + "token_acc": 0.8345549738219895, + "train_speed(iter/s)": 0.095774 + }, + { + "epoch": 0.9247251198195658, + "grad_norm": 12.261335372924805, + "learning_rate": 1.54008754195627e-07, + "loss": 0.5872795581817627, + "memory(GiB)": 47.44, + "step": 19885, + "token_acc": 0.8427717200140696, + "train_speed(iter/s)": 0.095787 + }, + { + "epoch": 0.9249576380795155, + "grad_norm": 7.186399459838867, + "learning_rate": 1.5306325743237316e-07, + "loss": 0.666871976852417, + "memory(GiB)": 47.44, + "step": 19890, + "token_acc": 0.8331595411887383, + "train_speed(iter/s)": 0.095801 + }, + { + "epoch": 0.925190156339465, + "grad_norm": 8.706633567810059, + "learning_rate": 1.5212062681246252e-07, + "loss": 0.6248205661773681, + "memory(GiB)": 47.44, + "step": 19895, + "token_acc": 0.8479046242774566, + "train_speed(iter/s)": 0.095815 + }, + { + "epoch": 0.9254226745994146, + "grad_norm": 7.308403491973877, + "learning_rate": 1.511808628932998e-07, + "loss": 0.6236941814422607, + "memory(GiB)": 47.44, + "step": 19900, + "token_acc": 0.8420720151610865, + "train_speed(iter/s)": 0.095828 + }, + { + "epoch": 0.9254226745994146, + "eval_loss": 0.5559277534484863, + "eval_runtime": 293.4352, + "eval_samples_per_second": 11.842, + "eval_steps_per_second": 11.842, + "step": 19900 + }, + { + "epoch": 0.9256551928593643, + "grad_norm": 8.53604793548584, + "learning_rate": 1.5024396623059767e-07, + "loss": 0.7425576210021972, + "memory(GiB)": 47.44, + "step": 19905, + "token_acc": 0.834554529892652, + "train_speed(iter/s)": 0.095706 + }, + { + "epoch": 0.9258877111193138, + "grad_norm": 8.350555419921875, + "learning_rate": 1.493099373783713e-07, + "loss": 0.6046389102935791, + "memory(GiB)": 47.44, + "step": 19910, + "token_acc": 0.8460446967009578, + "train_speed(iter/s)": 0.09572 + }, + { + "epoch": 0.9261202293792634, + "grad_norm": 9.9799222946167, + "learning_rate": 1.483787768889422e-07, + "loss": 0.6118096828460693, + "memory(GiB)": 47.44, + "step": 19915, + "token_acc": 0.8490630323679728, + "train_speed(iter/s)": 0.095733 + }, + { + "epoch": 0.926352747639213, + "grad_norm": 7.102067947387695, + "learning_rate": 1.4745048531293217e-07, + "loss": 0.5712886810302734, + "memory(GiB)": 47.44, + "step": 19920, + "token_acc": 0.8495081967213115, + "train_speed(iter/s)": 0.095747 + }, + { + "epoch": 0.9265852658991627, + "grad_norm": 10.693116188049316, + "learning_rate": 1.465250631992704e-07, + "loss": 0.5446754455566406, + "memory(GiB)": 47.44, + "step": 19925, + "token_acc": 0.8637228778073849, + "train_speed(iter/s)": 0.09576 + }, + { + "epoch": 0.9268177841591122, + "grad_norm": 9.30135726928711, + "learning_rate": 1.4560251109518642e-07, + "loss": 0.566897201538086, + "memory(GiB)": 47.44, + "step": 19930, + "token_acc": 0.8544303797468354, + "train_speed(iter/s)": 0.095773 + }, + { + "epoch": 0.9270503024190618, + "grad_norm": 12.602972030639648, + "learning_rate": 1.4468282954621493e-07, + "loss": 0.7747231960296631, + "memory(GiB)": 47.44, + "step": 19935, + "token_acc": 0.8014571948998178, + "train_speed(iter/s)": 0.095787 + }, + { + "epoch": 0.9272828206790115, + "grad_norm": 10.188447952270508, + "learning_rate": 1.4376601909619092e-07, + "loss": 0.6704387664794922, + "memory(GiB)": 47.44, + "step": 19940, + "token_acc": 0.8310099573257468, + "train_speed(iter/s)": 0.0958 + }, + { + "epoch": 0.9275153389389611, + "grad_norm": 11.103850364685059, + "learning_rate": 1.4285208028725406e-07, + "loss": 0.6308998107910156, + "memory(GiB)": 47.44, + "step": 19945, + "token_acc": 0.8469015795868773, + "train_speed(iter/s)": 0.095814 + }, + { + "epoch": 0.9277478571989106, + "grad_norm": 8.788395881652832, + "learning_rate": 1.419410136598426e-07, + "loss": 0.5965378284454346, + "memory(GiB)": 47.44, + "step": 19950, + "token_acc": 0.8495362418412916, + "train_speed(iter/s)": 0.095827 + }, + { + "epoch": 0.9277478571989106, + "eval_loss": 0.55594402551651, + "eval_runtime": 295.1143, + "eval_samples_per_second": 11.775, + "eval_steps_per_second": 11.775, + "step": 19950 + }, + { + "epoch": 0.9279803754588603, + "grad_norm": 9.410270690917969, + "learning_rate": 1.4103281975270055e-07, + "loss": 0.600709056854248, + "memory(GiB)": 47.44, + "step": 19955, + "token_acc": 0.8349722480669897, + "train_speed(iter/s)": 0.095705 + }, + { + "epoch": 0.9282128937188099, + "grad_norm": 8.508853912353516, + "learning_rate": 1.4012749910286948e-07, + "loss": 0.6112794399261474, + "memory(GiB)": 47.44, + "step": 19960, + "token_acc": 0.8422978412001464, + "train_speed(iter/s)": 0.095718 + }, + { + "epoch": 0.9284454119787594, + "grad_norm": 10.257251739501953, + "learning_rate": 1.3922505224569338e-07, + "loss": 0.6842128276824951, + "memory(GiB)": 47.44, + "step": 19965, + "token_acc": 0.8187641296156745, + "train_speed(iter/s)": 0.095732 + }, + { + "epoch": 0.928677930238709, + "grad_norm": 8.22706127166748, + "learning_rate": 1.3832547971481813e-07, + "loss": 0.715480899810791, + "memory(GiB)": 47.44, + "step": 19970, + "token_acc": 0.8302420622445772, + "train_speed(iter/s)": 0.095745 + }, + { + "epoch": 0.9289104484986587, + "grad_norm": 6.012380123138428, + "learning_rate": 1.3742878204218823e-07, + "loss": 0.6090039730072021, + "memory(GiB)": 47.44, + "step": 19975, + "token_acc": 0.832556471853711, + "train_speed(iter/s)": 0.095759 + }, + { + "epoch": 0.9291429667586083, + "grad_norm": 7.759617328643799, + "learning_rate": 1.3653495975804786e-07, + "loss": 0.6397760391235352, + "memory(GiB)": 47.44, + "step": 19980, + "token_acc": 0.8429378531073446, + "train_speed(iter/s)": 0.095772 + }, + { + "epoch": 0.9293754850185578, + "grad_norm": 7.183525562286377, + "learning_rate": 1.3564401339094312e-07, + "loss": 0.6927905559539795, + "memory(GiB)": 47.44, + "step": 19985, + "token_acc": 0.8360916613621897, + "train_speed(iter/s)": 0.095785 + }, + { + "epoch": 0.9296080032785075, + "grad_norm": 5.6819329261779785, + "learning_rate": 1.3475594346771703e-07, + "loss": 0.5243013381958008, + "memory(GiB)": 47.44, + "step": 19990, + "token_acc": 0.8552537526804861, + "train_speed(iter/s)": 0.095799 + }, + { + "epoch": 0.9298405215384571, + "grad_norm": 12.953128814697266, + "learning_rate": 1.338707505135134e-07, + "loss": 0.6449570178985595, + "memory(GiB)": 47.44, + "step": 19995, + "token_acc": 0.8425643262102049, + "train_speed(iter/s)": 0.095812 + }, + { + "epoch": 0.9300730397984067, + "grad_norm": 8.761154174804688, + "learning_rate": 1.329884350517735e-07, + "loss": 0.5618272304534913, + "memory(GiB)": 47.44, + "step": 20000, + "token_acc": 0.8585131894484412, + "train_speed(iter/s)": 0.095826 + }, + { + "epoch": 0.9300730397984067, + "eval_loss": 0.5558544397354126, + "eval_runtime": 294.6168, + "eval_samples_per_second": 11.795, + "eval_steps_per_second": 11.795, + "step": 20000 + }, + { + "epoch": 0.9303055580583562, + "grad_norm": 8.427507400512695, + "learning_rate": 1.3210899760423834e-07, + "loss": 0.6308176517486572, + "memory(GiB)": 47.44, + "step": 20005, + "token_acc": 0.8346826826826826, + "train_speed(iter/s)": 0.095704 + }, + { + "epoch": 0.9305380763183059, + "grad_norm": 8.652649879455566, + "learning_rate": 1.312324386909458e-07, + "loss": 0.6576950550079346, + "memory(GiB)": 47.44, + "step": 20010, + "token_acc": 0.8384502923976608, + "train_speed(iter/s)": 0.095717 + }, + { + "epoch": 0.9307705945782555, + "grad_norm": 7.4929304122924805, + "learning_rate": 1.3035875883023298e-07, + "loss": 0.6439097404479981, + "memory(GiB)": 47.44, + "step": 20015, + "token_acc": 0.8359495229301324, + "train_speed(iter/s)": 0.095731 + }, + { + "epoch": 0.931003112838205, + "grad_norm": 8.559335708618164, + "learning_rate": 1.2948795853873374e-07, + "loss": 0.5946200370788575, + "memory(GiB)": 47.44, + "step": 20020, + "token_acc": 0.8393613554903877, + "train_speed(iter/s)": 0.095744 + }, + { + "epoch": 0.9312356310981547, + "grad_norm": 7.127233982086182, + "learning_rate": 1.2862003833137848e-07, + "loss": 0.7438712596893311, + "memory(GiB)": 47.44, + "step": 20025, + "token_acc": 0.8122285332442366, + "train_speed(iter/s)": 0.095757 + }, + { + "epoch": 0.9314681493581043, + "grad_norm": 9.78085708618164, + "learning_rate": 1.2775499872139553e-07, + "loss": 0.7858536720275879, + "memory(GiB)": 47.44, + "step": 20030, + "token_acc": 0.7917398945518453, + "train_speed(iter/s)": 0.095771 + }, + { + "epoch": 0.9317006676180539, + "grad_norm": 10.972649574279785, + "learning_rate": 1.2689284022030956e-07, + "loss": 0.581892728805542, + "memory(GiB)": 47.44, + "step": 20035, + "token_acc": 0.8570875290472502, + "train_speed(iter/s)": 0.095785 + }, + { + "epoch": 0.9319331858780034, + "grad_norm": 7.946092128753662, + "learning_rate": 1.260335633379417e-07, + "loss": 0.6852340698242188, + "memory(GiB)": 47.44, + "step": 20040, + "token_acc": 0.8281821203057494, + "train_speed(iter/s)": 0.095797 + }, + { + "epoch": 0.9321657041379531, + "grad_norm": 10.450540542602539, + "learning_rate": 1.2517716858240824e-07, + "loss": 0.6736576080322265, + "memory(GiB)": 47.44, + "step": 20045, + "token_acc": 0.8368560105680317, + "train_speed(iter/s)": 0.095811 + }, + { + "epoch": 0.9323982223979027, + "grad_norm": 8.191557884216309, + "learning_rate": 1.2432365646012245e-07, + "loss": 0.6913845062255859, + "memory(GiB)": 47.44, + "step": 20050, + "token_acc": 0.8282527881040892, + "train_speed(iter/s)": 0.095824 + }, + { + "epoch": 0.9323982223979027, + "eval_loss": 0.5560406446456909, + "eval_runtime": 293.3635, + "eval_samples_per_second": 11.845, + "eval_steps_per_second": 11.845, + "step": 20050 + }, + { + "epoch": 0.9326307406578523, + "grad_norm": 6.211060047149658, + "learning_rate": 1.2347302747579225e-07, + "loss": 0.7071500778198242, + "memory(GiB)": 47.44, + "step": 20055, + "token_acc": 0.8342621275508575, + "train_speed(iter/s)": 0.095703 + }, + { + "epoch": 0.9328632589178019, + "grad_norm": 8.84935474395752, + "learning_rate": 1.2262528213242142e-07, + "loss": 0.6677957057952881, + "memory(GiB)": 47.44, + "step": 20060, + "token_acc": 0.8243108601793424, + "train_speed(iter/s)": 0.095717 + }, + { + "epoch": 0.9330957771777515, + "grad_norm": 9.605239868164062, + "learning_rate": 1.217804209313067e-07, + "loss": 0.569607925415039, + "memory(GiB)": 47.44, + "step": 20065, + "token_acc": 0.8461019237259534, + "train_speed(iter/s)": 0.09573 + }, + { + "epoch": 0.9333282954377011, + "grad_norm": 8.329422950744629, + "learning_rate": 1.2093844437204182e-07, + "loss": 0.6272897720336914, + "memory(GiB)": 47.44, + "step": 20070, + "token_acc": 0.8355795148247979, + "train_speed(iter/s)": 0.095743 + }, + { + "epoch": 0.9335608136976506, + "grad_norm": 11.321281433105469, + "learning_rate": 1.200993529525124e-07, + "loss": 0.695442008972168, + "memory(GiB)": 47.44, + "step": 20075, + "token_acc": 0.8292591199699135, + "train_speed(iter/s)": 0.095757 + }, + { + "epoch": 0.9337933319576003, + "grad_norm": 9.605953216552734, + "learning_rate": 1.192631471689004e-07, + "loss": 0.5914440631866456, + "memory(GiB)": 47.44, + "step": 20080, + "token_acc": 0.8651951123374064, + "train_speed(iter/s)": 0.09577 + }, + { + "epoch": 0.9340258502175499, + "grad_norm": 8.124171257019043, + "learning_rate": 1.1842982751567866e-07, + "loss": 0.6247981548309326, + "memory(GiB)": 47.44, + "step": 20085, + "token_acc": 0.8412457273072541, + "train_speed(iter/s)": 0.095784 + }, + { + "epoch": 0.9342583684774995, + "grad_norm": 8.131589889526367, + "learning_rate": 1.1759939448561575e-07, + "loss": 0.5428094387054443, + "memory(GiB)": 47.44, + "step": 20090, + "token_acc": 0.8647786198643798, + "train_speed(iter/s)": 0.095797 + }, + { + "epoch": 0.9344908867374491, + "grad_norm": 10.977629661560059, + "learning_rate": 1.167718485697722e-07, + "loss": 0.6608882427215577, + "memory(GiB)": 47.44, + "step": 20095, + "token_acc": 0.8396728016359918, + "train_speed(iter/s)": 0.095811 + }, + { + "epoch": 0.9347234049973987, + "grad_norm": 9.835230827331543, + "learning_rate": 1.1594719025750156e-07, + "loss": 0.6709197521209717, + "memory(GiB)": 47.44, + "step": 20100, + "token_acc": 0.831494184312556, + "train_speed(iter/s)": 0.095824 + }, + { + "epoch": 0.9347234049973987, + "eval_loss": 0.556010901927948, + "eval_runtime": 294.2096, + "eval_samples_per_second": 11.811, + "eval_steps_per_second": 11.811, + "step": 20100 + }, + { + "epoch": 0.9349559232573483, + "grad_norm": 8.349122047424316, + "learning_rate": 1.1512542003644933e-07, + "loss": 0.6077350616455078, + "memory(GiB)": 47.44, + "step": 20105, + "token_acc": 0.834884296794498, + "train_speed(iter/s)": 0.095703 + }, + { + "epoch": 0.9351884415172979, + "grad_norm": 9.29889965057373, + "learning_rate": 1.1430653839255402e-07, + "loss": 0.675541353225708, + "memory(GiB)": 47.44, + "step": 20110, + "token_acc": 0.8397224058162591, + "train_speed(iter/s)": 0.095716 + }, + { + "epoch": 0.9354209597772475, + "grad_norm": 8.753215789794922, + "learning_rate": 1.1349054581004548e-07, + "loss": 0.5333482265472412, + "memory(GiB)": 47.44, + "step": 20115, + "token_acc": 0.866890756302521, + "train_speed(iter/s)": 0.09573 + }, + { + "epoch": 0.9356534780371971, + "grad_norm": 7.72360372543335, + "learning_rate": 1.1267744277144554e-07, + "loss": 0.5998239994049073, + "memory(GiB)": 47.44, + "step": 20120, + "token_acc": 0.844138303619665, + "train_speed(iter/s)": 0.095743 + }, + { + "epoch": 0.9358859962971467, + "grad_norm": 12.028573989868164, + "learning_rate": 1.1186722975756626e-07, + "loss": 0.656134843826294, + "memory(GiB)": 47.44, + "step": 20125, + "token_acc": 0.8383878691141261, + "train_speed(iter/s)": 0.095756 + }, + { + "epoch": 0.9361185145570963, + "grad_norm": 8.484295845031738, + "learning_rate": 1.110599072475127e-07, + "loss": 0.5953815460205079, + "memory(GiB)": 47.44, + "step": 20130, + "token_acc": 0.8507862161257945, + "train_speed(iter/s)": 0.095769 + }, + { + "epoch": 0.9363510328170459, + "grad_norm": 9.174735069274902, + "learning_rate": 1.1025547571867856e-07, + "loss": 0.6517370700836181, + "memory(GiB)": 47.44, + "step": 20135, + "token_acc": 0.8399729912221472, + "train_speed(iter/s)": 0.095783 + }, + { + "epoch": 0.9365835510769955, + "grad_norm": 8.110981941223145, + "learning_rate": 1.0945393564675055e-07, + "loss": 0.6983220100402832, + "memory(GiB)": 47.44, + "step": 20140, + "token_acc": 0.8345406023637056, + "train_speed(iter/s)": 0.095796 + }, + { + "epoch": 0.9368160693369452, + "grad_norm": 7.590916633605957, + "learning_rate": 1.0865528750570286e-07, + "loss": 0.5777697563171387, + "memory(GiB)": 47.44, + "step": 20145, + "token_acc": 0.8619783108774236, + "train_speed(iter/s)": 0.09581 + }, + { + "epoch": 0.9370485875968947, + "grad_norm": 8.5171480178833, + "learning_rate": 1.0785953176780107e-07, + "loss": 0.5863373756408692, + "memory(GiB)": 47.44, + "step": 20150, + "token_acc": 0.8566164154103852, + "train_speed(iter/s)": 0.095823 + }, + { + "epoch": 0.9370485875968947, + "eval_loss": 0.5557586550712585, + "eval_runtime": 294.9179, + "eval_samples_per_second": 11.783, + "eval_steps_per_second": 11.783, + "step": 20150 + }, + { + "epoch": 0.9372811058568443, + "grad_norm": 9.852570533752441, + "learning_rate": 1.0706666890359985e-07, + "loss": 0.5910586357116699, + "memory(GiB)": 47.44, + "step": 20155, + "token_acc": 0.835281461738544, + "train_speed(iter/s)": 0.095702 + }, + { + "epoch": 0.937513624116794, + "grad_norm": 7.415332794189453, + "learning_rate": 1.0627669938194418e-07, + "loss": 0.5437919616699218, + "memory(GiB)": 47.44, + "step": 20160, + "token_acc": 0.8614267676767676, + "train_speed(iter/s)": 0.095715 + }, + { + "epoch": 0.9377461423767435, + "grad_norm": 12.079816818237305, + "learning_rate": 1.0548962366996707e-07, + "loss": 0.766736364364624, + "memory(GiB)": 47.44, + "step": 20165, + "token_acc": 0.7975959674292361, + "train_speed(iter/s)": 0.095729 + }, + { + "epoch": 0.9379786606366931, + "grad_norm": 8.337579727172852, + "learning_rate": 1.047054422330901e-07, + "loss": 0.625864315032959, + "memory(GiB)": 47.44, + "step": 20170, + "token_acc": 0.837708066581306, + "train_speed(iter/s)": 0.095742 + }, + { + "epoch": 0.9382111788966427, + "grad_norm": 8.03079605102539, + "learning_rate": 1.0392415553502455e-07, + "loss": 0.5572741985321045, + "memory(GiB)": 47.44, + "step": 20175, + "token_acc": 0.8698746187732972, + "train_speed(iter/s)": 0.095755 + }, + { + "epoch": 0.9384436971565924, + "grad_norm": 7.4281182289123535, + "learning_rate": 1.0314576403776977e-07, + "loss": 0.5888198375701904, + "memory(GiB)": 47.44, + "step": 20180, + "token_acc": 0.8555512869765655, + "train_speed(iter/s)": 0.095768 + }, + { + "epoch": 0.9386762154165419, + "grad_norm": 8.945042610168457, + "learning_rate": 1.02370268201612e-07, + "loss": 0.6920148372650147, + "memory(GiB)": 47.44, + "step": 20185, + "token_acc": 0.828472755180353, + "train_speed(iter/s)": 0.095781 + }, + { + "epoch": 0.9389087336764915, + "grad_norm": 7.534639358520508, + "learning_rate": 1.0159766848512608e-07, + "loss": 0.6225242137908935, + "memory(GiB)": 47.44, + "step": 20190, + "token_acc": 0.8326589595375723, + "train_speed(iter/s)": 0.095795 + }, + { + "epoch": 0.9391412519364412, + "grad_norm": 8.891571044921875, + "learning_rate": 1.0082796534517436e-07, + "loss": 0.5637755393981934, + "memory(GiB)": 47.44, + "step": 20195, + "token_acc": 0.8522033898305085, + "train_speed(iter/s)": 0.095808 + }, + { + "epoch": 0.9393737701963908, + "grad_norm": 9.549708366394043, + "learning_rate": 1.0006115923690551e-07, + "loss": 0.7433938026428223, + "memory(GiB)": 47.44, + "step": 20200, + "token_acc": 0.831905344757242, + "train_speed(iter/s)": 0.095821 + }, + { + "epoch": 0.9393737701963908, + "eval_loss": 0.5558546185493469, + "eval_runtime": 294.94, + "eval_samples_per_second": 11.782, + "eval_steps_per_second": 11.782, + "step": 20200 + }, + { + "epoch": 0.9396062884563403, + "grad_norm": 7.152247905731201, + "learning_rate": 9.929725061375627e-08, + "loss": 0.6077425479888916, + "memory(GiB)": 47.44, + "step": 20205, + "token_acc": 0.835157187425006, + "train_speed(iter/s)": 0.0957 + }, + { + "epoch": 0.9398388067162899, + "grad_norm": 10.471779823303223, + "learning_rate": 9.85362399274481e-08, + "loss": 0.5760631084442138, + "memory(GiB)": 47.44, + "step": 20210, + "token_acc": 0.868144690781797, + "train_speed(iter/s)": 0.095713 + }, + { + "epoch": 0.9400713249762396, + "grad_norm": 8.634541511535645, + "learning_rate": 9.777812762799211e-08, + "loss": 0.6034083843231202, + "memory(GiB)": 47.44, + "step": 20215, + "token_acc": 0.8574080950612699, + "train_speed(iter/s)": 0.095727 + }, + { + "epoch": 0.9403038432361891, + "grad_norm": 7.538180828094482, + "learning_rate": 9.702291416368193e-08, + "loss": 0.6724761486053467, + "memory(GiB)": 47.44, + "step": 20220, + "token_acc": 0.8304682868998222, + "train_speed(iter/s)": 0.09574 + }, + { + "epoch": 0.9405363614961387, + "grad_norm": 10.402557373046875, + "learning_rate": 9.627059998109978e-08, + "loss": 0.7383048057556152, + "memory(GiB)": 47.44, + "step": 20225, + "token_acc": 0.8202247191011236, + "train_speed(iter/s)": 0.095753 + }, + { + "epoch": 0.9407688797560884, + "grad_norm": 8.302450180053711, + "learning_rate": 9.552118552511147e-08, + "loss": 0.6783876895904541, + "memory(GiB)": 47.44, + "step": 20230, + "token_acc": 0.8281358281358281, + "train_speed(iter/s)": 0.095767 + }, + { + "epoch": 0.941001398016038, + "grad_norm": 8.443572998046875, + "learning_rate": 9.477467123886919e-08, + "loss": 0.540089750289917, + "memory(GiB)": 47.44, + "step": 20235, + "token_acc": 0.8699566522174058, + "train_speed(iter/s)": 0.09578 + }, + { + "epoch": 0.9412339162759875, + "grad_norm": 11.566424369812012, + "learning_rate": 9.403105756380926e-08, + "loss": 0.7472939968109131, + "memory(GiB)": 47.44, + "step": 20240, + "token_acc": 0.8125202724618877, + "train_speed(iter/s)": 0.095793 + }, + { + "epoch": 0.9414664345359371, + "grad_norm": 10.177603721618652, + "learning_rate": 9.329034493965383e-08, + "loss": 0.7187521457672119, + "memory(GiB)": 47.44, + "step": 20245, + "token_acc": 0.8234134804887663, + "train_speed(iter/s)": 0.095806 + }, + { + "epoch": 0.9416989527958868, + "grad_norm": 10.402929306030273, + "learning_rate": 9.255253380440921e-08, + "loss": 0.6262013912200928, + "memory(GiB)": 47.44, + "step": 20250, + "token_acc": 0.8423997513211067, + "train_speed(iter/s)": 0.095819 + }, + { + "epoch": 0.9416989527958868, + "eval_loss": 0.5554139018058777, + "eval_runtime": 294.8259, + "eval_samples_per_second": 11.787, + "eval_steps_per_second": 11.787, + "step": 20250 + }, + { + "epoch": 0.9419314710558363, + "grad_norm": 8.747036933898926, + "learning_rate": 9.181762459436694e-08, + "loss": 0.5820370674133301, + "memory(GiB)": 47.44, + "step": 20255, + "token_acc": 0.8354214893126188, + "train_speed(iter/s)": 0.095699 + }, + { + "epoch": 0.9421639893157859, + "grad_norm": 9.884243965148926, + "learning_rate": 9.108561774409941e-08, + "loss": 0.6614628314971924, + "memory(GiB)": 47.44, + "step": 20260, + "token_acc": 0.832986832986833, + "train_speed(iter/s)": 0.095712 + }, + { + "epoch": 0.9423965075757356, + "grad_norm": 7.948294162750244, + "learning_rate": 9.035651368646647e-08, + "loss": 0.6848249912261963, + "memory(GiB)": 47.44, + "step": 20265, + "token_acc": 0.8344437041972018, + "train_speed(iter/s)": 0.095725 + }, + { + "epoch": 0.9426290258356852, + "grad_norm": 6.9303693771362305, + "learning_rate": 8.963031285260937e-08, + "loss": 0.541897201538086, + "memory(GiB)": 47.44, + "step": 20270, + "token_acc": 0.8590909090909091, + "train_speed(iter/s)": 0.095738 + }, + { + "epoch": 0.9428615440956347, + "grad_norm": 6.2741241455078125, + "learning_rate": 8.890701567195292e-08, + "loss": 0.7256217002868652, + "memory(GiB)": 47.44, + "step": 20275, + "token_acc": 0.8132061260356516, + "train_speed(iter/s)": 0.095751 + }, + { + "epoch": 0.9430940623555844, + "grad_norm": 9.887829780578613, + "learning_rate": 8.818662257220556e-08, + "loss": 0.6676210880279541, + "memory(GiB)": 47.44, + "step": 20280, + "token_acc": 0.8412804268089363, + "train_speed(iter/s)": 0.095764 + }, + { + "epoch": 0.943326580615534, + "grad_norm": 11.911397933959961, + "learning_rate": 8.746913397935708e-08, + "loss": 0.5624986171722413, + "memory(GiB)": 47.44, + "step": 20285, + "token_acc": 0.8604972375690608, + "train_speed(iter/s)": 0.095777 + }, + { + "epoch": 0.9435590988754836, + "grad_norm": 13.02669906616211, + "learning_rate": 8.675455031768143e-08, + "loss": 0.6928697109222413, + "memory(GiB)": 47.44, + "step": 20290, + "token_acc": 0.8278443113772455, + "train_speed(iter/s)": 0.095791 + }, + { + "epoch": 0.9437916171354331, + "grad_norm": 9.055578231811523, + "learning_rate": 8.604287200973394e-08, + "loss": 0.58877592086792, + "memory(GiB)": 47.44, + "step": 20295, + "token_acc": 0.8360365673842525, + "train_speed(iter/s)": 0.095804 + }, + { + "epoch": 0.9440241353953828, + "grad_norm": 9.44039535522461, + "learning_rate": 8.533409947635185e-08, + "loss": 0.6449069023132324, + "memory(GiB)": 47.44, + "step": 20300, + "token_acc": 0.8507737656595431, + "train_speed(iter/s)": 0.095817 + }, + { + "epoch": 0.9440241353953828, + "eval_loss": 0.5554980635643005, + "eval_runtime": 295.2399, + "eval_samples_per_second": 11.77, + "eval_steps_per_second": 11.77, + "step": 20300 + }, + { + "epoch": 0.9442566536553324, + "grad_norm": 10.0753755569458, + "learning_rate": 8.46282331366538e-08, + "loss": 0.6532928943634033, + "memory(GiB)": 47.44, + "step": 20305, + "token_acc": 0.8349886082854667, + "train_speed(iter/s)": 0.095697 + }, + { + "epoch": 0.9444891719152819, + "grad_norm": 8.442564010620117, + "learning_rate": 8.392527340804146e-08, + "loss": 0.5748683452606201, + "memory(GiB)": 47.44, + "step": 20310, + "token_acc": 0.8664459161147903, + "train_speed(iter/s)": 0.09571 + }, + { + "epoch": 0.9447216901752316, + "grad_norm": 7.529089450836182, + "learning_rate": 8.32252207061951e-08, + "loss": 0.6530350685119629, + "memory(GiB)": 47.44, + "step": 20315, + "token_acc": 0.8451369216241738, + "train_speed(iter/s)": 0.095723 + }, + { + "epoch": 0.9449542084351812, + "grad_norm": 8.566681861877441, + "learning_rate": 8.252807544507913e-08, + "loss": 0.6495450496673584, + "memory(GiB)": 47.44, + "step": 20320, + "token_acc": 0.8388909704008992, + "train_speed(iter/s)": 0.095736 + }, + { + "epoch": 0.9451867266951308, + "grad_norm": 6.657361030578613, + "learning_rate": 8.183383803693545e-08, + "loss": 0.586691427230835, + "memory(GiB)": 47.44, + "step": 20325, + "token_acc": 0.8495850622406639, + "train_speed(iter/s)": 0.09575 + }, + { + "epoch": 0.9454192449550803, + "grad_norm": 7.880705833435059, + "learning_rate": 8.114250889228848e-08, + "loss": 0.5361662864685058, + "memory(GiB)": 47.44, + "step": 20330, + "token_acc": 0.8539553752535497, + "train_speed(iter/s)": 0.095763 + }, + { + "epoch": 0.94565176321503, + "grad_norm": 7.656473636627197, + "learning_rate": 8.04540884199434e-08, + "loss": 0.6294970512390137, + "memory(GiB)": 47.44, + "step": 20335, + "token_acc": 0.8405292479108635, + "train_speed(iter/s)": 0.095776 + }, + { + "epoch": 0.9458842814749796, + "grad_norm": 6.87350606918335, + "learning_rate": 7.97685770269846e-08, + "loss": 0.6477347373962402, + "memory(GiB)": 47.44, + "step": 20340, + "token_acc": 0.8395100502512562, + "train_speed(iter/s)": 0.09579 + }, + { + "epoch": 0.9461167997349292, + "grad_norm": 10.930252075195312, + "learning_rate": 7.908597511877447e-08, + "loss": 0.5632180213928223, + "memory(GiB)": 47.44, + "step": 20345, + "token_acc": 0.8588528678304239, + "train_speed(iter/s)": 0.095803 + }, + { + "epoch": 0.9463493179948788, + "grad_norm": 7.769409656524658, + "learning_rate": 7.840628309895848e-08, + "loss": 0.6253656864166259, + "memory(GiB)": 47.44, + "step": 20350, + "token_acc": 0.8249168430601754, + "train_speed(iter/s)": 0.095816 + }, + { + "epoch": 0.9463493179948788, + "eval_loss": 0.5555700659751892, + "eval_runtime": 296.4564, + "eval_samples_per_second": 11.722, + "eval_steps_per_second": 11.722, + "step": 20350 + }, + { + "epoch": 0.9465818362548284, + "grad_norm": 10.457148551940918, + "learning_rate": 7.772950136945789e-08, + "loss": 0.74940505027771, + "memory(GiB)": 47.44, + "step": 20355, + "token_acc": 0.8343391286300429, + "train_speed(iter/s)": 0.095695 + }, + { + "epoch": 0.946814354514778, + "grad_norm": 8.147481918334961, + "learning_rate": 7.705563033047592e-08, + "loss": 0.6334987163543702, + "memory(GiB)": 47.44, + "step": 20360, + "token_acc": 0.8368891947694426, + "train_speed(iter/s)": 0.095708 + }, + { + "epoch": 0.9470468727747275, + "grad_norm": 10.947805404663086, + "learning_rate": 7.638467038049214e-08, + "loss": 0.5573566913604736, + "memory(GiB)": 47.44, + "step": 20365, + "token_acc": 0.8695842450765864, + "train_speed(iter/s)": 0.095721 + }, + { + "epoch": 0.9472793910346772, + "grad_norm": 8.894417762756348, + "learning_rate": 7.571662191626694e-08, + "loss": 0.60349440574646, + "memory(GiB)": 47.44, + "step": 20370, + "token_acc": 0.8537735849056604, + "train_speed(iter/s)": 0.095734 + }, + { + "epoch": 0.9475119092946268, + "grad_norm": 9.178512573242188, + "learning_rate": 7.505148533283712e-08, + "loss": 0.5638972282409668, + "memory(GiB)": 47.44, + "step": 20375, + "token_acc": 0.853031465848043, + "train_speed(iter/s)": 0.095748 + }, + { + "epoch": 0.9477444275545764, + "grad_norm": 7.611563682556152, + "learning_rate": 7.438926102351973e-08, + "loss": 0.6906840324401855, + "memory(GiB)": 47.44, + "step": 20380, + "token_acc": 0.8227810650887574, + "train_speed(iter/s)": 0.095761 + }, + { + "epoch": 0.947976945814526, + "grad_norm": 9.940587997436523, + "learning_rate": 7.372994937990707e-08, + "loss": 0.6728063106536866, + "memory(GiB)": 47.44, + "step": 20385, + "token_acc": 0.8305246422893482, + "train_speed(iter/s)": 0.095774 + }, + { + "epoch": 0.9482094640744756, + "grad_norm": 8.792899131774902, + "learning_rate": 7.307355079187118e-08, + "loss": 0.7505878448486328, + "memory(GiB)": 47.44, + "step": 20390, + "token_acc": 0.82015065913371, + "train_speed(iter/s)": 0.095787 + }, + { + "epoch": 0.9484419823344252, + "grad_norm": 7.857633113861084, + "learning_rate": 7.242006564756043e-08, + "loss": 0.7554164409637452, + "memory(GiB)": 47.44, + "step": 20395, + "token_acc": 0.8082595870206489, + "train_speed(iter/s)": 0.0958 + }, + { + "epoch": 0.9486745005943747, + "grad_norm": 10.25812816619873, + "learning_rate": 7.17694943334013e-08, + "loss": 0.678457498550415, + "memory(GiB)": 47.44, + "step": 20400, + "token_acc": 0.8347305389221557, + "train_speed(iter/s)": 0.095814 + }, + { + "epoch": 0.9486745005943747, + "eval_loss": 0.5555104613304138, + "eval_runtime": 292.2526, + "eval_samples_per_second": 11.89, + "eval_steps_per_second": 11.89, + "step": 20400 + }, + { + "epoch": 0.9489070188543244, + "grad_norm": 10.560473442077637, + "learning_rate": 7.11218372340966e-08, + "loss": 0.7394631385803223, + "memory(GiB)": 47.44, + "step": 20405, + "token_acc": 0.8341056755207338, + "train_speed(iter/s)": 0.095695 + }, + { + "epoch": 0.949139537114274, + "grad_norm": 11.776728630065918, + "learning_rate": 7.047709473262554e-08, + "loss": 0.5568135261535645, + "memory(GiB)": 47.44, + "step": 20410, + "token_acc": 0.8602316602316602, + "train_speed(iter/s)": 0.095709 + }, + { + "epoch": 0.9493720553742236, + "grad_norm": 10.002482414245605, + "learning_rate": 6.983526721024425e-08, + "loss": 0.7164381504058838, + "memory(GiB)": 47.44, + "step": 20415, + "token_acc": 0.8240400667779633, + "train_speed(iter/s)": 0.095722 + }, + { + "epoch": 0.9496045736341732, + "grad_norm": 11.498485565185547, + "learning_rate": 6.919635504648581e-08, + "loss": 0.6015612125396729, + "memory(GiB)": 47.44, + "step": 20420, + "token_acc": 0.8594515181194907, + "train_speed(iter/s)": 0.095736 + }, + { + "epoch": 0.9498370918941228, + "grad_norm": 7.4594902992248535, + "learning_rate": 6.856035861915855e-08, + "loss": 0.6615743637084961, + "memory(GiB)": 47.44, + "step": 20425, + "token_acc": 0.8398541114058355, + "train_speed(iter/s)": 0.095749 + }, + { + "epoch": 0.9500696101540724, + "grad_norm": 8.488970756530762, + "learning_rate": 6.792727830434608e-08, + "loss": 0.6310092449188233, + "memory(GiB)": 47.44, + "step": 20430, + "token_acc": 0.8501026694045175, + "train_speed(iter/s)": 0.095762 + }, + { + "epoch": 0.9503021284140221, + "grad_norm": 6.498387336730957, + "learning_rate": 6.729711447640897e-08, + "loss": 0.6058262348175049, + "memory(GiB)": 47.44, + "step": 20435, + "token_acc": 0.8434754311747478, + "train_speed(iter/s)": 0.095774 + }, + { + "epoch": 0.9505346466739716, + "grad_norm": 10.151055335998535, + "learning_rate": 6.666986750798244e-08, + "loss": 0.7225804805755616, + "memory(GiB)": 47.44, + "step": 20440, + "token_acc": 0.8169456066945606, + "train_speed(iter/s)": 0.095788 + }, + { + "epoch": 0.9507671649339212, + "grad_norm": 7.998047351837158, + "learning_rate": 6.604553776997702e-08, + "loss": 0.6553145408630371, + "memory(GiB)": 47.44, + "step": 20445, + "token_acc": 0.8338098641887062, + "train_speed(iter/s)": 0.095801 + }, + { + "epoch": 0.9509996831938708, + "grad_norm": 9.505949974060059, + "learning_rate": 6.542412563157796e-08, + "loss": 0.5239490509033203, + "memory(GiB)": 47.44, + "step": 20450, + "token_acc": 0.8669275929549902, + "train_speed(iter/s)": 0.095814 + }, + { + "epoch": 0.9509996831938708, + "eval_loss": 0.5556869506835938, + "eval_runtime": 296.071, + "eval_samples_per_second": 11.737, + "eval_steps_per_second": 11.737, + "step": 20450 + }, + { + "epoch": 0.9512322014538204, + "grad_norm": 8.004267692565918, + "learning_rate": 6.480563146024521e-08, + "loss": 0.7079615116119384, + "memory(GiB)": 47.44, + "step": 20455, + "token_acc": 0.834247492705327, + "train_speed(iter/s)": 0.095695 + }, + { + "epoch": 0.95146471971377, + "grad_norm": 8.829005241394043, + "learning_rate": 6.419005562171399e-08, + "loss": 0.6616856575012207, + "memory(GiB)": 47.44, + "step": 20460, + "token_acc": 0.8278097982708934, + "train_speed(iter/s)": 0.095708 + }, + { + "epoch": 0.9516972379737196, + "grad_norm": 10.461058616638184, + "learning_rate": 6.357739847999367e-08, + "loss": 0.571187686920166, + "memory(GiB)": 47.44, + "step": 20465, + "token_acc": 0.8574660633484162, + "train_speed(iter/s)": 0.095721 + }, + { + "epoch": 0.9519297562336693, + "grad_norm": 6.821761131286621, + "learning_rate": 6.296766039736613e-08, + "loss": 0.5766401767730713, + "memory(GiB)": 47.44, + "step": 20470, + "token_acc": 0.8541473943879124, + "train_speed(iter/s)": 0.095734 + }, + { + "epoch": 0.9521622744936188, + "grad_norm": 5.286739349365234, + "learning_rate": 6.236084173438961e-08, + "loss": 0.6574409961700439, + "memory(GiB)": 47.44, + "step": 20475, + "token_acc": 0.8148148148148148, + "train_speed(iter/s)": 0.095747 + }, + { + "epoch": 0.9523947927535684, + "grad_norm": 8.371713638305664, + "learning_rate": 6.175694284989375e-08, + "loss": 0.5862714767456054, + "memory(GiB)": 47.44, + "step": 20480, + "token_acc": 0.8502436863092601, + "train_speed(iter/s)": 0.09576 + }, + { + "epoch": 0.952627311013518, + "grad_norm": 7.867913722991943, + "learning_rate": 6.11559641009829e-08, + "loss": 0.760695219039917, + "memory(GiB)": 47.44, + "step": 20485, + "token_acc": 0.8002534854245881, + "train_speed(iter/s)": 0.095773 + }, + { + "epoch": 0.9528598292734677, + "grad_norm": 8.1065034866333, + "learning_rate": 6.055790584303445e-08, + "loss": 0.6531691074371337, + "memory(GiB)": 47.44, + "step": 20490, + "token_acc": 0.8368972746331237, + "train_speed(iter/s)": 0.095786 + }, + { + "epoch": 0.9530923475334172, + "grad_norm": 12.392067909240723, + "learning_rate": 5.996276842969828e-08, + "loss": 0.6888431549072266, + "memory(GiB)": 47.44, + "step": 20495, + "token_acc": 0.8311335403726708, + "train_speed(iter/s)": 0.095799 + }, + { + "epoch": 0.9533248657933668, + "grad_norm": 7.902614593505859, + "learning_rate": 5.937055221289845e-08, + "loss": 0.5516024589538574, + "memory(GiB)": 47.44, + "step": 20500, + "token_acc": 0.8628333910633876, + "train_speed(iter/s)": 0.095812 + }, + { + "epoch": 0.9533248657933668, + "eval_loss": 0.5554400086402893, + "eval_runtime": 293.2617, + "eval_samples_per_second": 11.849, + "eval_steps_per_second": 11.849, + "step": 20500 + }, + { + "epoch": 0.9535573840533165, + "grad_norm": 8.178373336791992, + "learning_rate": 5.878125754283037e-08, + "loss": 0.5706425666809082, + "memory(GiB)": 47.44, + "step": 20505, + "token_acc": 0.8351757426336981, + "train_speed(iter/s)": 0.095693 + }, + { + "epoch": 0.953789902313266, + "grad_norm": 6.950657367706299, + "learning_rate": 5.8194884767961424e-08, + "loss": 0.7144189834594726, + "memory(GiB)": 47.44, + "step": 20510, + "token_acc": 0.8302277432712215, + "train_speed(iter/s)": 0.095706 + }, + { + "epoch": 0.9540224205732156, + "grad_norm": 7.927690505981445, + "learning_rate": 5.761143423503257e-08, + "loss": 0.663585901260376, + "memory(GiB)": 47.44, + "step": 20515, + "token_acc": 0.8332219251336899, + "train_speed(iter/s)": 0.095719 + }, + { + "epoch": 0.9542549388331653, + "grad_norm": 14.15100383758545, + "learning_rate": 5.703090628905617e-08, + "loss": 0.679119062423706, + "memory(GiB)": 47.44, + "step": 20520, + "token_acc": 0.8324175824175825, + "train_speed(iter/s)": 0.095732 + }, + { + "epoch": 0.9544874570931149, + "grad_norm": 8.4072847366333, + "learning_rate": 5.645330127331594e-08, + "loss": 0.5841912269592285, + "memory(GiB)": 47.44, + "step": 20525, + "token_acc": 0.8558432470258922, + "train_speed(iter/s)": 0.095746 + }, + { + "epoch": 0.9547199753530644, + "grad_norm": 8.089521408081055, + "learning_rate": 5.587861952936813e-08, + "loss": 0.6344331741333008, + "memory(GiB)": 47.44, + "step": 20530, + "token_acc": 0.8492678725236865, + "train_speed(iter/s)": 0.095758 + }, + { + "epoch": 0.954952493613014, + "grad_norm": 9.23490047454834, + "learning_rate": 5.5306861397038666e-08, + "loss": 0.6122418403625488, + "memory(GiB)": 47.44, + "step": 20535, + "token_acc": 0.8401052323881906, + "train_speed(iter/s)": 0.095771 + }, + { + "epoch": 0.9551850118729637, + "grad_norm": 11.101957321166992, + "learning_rate": 5.4738027214427114e-08, + "loss": 0.5774817943572998, + "memory(GiB)": 47.44, + "step": 20540, + "token_acc": 0.8593969144460029, + "train_speed(iter/s)": 0.095784 + }, + { + "epoch": 0.9554175301329132, + "grad_norm": 8.920694351196289, + "learning_rate": 5.417211731790217e-08, + "loss": 0.6202130794525147, + "memory(GiB)": 47.44, + "step": 20545, + "token_acc": 0.8487694300518135, + "train_speed(iter/s)": 0.095797 + }, + { + "epoch": 0.9556500483928628, + "grad_norm": 7.193459510803223, + "learning_rate": 5.360913204210394e-08, + "loss": 0.6225608825683594, + "memory(GiB)": 47.44, + "step": 20550, + "token_acc": 0.8276497695852535, + "train_speed(iter/s)": 0.09581 + }, + { + "epoch": 0.9556500483928628, + "eval_loss": 0.5555076599121094, + "eval_runtime": 292.2433, + "eval_samples_per_second": 11.891, + "eval_steps_per_second": 11.891, + "step": 20550 + }, + { + "epoch": 0.9558825666528125, + "grad_norm": 10.845667839050293, + "learning_rate": 5.304907171994278e-08, + "loss": 0.64171142578125, + "memory(GiB)": 47.44, + "step": 20555, + "token_acc": 0.834728835767473, + "train_speed(iter/s)": 0.095693 + }, + { + "epoch": 0.9561150849127621, + "grad_norm": 7.467350482940674, + "learning_rate": 5.249193668259989e-08, + "loss": 0.7335173130035401, + "memory(GiB)": 47.44, + "step": 20560, + "token_acc": 0.8039755351681958, + "train_speed(iter/s)": 0.095705 + }, + { + "epoch": 0.9563476031727116, + "grad_norm": 7.151541233062744, + "learning_rate": 5.1937727259525615e-08, + "loss": 0.5967055320739746, + "memory(GiB)": 47.44, + "step": 20565, + "token_acc": 0.8547701815372731, + "train_speed(iter/s)": 0.095718 + }, + { + "epoch": 0.9565801214326612, + "grad_norm": 7.254678249359131, + "learning_rate": 5.1386443778442264e-08, + "loss": 0.6981623649597168, + "memory(GiB)": 47.44, + "step": 20570, + "token_acc": 0.80312415836251, + "train_speed(iter/s)": 0.095731 + }, + { + "epoch": 0.9568126396926109, + "grad_norm": 7.200235366821289, + "learning_rate": 5.083808656534017e-08, + "loss": 0.7622573852539063, + "memory(GiB)": 47.44, + "step": 20575, + "token_acc": 0.8102981029810298, + "train_speed(iter/s)": 0.095744 + }, + { + "epoch": 0.9570451579525605, + "grad_norm": 11.583197593688965, + "learning_rate": 5.0292655944479963e-08, + "loss": 0.7474843978881835, + "memory(GiB)": 47.44, + "step": 20580, + "token_acc": 0.8325881768504719, + "train_speed(iter/s)": 0.095757 + }, + { + "epoch": 0.95727767621251, + "grad_norm": 10.911505699157715, + "learning_rate": 4.975015223839197e-08, + "loss": 0.6490874767303467, + "memory(GiB)": 47.44, + "step": 20585, + "token_acc": 0.8389866895663375, + "train_speed(iter/s)": 0.09577 + }, + { + "epoch": 0.9575101944724597, + "grad_norm": 9.79134750366211, + "learning_rate": 4.921057576787458e-08, + "loss": 0.6671240329742432, + "memory(GiB)": 47.44, + "step": 20590, + "token_acc": 0.8255982596084119, + "train_speed(iter/s)": 0.095783 + }, + { + "epoch": 0.9577427127324093, + "grad_norm": 7.199132442474365, + "learning_rate": 4.8673926851996454e-08, + "loss": 0.7733827590942383, + "memory(GiB)": 47.44, + "step": 20595, + "token_acc": 0.8150917743031951, + "train_speed(iter/s)": 0.095796 + }, + { + "epoch": 0.9579752309923588, + "grad_norm": 8.266303062438965, + "learning_rate": 4.8140205808094865e-08, + "loss": 0.6054050922393799, + "memory(GiB)": 47.44, + "step": 20600, + "token_acc": 0.8480492813141683, + "train_speed(iter/s)": 0.095809 + }, + { + "epoch": 0.9579752309923588, + "eval_loss": 0.5556042194366455, + "eval_runtime": 294.0624, + "eval_samples_per_second": 11.817, + "eval_steps_per_second": 11.817, + "step": 20600 + }, + { + "epoch": 0.9582077492523084, + "grad_norm": 7.836030960083008, + "learning_rate": 4.760941295177568e-08, + "loss": 0.6465342044830322, + "memory(GiB)": 47.44, + "step": 20605, + "token_acc": 0.8347251800626929, + "train_speed(iter/s)": 0.095691 + }, + { + "epoch": 0.9584402675122581, + "grad_norm": 7.720570087432861, + "learning_rate": 4.7081548596912276e-08, + "loss": 0.6976634502410889, + "memory(GiB)": 47.44, + "step": 20610, + "token_acc": 0.8268502581755593, + "train_speed(iter/s)": 0.095704 + }, + { + "epoch": 0.9586727857722077, + "grad_norm": 10.879193305969238, + "learning_rate": 4.655661305564774e-08, + "loss": 0.6851535320281983, + "memory(GiB)": 47.44, + "step": 20615, + "token_acc": 0.8421271292064811, + "train_speed(iter/s)": 0.095717 + }, + { + "epoch": 0.9589053040321572, + "grad_norm": 8.683979988098145, + "learning_rate": 4.6034606638392654e-08, + "loss": 0.697068738937378, + "memory(GiB)": 47.44, + "step": 20620, + "token_acc": 0.8208109719737626, + "train_speed(iter/s)": 0.09573 + }, + { + "epoch": 0.9591378222921069, + "grad_norm": 8.056221961975098, + "learning_rate": 4.551552965382511e-08, + "loss": 0.6238224029541015, + "memory(GiB)": 47.44, + "step": 20625, + "token_acc": 0.8367801463569837, + "train_speed(iter/s)": 0.095743 + }, + { + "epoch": 0.9593703405520565, + "grad_norm": 11.859223365783691, + "learning_rate": 4.4999382408892345e-08, + "loss": 0.5613128185272217, + "memory(GiB)": 47.44, + "step": 20630, + "token_acc": 0.8601036269430051, + "train_speed(iter/s)": 0.095756 + }, + { + "epoch": 0.9596028588120061, + "grad_norm": 7.768198490142822, + "learning_rate": 4.4486165208806885e-08, + "loss": 0.6417368412017822, + "memory(GiB)": 47.44, + "step": 20635, + "token_acc": 0.8403361344537815, + "train_speed(iter/s)": 0.095768 + }, + { + "epoch": 0.9598353770719557, + "grad_norm": 9.254332542419434, + "learning_rate": 4.397587835705097e-08, + "loss": 0.6034894466400147, + "memory(GiB)": 47.44, + "step": 20640, + "token_acc": 0.8504885993485342, + "train_speed(iter/s)": 0.095781 + }, + { + "epoch": 0.9600678953319053, + "grad_norm": 6.1051926612854, + "learning_rate": 4.346852215537267e-08, + "loss": 0.6086849212646485, + "memory(GiB)": 47.44, + "step": 20645, + "token_acc": 0.8502259522272434, + "train_speed(iter/s)": 0.095794 + }, + { + "epoch": 0.9603004135918549, + "grad_norm": 8.756834983825684, + "learning_rate": 4.296409690378644e-08, + "loss": 0.5037965297698974, + "memory(GiB)": 47.44, + "step": 20650, + "token_acc": 0.8769622401357658, + "train_speed(iter/s)": 0.095806 + }, + { + "epoch": 0.9603004135918549, + "eval_loss": 0.5555453896522522, + "eval_runtime": 292.1663, + "eval_samples_per_second": 11.894, + "eval_steps_per_second": 11.894, + "step": 20650 + }, + { + "epoch": 0.9605329318518044, + "grad_norm": 8.111796379089355, + "learning_rate": 4.246260290057591e-08, + "loss": 0.8531887054443359, + "memory(GiB)": 47.44, + "step": 20655, + "token_acc": 0.8338415120845439, + "train_speed(iter/s)": 0.095689 + }, + { + "epoch": 0.9607654501117541, + "grad_norm": 10.552480697631836, + "learning_rate": 4.196404044228941e-08, + "loss": 0.6422392368316651, + "memory(GiB)": 47.44, + "step": 20660, + "token_acc": 0.83687374749499, + "train_speed(iter/s)": 0.095702 + }, + { + "epoch": 0.9609979683717037, + "grad_norm": 8.833476066589355, + "learning_rate": 4.146840982374223e-08, + "loss": 0.6003233909606933, + "memory(GiB)": 47.44, + "step": 20665, + "token_acc": 0.8502259522272434, + "train_speed(iter/s)": 0.095715 + }, + { + "epoch": 0.9612304866316533, + "grad_norm": 9.322022438049316, + "learning_rate": 4.097571133801548e-08, + "loss": 0.541776466369629, + "memory(GiB)": 47.44, + "step": 20670, + "token_acc": 0.8672470076169749, + "train_speed(iter/s)": 0.095728 + }, + { + "epoch": 0.9614630048916029, + "grad_norm": 8.25760269165039, + "learning_rate": 4.048594527645833e-08, + "loss": 0.6594733238220215, + "memory(GiB)": 47.44, + "step": 20675, + "token_acc": 0.8320663441603318, + "train_speed(iter/s)": 0.095741 + }, + { + "epoch": 0.9616955231515525, + "grad_norm": 9.2846040725708, + "learning_rate": 3.9999111928683554e-08, + "loss": 0.5512380123138427, + "memory(GiB)": 47.44, + "step": 20680, + "token_acc": 0.8586731167933656, + "train_speed(iter/s)": 0.095754 + }, + { + "epoch": 0.9619280414115021, + "grad_norm": 8.768184661865234, + "learning_rate": 3.951521158257143e-08, + "loss": 0.6587899208068848, + "memory(GiB)": 47.44, + "step": 20685, + "token_acc": 0.83515731874145, + "train_speed(iter/s)": 0.095767 + }, + { + "epoch": 0.9621605596714518, + "grad_norm": 6.437694072723389, + "learning_rate": 3.9034244524266406e-08, + "loss": 0.6556053638458252, + "memory(GiB)": 47.44, + "step": 20690, + "token_acc": 0.8247656915648963, + "train_speed(iter/s)": 0.095779 + }, + { + "epoch": 0.9623930779314013, + "grad_norm": 9.403351783752441, + "learning_rate": 3.8556211038179304e-08, + "loss": 0.682905912399292, + "memory(GiB)": 47.44, + "step": 20695, + "token_acc": 0.8239623196938475, + "train_speed(iter/s)": 0.095792 + }, + { + "epoch": 0.9626255961913509, + "grad_norm": 8.33271312713623, + "learning_rate": 3.808111140698623e-08, + "loss": 0.5829206466674804, + "memory(GiB)": 47.44, + "step": 20700, + "token_acc": 0.8585293019783501, + "train_speed(iter/s)": 0.095805 + }, + { + "epoch": 0.9626255961913509, + "eval_loss": 0.5555470585823059, + "eval_runtime": 294.213, + "eval_samples_per_second": 11.811, + "eval_steps_per_second": 11.811, + "step": 20700 + }, + { + "epoch": 0.9628581144513005, + "grad_norm": 8.44229793548584, + "learning_rate": 3.760894591162911e-08, + "loss": 0.7375020503997802, + "memory(GiB)": 47.44, + "step": 20705, + "token_acc": 0.834630381412365, + "train_speed(iter/s)": 0.095687 + }, + { + "epoch": 0.9630906327112501, + "grad_norm": 8.542226791381836, + "learning_rate": 3.71397148313124e-08, + "loss": 0.7269775867462158, + "memory(GiB)": 47.44, + "step": 20710, + "token_acc": 0.8133986928104575, + "train_speed(iter/s)": 0.0957 + }, + { + "epoch": 0.9633231509711997, + "grad_norm": 8.211197853088379, + "learning_rate": 3.667341844350803e-08, + "loss": 0.5958492755889893, + "memory(GiB)": 47.44, + "step": 20715, + "token_acc": 0.8416696653472472, + "train_speed(iter/s)": 0.095713 + }, + { + "epoch": 0.9635556692311493, + "grad_norm": 9.82761287689209, + "learning_rate": 3.621005702395153e-08, + "loss": 0.6712447166442871, + "memory(GiB)": 47.44, + "step": 20720, + "token_acc": 0.8185196655311242, + "train_speed(iter/s)": 0.095726 + }, + { + "epoch": 0.963788187491099, + "grad_norm": 11.418320655822754, + "learning_rate": 3.574963084664207e-08, + "loss": 0.6232163429260253, + "memory(GiB)": 47.44, + "step": 20725, + "token_acc": 0.8422480620155038, + "train_speed(iter/s)": 0.095739 + }, + { + "epoch": 0.9640207057510485, + "grad_norm": 8.077664375305176, + "learning_rate": 3.529214018384408e-08, + "loss": 0.7407678127288818, + "memory(GiB)": 47.44, + "step": 20730, + "token_acc": 0.8097859327217125, + "train_speed(iter/s)": 0.095752 + }, + { + "epoch": 0.9642532240109981, + "grad_norm": 7.364772319793701, + "learning_rate": 3.483758530608616e-08, + "loss": 0.5969498634338379, + "memory(GiB)": 47.44, + "step": 20735, + "token_acc": 0.84688995215311, + "train_speed(iter/s)": 0.095764 + }, + { + "epoch": 0.9644857422709477, + "grad_norm": 8.380683898925781, + "learning_rate": 3.4385966482160525e-08, + "loss": 0.6390859603881835, + "memory(GiB)": 47.44, + "step": 20740, + "token_acc": 0.8565385971096229, + "train_speed(iter/s)": 0.095777 + }, + { + "epoch": 0.9647182605308973, + "grad_norm": 8.919743537902832, + "learning_rate": 3.393728397912355e-08, + "loss": 0.6278938770294189, + "memory(GiB)": 47.44, + "step": 20745, + "token_acc": 0.8445541064504759, + "train_speed(iter/s)": 0.09579 + }, + { + "epoch": 0.9649507787908469, + "grad_norm": 8.967925071716309, + "learning_rate": 3.34915380622941e-08, + "loss": 0.7339792728424073, + "memory(GiB)": 47.44, + "step": 20750, + "token_acc": 0.8144448713023434, + "train_speed(iter/s)": 0.095804 + }, + { + "epoch": 0.9649507787908469, + "eval_loss": 0.5556846261024475, + "eval_runtime": 295.7774, + "eval_samples_per_second": 11.749, + "eval_steps_per_second": 11.749, + "step": 20750 + }, + { + "epoch": 0.9651832970507965, + "grad_norm": 8.481380462646484, + "learning_rate": 3.304872899525691e-08, + "loss": 0.6525393962860108, + "memory(GiB)": 47.44, + "step": 20755, + "token_acc": 0.8348432948106412, + "train_speed(iter/s)": 0.095686 + }, + { + "epoch": 0.9654158153107462, + "grad_norm": 13.112621307373047, + "learning_rate": 3.260885703985806e-08, + "loss": 0.6459598541259766, + "memory(GiB)": 47.44, + "step": 20760, + "token_acc": 0.8370715192551841, + "train_speed(iter/s)": 0.095699 + }, + { + "epoch": 0.9656483335706957, + "grad_norm": 9.978148460388184, + "learning_rate": 3.217192245620726e-08, + "loss": 0.63280029296875, + "memory(GiB)": 47.44, + "step": 20765, + "token_acc": 0.8340011448196909, + "train_speed(iter/s)": 0.095712 + }, + { + "epoch": 0.9658808518306453, + "grad_norm": 8.677961349487305, + "learning_rate": 3.173792550267729e-08, + "loss": 0.749193525314331, + "memory(GiB)": 47.44, + "step": 20770, + "token_acc": 0.790771484375, + "train_speed(iter/s)": 0.095725 + }, + { + "epoch": 0.966113370090595, + "grad_norm": 8.47712230682373, + "learning_rate": 3.1306866435903974e-08, + "loss": 0.5897305488586426, + "memory(GiB)": 47.44, + "step": 20775, + "token_acc": 0.843000773395205, + "train_speed(iter/s)": 0.095738 + }, + { + "epoch": 0.9663458883505446, + "grad_norm": 8.103322982788086, + "learning_rate": 3.087874551078673e-08, + "loss": 0.6301196575164795, + "memory(GiB)": 47.44, + "step": 20780, + "token_acc": 0.8404820985466147, + "train_speed(iter/s)": 0.095751 + }, + { + "epoch": 0.9665784066104941, + "grad_norm": 10.327425956726074, + "learning_rate": 3.045356298048529e-08, + "loss": 0.5524001121520996, + "memory(GiB)": 47.44, + "step": 20785, + "token_acc": 0.8594551914725622, + "train_speed(iter/s)": 0.095764 + }, + { + "epoch": 0.9668109248704437, + "grad_norm": 9.891942024230957, + "learning_rate": 3.003131909642409e-08, + "loss": 0.6475227355957032, + "memory(GiB)": 47.44, + "step": 20790, + "token_acc": 0.8433402346445825, + "train_speed(iter/s)": 0.095777 + }, + { + "epoch": 0.9670434431303934, + "grad_norm": 7.341713905334473, + "learning_rate": 2.9612014108288955e-08, + "loss": 0.5846897602081299, + "memory(GiB)": 47.44, + "step": 20795, + "token_acc": 0.8507265521796565, + "train_speed(iter/s)": 0.09579 + }, + { + "epoch": 0.9672759613903429, + "grad_norm": 8.49909782409668, + "learning_rate": 2.9195648264027653e-08, + "loss": 0.7312897682189942, + "memory(GiB)": 47.44, + "step": 20800, + "token_acc": 0.8227060653188181, + "train_speed(iter/s)": 0.095803 + }, + { + "epoch": 0.9672759613903429, + "eval_loss": 0.5553971529006958, + "eval_runtime": 295.0521, + "eval_samples_per_second": 11.778, + "eval_steps_per_second": 11.778, + "step": 20800 + }, + { + "epoch": 0.9675084796502925, + "grad_norm": 7.44534158706665, + "learning_rate": 2.8782221809850464e-08, + "loss": 0.6029882907867432, + "memory(GiB)": 47.44, + "step": 20805, + "token_acc": 0.8352805884091072, + "train_speed(iter/s)": 0.095686 + }, + { + "epoch": 0.9677409979102422, + "grad_norm": 8.211577415466309, + "learning_rate": 2.837173499022905e-08, + "loss": 0.5521889209747315, + "memory(GiB)": 47.44, + "step": 20810, + "token_acc": 0.8668499607227023, + "train_speed(iter/s)": 0.095699 + }, + { + "epoch": 0.9679735161701918, + "grad_norm": 9.065325736999512, + "learning_rate": 2.7964188047895913e-08, + "loss": 0.6344101428985596, + "memory(GiB)": 47.44, + "step": 20815, + "token_acc": 0.8490566037735849, + "train_speed(iter/s)": 0.095712 + }, + { + "epoch": 0.9682060344301413, + "grad_norm": 8.109495162963867, + "learning_rate": 2.755958122384772e-08, + "loss": 0.7012411117553711, + "memory(GiB)": 47.44, + "step": 20820, + "token_acc": 0.8378076062639821, + "train_speed(iter/s)": 0.095725 + }, + { + "epoch": 0.9684385526900909, + "grad_norm": 7.52706241607666, + "learning_rate": 2.715791475734031e-08, + "loss": 0.6769590854644776, + "memory(GiB)": 47.44, + "step": 20825, + "token_acc": 0.8325183374083129, + "train_speed(iter/s)": 0.095738 + }, + { + "epoch": 0.9686710709500406, + "grad_norm": 7.486845970153809, + "learning_rate": 2.6759188885891462e-08, + "loss": 0.7046610832214355, + "memory(GiB)": 47.44, + "step": 20830, + "token_acc": 0.8220979020979021, + "train_speed(iter/s)": 0.09575 + }, + { + "epoch": 0.9689035892099902, + "grad_norm": 10.00309944152832, + "learning_rate": 2.6363403845280355e-08, + "loss": 0.686539888381958, + "memory(GiB)": 47.44, + "step": 20835, + "token_acc": 0.809255784865541, + "train_speed(iter/s)": 0.095762 + }, + { + "epoch": 0.9691361074699397, + "grad_norm": 9.596781730651855, + "learning_rate": 2.597055986954644e-08, + "loss": 0.7272031784057618, + "memory(GiB)": 47.44, + "step": 20840, + "token_acc": 0.8149327671620665, + "train_speed(iter/s)": 0.095775 + }, + { + "epoch": 0.9693686257298894, + "grad_norm": 5.8002142906188965, + "learning_rate": 2.5580657190991122e-08, + "loss": 0.650816535949707, + "memory(GiB)": 47.44, + "step": 20845, + "token_acc": 0.840092317837125, + "train_speed(iter/s)": 0.095789 + }, + { + "epoch": 0.969601143989839, + "grad_norm": 9.024754524230957, + "learning_rate": 2.5193696040174964e-08, + "loss": 0.6397728443145752, + "memory(GiB)": 47.44, + "step": 20850, + "token_acc": 0.8487957181088314, + "train_speed(iter/s)": 0.095802 + }, + { + "epoch": 0.969601143989839, + "eval_loss": 0.5555434823036194, + "eval_runtime": 294.1615, + "eval_samples_per_second": 11.813, + "eval_steps_per_second": 11.813, + "step": 20850 + }, + { + "epoch": 0.9698336622497885, + "grad_norm": 7.542553424835205, + "learning_rate": 2.4809676645921042e-08, + "loss": 0.6364368915557861, + "memory(GiB)": 47.44, + "step": 20855, + "token_acc": 0.8351431391905232, + "train_speed(iter/s)": 0.095685 + }, + { + "epoch": 0.9700661805097381, + "grad_norm": 9.540937423706055, + "learning_rate": 2.4428599235311045e-08, + "loss": 0.6640225410461426, + "memory(GiB)": 47.44, + "step": 20860, + "token_acc": 0.8276329156789706, + "train_speed(iter/s)": 0.095698 + }, + { + "epoch": 0.9702986987696878, + "grad_norm": 5.910898685455322, + "learning_rate": 2.4050464033688048e-08, + "loss": 0.6791872978210449, + "memory(GiB)": 47.44, + "step": 20865, + "token_acc": 0.8174807197943444, + "train_speed(iter/s)": 0.095711 + }, + { + "epoch": 0.9705312170296374, + "grad_norm": 5.9184370040893555, + "learning_rate": 2.3675271264655407e-08, + "loss": 0.6619283199310303, + "memory(GiB)": 47.44, + "step": 20870, + "token_acc": 0.8163421153111875, + "train_speed(iter/s)": 0.095724 + }, + { + "epoch": 0.9707637352895869, + "grad_norm": 9.528105735778809, + "learning_rate": 2.330302115007621e-08, + "loss": 0.5713191986083984, + "memory(GiB)": 47.44, + "step": 20875, + "token_acc": 0.8521199586349535, + "train_speed(iter/s)": 0.095736 + }, + { + "epoch": 0.9709962535495366, + "grad_norm": 11.298874855041504, + "learning_rate": 2.2933713910073262e-08, + "loss": 0.5619585990905762, + "memory(GiB)": 47.44, + "step": 20880, + "token_acc": 0.8574626865671642, + "train_speed(iter/s)": 0.095749 + }, + { + "epoch": 0.9712287718094862, + "grad_norm": 10.252700805664062, + "learning_rate": 2.25673497630291e-08, + "loss": 0.54613356590271, + "memory(GiB)": 47.44, + "step": 20885, + "token_acc": 0.8730092204526404, + "train_speed(iter/s)": 0.095762 + }, + { + "epoch": 0.9714612900694357, + "grad_norm": 5.918463230133057, + "learning_rate": 2.2203928925585984e-08, + "loss": 0.5828773498535156, + "memory(GiB)": 47.44, + "step": 20890, + "token_acc": 0.846976401179941, + "train_speed(iter/s)": 0.095775 + }, + { + "epoch": 0.9716938083293853, + "grad_norm": 10.07883358001709, + "learning_rate": 2.1843451612646448e-08, + "loss": 0.6059688091278076, + "memory(GiB)": 47.44, + "step": 20895, + "token_acc": 0.8476046774801962, + "train_speed(iter/s)": 0.095787 + }, + { + "epoch": 0.971926326589335, + "grad_norm": 9.270331382751465, + "learning_rate": 2.148591803737221e-08, + "loss": 0.541387939453125, + "memory(GiB)": 47.44, + "step": 20900, + "token_acc": 0.8590014064697609, + "train_speed(iter/s)": 0.0958 + }, + { + "epoch": 0.971926326589335, + "eval_loss": 0.5555252432823181, + "eval_runtime": 295.9192, + "eval_samples_per_second": 11.743, + "eval_steps_per_second": 11.743, + "step": 20900 + }, + { + "epoch": 0.9721588448492846, + "grad_norm": 8.093807220458984, + "learning_rate": 2.1131328411182484e-08, + "loss": 0.6522313594818115, + "memory(GiB)": 47.44, + "step": 20905, + "token_acc": 0.8349668238868015, + "train_speed(iter/s)": 0.095683 + }, + { + "epoch": 0.9723913631092341, + "grad_norm": 9.61971664428711, + "learning_rate": 2.0779682943758428e-08, + "loss": 0.5763451099395752, + "memory(GiB)": 47.44, + "step": 20910, + "token_acc": 0.8618947368421053, + "train_speed(iter/s)": 0.095696 + }, + { + "epoch": 0.9726238813691838, + "grad_norm": 9.078413963317871, + "learning_rate": 2.043098184303871e-08, + "loss": 0.5425615787506104, + "memory(GiB)": 47.44, + "step": 20915, + "token_acc": 0.8592896174863388, + "train_speed(iter/s)": 0.095709 + }, + { + "epoch": 0.9728563996291334, + "grad_norm": 12.81851863861084, + "learning_rate": 2.008522531522006e-08, + "loss": 0.6862669467926026, + "memory(GiB)": 47.44, + "step": 20920, + "token_acc": 0.832632464255677, + "train_speed(iter/s)": 0.095722 + }, + { + "epoch": 0.973088917889083, + "grad_norm": 10.916457176208496, + "learning_rate": 1.9742413564760033e-08, + "loss": 0.6592337608337402, + "memory(GiB)": 47.44, + "step": 20925, + "token_acc": 0.8250401284109149, + "train_speed(iter/s)": 0.095734 + }, + { + "epoch": 0.9733214361490325, + "grad_norm": 8.57726764678955, + "learning_rate": 1.9402546794373146e-08, + "loss": 0.7035943508148194, + "memory(GiB)": 47.44, + "step": 20930, + "token_acc": 0.825030012004802, + "train_speed(iter/s)": 0.095747 + }, + { + "epoch": 0.9735539544089822, + "grad_norm": 7.987157344818115, + "learning_rate": 1.9065625205033632e-08, + "loss": 0.6475473403930664, + "memory(GiB)": 47.44, + "step": 20935, + "token_acc": 0.828650711064863, + "train_speed(iter/s)": 0.095759 + }, + { + "epoch": 0.9737864726689318, + "grad_norm": 8.405952453613281, + "learning_rate": 1.8731648995972685e-08, + "loss": 0.5332788467407227, + "memory(GiB)": 47.44, + "step": 20940, + "token_acc": 0.8770635756937126, + "train_speed(iter/s)": 0.095772 + }, + { + "epoch": 0.9740189909288813, + "grad_norm": 8.0729398727417, + "learning_rate": 1.840061836468232e-08, + "loss": 0.6800761699676514, + "memory(GiB)": 47.44, + "step": 20945, + "token_acc": 0.826472675656494, + "train_speed(iter/s)": 0.095785 + }, + { + "epoch": 0.974251509188831, + "grad_norm": 9.692399978637695, + "learning_rate": 1.807253350690985e-08, + "loss": 0.6131781101226806, + "memory(GiB)": 47.44, + "step": 20950, + "token_acc": 0.8350717079530638, + "train_speed(iter/s)": 0.095798 + }, + { + "epoch": 0.974251509188831, + "eval_loss": 0.555717408657074, + "eval_runtime": 294.7045, + "eval_samples_per_second": 11.791, + "eval_steps_per_second": 11.791, + "step": 20950 + }, + { + "epoch": 0.9744840274487806, + "grad_norm": 8.899210929870605, + "learning_rate": 1.7747394616662862e-08, + "loss": 0.7808675765991211, + "memory(GiB)": 47.44, + "step": 20955, + "token_acc": 0.834443311452425, + "train_speed(iter/s)": 0.095681 + }, + { + "epoch": 0.9747165457087302, + "grad_norm": 6.883942604064941, + "learning_rate": 1.7425201886205333e-08, + "loss": 0.6193971633911133, + "memory(GiB)": 47.44, + "step": 20960, + "token_acc": 0.8425504229017566, + "train_speed(iter/s)": 0.095693 + }, + { + "epoch": 0.9749490639686798, + "grad_norm": 7.886797904968262, + "learning_rate": 1.7105955506059867e-08, + "loss": 0.6988693714141846, + "memory(GiB)": 47.44, + "step": 20965, + "token_acc": 0.8223776223776224, + "train_speed(iter/s)": 0.095706 + }, + { + "epoch": 0.9751815822286294, + "grad_norm": 11.55400562286377, + "learning_rate": 1.6789655665006565e-08, + "loss": 0.6574934005737305, + "memory(GiB)": 47.44, + "step": 20970, + "token_acc": 0.8405327573794097, + "train_speed(iter/s)": 0.095719 + }, + { + "epoch": 0.975414100488579, + "grad_norm": 6.187772750854492, + "learning_rate": 1.6476302550084145e-08, + "loss": 0.7456907272338867, + "memory(GiB)": 47.44, + "step": 20975, + "token_acc": 0.8103341584158416, + "train_speed(iter/s)": 0.095732 + }, + { + "epoch": 0.9756466187485286, + "grad_norm": 8.612384796142578, + "learning_rate": 1.6165896346587162e-08, + "loss": 0.6755566596984863, + "memory(GiB)": 47.44, + "step": 20980, + "token_acc": 0.8424725822532403, + "train_speed(iter/s)": 0.095744 + }, + { + "epoch": 0.9758791370084782, + "grad_norm": 6.694667339324951, + "learning_rate": 1.585843723806879e-08, + "loss": 0.7141738414764405, + "memory(GiB)": 47.44, + "step": 20985, + "token_acc": 0.8280895231916964, + "train_speed(iter/s)": 0.095757 + }, + { + "epoch": 0.9761116552684278, + "grad_norm": 8.811197280883789, + "learning_rate": 1.55539254063386e-08, + "loss": 0.6681375503540039, + "memory(GiB)": 47.44, + "step": 20990, + "token_acc": 0.8291426840351409, + "train_speed(iter/s)": 0.09577 + }, + { + "epoch": 0.9763441735283774, + "grad_norm": 8.207998275756836, + "learning_rate": 1.525236103146477e-08, + "loss": 0.661448621749878, + "memory(GiB)": 47.44, + "step": 20995, + "token_acc": 0.8250607427976397, + "train_speed(iter/s)": 0.095783 + }, + { + "epoch": 0.976576691788327, + "grad_norm": 7.307313919067383, + "learning_rate": 1.4953744291770766e-08, + "loss": 0.663054084777832, + "memory(GiB)": 47.44, + "step": 21000, + "token_acc": 0.839825263924281, + "train_speed(iter/s)": 0.095796 + }, + { + "epoch": 0.976576691788327, + "eval_loss": 0.5556771159172058, + "eval_runtime": 295.8602, + "eval_samples_per_second": 11.745, + "eval_steps_per_second": 11.745, + "step": 21000 + }, + { + "epoch": 0.9768092100482766, + "grad_norm": 8.349247932434082, + "learning_rate": 1.4658075363838121e-08, + "loss": 0.5215034484863281, + "memory(GiB)": 47.44, + "step": 21005, + "token_acc": 0.8354958257626087, + "train_speed(iter/s)": 0.095679 + }, + { + "epoch": 0.9770417283082262, + "grad_norm": 8.144487380981445, + "learning_rate": 1.4365354422504751e-08, + "loss": 0.707400131225586, + "memory(GiB)": 47.44, + "step": 21010, + "token_acc": 0.8157653528872594, + "train_speed(iter/s)": 0.095691 + }, + { + "epoch": 0.9772742465681759, + "grad_norm": 10.015386581420898, + "learning_rate": 1.4075581640866088e-08, + "loss": 0.6027235507965087, + "memory(GiB)": 47.44, + "step": 21015, + "token_acc": 0.8569099062372605, + "train_speed(iter/s)": 0.095704 + }, + { + "epoch": 0.9775067648281254, + "grad_norm": 6.5005412101745605, + "learning_rate": 1.3788757190273394e-08, + "loss": 0.7321415901184082, + "memory(GiB)": 47.44, + "step": 21020, + "token_acc": 0.808440366972477, + "train_speed(iter/s)": 0.095717 + }, + { + "epoch": 0.977739283088075, + "grad_norm": 6.8954548835754395, + "learning_rate": 1.3504881240334888e-08, + "loss": 0.5201037406921387, + "memory(GiB)": 47.44, + "step": 21025, + "token_acc": 0.858516909711449, + "train_speed(iter/s)": 0.095729 + }, + { + "epoch": 0.9779718013480246, + "grad_norm": 7.919386863708496, + "learning_rate": 1.3223953958915736e-08, + "loss": 0.7142354011535644, + "memory(GiB)": 47.44, + "step": 21030, + "token_acc": 0.8170170827858082, + "train_speed(iter/s)": 0.095742 + }, + { + "epoch": 0.9782043196079742, + "grad_norm": 8.009725570678711, + "learning_rate": 1.2945975512135833e-08, + "loss": 0.665521764755249, + "memory(GiB)": 47.44, + "step": 21035, + "token_acc": 0.8361138370951914, + "train_speed(iter/s)": 0.095755 + }, + { + "epoch": 0.9784368378679238, + "grad_norm": 10.154017448425293, + "learning_rate": 1.2670946064373135e-08, + "loss": 0.5645842552185059, + "memory(GiB)": 47.44, + "step": 21040, + "token_acc": 0.8581791802684077, + "train_speed(iter/s)": 0.095767 + }, + { + "epoch": 0.9786693561278734, + "grad_norm": 8.321249961853027, + "learning_rate": 1.2398865778261438e-08, + "loss": 0.6524802207946777, + "memory(GiB)": 47.44, + "step": 21045, + "token_acc": 0.843369300382875, + "train_speed(iter/s)": 0.09578 + }, + { + "epoch": 0.9789018743878231, + "grad_norm": 8.507335662841797, + "learning_rate": 1.2129734814689265e-08, + "loss": 0.5944690227508544, + "memory(GiB)": 47.44, + "step": 21050, + "token_acc": 0.8567393058918482, + "train_speed(iter/s)": 0.095793 + }, + { + "epoch": 0.9789018743878231, + "eval_loss": 0.5553185343742371, + "eval_runtime": 296.9087, + "eval_samples_per_second": 11.704, + "eval_steps_per_second": 11.704, + "step": 21050 + }, + { + "epoch": 0.9791343926477726, + "grad_norm": 11.044922828674316, + "learning_rate": 1.1863553332802091e-08, + "loss": 0.6195722579956054, + "memory(GiB)": 47.44, + "step": 21055, + "token_acc": 0.8350491511708955, + "train_speed(iter/s)": 0.095676 + }, + { + "epoch": 0.9793669109077222, + "grad_norm": 8.964325904846191, + "learning_rate": 1.1600321490001786e-08, + "loss": 0.6727898120880127, + "memory(GiB)": 47.44, + "step": 21060, + "token_acc": 0.8264599142197294, + "train_speed(iter/s)": 0.095689 + }, + { + "epoch": 0.9795994291676718, + "grad_norm": 8.259568214416504, + "learning_rate": 1.1340039441945505e-08, + "loss": 0.56252121925354, + "memory(GiB)": 47.44, + "step": 21065, + "token_acc": 0.8704742478327384, + "train_speed(iter/s)": 0.095702 + }, + { + "epoch": 0.9798319474276215, + "grad_norm": 6.8413615226745605, + "learning_rate": 1.108270734254624e-08, + "loss": 0.5423484802246094, + "memory(GiB)": 47.44, + "step": 21070, + "token_acc": 0.8570935450466, + "train_speed(iter/s)": 0.095714 + }, + { + "epoch": 0.980064465687571, + "grad_norm": 8.64388656616211, + "learning_rate": 1.0828325343971713e-08, + "loss": 0.5598763465881348, + "memory(GiB)": 47.44, + "step": 21075, + "token_acc": 0.8564008554842653, + "train_speed(iter/s)": 0.095727 + }, + { + "epoch": 0.9802969839475206, + "grad_norm": 8.684253692626953, + "learning_rate": 1.0576893596646043e-08, + "loss": 0.5953333377838135, + "memory(GiB)": 47.44, + "step": 21080, + "token_acc": 0.8587755102040816, + "train_speed(iter/s)": 0.09574 + }, + { + "epoch": 0.9805295022074703, + "grad_norm": 8.789764404296875, + "learning_rate": 1.0328412249248632e-08, + "loss": 0.577445650100708, + "memory(GiB)": 47.44, + "step": 21085, + "token_acc": 0.8483353884093712, + "train_speed(iter/s)": 0.095752 + }, + { + "epoch": 0.9807620204674198, + "grad_norm": 7.451662540435791, + "learning_rate": 1.0082881448714721e-08, + "loss": 0.6344650268554688, + "memory(GiB)": 47.44, + "step": 21090, + "token_acc": 0.8471953578336557, + "train_speed(iter/s)": 0.095765 + }, + { + "epoch": 0.9809945387273694, + "grad_norm": 9.513747215270996, + "learning_rate": 9.840301340233171e-09, + "loss": 0.6367983818054199, + "memory(GiB)": 47.44, + "step": 21095, + "token_acc": 0.8491570541259982, + "train_speed(iter/s)": 0.095778 + }, + { + "epoch": 0.981227056987319, + "grad_norm": 10.594439506530762, + "learning_rate": 9.60067206725035e-09, + "loss": 0.7203311443328857, + "memory(GiB)": 47.44, + "step": 21100, + "token_acc": 0.8204283360790774, + "train_speed(iter/s)": 0.095791 + }, + { + "epoch": 0.981227056987319, + "eval_loss": 0.5554095506668091, + "eval_runtime": 293.9435, + "eval_samples_per_second": 11.822, + "eval_steps_per_second": 11.822, + "step": 21100 + }, + { + "epoch": 0.9814595752472687, + "grad_norm": 7.318916320800781, + "learning_rate": 9.36399377146513e-09, + "loss": 0.5712925434112549, + "memory(GiB)": 47.44, + "step": 21105, + "token_acc": 0.8353805212137679, + "train_speed(iter/s)": 0.095676 + }, + { + "epoch": 0.9816920935072182, + "grad_norm": 9.158342361450195, + "learning_rate": 9.130266592833891e-09, + "loss": 0.7172455310821533, + "memory(GiB)": 47.44, + "step": 21110, + "token_acc": 0.8085735402808574, + "train_speed(iter/s)": 0.095689 + }, + { + "epoch": 0.9819246117671678, + "grad_norm": 8.45883846282959, + "learning_rate": 8.89949066956608e-09, + "loss": 0.5606085777282714, + "memory(GiB)": 47.44, + "step": 21115, + "token_acc": 0.8552323637579405, + "train_speed(iter/s)": 0.095702 + }, + { + "epoch": 0.9821571300271175, + "grad_norm": 8.795999526977539, + "learning_rate": 8.671666138126423e-09, + "loss": 0.5041959762573243, + "memory(GiB)": 47.44, + "step": 21120, + "token_acc": 0.8740388136213841, + "train_speed(iter/s)": 0.095715 + }, + { + "epoch": 0.9823896482870671, + "grad_norm": 8.338200569152832, + "learning_rate": 8.446793133235486e-09, + "loss": 0.5370265483856201, + "memory(GiB)": 47.44, + "step": 21125, + "token_acc": 0.8650176678445229, + "train_speed(iter/s)": 0.095727 + }, + { + "epoch": 0.9826221665470166, + "grad_norm": 8.376656532287598, + "learning_rate": 8.224871787866906e-09, + "loss": 0.6197101593017578, + "memory(GiB)": 47.44, + "step": 21130, + "token_acc": 0.8482264665757162, + "train_speed(iter/s)": 0.09574 + }, + { + "epoch": 0.9828546848069663, + "grad_norm": 11.890416145324707, + "learning_rate": 8.005902233249596e-09, + "loss": 0.6680188179016113, + "memory(GiB)": 47.44, + "step": 21135, + "token_acc": 0.8227696004806249, + "train_speed(iter/s)": 0.095752 + }, + { + "epoch": 0.9830872030669159, + "grad_norm": 7.547930717468262, + "learning_rate": 7.789884598867203e-09, + "loss": 0.6532802581787109, + "memory(GiB)": 47.44, + "step": 21140, + "token_acc": 0.8363082281236395, + "train_speed(iter/s)": 0.095765 + }, + { + "epoch": 0.9833197213268654, + "grad_norm": 9.293731689453125, + "learning_rate": 7.576819012457548e-09, + "loss": 0.5700969219207763, + "memory(GiB)": 47.44, + "step": 21145, + "token_acc": 0.8506117192530586, + "train_speed(iter/s)": 0.095778 + }, + { + "epoch": 0.983552239586815, + "grad_norm": 11.201024055480957, + "learning_rate": 7.36670560001318e-09, + "loss": 0.71249418258667, + "memory(GiB)": 47.44, + "step": 21150, + "token_acc": 0.8229683326974437, + "train_speed(iter/s)": 0.095791 + }, + { + "epoch": 0.983552239586815, + "eval_loss": 0.5556346774101257, + "eval_runtime": 293.5457, + "eval_samples_per_second": 11.838, + "eval_steps_per_second": 11.838, + "step": 21150 + }, + { + "epoch": 0.9837847578467647, + "grad_norm": 10.607219696044922, + "learning_rate": 7.159544485780268e-09, + "loss": 0.5556900978088379, + "memory(GiB)": 47.44, + "step": 21155, + "token_acc": 0.8356220487616844, + "train_speed(iter/s)": 0.095676 + }, + { + "epoch": 0.9840172761067143, + "grad_norm": 7.971781253814697, + "learning_rate": 6.95533579225971e-09, + "loss": 0.6896307945251465, + "memory(GiB)": 47.44, + "step": 21160, + "token_acc": 0.8268027658873889, + "train_speed(iter/s)": 0.095689 + }, + { + "epoch": 0.9842497943666638, + "grad_norm": 8.960981369018555, + "learning_rate": 6.754079640206579e-09, + "loss": 0.614405345916748, + "memory(GiB)": 47.44, + "step": 21165, + "token_acc": 0.8401153476449856, + "train_speed(iter/s)": 0.095701 + }, + { + "epoch": 0.9844823126266135, + "grad_norm": 9.977701187133789, + "learning_rate": 6.555776148629567e-09, + "loss": 0.606471061706543, + "memory(GiB)": 47.44, + "step": 21170, + "token_acc": 0.844391244870041, + "train_speed(iter/s)": 0.095714 + }, + { + "epoch": 0.9847148308865631, + "grad_norm": 10.329645156860352, + "learning_rate": 6.3604254347920945e-09, + "loss": 0.6224015235900879, + "memory(GiB)": 47.44, + "step": 21175, + "token_acc": 0.839622641509434, + "train_speed(iter/s)": 0.095727 + }, + { + "epoch": 0.9849473491465126, + "grad_norm": 10.578011512756348, + "learning_rate": 6.168027614210093e-09, + "loss": 0.5204459667205811, + "memory(GiB)": 47.44, + "step": 21180, + "token_acc": 0.8709796672828096, + "train_speed(iter/s)": 0.095739 + }, + { + "epoch": 0.9851798674064622, + "grad_norm": 7.306653022766113, + "learning_rate": 5.978582800655886e-09, + "loss": 0.6697455883026123, + "memory(GiB)": 47.44, + "step": 21185, + "token_acc": 0.8152958152958153, + "train_speed(iter/s)": 0.095752 + }, + { + "epoch": 0.9854123856664119, + "grad_norm": 7.48827600479126, + "learning_rate": 5.792091106152642e-09, + "loss": 0.6770011425018311, + "memory(GiB)": 47.44, + "step": 21190, + "token_acc": 0.8324396782841823, + "train_speed(iter/s)": 0.095765 + }, + { + "epoch": 0.9856449039263615, + "grad_norm": 7.64422607421875, + "learning_rate": 5.60855264097937e-09, + "loss": 0.5752118587493896, + "memory(GiB)": 47.44, + "step": 21195, + "token_acc": 0.8565545641729582, + "train_speed(iter/s)": 0.095777 + }, + { + "epoch": 0.985877422186311, + "grad_norm": 7.497743606567383, + "learning_rate": 5.4279675136675866e-09, + "loss": 0.5968385696411133, + "memory(GiB)": 47.44, + "step": 21200, + "token_acc": 0.8383076418497868, + "train_speed(iter/s)": 0.09579 + }, + { + "epoch": 0.985877422186311, + "eval_loss": 0.5555036664009094, + "eval_runtime": 292.1935, + "eval_samples_per_second": 11.893, + "eval_steps_per_second": 11.893, + "step": 21200 + }, + { + "epoch": 0.9861099404462607, + "grad_norm": 9.114083290100098, + "learning_rate": 5.250335831003539e-09, + "loss": 0.7167449474334717, + "memory(GiB)": 47.44, + "step": 21205, + "token_acc": 0.8349151290327761, + "train_speed(iter/s)": 0.095676 + }, + { + "epoch": 0.9863424587062103, + "grad_norm": 8.94640064239502, + "learning_rate": 5.075657698025427e-09, + "loss": 0.6159820556640625, + "memory(GiB)": 47.44, + "step": 21210, + "token_acc": 0.839986235375086, + "train_speed(iter/s)": 0.095688 + }, + { + "epoch": 0.9865749769661599, + "grad_norm": 12.260013580322266, + "learning_rate": 4.903933218026735e-09, + "loss": 0.6116408824920654, + "memory(GiB)": 47.44, + "step": 21215, + "token_acc": 0.8501515807708965, + "train_speed(iter/s)": 0.095701 + }, + { + "epoch": 0.9868074952261094, + "grad_norm": 8.043527603149414, + "learning_rate": 4.735162492552347e-09, + "loss": 0.6420509815216064, + "memory(GiB)": 47.44, + "step": 21220, + "token_acc": 0.8346534653465346, + "train_speed(iter/s)": 0.095714 + }, + { + "epoch": 0.9870400134860591, + "grad_norm": 8.910659790039062, + "learning_rate": 4.569345621402988e-09, + "loss": 0.5657378196716308, + "memory(GiB)": 47.44, + "step": 21225, + "token_acc": 0.8534538411878632, + "train_speed(iter/s)": 0.095726 + }, + { + "epoch": 0.9872725317460087, + "grad_norm": 8.287259101867676, + "learning_rate": 4.406482702630222e-09, + "loss": 0.5949373245239258, + "memory(GiB)": 47.44, + "step": 21230, + "token_acc": 0.8417182662538699, + "train_speed(iter/s)": 0.095739 + }, + { + "epoch": 0.9875050500059582, + "grad_norm": 9.379225730895996, + "learning_rate": 4.246573832540346e-09, + "loss": 0.5357473850250244, + "memory(GiB)": 47.44, + "step": 21235, + "token_acc": 0.8592964824120602, + "train_speed(iter/s)": 0.095752 + }, + { + "epoch": 0.9877375682659079, + "grad_norm": 7.861101150512695, + "learning_rate": 4.089619105692166e-09, + "loss": 0.5065900802612304, + "memory(GiB)": 47.44, + "step": 21240, + "token_acc": 0.8750491159135559, + "train_speed(iter/s)": 0.095765 + }, + { + "epoch": 0.9879700865258575, + "grad_norm": 8.914447784423828, + "learning_rate": 3.9356186148981065e-09, + "loss": 0.6780075073242188, + "memory(GiB)": 47.44, + "step": 21245, + "token_acc": 0.8344851416724257, + "train_speed(iter/s)": 0.095777 + }, + { + "epoch": 0.9882026047858071, + "grad_norm": 9.360499382019043, + "learning_rate": 3.7845724512231005e-09, + "loss": 0.6165886402130127, + "memory(GiB)": 47.44, + "step": 21250, + "token_acc": 0.8479569211276529, + "train_speed(iter/s)": 0.09579 + }, + { + "epoch": 0.9882026047858071, + "eval_loss": 0.5553276538848877, + "eval_runtime": 292.8059, + "eval_samples_per_second": 11.868, + "eval_steps_per_second": 11.868, + "step": 21250 + }, + { + "epoch": 0.9884351230457566, + "grad_norm": 10.547829627990723, + "learning_rate": 3.636480703986256e-09, + "loss": 0.6729447841644287, + "memory(GiB)": 47.44, + "step": 21255, + "token_acc": 0.8349352710797725, + "train_speed(iter/s)": 0.095676 + }, + { + "epoch": 0.9886676413057063, + "grad_norm": 11.017034530639648, + "learning_rate": 3.4913434607575235e-09, + "loss": 0.6142584323883057, + "memory(GiB)": 47.44, + "step": 21260, + "token_acc": 0.8463054187192118, + "train_speed(iter/s)": 0.095689 + }, + { + "epoch": 0.9889001595656559, + "grad_norm": 6.795493125915527, + "learning_rate": 3.3491608073621394e-09, + "loss": 0.6216294765472412, + "memory(GiB)": 47.44, + "step": 21265, + "token_acc": 0.8422927461139896, + "train_speed(iter/s)": 0.095701 + }, + { + "epoch": 0.9891326778256055, + "grad_norm": 8.335597038269043, + "learning_rate": 3.2099328278767385e-09, + "loss": 0.6300849914550781, + "memory(GiB)": 47.44, + "step": 21270, + "token_acc": 0.8495707353490108, + "train_speed(iter/s)": 0.095713 + }, + { + "epoch": 0.9893651960855551, + "grad_norm": 9.009703636169434, + "learning_rate": 3.0736596046304634e-09, + "loss": 0.6418187141418457, + "memory(GiB)": 47.44, + "step": 21275, + "token_acc": 0.8273195876288659, + "train_speed(iter/s)": 0.095725 + }, + { + "epoch": 0.9895977143455047, + "grad_norm": 7.719686031341553, + "learning_rate": 2.9403412182071876e-09, + "loss": 0.605592679977417, + "memory(GiB)": 47.44, + "step": 21280, + "token_acc": 0.8451736221726664, + "train_speed(iter/s)": 0.095738 + }, + { + "epoch": 0.9898302326054543, + "grad_norm": 8.592754364013672, + "learning_rate": 2.809977747441073e-09, + "loss": 0.5792980670928956, + "memory(GiB)": 47.44, + "step": 21285, + "token_acc": 0.8473509933774834, + "train_speed(iter/s)": 0.09575 + }, + { + "epoch": 0.9900627508654039, + "grad_norm": 7.238741397857666, + "learning_rate": 2.682569269419899e-09, + "loss": 0.7391181945800781, + "memory(GiB)": 47.44, + "step": 21290, + "token_acc": 0.8089855891494773, + "train_speed(iter/s)": 0.095763 + }, + { + "epoch": 0.9902952691253535, + "grad_norm": 11.120532989501953, + "learning_rate": 2.5581158594856215e-09, + "loss": 0.6632680892944336, + "memory(GiB)": 47.44, + "step": 21295, + "token_acc": 0.8408516780945507, + "train_speed(iter/s)": 0.095776 + }, + { + "epoch": 0.9905277873853031, + "grad_norm": 9.001179695129395, + "learning_rate": 2.4366175912299283e-09, + "loss": 0.6990299701690674, + "memory(GiB)": 47.44, + "step": 21300, + "token_acc": 0.8209459459459459, + "train_speed(iter/s)": 0.095788 + }, + { + "epoch": 0.9905277873853031, + "eval_loss": 0.5553578734397888, + "eval_runtime": 297.4219, + "eval_samples_per_second": 11.684, + "eval_steps_per_second": 11.684, + "step": 21300 + }, + { + "epoch": 0.9907603056452527, + "grad_norm": 8.921056747436523, + "learning_rate": 2.3180745364997927e-09, + "loss": 0.6302893161773682, + "memory(GiB)": 47.44, + "step": 21305, + "token_acc": 0.8346371664426954, + "train_speed(iter/s)": 0.095672 + }, + { + "epoch": 0.9909928239052023, + "grad_norm": 8.292360305786133, + "learning_rate": 2.20248676539192e-09, + "loss": 0.6054904937744141, + "memory(GiB)": 47.44, + "step": 21310, + "token_acc": 0.8438385269121813, + "train_speed(iter/s)": 0.095685 + }, + { + "epoch": 0.9912253421651519, + "grad_norm": 9.087489128112793, + "learning_rate": 2.089854346258302e-09, + "loss": 0.538211727142334, + "memory(GiB)": 47.44, + "step": 21315, + "token_acc": 0.8768577494692145, + "train_speed(iter/s)": 0.095697 + }, + { + "epoch": 0.9914578604251015, + "grad_norm": 9.042856216430664, + "learning_rate": 1.9801773457012173e-09, + "loss": 0.5818132877349853, + "memory(GiB)": 47.44, + "step": 21320, + "token_acc": 0.8537444933920705, + "train_speed(iter/s)": 0.09571 + }, + { + "epoch": 0.991690378685051, + "grad_norm": 11.697620391845703, + "learning_rate": 1.8734558285760095e-09, + "loss": 0.7718028068542481, + "memory(GiB)": 47.44, + "step": 21325, + "token_acc": 0.8196051735874744, + "train_speed(iter/s)": 0.095722 + }, + { + "epoch": 0.9919228969450007, + "grad_norm": 8.305510520935059, + "learning_rate": 1.7696898579905309e-09, + "loss": 0.6060296058654785, + "memory(GiB)": 47.44, + "step": 21330, + "token_acc": 0.8515681983953318, + "train_speed(iter/s)": 0.095735 + }, + { + "epoch": 0.9921554152049503, + "grad_norm": 10.319281578063965, + "learning_rate": 1.6688794953051424e-09, + "loss": 0.6432456016540528, + "memory(GiB)": 47.44, + "step": 21335, + "token_acc": 0.8345031400073882, + "train_speed(iter/s)": 0.095747 + }, + { + "epoch": 0.9923879334649, + "grad_norm": 7.902041912078857, + "learning_rate": 1.5710248001316041e-09, + "loss": 0.6019167423248291, + "memory(GiB)": 47.44, + "step": 21340, + "token_acc": 0.8467714954834393, + "train_speed(iter/s)": 0.09576 + }, + { + "epoch": 0.9926204517248495, + "grad_norm": 9.371742248535156, + "learning_rate": 1.4761258303352954e-09, + "loss": 0.6643675327301025, + "memory(GiB)": 47.44, + "step": 21345, + "token_acc": 0.8371451104100947, + "train_speed(iter/s)": 0.095773 + }, + { + "epoch": 0.9928529699847991, + "grad_norm": 6.8133544921875, + "learning_rate": 1.3841826420318838e-09, + "loss": 0.6819211483001709, + "memory(GiB)": 47.44, + "step": 21350, + "token_acc": 0.8228238519533927, + "train_speed(iter/s)": 0.095785 + }, + { + "epoch": 0.9928529699847991, + "eval_loss": 0.5555744767189026, + "eval_runtime": 295.7567, + "eval_samples_per_second": 11.75, + "eval_steps_per_second": 11.75, + "step": 21350 + }, + { + "epoch": 0.9930854882447487, + "grad_norm": 8.499656677246094, + "learning_rate": 1.295195289590101e-09, + "loss": 0.5959889888763428, + "memory(GiB)": 47.44, + "step": 21355, + "token_acc": 0.83490411440722, + "train_speed(iter/s)": 0.09567 + }, + { + "epoch": 0.9933180065046984, + "grad_norm": 6.653784275054932, + "learning_rate": 1.2091638256322979e-09, + "loss": 0.6795511722564698, + "memory(GiB)": 47.44, + "step": 21360, + "token_acc": 0.8144393241167435, + "train_speed(iter/s)": 0.095683 + }, + { + "epoch": 0.9935505247646479, + "grad_norm": 10.100828170776367, + "learning_rate": 1.126088301030004e-09, + "loss": 0.5773494243621826, + "memory(GiB)": 47.44, + "step": 21365, + "token_acc": 0.8461000349772648, + "train_speed(iter/s)": 0.095695 + }, + { + "epoch": 0.9937830430245975, + "grad_norm": 9.322258949279785, + "learning_rate": 1.0459687649094773e-09, + "loss": 0.6679564952850342, + "memory(GiB)": 47.44, + "step": 21370, + "token_acc": 0.8149339049660593, + "train_speed(iter/s)": 0.095708 + }, + { + "epoch": 0.9940155612845472, + "grad_norm": 7.6209516525268555, + "learning_rate": 9.688052646472656e-10, + "loss": 0.6366386890411377, + "memory(GiB)": 47.44, + "step": 21375, + "token_acc": 0.8308026030368764, + "train_speed(iter/s)": 0.09572 + }, + { + "epoch": 0.9942480795444967, + "grad_norm": 8.107213020324707, + "learning_rate": 8.945978458724247e-10, + "loss": 0.6723571300506592, + "memory(GiB)": 47.44, + "step": 21380, + "token_acc": 0.8251320544898526, + "train_speed(iter/s)": 0.095733 + }, + { + "epoch": 0.9944805978044463, + "grad_norm": 9.466985702514648, + "learning_rate": 8.233465524670748e-10, + "loss": 0.6097723960876464, + "memory(GiB)": 47.44, + "step": 21385, + "token_acc": 0.8440553745928339, + "train_speed(iter/s)": 0.095745 + }, + { + "epoch": 0.9947131160643959, + "grad_norm": 15.100635528564453, + "learning_rate": 7.550514265630693e-10, + "loss": 0.6184077262878418, + "memory(GiB)": 47.44, + "step": 21390, + "token_acc": 0.8390658174097665, + "train_speed(iter/s)": 0.095758 + }, + { + "epoch": 0.9949456343243456, + "grad_norm": 9.579854965209961, + "learning_rate": 6.897125085458811e-10, + "loss": 0.5857309818267822, + "memory(GiB)": 47.44, + "step": 21395, + "token_acc": 0.8594069529652352, + "train_speed(iter/s)": 0.09577 + }, + { + "epoch": 0.9951781525842951, + "grad_norm": 8.905710220336914, + "learning_rate": 6.273298370523818e-10, + "loss": 0.6910581111907959, + "memory(GiB)": 47.44, + "step": 21400, + "token_acc": 0.8338809784592918, + "train_speed(iter/s)": 0.095783 + }, + { + "epoch": 0.9951781525842951, + "eval_loss": 0.5554383993148804, + "eval_runtime": 298.0918, + "eval_samples_per_second": 11.657, + "eval_steps_per_second": 11.657, + "step": 21400 + }, + { + "epoch": 0.9954106708442447, + "grad_norm": 8.429669380187988, + "learning_rate": 5.679034489713964e-10, + "loss": 0.8259492874145508, + "memory(GiB)": 47.44, + "step": 21405, + "token_acc": 0.8338284840423413, + "train_speed(iter/s)": 0.095667 + }, + { + "epoch": 0.9956431891041944, + "grad_norm": 7.528830528259277, + "learning_rate": 5.114333794437043e-10, + "loss": 0.6541305541992187, + "memory(GiB)": 47.44, + "step": 21410, + "token_acc": 0.8467771639042357, + "train_speed(iter/s)": 0.09568 + }, + { + "epoch": 0.995875707364144, + "grad_norm": 7.162056922912598, + "learning_rate": 4.579196618620385e-10, + "loss": 0.6312575340270996, + "memory(GiB)": 47.44, + "step": 21415, + "token_acc": 0.8346333853354134, + "train_speed(iter/s)": 0.095692 + }, + { + "epoch": 0.9961082256240935, + "grad_norm": 10.457352638244629, + "learning_rate": 4.0736232787053075e-10, + "loss": 0.6251492023468017, + "memory(GiB)": 47.44, + "step": 21420, + "token_acc": 0.8450704225352113, + "train_speed(iter/s)": 0.095704 + }, + { + "epoch": 0.9963407438840431, + "grad_norm": 7.633652210235596, + "learning_rate": 3.5976140736471153e-10, + "loss": 0.6230262279510498, + "memory(GiB)": 47.44, + "step": 21425, + "token_acc": 0.8563084112149533, + "train_speed(iter/s)": 0.095716 + }, + { + "epoch": 0.9965732621439928, + "grad_norm": 8.276957511901855, + "learning_rate": 3.1511692849317545e-10, + "loss": 0.6544853687286377, + "memory(GiB)": 47.44, + "step": 21430, + "token_acc": 0.8370973713439467, + "train_speed(iter/s)": 0.095729 + }, + { + "epoch": 0.9968057804039423, + "grad_norm": 7.65335750579834, + "learning_rate": 2.7342891765536064e-10, + "loss": 0.5899259567260742, + "memory(GiB)": 47.44, + "step": 21435, + "token_acc": 0.84364492266308, + "train_speed(iter/s)": 0.095741 + }, + { + "epoch": 0.9970382986638919, + "grad_norm": 8.185805320739746, + "learning_rate": 2.3469739950265913e-10, + "loss": 0.6896131992340088, + "memory(GiB)": 47.44, + "step": 21440, + "token_acc": 0.8392932267564156, + "train_speed(iter/s)": 0.095754 + }, + { + "epoch": 0.9972708169238416, + "grad_norm": 10.828086853027344, + "learning_rate": 1.9892239693786176e-10, + "loss": 0.6359089851379395, + "memory(GiB)": 47.44, + "step": 21445, + "token_acc": 0.8455346876197777, + "train_speed(iter/s)": 0.095766 + }, + { + "epoch": 0.9975033351837912, + "grad_norm": 7.987010955810547, + "learning_rate": 1.6610393111682331e-10, + "loss": 0.6796002388000488, + "memory(GiB)": 47.44, + "step": 21450, + "token_acc": 0.8279637159837347, + "train_speed(iter/s)": 0.095778 + }, + { + "epoch": 0.9975033351837912, + "eval_loss": 0.5553656816482544, + "eval_runtime": 295.3935, + "eval_samples_per_second": 11.764, + "eval_steps_per_second": 11.764, + "step": 21450 + }, + { + "epoch": 0.9977358534437407, + "grad_norm": 8.468503952026367, + "learning_rate": 1.362420214451321e-10, + "loss": 0.5804356575012207, + "memory(GiB)": 47.44, + "step": 21455, + "token_acc": 0.8353789285028483, + "train_speed(iter/s)": 0.095664 + }, + { + "epoch": 0.9979683717036903, + "grad_norm": 7.758851528167725, + "learning_rate": 1.093366855814404e-10, + "loss": 0.7208607196807861, + "memory(GiB)": 47.44, + "step": 21460, + "token_acc": 0.8289473684210527, + "train_speed(iter/s)": 0.095677 + }, + { + "epoch": 0.99820088996364, + "grad_norm": 8.039312362670898, + "learning_rate": 8.538793943579926e-11, + "loss": 0.6910472393035889, + "memory(GiB)": 47.44, + "step": 21465, + "token_acc": 0.8290655138011307, + "train_speed(iter/s)": 0.09569 + }, + { + "epoch": 0.9984334082235896, + "grad_norm": 8.54366397857666, + "learning_rate": 6.43957971696585e-11, + "loss": 0.7338367938995362, + "memory(GiB)": 47.44, + "step": 21470, + "token_acc": 0.8101694915254237, + "train_speed(iter/s)": 0.095702 + }, + { + "epoch": 0.9986659264835391, + "grad_norm": 8.529617309570312, + "learning_rate": 4.6360271195866614e-11, + "loss": 0.5459727287292481, + "memory(GiB)": 47.44, + "step": 21475, + "token_acc": 0.8498957609451008, + "train_speed(iter/s)": 0.095715 + }, + { + "epoch": 0.9988984447434888, + "grad_norm": 8.786166191101074, + "learning_rate": 3.128137218033622e-11, + "loss": 0.6408785820007324, + "memory(GiB)": 47.44, + "step": 21480, + "token_acc": 0.8443396226415094, + "train_speed(iter/s)": 0.095727 + }, + { + "epoch": 0.9991309630034384, + "grad_norm": 7.954626560211182, + "learning_rate": 1.915910903926843e-11, + "loss": 0.6573843955993652, + "memory(GiB)": 47.44, + "step": 21485, + "token_acc": 0.8392993145468393, + "train_speed(iter/s)": 0.09574 + }, + { + "epoch": 0.9993634812633879, + "grad_norm": 10.057504653930664, + "learning_rate": 9.993488940818197e-12, + "loss": 0.7026410579681397, + "memory(GiB)": 47.44, + "step": 21490, + "token_acc": 0.8170683524298696, + "train_speed(iter/s)": 0.095752 + }, + { + "epoch": 0.9995959995233376, + "grad_norm": 7.514813423156738, + "learning_rate": 3.7845173050943265e-12, + "loss": 0.6564667224884033, + "memory(GiB)": 47.44, + "step": 21495, + "token_acc": 0.8414082687338501, + "train_speed(iter/s)": 0.095765 + }, + { + "epoch": 0.9998285177832872, + "grad_norm": 8.590625762939453, + "learning_rate": 5.321978036043618e-13, + "loss": 0.6297463417053223, + "memory(GiB)": 47.44, + "step": 21500, + "token_acc": 0.8341463414634146, + "train_speed(iter/s)": 0.095777 + }, + { + "epoch": 0.9998285177832872, + "eval_loss": 0.5553836226463318, + "eval_runtime": 292.3276, + "eval_samples_per_second": 11.887, + "eval_steps_per_second": 11.887, + "step": 21500 + }, + { + "epoch": 0.9999680287392569, + "eval_loss": 0.5553876757621765, + "eval_runtime": 296.6555, + "eval_samples_per_second": 11.714, + "eval_steps_per_second": 11.714, + "step": 21503 + } + ], + "logging_steps": 5, + "max_steps": 21503, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.1957843844152115e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}